From 7a215285464eb818fd340643fe2ef67ecaf59808 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 4 Dec 2019 22:28:32 +0100
Subject: [PATCH 01/29] Go back to forward declare WaitingTask

---
 FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h b/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h
index 44f7b1ca14944..ccd99247e501e 100644
--- a/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h
+++ b/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h
@@ -24,9 +24,8 @@
 
 #include "tbb/task_arena.h"
 
-#include "FWCore/Concurrency/interface/WaitingTask.h"
-
 namespace edm {
+  class WaitingTask;
   class WaitingTaskHolder;
 
   class WaitingTaskWithArenaHolder {

From 0f42a7e63c208cc47b91a188cfd9e84335310ecb Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 4 Dec 2019 22:30:04 +0100
Subject: [PATCH 02/29] Fix comment

---
 HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
index 758218bb958a2..70539006d2563 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
@@ -31,7 +31,7 @@ namespace impl {
     const cudautils::SharedStreamPtr& streamPtr() const { return stream_; }
 
   protected:
-    // The constructors set the current device device, but the device
+    // The constructors set the current device, but the device
     // is not set back to the previous value at the destructor. This
     // should be sufficient (and tiny bit faster) as all CUDA API
     // functions relying on the current device should be called from

From 6e5543ce1bf6992076198c16671fb6d22148e0d8 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 5 Dec 2019 18:32:46 +0100
Subject: [PATCH 03/29] Enable CUDA for compute capability 3.5

---
 HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
index d901e1850bceb..eb21f22cd0c5c 100644
--- a/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
+++ b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
@@ -22,7 +22,7 @@ int main() {
     cudaDeviceProp properties;
     cudaGetDeviceProperties(&properties, i);
 
-    if (properties.major < minimumMajor) {
+    if ((not(properties.major == 3 and properties.minor == 5)) and properties.major < minimumMajor) {
       return EXIT_FAILURE;
     }
   }

From 66effb194873a74554fc6e31764b5517c70d146b Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 5 Dec 2019 20:26:48 +0100
Subject: [PATCH 04/29] Add CUDA existence protections to BuildFiles

---
 CUDADataFormats/Common/BuildFile.xml                 | 4 ++++
 CUDADataFormats/Common/test/BuildFile.xml            | 2 ++
 HeterogeneousCore/CUDACore/BuildFile.xml             | 4 ++++
 HeterogeneousCore/CUDACore/test/BuildFile.xml        | 2 ++
 HeterogeneousCore/CUDAServices/BuildFile.xml         | 4 ++++
 HeterogeneousCore/CUDAServices/bin/BuildFile.xml     | 2 ++
 HeterogeneousCore/CUDAServices/plugins/BuildFile.xml | 3 +++
 HeterogeneousCore/CUDAServices/test/BuildFile.xml    | 6 ++++--
 HeterogeneousCore/CUDATest/BuildFile.xml             | 4 ++++
 HeterogeneousCore/CUDATest/plugins/BuildFile.xml     | 3 +++
 HeterogeneousCore/CUDATest/test/BuildFile.xml        | 2 ++
 HeterogeneousCore/CUDAUtilities/BuildFile.xml        | 4 ++++
 HeterogeneousCore/CUDAUtilities/test/BuildFile.xml   | 2 ++
 13 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/CUDADataFormats/Common/BuildFile.xml b/CUDADataFormats/Common/BuildFile.xml
index 98033aab4d99d..b867319b8276f 100644
--- a/CUDADataFormats/Common/BuildFile.xml
+++ b/CUDADataFormats/Common/BuildFile.xml
@@ -1,3 +1,7 @@
+<ifarchitecture name="_ppc64le_">
+  <flags SKIP_FILES="*"/>
+</ifarchitecture>
+
 <use name="HeterogeneousCore/CUDAUtilities"/>
 
 <export>
diff --git a/CUDADataFormats/Common/test/BuildFile.xml b/CUDADataFormats/Common/test/BuildFile.xml
index 5e804fe80a736..30b84c9ce2f22 100644
--- a/CUDADataFormats/Common/test/BuildFile.xml
+++ b/CUDADataFormats/Common/test/BuildFile.xml
@@ -1,5 +1,7 @@
+<iftool name="cuda-gcc-support">
 <bin file="test*.cc" name="testCUDADataFormatsCommon">
   <use name="HeterogeneousCore/CUDACore"/>
   <use name="catch2"/>
   <use name="cuda"/>
 </bin>
+</iftool>
diff --git a/HeterogeneousCore/CUDACore/BuildFile.xml b/HeterogeneousCore/CUDACore/BuildFile.xml
index d78c8a28f0470..15233466d447b 100644
--- a/HeterogeneousCore/CUDACore/BuildFile.xml
+++ b/HeterogeneousCore/CUDACore/BuildFile.xml
@@ -1,3 +1,7 @@
+<ifarchitecture name="_ppc64le_">
+  <flags SKIP_FILES="*"/>
+</ifarchitecture>
+
 <use name="FWCore/Concurrency"/>
 <use name="FWCore/Framework"/>
 <use name="FWCore/ServiceRegistry"/>
diff --git a/HeterogeneousCore/CUDACore/test/BuildFile.xml b/HeterogeneousCore/CUDACore/test/BuildFile.xml
index a6f34c70e8822..474f3cee1b3e8 100644
--- a/HeterogeneousCore/CUDACore/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDACore/test/BuildFile.xml
@@ -1,3 +1,4 @@
+<iftool name="cuda-gcc-support">
 <bin file="test_*.cc test_*.cu" name="testHeterogeneousCoreCUDACore">
   <use name="CUDADataFormats/Common"/>
   <use name="FWCore/ParameterSet"/>
@@ -14,3 +15,4 @@
   <use name="HeterogeneousCore/CUDAUtilities"/>
   <use name="cuda"/>
 </bin>
+</iftool>
diff --git a/HeterogeneousCore/CUDAServices/BuildFile.xml b/HeterogeneousCore/CUDAServices/BuildFile.xml
index 9320cad14f285..e1ddaff2ada4c 100644
--- a/HeterogeneousCore/CUDAServices/BuildFile.xml
+++ b/HeterogeneousCore/CUDAServices/BuildFile.xml
@@ -1,3 +1,7 @@
+<ifarchitecture name="_ppc64le_">
+  <flags SKIP_FILES="*"/>
+</ifarchitecture>
+
 <use name="FWCore/Framework"/>
 <use name="FWCore/ServiceRegistry"/>
 <use name="FWCore/ParameterSet"/>
diff --git a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
index 041ed25ba134a..68e32b64b4032 100644
--- a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
+++ b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
@@ -1,3 +1,4 @@
+<iftool name="cuda-gcc-support">
 <bin name="cudaComputeCapabilities" file="cudaComputeCapabilities.cpp">
   <use name="cuda"/>
 </bin>
@@ -5,3 +6,4 @@
 <bin name="cudaIsEnabled" file="cudaIsEnabled.cpp">
   <use name="cuda"/>
 </bin>
+</iftool>
diff --git a/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml b/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml
index 81d4f20331ce3..188edb442d0cc 100644
--- a/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml
+++ b/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml
@@ -1,3 +1,6 @@
+<ifarchitecture name="_ppc64le_">
+  <flags SKIP_FILES="*"/>
+</ifarchitecture>
 <use name="cuda"/>
 <use name="DataFormats/Common"/>
 <use name="DataFormats/Provenance"/>
diff --git a/HeterogeneousCore/CUDAServices/test/BuildFile.xml b/HeterogeneousCore/CUDAServices/test/BuildFile.xml
index 8697cb61fb40a..020b3736617c5 100644
--- a/HeterogeneousCore/CUDAServices/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAServices/test/BuildFile.xml
@@ -1,5 +1,7 @@
-<use  name="HeterogeneousCore/CUDAServices"/>
-<use  name="cuda"/>
+<iftool name="cuda-gcc-support">
 <bin  file="testCUDAService.cpp test_main.cpp" name="testCUDAService">
+  <use  name="HeterogeneousCore/CUDAServices"/>
+  <use  name="cuda"/>
   <use name="catch2"/>
 </bin>
+</iftool>
diff --git a/HeterogeneousCore/CUDATest/BuildFile.xml b/HeterogeneousCore/CUDATest/BuildFile.xml
index 112c200812d98..eb6a1cf31e2e6 100644
--- a/HeterogeneousCore/CUDATest/BuildFile.xml
+++ b/HeterogeneousCore/CUDATest/BuildFile.xml
@@ -1,3 +1,7 @@
+<ifarchitecture name="_ppc64le_">
+  <flags SKIP_FILES="*"/>
+</ifarchitecture>
+
 <use name="DataFormats/Common"/>
 <use name="HeterogeneousCore/CUDACore"/>
 <use name="rootcore"/>
diff --git a/HeterogeneousCore/CUDATest/plugins/BuildFile.xml b/HeterogeneousCore/CUDATest/plugins/BuildFile.xml
index b53d247aa6129..e0b63df02befa 100644
--- a/HeterogeneousCore/CUDATest/plugins/BuildFile.xml
+++ b/HeterogeneousCore/CUDATest/plugins/BuildFile.xml
@@ -1,3 +1,6 @@
+<ifarchitecture name="_ppc64le_">
+  <flags SKIP_FILES="*"/>
+</ifarchitecture>
 <library file="*.cc *.cu" name="HeterogeneousCoreCUDATestPlugins">
   <flags EDM_PLUGIN="1"/>
   <use name="FWCore/Framework"/>
diff --git a/HeterogeneousCore/CUDATest/test/BuildFile.xml b/HeterogeneousCore/CUDATest/test/BuildFile.xml
index 3287d65c14470..424eb6862be79 100644
--- a/HeterogeneousCore/CUDATest/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDATest/test/BuildFile.xml
@@ -1,3 +1,4 @@
+<iftool name="cuda-gcc-support">
 <bin file="test*.cc" name="testHeterogeneousCoreCUDATest">
   <use name="FWCore/TestProcessor"/>
   <use name="HeterogeneousCore/CUDACore"/>
@@ -8,3 +9,4 @@
   <flags TEST_RUNNER_ARGS=" /bin/bash HeterogeneousCore/CUDATest/test runtests.sh"/>
   <use name="FWCore/Utilities"/>
 </bin>
+</iftool>
diff --git a/HeterogeneousCore/CUDAUtilities/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
index 4528e0288d64f..b0abbab26f103 100644
--- a/HeterogeneousCore/CUDAUtilities/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
@@ -1,3 +1,7 @@
+<ifarchitecture name="_ppc64le_">
+  <flags SKIP_FILES="*"/>
+</ifarchitecture>
+
 <use name="cub"/>
 <use name="cuda"/>
 <use name="FWCore/Utilities"/>
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index a700c0865f0f2..60961d42999d4 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -1,5 +1,6 @@
 <use name="HeterogeneousCore/CUDAUtilities"/>
 
+<iftool name="cuda-gcc-support">
 <bin file="assert_t.cu" name="cudaAssert_t">
 </bin>
 
@@ -81,3 +82,4 @@
   <flags CXXFLAGS="-g -DGPU_DEBUG"/>
 </bin>
 
+</iftool>

From 417d558d43636734440d3dd6031c58050e258c94 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 5 Dec 2019 21:36:30 +0100
Subject: [PATCH 05/29] Clean up CUDAService

- Remove obsolete comment
- Make computeCapability() const
- Remove unnecessary ActivityRegistry argument from constructor
- Replace C-style cast with static_cast
- Remove test_main.cc as unnecessary
---
 .../CUDAServices/interface/CUDAService.h         | 12 ++----------
 .../CUDAServices/plugins/plugins.cc              |  2 +-
 .../CUDAServices/src/CUDAService.cc              |  6 ++++--
 .../CUDAServices/test/testCUDAService.cpp        | 16 +++++-----------
 HeterogeneousCore/CUDAServices/test/test_main.cc |  2 --
 5 files changed, 12 insertions(+), 26 deletions(-)
 delete mode 100644 HeterogeneousCore/CUDAServices/test/test_main.cc

diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
index 625ce40fdcdc9..5295af75513b0 100644
--- a/HeterogeneousCore/CUDAServices/interface/CUDAService.h
+++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
@@ -12,17 +12,9 @@ namespace edm {
   class ConfigurationDescriptions;
 }  // namespace edm
 
-/**
- * TODO:
- * - CUDA stream management?
- *   * Not really needed until we want to pass CUDA stream objects from one module to another
- *   * Which is not really needed until we want to go for "streaming mode"
- *   * Until that framework's inter-module synchronization is safe (but not necessarily optimal)
- * - Management of (preallocated) memory?
- */
 class CUDAService {
 public:
-  CUDAService(edm::ParameterSet const& iConfig, edm::ActivityRegistry& iRegistry);
+  CUDAService(edm::ParameterSet const& iConfig);
   ~CUDAService();
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
@@ -32,7 +24,7 @@ class CUDAService {
   int numberOfDevices() const { return numberOfDevices_; }
 
   // major, minor
-  std::pair<int, int> computeCapability(int device) { return computeCapabilities_.at(device); }
+  std::pair<int, int> computeCapability(int device) const { return computeCapabilities_.at(device); }
 
   // Returns the id of device with most free memory. If none is found, returns -1.
   int deviceWithMostFreeMemory() const;
diff --git a/HeterogeneousCore/CUDAServices/plugins/plugins.cc b/HeterogeneousCore/CUDAServices/plugins/plugins.cc
index d8aefa42e9c99..120ca10cb13ce 100644
--- a/HeterogeneousCore/CUDAServices/plugins/plugins.cc
+++ b/HeterogeneousCore/CUDAServices/plugins/plugins.cc
@@ -1,4 +1,4 @@
 #include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 
-DEFINE_FWK_SERVICE(CUDAService);
+DEFINE_FWK_SERVICE_MAKER(CUDAService, edm::serviceregistry::ParameterSetMaker<CUDAService>);
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
index 1568e5bb508eb..dbc32c6911564 100644
--- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -119,7 +119,7 @@ namespace {
 }  // namespace
 
 /// Constructor
-CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& iRegistry) {
+CUDAService::CUDAService(edm::ParameterSet const& config) {
   bool configEnabled = config.getUntrackedParameter<bool>("enabled");
   if (not configEnabled) {
     edm::LogInfo("CUDAService") << "CUDAService disabled by configuration";
@@ -167,7 +167,9 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
         "exclusive (single process)",  // cudaComputeModeExclusiveProcess
         "unknown"};
     log << "  compute mode:" << std::right << std::setw(27)
-        << computeModeDescription[std::min(properties.computeMode, (int)std::size(computeModeDescription) - 1)] << '\n';
+        << computeModeDescription[std::min(properties.computeMode,
+                                           static_cast<int>(std::size(computeModeDescription)) - 1)]
+        << '\n';
 
     // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with abort()
     cudaCheck(cudaSetDevice(i));
diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
index 88e9508b7206c..572e077606b0b 100644
--- a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
+++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
@@ -10,23 +10,20 @@
 
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
-#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
 #include "FWCore/Utilities/interface/Exception.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
 
 namespace {
-  CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) {
+  CUDAService makeCUDAService(edm::ParameterSet ps) {
     auto desc = edm::ConfigurationDescriptions("Service", "CUDAService");
     CUDAService::fillDescriptions(desc);
     desc.validate(ps, "CUDAService");
-    return CUDAService(ps, ar);
+    return CUDAService(ps);
   }
 }  // namespace
 
 TEST_CASE("Tests of CUDAService", "[CUDAService]") {
-  edm::ActivityRegistry ar;
-
   // Test setup: check if a simple CUDA runtime API call fails:
   // if so, skip the test with the CUDAService enabled
   int deviceCount = 0;
@@ -41,7 +38,7 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
     edm::ParameterSet ps;
     ps.addUntrackedParameter("enabled", true);
     SECTION("Enabled only if there are CUDA capable GPUs") {
-      auto cs = makeCUDAService(ps, ar);
+      auto cs = makeCUDAService(ps);
       if (deviceCount <= 0) {
         REQUIRE(cs.enabled() == false);
         WARN("CUDAService is disabled as there are no CUDA GPU devices");
@@ -55,7 +52,7 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
       return;
     }
 
-    auto cs = makeCUDAService(ps, ar);
+    auto cs = makeCUDAService(ps);
 
     SECTION("CUDA Queries") {
       int driverVersion = 0, runtimeVersion = 0;
@@ -116,11 +113,8 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
   SECTION("Force to be disabled") {
     edm::ParameterSet ps;
     ps.addUntrackedParameter("enabled", false);
-    auto cs = makeCUDAService(ps, ar);
+    auto cs = makeCUDAService(ps);
     REQUIRE(cs.enabled() == false);
     REQUIRE(cs.numberOfDevices() == 0);
   }
-
-  //Fake the end-of-job signal.
-  ar.postEndJobSignal_();
 }
diff --git a/HeterogeneousCore/CUDAServices/test/test_main.cc b/HeterogeneousCore/CUDAServices/test/test_main.cc
deleted file mode 100644
index 0c7c351f437f5..0000000000000
--- a/HeterogeneousCore/CUDAServices/test/test_main.cc
+++ /dev/null
@@ -1,2 +0,0 @@
-#define CATCH_CONFIG_MAIN
-#include "catch.hpp"

From 2decd6ab52d168eb99e65df0a156f0ebfa017dfd Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 5 Dec 2019 22:45:06 +0100
Subject: [PATCH 06/29] Use iftool instead of ifarchitecture

---
 CUDADataFormats/Common/BuildFile.xml                 | 6 ++----
 HeterogeneousCore/CUDACore/BuildFile.xml             | 6 ++----
 HeterogeneousCore/CUDAServices/BuildFile.xml         | 6 ++----
 HeterogeneousCore/CUDAServices/plugins/BuildFile.xml | 5 ++---
 HeterogeneousCore/CUDATest/BuildFile.xml             | 6 ++----
 HeterogeneousCore/CUDATest/plugins/BuildFile.xml     | 5 ++---
 HeterogeneousCore/CUDAUtilities/BuildFile.xml        | 6 ++----
 7 files changed, 14 insertions(+), 26 deletions(-)

diff --git a/CUDADataFormats/Common/BuildFile.xml b/CUDADataFormats/Common/BuildFile.xml
index b867319b8276f..e7a5ba74d80be 100644
--- a/CUDADataFormats/Common/BuildFile.xml
+++ b/CUDADataFormats/Common/BuildFile.xml
@@ -1,9 +1,7 @@
-<ifarchitecture name="_ppc64le_">
-  <flags SKIP_FILES="*"/>
-</ifarchitecture>
-
+<iftool name="cuda-gcc-support">
 <use name="HeterogeneousCore/CUDAUtilities"/>
 
 <export>
     <lib name="1"/>
 </export>
+</iftool>
diff --git a/HeterogeneousCore/CUDACore/BuildFile.xml b/HeterogeneousCore/CUDACore/BuildFile.xml
index 15233466d447b..a10567db16edb 100644
--- a/HeterogeneousCore/CUDACore/BuildFile.xml
+++ b/HeterogeneousCore/CUDACore/BuildFile.xml
@@ -1,7 +1,4 @@
-<ifarchitecture name="_ppc64le_">
-  <flags SKIP_FILES="*"/>
-</ifarchitecture>
-
+<iftool name="cuda-gcc-support">
 <use name="FWCore/Concurrency"/>
 <use name="FWCore/Framework"/>
 <use name="FWCore/ServiceRegistry"/>
@@ -14,3 +11,4 @@
 <export>
     <lib name="1"/>
 </export>
+</iftool>
diff --git a/HeterogeneousCore/CUDAServices/BuildFile.xml b/HeterogeneousCore/CUDAServices/BuildFile.xml
index e1ddaff2ada4c..4983b36f38e83 100644
--- a/HeterogeneousCore/CUDAServices/BuildFile.xml
+++ b/HeterogeneousCore/CUDAServices/BuildFile.xml
@@ -1,7 +1,4 @@
-<ifarchitecture name="_ppc64le_">
-  <flags SKIP_FILES="*"/>
-</ifarchitecture>
-
+<iftool name="cuda-gcc-support">
 <use name="FWCore/Framework"/>
 <use name="FWCore/ServiceRegistry"/>
 <use name="FWCore/ParameterSet"/>
@@ -13,3 +10,4 @@
 <export>
     <lib name="1"/>
 </export>
+</iftool>
diff --git a/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml b/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml
index 188edb442d0cc..95857d74e7dfa 100644
--- a/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml
+++ b/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml
@@ -1,6 +1,4 @@
-<ifarchitecture name="_ppc64le_">
-  <flags SKIP_FILES="*"/>
-</ifarchitecture>
+<iftool name="cuda-gcc-support">
 <use name="cuda"/>
 <use name="DataFormats/Common"/>
 <use name="DataFormats/Provenance"/>
@@ -16,3 +14,4 @@
 <library file="*.cc" name="HeterogeneousCoreCUDAServicesPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
+</iftool>
diff --git a/HeterogeneousCore/CUDATest/BuildFile.xml b/HeterogeneousCore/CUDATest/BuildFile.xml
index eb6a1cf31e2e6..84905be5e2db8 100644
--- a/HeterogeneousCore/CUDATest/BuildFile.xml
+++ b/HeterogeneousCore/CUDATest/BuildFile.xml
@@ -1,7 +1,5 @@
-<ifarchitecture name="_ppc64le_">
-  <flags SKIP_FILES="*"/>
-</ifarchitecture>
-
+<iftool name="cuda-gcc-support">
 <use name="DataFormats/Common"/>
 <use name="HeterogeneousCore/CUDACore"/>
 <use name="rootcore"/>
+</iftool>
diff --git a/HeterogeneousCore/CUDATest/plugins/BuildFile.xml b/HeterogeneousCore/CUDATest/plugins/BuildFile.xml
index e0b63df02befa..13f9ef6c06cd2 100644
--- a/HeterogeneousCore/CUDATest/plugins/BuildFile.xml
+++ b/HeterogeneousCore/CUDATest/plugins/BuildFile.xml
@@ -1,6 +1,4 @@
-<ifarchitecture name="_ppc64le_">
-  <flags SKIP_FILES="*"/>
-</ifarchitecture>
+<iftool name="cuda-gcc-support">
 <library file="*.cc *.cu" name="HeterogeneousCoreCUDATestPlugins">
   <flags EDM_PLUGIN="1"/>
   <use name="FWCore/Framework"/>
@@ -10,3 +8,4 @@
   <use name="HeterogeneousCore/CUDAUtilities"/>
   <use name="cuda"/>
 </library>
+</iftool>
diff --git a/HeterogeneousCore/CUDAUtilities/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
index b0abbab26f103..4a8e644a231f6 100644
--- a/HeterogeneousCore/CUDAUtilities/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
@@ -1,7 +1,4 @@
-<ifarchitecture name="_ppc64le_">
-  <flags SKIP_FILES="*"/>
-</ifarchitecture>
-
+<iftool name="cuda-gcc-support">
 <use name="cub"/>
 <use name="cuda"/>
 <use name="FWCore/Utilities"/>
@@ -10,3 +7,4 @@
 <export>
     <lib name="1"/>
 </export>
+</iftool>

From 29430c4fc7640cebe87f27abe08c8d00b1068a89 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 5 Dec 2019 22:46:19 +0100
Subject: [PATCH 07/29] CUDAServices does not directly depend on cub

---
 HeterogeneousCore/CUDAServices/BuildFile.xml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/HeterogeneousCore/CUDAServices/BuildFile.xml b/HeterogeneousCore/CUDAServices/BuildFile.xml
index 4983b36f38e83..c2d566baf5c1b 100644
--- a/HeterogeneousCore/CUDAServices/BuildFile.xml
+++ b/HeterogeneousCore/CUDAServices/BuildFile.xml
@@ -5,7 +5,6 @@
 <use name="FWCore/MessageLogger"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="cuda"/>
-<use name="cub"/>
 
 <export>
     <lib name="1"/>

From fc90c2c929b4a59077e39a5436accabf528a3d32 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 13 Dec 2019 18:32:01 +0100
Subject: [PATCH 08/29] Clean up CUDATest

- explicit CUDAThing constructor
- Make EDModule member variables const when possible
- Add descriptions.setComment() where one was missing
- Use EDPutToken and ctx.emplace() in TestCUDAProducerGPUFirst
- Simplify test config file and the runner script
---
 .../CUDATest/interface/CUDAThing.h            |  2 +-
 .../CUDATest/plugins/TestCUDAAnalyzerGPU.cc   | 20 ++++++------
 .../CUDATest/plugins/TestCUDAProducerCPU.cc   | 14 ++++-----
 .../CUDATest/plugins/TestCUDAProducerGPU.cc   | 18 +++++------
 .../CUDATest/plugins/TestCUDAProducerGPUEW.cc | 31 +++++++++++--------
 .../plugins/TestCUDAProducerGPUEWTask.cc      | 31 ++++++++++++-------
 .../plugins/TestCUDAProducerGPUFirst.cc       | 19 ++++++------
 .../plugins/TestCUDAProducerGPUtoCPU.cc       | 24 +++++++-------
 HeterogeneousCore/CUDATest/test/BuildFile.xml |  5 +--
 .../CUDATest/test/TestCUDATest.cc             |  3 --
 HeterogeneousCore/CUDATest/test/runtests.sh   | 10 +++---
 .../CUDATest/test/testCUDASwitch_cfg.py       |  8 ++---
 12 files changed, 93 insertions(+), 92 deletions(-)
 delete mode 100644 HeterogeneousCore/CUDATest/test/TestCUDATest.cc

diff --git a/HeterogeneousCore/CUDATest/interface/CUDAThing.h b/HeterogeneousCore/CUDATest/interface/CUDAThing.h
index 1ef6c2a7238cc..f8559a4f86b41 100644
--- a/HeterogeneousCore/CUDATest/interface/CUDAThing.h
+++ b/HeterogeneousCore/CUDATest/interface/CUDAThing.h
@@ -6,7 +6,7 @@
 class CUDAThing {
 public:
   CUDAThing() = default;
-  CUDAThing(cudautils::device::unique_ptr<float[]> ptr) : ptr_(std::move(ptr)) {}
+  explicit CUDAThing(cudautils::device::unique_ptr<float[]> ptr) : ptr_(std::move(ptr)) {}
 
   const float *get() const { return ptr_.get(); }
 
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
index 6d708cb0833af..e38c596fbe2f5 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
@@ -16,23 +16,23 @@
 
 class TestCUDAAnalyzerGPU : public edm::global::EDAnalyzer<> {
 public:
-  explicit TestCUDAAnalyzerGPU(const edm::ParameterSet& iConfig);
+  explicit TestCUDAAnalyzerGPU(edm::ParameterSet const& iConfig);
   ~TestCUDAAnalyzerGPU() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
-  void analyze(edm::StreamID, const edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+  void analyze(edm::StreamID, edm::Event const& iEvent, edm::EventSetup const& iSetup) const override;
   void endJob() override;
 
 private:
-  std::string label_;
-  edm::EDGetTokenT<CUDAProduct<CUDAThing>> srcToken_;
-  double minValue_;
-  double maxValue_;
+  std::string const label_;
+  edm::EDGetTokenT<CUDAProduct<CUDAThing>> const srcToken_;
+  double const minValue_;
+  double const maxValue_;
   std::unique_ptr<TestCUDAAnalyzerGPUKernel> gpuAlgo_;
 };
 
-TestCUDAAnalyzerGPU::TestCUDAAnalyzerGPU(const edm::ParameterSet& iConfig)
+TestCUDAAnalyzerGPU::TestCUDAAnalyzerGPU(edm::ParameterSet const& iConfig)
     : label_(iConfig.getParameter<std::string>("@module_label")),
       srcToken_(consumes<CUDAProduct<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))),
       minValue_(iConfig.getParameter<double>("minValue")),
@@ -53,13 +53,13 @@ void TestCUDAAnalyzerGPU::fillDescriptions(edm::ConfigurationDescriptions& descr
   descriptions.setComment("This EDAnalyzer is part of the TestCUDAProducer* family. It models a GPU analyzer.");
 }
 
-void TestCUDAAnalyzerGPU::analyze(edm::StreamID, const edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+void TestCUDAAnalyzerGPU::analyze(edm::StreamID, edm::Event const& iEvent, edm::EventSetup const& iSetup) const {
   edm::LogVerbatim("TestCUDAAnalyzerGPU") << label_ << " TestCUDAAnalyzerGPU::analyze begin event "
                                           << iEvent.id().event() << " stream " << iEvent.streamID();
 
-  const auto& in = iEvent.get(srcToken_);
+  auto const& in = iEvent.get(srcToken_);
   CUDAScopedContextAnalyze ctx{in};
-  const CUDAThing& input = ctx.get(in);
+  CUDAThing const& input = ctx.get(in);
   gpuAlgo_->analyzeAsync(input.get(), ctx.stream());
 
   edm::LogVerbatim("TestCUDAAnalyzerGPU")
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc
index bb19e2a3d7807..c25a44023ebc0 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc
@@ -11,20 +11,20 @@
 
 class TestCUDAProducerCPU : public edm::global::EDProducer<> {
 public:
-  explicit TestCUDAProducerCPU(const edm::ParameterSet& iConfig);
+  explicit TestCUDAProducerCPU(edm::ParameterSet const& iConfig);
   ~TestCUDAProducerCPU() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
-  void produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+  void produce(edm::StreamID id, edm::Event& iEvent, edm::EventSetup const& iSetup) const override;
 
 private:
-  std::string label_;
+  std::string const label_;
   edm::EDGetTokenT<int> srcToken_;
-  edm::EDPutTokenT<int> dstToken_;
+  edm::EDPutTokenT<int> const dstToken_;
 };
 
-TestCUDAProducerCPU::TestCUDAProducerCPU(const edm::ParameterSet& iConfig)
+TestCUDAProducerCPU::TestCUDAProducerCPU(edm::ParameterSet const& iConfig)
     : label_{iConfig.getParameter<std::string>("@module_label")}, dstToken_{produces<int>()} {
   auto srcTag = iConfig.getParameter<edm::InputTag>("src");
   if (!srcTag.label().empty()) {
@@ -39,7 +39,7 @@ void TestCUDAProducerCPU::fillDescriptions(edm::ConfigurationDescriptions& descr
   descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a CPU algorithm.");
 }
 
-void TestCUDAProducerCPU::produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+void TestCUDAProducerCPU::produce(edm::StreamID id, edm::Event& iEvent, edm::EventSetup const& iSetup) const {
   edm::LogVerbatim("TestCUDAProducerCPU")
       << label_ << " TestCUDAProducerCPU::produce begin event " << iEvent.id().event() << " stream " << id;
 
@@ -56,7 +56,7 @@ void TestCUDAProducerCPU::produce(edm::StreamID id, edm::Event& iEvent, const ed
       << " Task (CPU) for event " << iEvent.id().event() << " in stream " << id << " will take " << dur << " seconds";
   std::this_thread::sleep_for(std::chrono::seconds(1) * dur);
 
-  const unsigned int output = input + id * 100 + iEvent.id().event();
+  unsigned int const output = input + id * 100 + iEvent.id().event();
 
   iEvent.emplace(dstToken_, output);
 
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
index e66bd3080c1e7..8f8979a25a273 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
@@ -18,16 +18,16 @@ class TestCUDAProducerGPU : public edm::global::EDProducer<> {
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
-  void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+  void produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup const& iSetup) const override;
 
 private:
-  std::string label_;
-  edm::EDGetTokenT<CUDAProduct<CUDAThing>> srcToken_;
-  edm::EDPutTokenT<CUDAProduct<CUDAThing>> dstToken_;
-  TestCUDAProducerGPUKernel gpuAlgo_;
+  std::string const label_;
+  edm::EDGetTokenT<CUDAProduct<CUDAThing>> const srcToken_;
+  edm::EDPutTokenT<CUDAProduct<CUDAThing>> const dstToken_;
+  TestCUDAProducerGPUKernel const gpuAlgo_;
 };
 
-TestCUDAProducerGPU::TestCUDAProducerGPU(const edm::ParameterSet& iConfig)
+TestCUDAProducerGPU::TestCUDAProducerGPU(edm::ParameterSet const& iConfig)
     : label_(iConfig.getParameter<std::string>("@module_label")),
       srcToken_(consumes<CUDAProduct<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))),
       dstToken_(produces<CUDAProduct<CUDAThing>>()) {}
@@ -41,13 +41,13 @@ void TestCUDAProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& descr
       "algorithm in the chain of the GPU EDProducers. Produces CUDAProduct<CUDAThing>.");
 }
 
-void TestCUDAProducerGPU::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+void TestCUDAProducerGPU::produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup const& iSetup) const {
   edm::LogVerbatim("TestCUDAProducerGPU") << label_ << " TestCUDAProducerGPU::produce begin event "
                                           << iEvent.id().event() << " stream " << iEvent.streamID();
 
-  const auto& in = iEvent.get(srcToken_);
+  auto const& in = iEvent.get(srcToken_);
   CUDAScopedContextProduce ctx{in};
-  const CUDAThing& input = ctx.get(in);
+  CUDAThing const& input = ctx.get(in);
 
   ctx.emplace(iEvent, dstToken_, CUDAThing{gpuAlgo_.runAlgo(label_, input.get(), ctx.stream())});
 
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
index 74e5af7c46baf..383e15d0a96f3 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
@@ -18,27 +18,27 @@
 
 class TestCUDAProducerGPUEW : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
-  explicit TestCUDAProducerGPUEW(const edm::ParameterSet& iConfig);
+  explicit TestCUDAProducerGPUEW(edm::ParameterSet const& iConfig);
   ~TestCUDAProducerGPUEW() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
-  void acquire(const edm::Event& iEvent,
-               const edm::EventSetup& iSetup,
+  void acquire(edm::Event const& iEvent,
+               edm::EventSetup const& iSetup,
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
-  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
 private:
-  std::string label_;
-  edm::EDGetTokenT<CUDAProduct<CUDAThing>> srcToken_;
-  edm::EDPutTokenT<CUDAProduct<CUDAThing>> dstToken_;
+  std::string const label_;
+  edm::EDGetTokenT<CUDAProduct<CUDAThing>> const srcToken_;
+  edm::EDPutTokenT<CUDAProduct<CUDAThing>> const dstToken_;
   TestCUDAProducerGPUKernel gpuAlgo_;
   CUDAContextState ctxState_;
   cudautils::device::unique_ptr<float[]> devicePtr_;
   cudautils::host::noncached::unique_ptr<float> hostData_;
 };
 
-TestCUDAProducerGPUEW::TestCUDAProducerGPUEW(const edm::ParameterSet& iConfig)
+TestCUDAProducerGPUEW::TestCUDAProducerGPUEW(edm::ParameterSet const& iConfig)
     : label_{iConfig.getParameter<std::string>("@module_label")},
       srcToken_{consumes<CUDAProduct<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))},
       dstToken_{produces<CUDAProduct<CUDAThing>>()} {
@@ -52,17 +52,22 @@ void TestCUDAProducerGPUEW::fillDescriptions(edm::ConfigurationDescriptions& des
   edm::ParameterSetDescription desc;
   desc.add<edm::InputTag>("src", edm::InputTag());
   descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment(
+      "This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this is not the first "
+      "algorithm in the chain of the GPU EDProducers, and that transfers some data from GPU to CPU and thus needs to "
+      "synchronize GPU and CPU. The synchronization is implemented with the ExternalWork extension. Produces "
+      "CUDAProduct<CUDAThing>.");
 }
 
-void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent,
-                                    const edm::EventSetup& iSetup,
+void TestCUDAProducerGPUEW::acquire(edm::Event const& iEvent,
+                                    edm::EventSetup const& iSetup,
                                     edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire begin event "
                                             << iEvent.id().event() << " stream " << iEvent.streamID();
 
-  const auto& in = iEvent.get(srcToken_);
+  auto const& in = iEvent.get(srcToken_);
   CUDAScopedContextAcquire ctx{in, std::move(waitingTaskHolder), ctxState_};
-  const CUDAThing& input = ctx.get(in);
+  CUDAThing const& input = ctx.get(in);
 
   devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx.stream());
   // Mimick the need to transfer some of the GPU data back to CPU to
@@ -74,7 +79,7 @@ void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent,
                                             << iEvent.id().event() << " stream " << iEvent.streamID();
 }
 
-void TestCUDAProducerGPUEW::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+void TestCUDAProducerGPUEW::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
   edm::LogVerbatim("TestCUDAProducerGPUEW")
       << label_ << " TestCUDAProducerGPUEW::produce begin event " << iEvent.id().event() << " stream "
       << iEvent.streamID() << " 10th element " << *hostData_;
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
index 0c8aad0931f15..f3010c94b3d9c 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
@@ -23,29 +23,29 @@
 
 class TestCUDAProducerGPUEWTask : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
-  explicit TestCUDAProducerGPUEWTask(const edm::ParameterSet& iConfig);
+  explicit TestCUDAProducerGPUEWTask(edm::ParameterSet const& iConfig);
   ~TestCUDAProducerGPUEWTask() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
-  void acquire(const edm::Event& iEvent,
-               const edm::EventSetup& iSetup,
+  void acquire(edm::Event const& iEvent,
+               edm::EventSetup const& iSetup,
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
 
 private:
   void addSimpleWork(edm::EventNumber_t eventID, edm::StreamID streamID, CUDAScopedContextTask& ctx);
 
-  std::string label_;
-  edm::EDGetTokenT<CUDAProduct<CUDAThing>> srcToken_;
-  edm::EDPutTokenT<CUDAProduct<CUDAThing>> dstToken_;
+  std::string const label_;
+  edm::EDGetTokenT<CUDAProduct<CUDAThing>> const srcToken_;
+  edm::EDPutTokenT<CUDAProduct<CUDAThing>> const dstToken_;
   TestCUDAProducerGPUKernel gpuAlgo_;
   CUDAContextState ctxState_;
   cudautils::device::unique_ptr<float[]> devicePtr_;
   cudautils::host::noncached::unique_ptr<float> hostData_;
 };
 
-TestCUDAProducerGPUEWTask::TestCUDAProducerGPUEWTask(const edm::ParameterSet& iConfig)
+TestCUDAProducerGPUEWTask::TestCUDAProducerGPUEWTask(edm::ParameterSet const& iConfig)
     : label_{iConfig.getParameter<std::string>("@module_label")},
       srcToken_{consumes<CUDAProduct<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))},
       dstToken_{produces<CUDAProduct<CUDAThing>>()} {
@@ -59,18 +59,25 @@ void TestCUDAProducerGPUEWTask::fillDescriptions(edm::ConfigurationDescriptions&
   edm::ParameterSetDescription desc;
   desc.add<edm::InputTag>("src", edm::InputTag());
   descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment(
+      "This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this is not the first "
+      "algorithm in the chain of the GPU EDProducers, and that transfers some data from GPU to CPU multiple times "
+      "alternating the transfers and kernel executions (e.g. to decide which kernel to run next based on a value from "
+      "GPU). A synchronization between GPU and CPU is needed after each transfer. The synchronizations are implemented "
+      "with the ExternalWork extension and explicit TBB tasks within the module. Produces "
+      "CUDAProduct<CUDAThing>.");
 }
 
-void TestCUDAProducerGPUEWTask::acquire(const edm::Event& iEvent,
-                                        const edm::EventSetup& iSetup,
+void TestCUDAProducerGPUEWTask::acquire(edm::Event const& iEvent,
+                                        edm::EventSetup const& iSetup,
                                         edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   edm::LogVerbatim("TestCUDAProducerGPUEWTask") << label_ << " TestCUDAProducerGPUEWTask::acquire begin event "
                                                 << iEvent.id().event() << " stream " << iEvent.streamID();
 
-  const auto& in = iEvent.get(srcToken_);
+  auto const& in = iEvent.get(srcToken_);
   CUDAScopedContextAcquire ctx{in, waitingTaskHolder, ctxState_};
 
-  const CUDAThing& input = ctx.get(in);
+  CUDAThing const& input = ctx.get(in);
 
   devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx.stream());
   // Mimick the need to transfer some of the GPU data back to CPU to
@@ -109,7 +116,7 @@ void TestCUDAProducerGPUEWTask::addSimpleWork(edm::EventNumber_t eventID,
   }
 }
 
-void TestCUDAProducerGPUEWTask::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+void TestCUDAProducerGPUEWTask::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
   edm::LogVerbatim("TestCUDAProducerGPUEWTask")
       << label_ << " TestCUDAProducerGPUEWTask::produce begin event " << iEvent.id().event() << " stream "
       << iEvent.streamID() << " 10th element " << *hostData_;
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
index 12f4f4530e84f..25fad0abe9438 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
@@ -13,22 +13,21 @@
 
 class TestCUDAProducerGPUFirst : public edm::global::EDProducer<> {
 public:
-  explicit TestCUDAProducerGPUFirst(const edm::ParameterSet& iConfig);
+  explicit TestCUDAProducerGPUFirst(edm::ParameterSet const& iConfig);
   ~TestCUDAProducerGPUFirst() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
-  void produce(edm::StreamID stream, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+  void produce(edm::StreamID stream, edm::Event& iEvent, edm::EventSetup const& iSetup) const override;
 
 private:
-  std::string label_;
-  TestCUDAProducerGPUKernel gpuAlgo_;
+  std::string const label_;
+  edm::EDPutTokenT<CUDAProduct<CUDAThing>> const dstToken_;
+  TestCUDAProducerGPUKernel const gpuAlgo_;
 };
 
-TestCUDAProducerGPUFirst::TestCUDAProducerGPUFirst(const edm::ParameterSet& iConfig)
-    : label_(iConfig.getParameter<std::string>("@module_label")) {
-  produces<CUDAProduct<CUDAThing>>();
-}
+TestCUDAProducerGPUFirst::TestCUDAProducerGPUFirst(edm::ParameterSet const& iConfig)
+    : label_(iConfig.getParameter<std::string>("@module_label")), dstToken_{produces<CUDAProduct<CUDAThing>>()} {}
 
 void TestCUDAProducerGPUFirst::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
@@ -40,14 +39,14 @@ void TestCUDAProducerGPUFirst::fillDescriptions(edm::ConfigurationDescriptions&
 
 void TestCUDAProducerGPUFirst::produce(edm::StreamID streamID,
                                        edm::Event& iEvent,
-                                       const edm::EventSetup& iSetup) const {
+                                       edm::EventSetup const& iSetup) const {
   edm::LogVerbatim("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce begin event "
                                                << iEvent.id().event() << " stream " << iEvent.streamID();
 
   CUDAScopedContextProduce ctx{streamID};
 
   cudautils::device::unique_ptr<float[]> output = gpuAlgo_.runAlgo(label_, ctx.stream());
-  iEvent.put(ctx.wrap(CUDAThing(std::move(output))));
+  ctx.emplace(iEvent, dstToken_, std::move(output));
 
   edm::LogVerbatim("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce end event "
                                                << iEvent.id().event() << " stream " << iEvent.streamID();
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
index 168ac1daa14b9..1d5456f329e0f 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
@@ -15,25 +15,25 @@
 
 class TestCUDAProducerGPUtoCPU : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
-  explicit TestCUDAProducerGPUtoCPU(const edm::ParameterSet& iConfig);
+  explicit TestCUDAProducerGPUtoCPU(edm::ParameterSet const& iConfig);
   ~TestCUDAProducerGPUtoCPU() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
-  void acquire(const edm::Event& iEvent,
-               const edm::EventSetup& iSetup,
+  void acquire(edm::Event const& iEvent,
+               edm::EventSetup const& iSetup,
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
 
   void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
 
 private:
-  std::string label_;
-  edm::EDGetTokenT<CUDAProduct<CUDAThing>> srcToken_;
-  edm::EDPutTokenT<int> dstToken_;
+  std::string const label_;
+  edm::EDGetTokenT<CUDAProduct<CUDAThing>> const srcToken_;
+  edm::EDPutTokenT<int> const dstToken_;
   cudautils::host::unique_ptr<float[]> buffer_;
 };
 
-TestCUDAProducerGPUtoCPU::TestCUDAProducerGPUtoCPU(const edm::ParameterSet& iConfig)
+TestCUDAProducerGPUtoCPU::TestCUDAProducerGPUtoCPU(edm::ParameterSet const& iConfig)
     : label_{iConfig.getParameter<std::string>("@module_label")},
       srcToken_{consumes<CUDAProduct<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))},
       dstToken_{produces<int>()} {}
@@ -47,15 +47,15 @@ void TestCUDAProducerGPUtoCPU::fillDescriptions(edm::ConfigurationDescriptions&
       "the data to legacy data format. Produces int, to be compatible with TestCUDAProducerCPU.");
 }
 
-void TestCUDAProducerGPUtoCPU::acquire(const edm::Event& iEvent,
-                                       const edm::EventSetup& iSetup,
+void TestCUDAProducerGPUtoCPU::acquire(edm::Event const& iEvent,
+                                       edm::EventSetup const& iSetup,
                                        edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire begin event "
                                                << iEvent.id().event() << " stream " << iEvent.streamID();
 
-  const auto& in = iEvent.get(srcToken_);
+  auto const& in = iEvent.get(srcToken_);
   CUDAScopedContextAcquire ctx{in, std::move(waitingTaskHolder)};
-  const CUDAThing& device = ctx.get(in);
+  CUDAThing const& device = ctx.get(in);
 
   buffer_ = cudautils::make_host_unique<float[]>(TestCUDAProducerGPUKernel::NUM_VALUES, ctx.stream());
   // Enqueue async copy, continue in produce once finished
@@ -69,7 +69,7 @@ void TestCUDAProducerGPUtoCPU::acquire(const edm::Event& iEvent,
                                                << iEvent.id().event() << " stream " << iEvent.streamID();
 }
 
-void TestCUDAProducerGPUtoCPU::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+void TestCUDAProducerGPUtoCPU::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
   edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::produce begin event "
                                                << iEvent.id().event() << " stream " << iEvent.streamID();
 
diff --git a/HeterogeneousCore/CUDATest/test/BuildFile.xml b/HeterogeneousCore/CUDATest/test/BuildFile.xml
index 424eb6862be79..23f20762c629c 100644
--- a/HeterogeneousCore/CUDATest/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDATest/test/BuildFile.xml
@@ -5,8 +5,5 @@
   <use name="catch2"/>
 </bin>
 
-<bin file="TestCUDATest.cc">
-  <flags TEST_RUNNER_ARGS=" /bin/bash HeterogeneousCore/CUDATest/test runtests.sh"/>
-  <use name="FWCore/Utilities"/>
-</bin>
+<test name="TestCUDATest" command="runtests.sh"/>
 </iftool>
diff --git a/HeterogeneousCore/CUDATest/test/TestCUDATest.cc b/HeterogeneousCore/CUDATest/test/TestCUDATest.cc
deleted file mode 100644
index b2991bd18ae57..0000000000000
--- a/HeterogeneousCore/CUDATest/test/TestCUDATest.cc
+++ /dev/null
@@ -1,3 +0,0 @@
-#include "FWCore/Utilities/interface/TestHelper.h"
-
-RUNTEST()
diff --git a/HeterogeneousCore/CUDATest/test/runtests.sh b/HeterogeneousCore/CUDATest/test/runtests.sh
index 6817aa8d7ffab..6a9050388ea2e 100755
--- a/HeterogeneousCore/CUDATest/test/runtests.sh
+++ b/HeterogeneousCore/CUDATest/test/runtests.sh
@@ -2,10 +2,8 @@
 
 function die { echo Failure $1: status $2 ; exit $2 ; }
 
-pushd ${LOCAL_TMP_DIR}
+TEST_DIR=src/HeterogeneousCore/CUDATest/test
 
-  echo "*************************************************"
-  echo "CUDA producer configuration with SwitchProducer"
-  cmsRun ${LOCAL_TEST_DIR}/testCUDASwitch_cfg.py || die "cmsRun testCUDASwitch_cfg.py 1" $?
-
-popd
+echo "*************************************************"
+echo "CUDA producer configuration with SwitchProducer"
+cmsRun ${TEST_DIR}/testCUDASwitch_cfg.py || die "cmsRun testCUDASwitch_cfg.py 1" $?
diff --git a/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py b/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py
index 2e213c8a03ede..805617091686b 100644
--- a/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py
+++ b/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py
@@ -12,17 +12,15 @@
 
 process.source = cms.Source("EmptySource")
 
-process.maxEvents = cms.untracked.PSet( input = cms.untracked.int32(3) )
+process.maxEvents.input = 3
 if not silent:
     process.maxEvents.input = 10
     process.MessageLogger.cerr.threshold = cms.untracked.string("INFO")
     process.MessageLogger.cerr.INFO.limit = process.MessageLogger.cerr.default.limit
 
 
-process.options = cms.untracked.PSet(
-#    numberOfThreads = cms.untracked.uint32(4),
-    numberOfStreams = cms.untracked.uint32(0)
-)
+#process.options.numberOfThreads = 4
+process.options.numberOfStreams = 0
 #process.Tracer = cms.Service("Tracer")
 
 # Flow diagram of the modules

From 14a992efe5544810ef394fba4ffc336a2a0a564a Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 13 Dec 2019 18:41:40 +0100
Subject: [PATCH 09/29] Mark thread-safe static variables with CMS_THREAD_SAFE

---
 .../CUDAUtilities/interface/CUDAEventCache.h       |  2 +-
 .../CUDAUtilities/interface/CUDAStreamCache.h      |  2 +-
 .../CUDAUtilities/src/CUDAEventCache.cc            |  4 +++-
 .../CUDAUtilities/src/CUDAStreamCache.cc           |  4 +++-
 .../CUDAUtilities/src/getCachingDeviceAllocator.h  | 14 ++++++++------
 .../CUDAUtilities/src/getCachingHostAllocator.h    | 14 ++++++++------
 6 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h b/HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h
index cc5b73b58f601..53045b59c0a98 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h
@@ -24,7 +24,7 @@ namespace cudautils {
 
   private:
     friend class ::CUDAService;
-    // intended to be called only from CUDAService destructor
+    // not thread safe, intended to be called only from CUDAService destructor
     void clear();
 
     class Deleter {
diff --git a/HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h b/HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h
index c11cf399fb574..032e3f8745b14 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h
@@ -24,7 +24,7 @@ namespace cudautils {
 
   private:
     friend class ::CUDAService;
-    // intended to be called only from CUDAService destructor
+    // not thread safe, intended to be called only from CUDAService destructor
     void clear();
 
     class Deleter {
diff --git a/HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc b/HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc
index ffc881879466c..119e79dc29149 100644
--- a/HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc
+++ b/HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc
@@ -1,3 +1,4 @@
+#include "FWCore/Utilities/interface/thread_safety_macros.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
@@ -38,7 +39,8 @@ namespace cudautils {
   }
 
   CUDAEventCache& getCUDAEventCache() {
-    static CUDAEventCache cache;
+    // the public interface is thread safe
+    CMS_THREAD_SAFE static CUDAEventCache cache;
     return cache;
   }
 }  // namespace cudautils
diff --git a/HeterogeneousCore/CUDAUtilities/src/CUDAStreamCache.cc b/HeterogeneousCore/CUDAUtilities/src/CUDAStreamCache.cc
index adf0f6c092f34..a77e490169c4d 100644
--- a/HeterogeneousCore/CUDAUtilities/src/CUDAStreamCache.cc
+++ b/HeterogeneousCore/CUDAUtilities/src/CUDAStreamCache.cc
@@ -1,3 +1,4 @@
+#include "FWCore/Utilities/interface/thread_safety_macros.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
@@ -36,7 +37,8 @@ namespace cudautils {
   }
 
   CUDAStreamCache& getCUDAStreamCache() {
-    static CUDAStreamCache cache;
+    // the public interface is thread safe
+    CMS_THREAD_SAFE static CUDAStreamCache cache;
     return cache;
   }
 }  // namespace cudautils
diff --git a/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
index e545a6a7839a1..a0917a320c28d 100644
--- a/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
+++ b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
@@ -2,6 +2,7 @@
 #define HeterogeneousCore_CUDACore_src_getCachingDeviceAllocator
 
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/Utilities/interface/thread_safety_macros.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h"
 #include "CachingDeviceAllocator.h"
@@ -64,12 +65,13 @@ namespace cudautils {
         log << "  maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n";
       });
 
-      static notcub::CachingDeviceAllocator allocator{binGrowth,
-                                                      minBin,
-                                                      maxBin,
-                                                      minCachedBytes(),
-                                                      false,  // do not skip cleanup
-                                                      debug};
+      // the public interface is thread safe
+      CMS_THREAD_SAFE static notcub::CachingDeviceAllocator allocator{binGrowth,
+                                                                      minBin,
+                                                                      maxBin,
+                                                                      minCachedBytes(),
+                                                                      false,  // do not skip cleanup
+                                                                      debug};
       return allocator;
     }
   }  // namespace allocator
diff --git a/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h b/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h
index 865e4c677d547..b9e31a78176f9 100644
--- a/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h
+++ b/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h
@@ -2,6 +2,7 @@
 #define HeterogeneousCore_CUDACore_src_getCachingHostAllocator
 
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/Utilities/interface/thread_safety_macros.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "CachingHostAllocator.h"
 
@@ -33,12 +34,13 @@ namespace cudautils {
         log << "  maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n";
       });
 
-      static notcub::CachingHostAllocator allocator{binGrowth,
-                                                    minBin,
-                                                    maxBin,
-                                                    minCachedBytes(),
-                                                    false,  // do not skip cleanup
-                                                    debug};
+      // the public interface is thread safe
+      CMS_THREAD_SAFE static notcub::CachingHostAllocator allocator{binGrowth,
+                                                                    minBin,
+                                                                    maxBin,
+                                                                    minCachedBytes(),
+                                                                    false,  // do not skip cleanup
+                                                                    debug};
       return allocator;
     }
   }  // namespace allocator

From cc0991b53f506a9c4f48b959de98555187b24f31 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 13 Dec 2019 20:47:46 +0100
Subject: [PATCH 10/29] Move mutability of a member from
 TestCUDAAnalyzerGPUKernel to TestCUDAAnalyzerGPU

The point is that the member functions mutating the visible state
should not be const even if they are thread safe.
TestCUDAAnalyzerGPUKernel::analyzeAsync() mutates the visible state
(by "filling a histogram"), so it should not be const. Declaring
TestCUDAAnalyzerGPU::gpuAlgo_ as mutable is an improvement since
filling the histogram does not really change the visible state of the
EDAnalyzer (towards the framework).
---
 HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc   | 3 ++-
 .../CUDATest/plugins/TestCUDAAnalyzerGPUKernel.cu           | 2 +-
 .../CUDATest/plugins/TestCUDAAnalyzerGPUKernel.h            | 6 +++---
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
index e38c596fbe2f5..8fe5688018728 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
@@ -29,7 +29,8 @@ class TestCUDAAnalyzerGPU : public edm::global::EDAnalyzer<> {
   edm::EDGetTokenT<CUDAProduct<CUDAThing>> const srcToken_;
   double const minValue_;
   double const maxValue_;
-  std::unique_ptr<TestCUDAAnalyzerGPUKernel> gpuAlgo_;
+  // the public interface is thread safe
+  CMS_THREAD_SAFE mutable std::unique_ptr<TestCUDAAnalyzerGPUKernel> gpuAlgo_;
 };
 
 TestCUDAAnalyzerGPU::TestCUDAAnalyzerGPU(edm::ParameterSet const& iConfig)
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.cu
index 4d4cca09e4668..01ded40c6d7ff 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.cu
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.cu
@@ -31,7 +31,7 @@ TestCUDAAnalyzerGPUKernel::TestCUDAAnalyzerGPUKernel(cudaStream_t stream) {
   cudaCheck(cudaStreamSynchronize(stream));
 }
 
-void TestCUDAAnalyzerGPUKernel::analyzeAsync(const float *d_input, cudaStream_t stream) const {
+void TestCUDAAnalyzerGPUKernel::analyzeAsync(const float *d_input, cudaStream_t stream) {
   analyze<<<int(ceil(float(NUM_VALUES) / 256)), 256, 0, stream>>>(d_input, sum_.get(), NUM_VALUES);
 }
 
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.h b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.h
index 6854ba8d61af7..612e617c67c8c 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.h
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.h
@@ -12,12 +12,12 @@ class TestCUDAAnalyzerGPUKernel {
   TestCUDAAnalyzerGPUKernel(cudaStream_t stream);
   ~TestCUDAAnalyzerGPUKernel() = default;
 
-  // returns (owning) pointer to device memory
-  void analyzeAsync(const float* d_input, cudaStream_t stream) const;
+  // thread safe
+  void analyzeAsync(const float* d_input, cudaStream_t stream);
   float value(cudaStream_t stream) const;
 
 private:
-  mutable cudautils::device::unique_ptr<float[]> sum_;  // all writes are atomic in CUDA
+  cudautils::device::unique_ptr<float[]> sum_;  // all writes are atomic in CUDA
 };
 
 #endif

From 3cdc5cc8e8c17cc01b3772c2d53620123c6d64d6 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Mon, 16 Dec 2019 22:31:58 +0100
Subject: [PATCH 11/29] Guarantee that cache returns only occurred events

---
 .../CUDAUtilities/interface/CUDAEventCache.h  |  9 +++++-
 .../CUDAUtilities/src/CUDAEventCache.cc       | 30 ++++++++++++++++++-
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h b/HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h
index 53045b59c0a98..8e5001b525351 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h
@@ -18,12 +18,19 @@ namespace cudautils {
     CUDAEventCache();
 
     // Gets a (cached) CUDA event for the current device. The event
-    // will be returned to the cache by the shared_ptr destructor.
+    // will be returned to the cache by the shared_ptr destructor. The
+    // returned event is guaranteed to be "occurred", i.e.
+    // cudaEventQuery() == cudaSuccess.
+    //
     // This function is thread safe
     SharedEventPtr getCUDAEvent();
 
   private:
     friend class ::CUDAService;
+
+    // thread safe
+    SharedEventPtr makeOrGet(int dev);
+
     // not thread safe, intended to be called only from CUDAService destructor
     void clear();
 
diff --git a/HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc b/HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc
index 119e79dc29149..bf79d0bb54568 100644
--- a/HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc
+++ b/HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc
@@ -19,8 +19,36 @@ namespace cudautils {
 
   SharedEventPtr CUDAEventCache::getCUDAEvent() {
     const auto dev = cudautils::currentDevice();
+    auto event = makeOrGet(dev);
+    auto ret = cudaEventQuery(event.get());
+    // event is occurred, return immediately
+    if (ret == cudaSuccess) {
+      return event;
+    }
+    // return code is something else than "recorded", throw exception
+    if (ret != cudaErrorNotReady) {
+      cudaCheck(ret);
+    }
+
+    // Got recorded, but not yet occurred event. Try until we get an
+    // occurred event. Need to keep all recorded events until an
+    // occurred event is found in order to avoid ping-pong with a
+    // recorded event.
+    std::vector<SharedEventPtr> ptrs{std::move(event)};
+    do {
+      event = makeOrGet(dev);
+      ret = cudaEventQuery(event.get());
+      if (ret == cudaErrorNotReady) {
+        ptrs.emplace_back(std::move(event));
+      } else if (ret != cudaSuccess) {
+        cudaCheck(ret);
+      }
+    } while (ret != cudaSuccess);
+    return event;
+  }
+
+  SharedEventPtr CUDAEventCache::makeOrGet(int dev) {
     return cache_[dev].makeOrGet([dev]() {
-      // TODO(?): We should not return a recorded, but not-yet-occurred event
       cudaEvent_t event;
       // it should be a bit faster to ignore timings
       cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));

From e3b3cfb0f8d410f322f160bfee65d4febe098bb8 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Mon, 16 Dec 2019 23:23:41 +0100
Subject: [PATCH 12/29] Always record and query the CUDA event, to minimize
 need for error checking in CUDAScopedContextProduce destructor

---
 .../Common/interface/CUDAProduct.h            |  8 +++---
 .../Common/interface/CUDAProductBase.h        |  9 +++----
 CUDADataFormats/Common/src/CUDAProductBase.cc | 18 ++++++-------
 .../CUDACore/interface/CUDAScopedContext.h    | 21 +++++-----------
 .../CUDACore/src/CUDAScopedContext.cc         | 25 +++----------------
 5 files changed, 27 insertions(+), 54 deletions(-)

diff --git a/CUDADataFormats/Common/interface/CUDAProduct.h b/CUDADataFormats/Common/interface/CUDAProduct.h
index 75c9c80e7f206..9862d9c79bcd0 100644
--- a/CUDADataFormats/Common/interface/CUDAProduct.h
+++ b/CUDADataFormats/Common/interface/CUDAProduct.h
@@ -42,12 +42,12 @@ class CUDAProduct : public CUDAProductBase {
   friend class CUDAScopedContextProduce;
   friend class edm::Wrapper<CUDAProduct<T>>;
 
-  explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, T data)
-      : CUDAProductBase(device, std::move(stream)), data_(std::move(data)) {}
+  explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event, T data)
+      : CUDAProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {}
 
   template <typename... Args>
-  explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, Args&&... args)
-      : CUDAProductBase(device, std::move(stream)), data_(std::forward<Args>(args)...) {}
+  explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event, Args&&... args)
+      : CUDAProductBase(device, std::move(stream), std::move(event)), data_(std::forward<Args>(args)...) {}
 
   T data_;  //!
 };
diff --git a/CUDADataFormats/Common/interface/CUDAProductBase.h b/CUDADataFormats/Common/interface/CUDAProductBase.h
index 219b7e619de7f..6887c50f7751a 100644
--- a/CUDADataFormats/Common/interface/CUDAProductBase.h
+++ b/CUDADataFormats/Common/interface/CUDAProductBase.h
@@ -50,18 +50,17 @@ class CUDAProductBase {
   // mutable access is needed even if the CUDAScopedContext itself
   // would be const. Therefore it is ok to return a non-const
   // pointer from a const method here.
-  cudaEvent_t event() const { return event_ ? event_.get() : nullptr; }
+  cudaEvent_t event() const { return event_.get(); }
 
 protected:
-  explicit CUDAProductBase(int device, cudautils::SharedStreamPtr stream)
-      : stream_{std::move(stream)}, device_{device} {}
+  explicit CUDAProductBase(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event)
+      : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {}
 
 private:
   friend class impl::CUDAScopedContextBase;
   friend class CUDAScopedContextProduce;
 
-  // The following functions are intended to be used only from CUDAScopedContext
-  void setEvent(cudautils::SharedEventPtr event) { event_ = std::move(event); }
+  // The following function is intended to be used only from CUDAScopedContext
   const cudautils::SharedStreamPtr& streamPtr() const { return stream_; }
 
   bool mayReuseStream() const {
diff --git a/CUDADataFormats/Common/src/CUDAProductBase.cc b/CUDADataFormats/Common/src/CUDAProductBase.cc
index 72302d3165676..eda6ee99d13f5 100644
--- a/CUDADataFormats/Common/src/CUDAProductBase.cc
+++ b/CUDADataFormats/Common/src/CUDAProductBase.cc
@@ -2,10 +2,9 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h"
 
 bool CUDAProductBase::isAvailable() const {
-  // In absence of event, the product was available already at the end
-  // of produce() of the producer.
+  // if default-constructed, the product is not available
   if (not event_) {
-    return true;
+    return false;
   }
   return cudautils::eventIsOccurred(event_.get());
 }
@@ -15,13 +14,14 @@ CUDAProductBase::~CUDAProductBase() {
   // complete before destructing the product. This is to make sure
   // that the EDM stream does not move to the next event before all
   // asynchronous processing of the current is complete.
+
+  // TODO: a callback notifying a WaitingTaskHolder (or similar)
+  // would avoid blocking the CPU, but would also require more work.
+  //
+  // Intentionally not checking the return value to avoid throwing
+  // exceptions. If this call would fail, we should get failures
+  // elsewhere as well.
   if (event_) {
-    // TODO: a callback notifying a WaitingTaskHolder (or similar)
-    // would avoid blocking the CPU, but would also require more work.
-    //
-    // Intentionally not checking the return value to avoid throwing
-    // exceptions. If this call would fail, we should get failures
-    // elsewhere as well.
     cudaEventSynchronize(event_.get());
   }
 }
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
index 70539006d2563..f5dc53b785a05 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
@@ -10,6 +10,7 @@
 #include "FWCore/Utilities/interface/EDPutToken.h"
 #include "FWCore/Utilities/interface/StreamID.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
 
@@ -154,27 +155,18 @@ class CUDAScopedContextProduce : public impl::CUDAScopedContextGetterBase {
   explicit CUDAScopedContextProduce(CUDAContextState& state)
       : CUDAScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {}
 
+  /// Record the CUDA event, all asynchronous work must have been queued before the destructor
   ~CUDAScopedContextProduce();
 
   template <typename T>
   std::unique_ptr<CUDAProduct<T>> wrap(T data) {
     // make_unique doesn't work because of private constructor
-    //
-    // CUDAProduct<T> constructor records CUDA event to the CUDA
-    // stream. The event will become "occurred" after all work queued
-    // to the stream before this point has been finished.
-    std::unique_ptr<CUDAProduct<T>> ret(new CUDAProduct<T>(device(), streamPtr(), std::move(data)));
-    createEventIfStreamBusy();
-    ret->setEvent(event_);
-    return ret;
+    return std::unique_ptr<CUDAProduct<T>>(new CUDAProduct<T>(device(), streamPtr(), event_, std::move(data)));
   }
 
   template <typename T, typename... Args>
   auto emplace(edm::Event& iEvent, edm::EDPutTokenT<T> token, Args&&... args) {
-    auto ret = iEvent.emplace(token, device(), streamPtr(), std::forward<Args>(args)...);
-    createEventIfStreamBusy();
-    const_cast<T&>(*ret).setEvent(event_);
-    return ret;
+    return iEvent.emplace(token, device(), streamPtr(), event_, std::forward<Args>(args)...);
   }
 
 private:
@@ -184,9 +176,8 @@ class CUDAScopedContextProduce : public impl::CUDAScopedContextGetterBase {
   explicit CUDAScopedContextProduce(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event)
       : CUDAScopedContextGetterBase(device, std::move(stream)), event_{std::move(event)} {}
 
-  void createEventIfStreamBusy();
-
-  cudautils::SharedEventPtr event_;
+  // create the CUDA Event upfront to catch possible errors from its creation
+  cudautils::SharedEventPtr event_ = cudautils::getCUDAEventCache().getCUDAEvent();
 };
 
 /**
diff --git a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
index df56c318e22fa..2d2a155a5bc11 100644
--- a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
@@ -3,7 +3,6 @@
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "FWCore/ServiceRegistry/interface/Service.h"
 #include "FWCore/Utilities/interface/Exception.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
@@ -107,26 +106,10 @@ void CUDAScopedContextAcquire::throwNoState() {
 ////////////////////
 
 CUDAScopedContextProduce::~CUDAScopedContextProduce() {
-  if (event_) {
-    cudaCheck(cudaEventRecord(event_.get(), stream()));
-  }
-}
-
-void CUDAScopedContextProduce::createEventIfStreamBusy() {
-  if (event_) {
-    return;
-  }
-  auto ret = cudaStreamQuery(stream());
-  if (ret == cudaSuccess) {
-    return;
-  }
-  if (ret != cudaErrorNotReady) {
-    // cudaErrorNotReady indicates that the stream is busy, and thus
-    // is not an error
-    cudaCheck(ret);
-  }
-
-  event_ = cudautils::getCUDAEventCache().getCUDAEvent();
+  // Intentionally not checking the return value to avoid throwing
+  // exceptions. If this call would fail, we should get failures
+  // elsewhere as well.
+  cudaEventRecord(event_.get(), stream());
 }
 
 ////////////////////

From 47bdfdcc0354b31f7cefd0c7636d815e83bda108 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Mon, 6 Jan 2020 23:04:44 +0100
Subject: [PATCH 13/29] Add comment motivating cudautils::MessageLogger

---
 HeterogeneousCore/CUDAUtilities/interface/MessageLogger.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/MessageLogger.h b/HeterogeneousCore/CUDAUtilities/interface/MessageLogger.h
index 5299181929fd5..ba098f687f846 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/MessageLogger.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/MessageLogger.h
@@ -6,6 +6,12 @@
 
 namespace cudautils {
 
+  /**
+   * This class is a temporary measure to hide C++17 constructs in
+   * MessaLogger from .cu files (those are mainly files that launch
+   * kernels). It will be removed once we will be able to compile .cu
+   * files with C++17 capable compiler.
+   */
   class MessageLogger {
   public:
     MessageLogger(std::string const& category) : category_(category) {}

From 541c91616815961d02a83c0685df3f2784700be1 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Mon, 6 Jan 2020 23:55:52 +0100
Subject: [PATCH 14/29] Use hasCUDADevices() for host_noncached_unique_ptr_t as
 well

---
 .../CUDAUtilities/test/host_noncached_unique_ptr_t.cpp        | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp
index 12f7bb239023b..a3b8cf63949c4 100644
--- a/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp
@@ -4,7 +4,9 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
 
 TEST_CASE("host_noncached_unique_ptr", "[cudaMemTools]") {
-  requireCUDADevices();
+  if (not hasCUDADevices()) {
+    return;
+  }
 
   SECTION("Single element") {
     auto ptr1 = cudautils::make_host_noncached_unique<int>();

From a9b026c12f005233d4a8bf5a3aa69f6ebc47f8db Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 7 Jan 2020 18:49:45 +0100
Subject: [PATCH 15/29] Test reset of multiple elements

---
 HeterogeneousCore/CUDAUtilities/test/device_unique_ptr_t.cpp | 4 ++++
 HeterogeneousCore/CUDAUtilities/test/host_unique_ptr_t.cpp   | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/HeterogeneousCore/CUDAUtilities/test/device_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/device_unique_ptr_t.cpp
index b3decf337cfa0..e08876ff8614c 100644
--- a/HeterogeneousCore/CUDAUtilities/test/device_unique_ptr_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/device_unique_ptr_t.cpp
@@ -29,6 +29,10 @@ TEST_CASE("device_unique_ptr", "[cudaMemTools]") {
   SECTION("Multiple elements") {
     auto ptr = cudautils::make_device_unique<int[]>(10, stream);
     REQUIRE(ptr != nullptr);
+    cudaCheck(cudaStreamSynchronize(stream));
+
+    ptr.reset();
+    REQUIRE(ptr.get() == nullptr);
   }
 
   SECTION("Allocating too much") {
diff --git a/HeterogeneousCore/CUDAUtilities/test/host_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/host_unique_ptr_t.cpp
index 2ba9fd5aefc1c..477ba35f88bac 100644
--- a/HeterogeneousCore/CUDAUtilities/test/host_unique_ptr_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/host_unique_ptr_t.cpp
@@ -20,6 +20,7 @@ TEST_CASE("host_unique_ptr", "[cudaMemTools]") {
   SECTION("Reset") {
     auto ptr = cudautils::make_host_unique<int>(stream);
     REQUIRE(ptr != nullptr);
+    cudaCheck(cudaStreamSynchronize(stream));
 
     ptr.reset();
     REQUIRE(ptr.get() == nullptr);
@@ -28,6 +29,10 @@ TEST_CASE("host_unique_ptr", "[cudaMemTools]") {
   SECTION("Multiple elements") {
     auto ptr = cudautils::make_host_unique<int[]>(10, stream);
     REQUIRE(ptr != nullptr);
+    cudaCheck(cudaStreamSynchronize(stream));
+
+    ptr.reset();
+    REQUIRE(ptr.get() == nullptr);
   }
 
   SECTION("Allocating too much") {

From 072a823b3d75320e972f9b65e6ceab4bb0ef33e2 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Mon, 6 Jan 2020 23:48:01 +0100
Subject: [PATCH 16/29] Rename {hasCUDA,requireCUDA}Devices to
 cms::cudatest::{test,require}Devices

Also add comment that requireDevices() is meant for unit tests only
---
 .../Common/test/test_CUDAProduct.cc           |  4 +--
 .../CUDACore/test/testStreamEvent.cu          |  4 +--
 .../CUDACore/test/test_CUDAScopedContext.cc   |  4 +--
 .../test/test_TestCUDAProducerGPUFirst.cc     |  6 ++--
 .../interface/requireCUDADevices.h            |  8 -----
 .../CUDAUtilities/interface/requireDevices.h  | 17 +++++++++++
 .../CUDAUtilities/src/requireCUDADevices.cc   | 28 -----------------
 .../CUDAUtilities/src/requireDevices.cc       | 30 +++++++++++++++++++
 .../CUDAUtilities/test/assert_t.cu            |  4 +--
 .../CUDAUtilities/test/copyAsync_t.cpp        |  4 +--
 .../test/device_unique_ptr_t.cpp              |  4 +--
 .../test/host_noncached_unique_ptr_t.cpp      |  4 +--
 .../CUDAUtilities/test/host_unique_ptr_t.cpp  |  4 +--
 .../CUDAUtilities/test/memsetAsync_t.cpp      |  4 +--
 14 files changed, 68 insertions(+), 57 deletions(-)
 delete mode 100644 HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/requireDevices.h
 delete mode 100644 HeterogeneousCore/CUDAUtilities/src/requireCUDADevices.cc
 create mode 100644 HeterogeneousCore/CUDAUtilities/src/requireDevices.cc

diff --git a/CUDADataFormats/Common/test/test_CUDAProduct.cc b/CUDADataFormats/Common/test/test_CUDAProduct.cc
index 3eb3115571813..8e334d336db88 100644
--- a/CUDADataFormats/Common/test/test_CUDAProduct.cc
+++ b/CUDADataFormats/Common/test/test_CUDAProduct.cc
@@ -3,7 +3,7 @@
 #include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
 
@@ -30,7 +30,7 @@ TEST_CASE("Use of CUDAProduct template", "[CUDACore]") {
     auto bar = std::move(foo);
   }
 
-  if (not hasCUDADevices()) {
+  if (not cms::cudatest::testDevices()) {
     return;
   }
 
diff --git a/HeterogeneousCore/CUDACore/test/testStreamEvent.cu b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu
index f819a78f698e4..deeb444dc255b 100644
--- a/HeterogeneousCore/CUDACore/test/testStreamEvent.cu
+++ b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu
@@ -13,7 +13,7 @@
 #include <cuda_runtime.h>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 namespace {
   constexpr int ARRAY_SIZE = 20000000;
@@ -31,7 +31,7 @@ __global__ void kernel_looping(float *point, unsigned int num) {
 }
 
 int main() {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   constexpr bool debug = false;
 
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
index 219e4dfb20103..2a7d066e1d5e3 100644
--- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
@@ -8,7 +8,7 @@
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
@@ -39,7 +39,7 @@ namespace {
 }  // namespace
 
 TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
-  if (not hasCUDADevices()) {
+  if (not cms::cudatest::testDevices()) {
     return;
   }
 
diff --git a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
index e52b8e82ca9da..a8d2e6ba21564 100644
--- a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
@@ -5,7 +5,7 @@
 #include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 #include <iostream>
@@ -26,7 +26,7 @@ process.moduleToTest(process.toTest)
 
   SECTION("No event data") {
     // Calls produce(), so don't call without a GPU
-    if (not hasCUDADevices()) {
+    if (not cms::cudatest::testDevices()) {
       return;
     }
     edm::test::TestProcessor tester(config);
@@ -63,7 +63,7 @@ process.moduleToTest(process.toTest)
 )_"};
   edm::test::TestProcessor::Config config{baseConfig};
 
-  if (not hasCUDADevices()) {
+  if (not cms::cudatest::testDevices()) {
     return;
   }
 
diff --git a/HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h b/HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h
deleted file mode 100644
index adb919015d79c..0000000000000
--- a/HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef HeterogeneousCore_CUDAUtilities_requireCUDADevices_h
-#define HeterogeneousCore_CUDAUtilities_requireCUDADevices_h
-
-bool hasCUDADevices();
-
-void requireCUDADevices();
-
-#endif  // HeterogeneousCore_CUDAUtilities_requireCUDADevices_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/requireDevices.h b/HeterogeneousCore/CUDAUtilities/interface/requireDevices.h
new file mode 100644
index 0000000000000..0795175b31048
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/requireDevices.h
@@ -0,0 +1,17 @@
+#ifndef HeterogeneousCore_CUDAUtilities_requireDevices_h
+#define HeterogeneousCore_CUDAUtilities_requireDevices_h
+
+/**
+ * These functions are meant to be called only from unit tests.
+ */
+namespace cms {
+  namespace cudatest {
+    /// In presence of CUDA devices, return true; otherwise print message and return false
+    bool testDevices();
+
+    /// Print message and exit if there are no CUDA devices
+    void requireDevices();
+  }  // namespace cudatest
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_requireDevices_h
diff --git a/HeterogeneousCore/CUDAUtilities/src/requireCUDADevices.cc b/HeterogeneousCore/CUDAUtilities/src/requireCUDADevices.cc
deleted file mode 100644
index a2e9949003a65..0000000000000
--- a/HeterogeneousCore/CUDAUtilities/src/requireCUDADevices.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-#include <cstdlib>
-#include <iostream>
-
-#include <cuda_runtime.h>
-
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
-
-bool hasCUDADevices() {
-  int devices = 0;
-  auto status = cudaGetDeviceCount(&devices);
-  if (status != cudaSuccess) {
-    std::cerr << "Failed to initialise the CUDA runtime, the test will be skipped."
-              << "\n";
-    return false;
-  }
-  if (devices == 0) {
-    std::cerr << "No CUDA devices available, the test will be skipped."
-              << "\n";
-    return false;
-  }
-  return true;
-}
-
-void requireCUDADevices() {
-  if (not hasCUDADevices()) {
-    exit(EXIT_SUCCESS);
-  }
-}
diff --git a/HeterogeneousCore/CUDAUtilities/src/requireDevices.cc b/HeterogeneousCore/CUDAUtilities/src/requireDevices.cc
new file mode 100644
index 0000000000000..8140578250d2c
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/requireDevices.cc
@@ -0,0 +1,30 @@
+#include <cstdlib>
+#include <iostream>
+
+#include <cuda_runtime.h>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+namespace cms::cudatest {
+  bool testDevices() {
+    int devices = 0;
+    auto status = cudaGetDeviceCount(&devices);
+    if (status != cudaSuccess) {
+      std::cerr << "Failed to initialise the CUDA runtime, the test will be skipped."
+                << "\n";
+      return false;
+    }
+    if (devices == 0) {
+      std::cerr << "No CUDA devices available, the test will be skipped."
+                << "\n";
+      return false;
+    }
+    return true;
+  }
+
+  void requireDevices() {
+    if (not testDevices()) {
+      exit(EXIT_SUCCESS);
+    }
+  }
+}  // namespace cms::cudatest
diff --git a/HeterogeneousCore/CUDAUtilities/test/assert_t.cu b/HeterogeneousCore/CUDAUtilities/test/assert_t.cu
index c7f6ca5faf9da..324c9aba46f98 100644
--- a/HeterogeneousCore/CUDAUtilities/test/assert_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/assert_t.cu
@@ -1,10 +1,10 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 __global__ void testIt(int one) { assert(one == 1); }
 
 int main(int argc, char* argv[]) {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   testIt<<<1, 1>>>(argc);
   cudaDeviceSynchronize();
diff --git a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
index 0dc6e5d4528f1..3dbf853ce43a6 100644
--- a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
@@ -4,10 +4,10 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 TEST_CASE("copyAsync", "[cudaMemTools]") {
-  if (not hasCUDADevices()) {
+  if (not cms::cudatest::testDevices()) {
     return;
   }
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/device_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/device_unique_ptr_t.cpp
index e08876ff8614c..27c5bd3b23f3d 100644
--- a/HeterogeneousCore/CUDAUtilities/test/device_unique_ptr_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/device_unique_ptr_t.cpp
@@ -2,10 +2,10 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 TEST_CASE("device_unique_ptr", "[cudaMemTools]") {
-  if (not hasCUDADevices()) {
+  if (not cms::cudatest::testDevices()) {
     return;
   }
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp
index a3b8cf63949c4..5111936a07c90 100644
--- a/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp
@@ -1,10 +1,10 @@
 #include "catch.hpp"
 
 #include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 TEST_CASE("host_noncached_unique_ptr", "[cudaMemTools]") {
-  if (not hasCUDADevices()) {
+  if (not cms::cudatest::testDevices()) {
     return;
   }
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/host_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/host_unique_ptr_t.cpp
index 477ba35f88bac..230ad48945d69 100644
--- a/HeterogeneousCore/CUDAUtilities/test/host_unique_ptr_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/host_unique_ptr_t.cpp
@@ -2,10 +2,10 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 TEST_CASE("host_unique_ptr", "[cudaMemTools]") {
-  if (not hasCUDADevices()) {
+  if (not cms::cudatest::testDevices()) {
     return;
   }
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp
index df4fbf52adb3d..ec30f4badea3e 100644
--- a/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp
@@ -5,10 +5,10 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 TEST_CASE("memsetAsync", "[cudaMemTools]") {
-  if (not hasCUDADevices()) {
+  if (not cms::cudatest::testDevices()) {
     return;
   }
 

From 8c47b5d27ae323d9499c50d54cd6a6f15bdd4cba Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 8 Jan 2020 15:22:09 +0100
Subject: [PATCH 17/29] Propagate {hasCUDA,requireCUDA}Devices ->
 cms::cudatest::{test,require}Devices rename

---
 CUDADataFormats/Track/test/TrajectoryStateSOA_t.h             | 4 ++--
 .../TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp            | 4 ++--
 DataFormats/CaloRecHit/test/test_calo_rechit.cu               | 4 ++--
 DataFormats/DetId/test/test_detid.cu                          | 4 ++--
 DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp    | 4 ++--
 DataFormats/HcalDetId/test/test_hcal_detid.cu                 | 4 ++--
 DataFormats/HcalDigi/test/test_hcal_digi.cu                   | 4 ++--
 DataFormats/HcalRecHit/test/test_hcal_reco.cu                 | 4 ++--
 DataFormats/Math/test/CholeskyInvert_t.cu                     | 4 ++--
 DataFormats/Math/test/cudaAtan2Test.cu                        | 4 ++--
 DataFormats/Math/test/cudaMathTest.cu                         | 4 ++--
 HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp     | 4 ++--
 HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu      | 4 ++--
 HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu   | 4 ++--
 HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h       | 4 ++--
 HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu    | 4 ++--
 HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h             | 4 ++--
 HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu          | 4 ++--
 HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu           | 4 ++--
 HeterogeneousCore/CUDAUtilities/test/test_GPUSimpleVector.cu  | 4 ++--
 RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h    | 4 ++--
 RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu     | 4 ++--
 .../PixelTrackFitting/test/testEigenGPUNoFit.cu               | 4 ++--
 RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h   | 4 ++--
 24 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
index 1fbe6a73da910..c8e92aca2628f 100644
--- a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
+++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
@@ -51,13 +51,13 @@ __global__ void testTSSoA(TS* pts, int n) {
 }
 
 #ifdef __CUDACC__
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #endif
 
 int main() {
 #ifdef __CUDACC__
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 #endif
 
   TS ts;
diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp
index 592f0267c2f7d..32af6c181ae68 100644
--- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp
+++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp
@@ -1,6 +1,6 @@
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 namespace testTrackingRecHit2D {
@@ -10,7 +10,7 @@ namespace testTrackingRecHit2D {
 }
 
 int main() {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   cudaStream_t stream;
   cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
diff --git a/DataFormats/CaloRecHit/test/test_calo_rechit.cu b/DataFormats/CaloRecHit/test/test_calo_rechit.cu
index a22fb77dc7d06..21b53aeeca94f 100644
--- a/DataFormats/CaloRecHit/test/test_calo_rechit.cu
+++ b/DataFormats/CaloRecHit/test/test_calo_rechit.cu
@@ -5,7 +5,7 @@
 #include <cuda_runtime.h>
 
 #include "DataFormats/CaloRecHit/interface/CaloRecHit.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 __global__ void kernel_test_calo_rechit(CaloRecHit* other) {
   CaloRecHit rh{DetId(0), 10, 1, 0, 0};
@@ -43,7 +43,7 @@ void test_calo_rechit() {
 }
 
 int main(int argc, char** argv) {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   test_calo_rechit();
 
diff --git a/DataFormats/DetId/test/test_detid.cu b/DataFormats/DetId/test/test_detid.cu
index a7c44ba1372a1..ed3960b652fc2 100644
--- a/DataFormats/DetId/test/test_detid.cu
+++ b/DataFormats/DetId/test/test_detid.cu
@@ -6,7 +6,7 @@
 
 #include "DataFormats/DetId/interface/DetId.h"
 #include "DataFormats/HcalDetId/interface/HcalDetId.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 __global__ void test_gen_detid(DetId* id, uint32_t const rawid) {
   DetId did{rawid};
@@ -29,7 +29,7 @@ void test_detid() {
 }
 
 int main(int argc, char** argv) {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   // test det id functionality
   test_detid();
diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
index 0282b4998f9e1..e0d305964cc65 100644
--- a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
+++ b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
@@ -13,7 +13,7 @@
 #include "DataFormats/GeometrySurface/interface/GloballyPositioned.h"
 #include "DataFormats/GeometrySurface/interface/SOARotation.h"
 #include "DataFormats/GeometrySurface/interface/TkRotation.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 void toGlobalWrapper(SOAFrame<float> const *frame,
                      float const *xl,
@@ -26,7 +26,7 @@ void toGlobalWrapper(SOAFrame<float> const *frame,
                      uint32_t n);
 
 int main(void) {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   typedef float T;
   typedef TkRotation<T> Rotation;
diff --git a/DataFormats/HcalDetId/test/test_hcal_detid.cu b/DataFormats/HcalDetId/test/test_hcal_detid.cu
index 1b859e5497514..a10b19adbe0df 100644
--- a/DataFormats/HcalDetId/test/test_hcal_detid.cu
+++ b/DataFormats/HcalDetId/test/test_hcal_detid.cu
@@ -6,7 +6,7 @@
 
 #include "DataFormats/DetId/interface/DetId.h"
 #include "DataFormats/HcalDetId/interface/HcalDetId.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 __global__ void test_gen_detid(DetId *id) {
   DetId did;
@@ -65,7 +65,7 @@ void test_hcal_detid() {
 }
 
 int main(int argc, char **argv) {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   // test det id functionality
   test_detid();
diff --git a/DataFormats/HcalDigi/test/test_hcal_digi.cu b/DataFormats/HcalDigi/test/test_hcal_digi.cu
index d0dd3fc874ba3..907167ac6ad10 100644
--- a/DataFormats/HcalDigi/test/test_hcal_digi.cu
+++ b/DataFormats/HcalDigi/test/test_hcal_digi.cu
@@ -11,7 +11,7 @@
 #include "DataFormats/HcalDigi/interface/HcalDigiCollections.h"
 #include "DataFormats/HcalDigi/interface/QIE10DataFrame.h"
 #include "DataFormats/HcalDigi/interface/QIE11DataFrame.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 __global__ void kernel_test_hcal_qiesample(HcalQIESample *sample, uint16_t value) {
   printf("kernel: testing hcal qie sampel\n");
@@ -163,7 +163,7 @@ void test_hcal_qie8_hbhedf() {
 }
 
 int main(int argc, char **argv) {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   // qie8
   test_hcal_qiesample();
diff --git a/DataFormats/HcalRecHit/test/test_hcal_reco.cu b/DataFormats/HcalRecHit/test/test_hcal_reco.cu
index 70d5a4b0c3501..5f5d39fc562a5 100644
--- a/DataFormats/HcalRecHit/test/test_hcal_reco.cu
+++ b/DataFormats/HcalRecHit/test/test_hcal_reco.cu
@@ -10,7 +10,7 @@
 #include "DataFormats/HcalRecHit/interface/HORecHit.h"
 #include "DataFormats/HcalRecHit/interface/HFQIE10Info.h"
 #include "DataFormats/HcalRecHit/interface/HBHEChannelInfo.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 template <typename T>
 __global__ void kernel_test_hcal_rechits(T *other) {
@@ -110,7 +110,7 @@ void test_hcal_hbhechinfo() {
 }
 
 int main(int argc, char **argv) {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   test_hcal_rechits<HBHERecHit>();
   test_hcal_rechits<HFRecHit>();
diff --git a/DataFormats/Math/test/CholeskyInvert_t.cu b/DataFormats/Math/test/CholeskyInvert_t.cu
index f493a7602307a..73bc3de897c8d 100644
--- a/DataFormats/Math/test/CholeskyInvert_t.cu
+++ b/DataFormats/Math/test/CholeskyInvert_t.cu
@@ -16,7 +16,7 @@
 #include "DataFormats/Math/interface/choleskyInversion.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h"
 
@@ -197,7 +197,7 @@ void go(bool soa) {
 }
 
 int main() {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   go<2>(false);
   go<4>(false);
diff --git a/DataFormats/Math/test/cudaAtan2Test.cu b/DataFormats/Math/test/cudaAtan2Test.cu
index e678c9208a9ae..70a818021ed53 100644
--- a/DataFormats/Math/test/cudaAtan2Test.cu
+++ b/DataFormats/Math/test/cudaAtan2Test.cu
@@ -30,7 +30,7 @@ end
 #include "DataFormats/Math/interface/approx_atan2.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 
 constexpr float xmin = -100.001;  // avoid 0
@@ -96,7 +96,7 @@ void go() {
 }
 
 int main() {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   try {
     go<3>();
diff --git a/DataFormats/Math/test/cudaMathTest.cu b/DataFormats/Math/test/cudaMathTest.cu
index d557456e4726c..f19be00100c7f 100644
--- a/DataFormats/Math/test/cudaMathTest.cu
+++ b/DataFormats/Math/test/cudaMathTest.cu
@@ -40,7 +40,7 @@ end
 #include "DataFormats/Math/interface/approx_atan2.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 
 std::mt19937 eng;
@@ -181,7 +181,7 @@ void go() {
 }
 
 int main() {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   try {
     go<USEEXP>();
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
index d4cf710e0b9af..cc5541f58ad60 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
@@ -5,7 +5,7 @@
 #include <random>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 template <typename T, int NBINS = 128, int S = 8 * sizeof(T), int DELTA = 1000>
 void go() {
@@ -136,7 +136,7 @@ void go() {
 }
 
 int main() {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   go<int16_t>();
   go<uint8_t, 128, 8, 4>();
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
index 6a6eb1d63adab..8dc1abbe51cf3 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
@@ -7,7 +7,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h"
 
 template <typename T>
@@ -152,7 +152,7 @@ void go() {
 }
 
 int main() {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   go<int16_t>();
   go<int8_t>();
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
index f57a5275d1500..3343370ef45fb 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
@@ -7,7 +7,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h"
 
@@ -137,7 +137,7 @@ void go() {
 }
 
 int main() {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   go<int16_t>();
   go<uint8_t, 128, 8, 4>();
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
index 25dcc01b6244c..2cdafd0a876cb 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
+++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
@@ -9,7 +9,7 @@
 #ifdef __CUDACC__
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
 
 #endif
@@ -99,7 +99,7 @@ __global__ void verifyBulk(Assoc const* __restrict__ assoc, AtomicPairCounter co
 
 int main() {
 #ifdef __CUDACC__
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
   auto current_device = cudautils::currentDevice();
 #else
   // make sure cuda emulation is working
diff --git a/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu b/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu
index 06f255c0409d7..7fcd26f78194d 100644
--- a/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu
@@ -2,7 +2,7 @@
 #include <iostream>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h"
 
@@ -33,7 +33,7 @@ void wrapper() {
 }
 
 int main() {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   wrapper();
 }
diff --git a/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h
index ae2975f3df469..e9fef92e68083 100644
--- a/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h
+++ b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h
@@ -59,13 +59,13 @@ __global__ void testBasicSoA(float* p) {
 #include <random>
 
 #ifdef __CUDACC__
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #endif
 
 int main() {
 #ifdef __CUDACC__
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 #endif
 
   float p[1024];
diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
index a4db0727a9802..99936a1f291da 100644
--- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
@@ -4,7 +4,7 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 template <typename T>
 __global__ void testPrefixScan(uint32_t size) {
@@ -72,7 +72,7 @@ __global__ void verify(uint32_t const *v, uint32_t n) {
 }
 
 int main() {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   std::cout << "warp level" << std::endl;
   // std::cout << "warp 32" << std::endl;
diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
index f031bce33a9b8..ff808c5dfb48f 100644
--- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
@@ -10,7 +10,7 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h"
@@ -169,7 +169,7 @@ void go(bool useShared) {
 }
 
 int main() {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   bool useShared = false;
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/test_GPUSimpleVector.cu b/HeterogeneousCore/CUDAUtilities/test/test_GPUSimpleVector.cu
index e6e1a94a1377e..2811e3b34598e 100644
--- a/HeterogeneousCore/CUDAUtilities/test/test_GPUSimpleVector.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/test_GPUSimpleVector.cu
@@ -8,7 +8,7 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 __global__ void vector_pushback(GPU::SimpleVector<int> *foo) {
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
@@ -23,7 +23,7 @@ __global__ void vector_emplace_back(GPU::SimpleVector<int> *foo) {
 }
 
 int main() {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   auto maxN = 10000;
   GPU::SimpleVector<int> *obj_ptr = nullptr;
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
index b22e7a35a6ac7..345ae820f59e1 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
@@ -13,7 +13,7 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h"
 #endif
@@ -23,7 +23,7 @@
 
 int main(void) {
 #ifdef __CUDACC__
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   if (cudautils::cudaDeviceCount() == 0) {
     std::cerr << "No CUDA devices on this system"
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
index e1606ab54c9c6..f0b641361aee4 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
@@ -4,7 +4,7 @@
 #include <Eigen/Eigenvalues>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 #ifdef USE_BL
 #include "RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h"
@@ -329,7 +329,7 @@ void testFit() {
 }
 
 int main(int argc, char* argv[]) {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   testFit<4>();
   testFit<3>();
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
index 7ef3f572603b0..6ac1088943305 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
@@ -4,7 +4,7 @@
 #include <Eigen/Eigenvalues>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "test_common.h"
 
 using namespace Eigen;
@@ -215,7 +215,7 @@ void testEigenvalues() {
 }
 
 int main(int argc, char *argv[]) {
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   testEigenvalues();
   testInverse3x3();
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 5261069a6b283..de3a9a2316238 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -5,7 +5,7 @@
 #include <vector>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #ifdef USE_DBSCAN
 #include "RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracksDBSCAN.h"
@@ -114,7 +114,7 @@ __global__ void print(ZVertices const* pdata, WorkSpace const* pws) {
 
 int main() {
 #ifdef __CUDACC__
-  requireCUDADevices();
+  cms::cudatest::requireDevices();
 
   auto onGPU_d = cudautils::make_device_unique<ZVertices[]>(1, nullptr);
   auto ws_d = cudautils::make_device_unique<WorkSpace[]>(1, nullptr);

From 83a3ac822cbf5fb7ee77cb6c43b56293a8a1f7ac Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 7 Jan 2020 20:32:44 +0100
Subject: [PATCH 18/29] Added comments to highlight the pieces in
 CachingDeviceAllocator that have been changed wrt. cub

---
 HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h b/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h
index 6e07fb6c4a8ed..075d568f21039 100644
--- a/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h
+++ b/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h
@@ -378,6 +378,7 @@ namespace notcub {
             cached_bytes[device].live += search_key.bytes;
 
             if (debug)
+              // CMS: improved debug message
               _CubLog(
                   "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously "
                   "associated with stream %lld, event %lld).\n",
@@ -483,6 +484,7 @@ namespace notcub {
         mutex.Unlock();
 
         if (debug)
+          // CMS: improved debug message
           _CubLog(
               "\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n",
               device,
@@ -564,6 +566,7 @@ namespace notcub {
           cached_bytes[device].free += search_key.bytes;
 
           if (debug)
+            // CMS: improved debug message
             _CubLog(
                 "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available "
                 "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
@@ -604,6 +607,7 @@ namespace notcub {
           return error;
 
         if (debug)
+          // CMS: improved debug message
           _CubLog(
               "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t  %lld available "
               "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",

From 4816af03cbf68e866d2cf203f97ee14bb982c385 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 7 Jan 2020 20:45:12 +0100
Subject: [PATCH 19/29] Add a comment motivating assert_t.cu

---
 HeterogeneousCore/CUDAUtilities/test/assert_t.cu | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/HeterogeneousCore/CUDAUtilities/test/assert_t.cu b/HeterogeneousCore/CUDAUtilities/test/assert_t.cu
index 324c9aba46f98..2e74f4f69bec2 100644
--- a/HeterogeneousCore/CUDAUtilities/test/assert_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/assert_t.cu
@@ -1,6 +1,12 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
+/**
+ * This file tests that the assert() and #include ".../cuda_assert.h"
+ * compiles and runs when compiled with and without -DGPU_DEBUG (see
+ * also BuildFile.xml).
+ */
+
 __global__ void testIt(int one) { assert(one == 1); }
 
 int main(int argc, char* argv[]) {

From 36ae7e89f8065621e501d3b5303eb1ef123756bc Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 7 Jan 2020 20:48:43 +0100
Subject: [PATCH 20/29] Rename cudautils::cudaDeviceCount() to
 cudautils::deviceCount()

---
 HeterogeneousCore/CUDACore/interface/CUDAESProduct.h        | 5 ++---
 .../interface/{cudaDeviceCount.h => deviceCount.h}          | 6 +++---
 HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc       | 6 +++---
 HeterogeneousCore/CUDAUtilities/src/CUDAStreamCache.cc      | 6 +++---
 .../CUDAUtilities/src/getCachingDeviceAllocator.h           | 4 ++--
 5 files changed, 13 insertions(+), 14 deletions(-)
 rename HeterogeneousCore/CUDAUtilities/interface/{cudaDeviceCount.h => deviceCount.h} (61%)

diff --git a/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h b/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h
index b8b230e510fa3..a6c54e0a2c8ce 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h
@@ -6,18 +6,17 @@
 #include <mutex>
 #include <vector>
 
-#include "FWCore/Concurrency/interface/hardware_pause.h"
 #include "FWCore/Utilities/interface/thread_safety_macros.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h"
 
 template <typename T>
 class CUDAESProduct {
 public:
-  CUDAESProduct() : gpuDataPerDevice_(cudautils::cudaDeviceCount()) {
+  CUDAESProduct() : gpuDataPerDevice_(cudautils::deviceCount()) {
     for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) {
       gpuDataPerDevice_[i].m_event = cudautils::getCUDAEventCache().getCUDAEvent();
     }
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h b/HeterogeneousCore/CUDAUtilities/interface/deviceCount.h
similarity index 61%
rename from HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h
rename to HeterogeneousCore/CUDAUtilities/interface/deviceCount.h
index 37be02714747b..fb6b741225f8f 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/deviceCount.h
@@ -1,12 +1,12 @@
-#ifndef HeterogenousCore_CUDAUtilities_cudaDeviceCount_h
-#define HeterogenousCore_CUDAUtilities_cudaDeviceCount_h
+#ifndef HeterogenousCore_CUDAUtilities_deviceCount_h
+#define HeterogenousCore_CUDAUtilities_deviceCount_h
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 #include <cuda_runtime.h>
 
 namespace cudautils {
-  inline int cudaDeviceCount() {
+  inline int deviceCount() {
     int ndevices;
     cudaCheck(cudaGetDeviceCount(&ndevices));
     return ndevices;
diff --git a/HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc b/HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc
index bf79d0bb54568..6c46f9456cafe 100644
--- a/HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc
+++ b/HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc
@@ -2,8 +2,8 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h"
 
 namespace cudautils {
   void CUDAEventCache::Deleter::operator()(cudaEvent_t event) const {
@@ -15,7 +15,7 @@ namespace cudautils {
 
   // CUDAEventCache should be constructed by the first call to
   // getCUDAEventCache() only if we have CUDA devices present
-  CUDAEventCache::CUDAEventCache() : cache_(cudautils::cudaDeviceCount()) {}
+  CUDAEventCache::CUDAEventCache() : cache_(cudautils::deviceCount()) {}
 
   SharedEventPtr CUDAEventCache::getCUDAEvent() {
     const auto dev = cudautils::currentDevice();
@@ -63,7 +63,7 @@ namespace cudautils {
     // CUDAEventCache lives through multiple tests (and go through
     // multiple shutdowns of the framework).
     cache_.clear();
-    cache_.resize(cudautils::cudaDeviceCount());
+    cache_.resize(cudautils::deviceCount());
   }
 
   CUDAEventCache& getCUDAEventCache() {
diff --git a/HeterogeneousCore/CUDAUtilities/src/CUDAStreamCache.cc b/HeterogeneousCore/CUDAUtilities/src/CUDAStreamCache.cc
index a77e490169c4d..a4455e5733481 100644
--- a/HeterogeneousCore/CUDAUtilities/src/CUDAStreamCache.cc
+++ b/HeterogeneousCore/CUDAUtilities/src/CUDAStreamCache.cc
@@ -2,8 +2,8 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h"
 
 namespace cudautils {
   void CUDAStreamCache::Deleter::operator()(cudaStream_t stream) const {
@@ -15,7 +15,7 @@ namespace cudautils {
 
   // CUDAStreamCache should be constructed by the first call to
   // getCUDAStreamCache() only if we have CUDA devices present
-  CUDAStreamCache::CUDAStreamCache() : cache_(cudautils::cudaDeviceCount()) {}
+  CUDAStreamCache::CUDAStreamCache() : cache_(cudautils::deviceCount()) {}
 
   SharedStreamPtr CUDAStreamCache::getCUDAStream() {
     const auto dev = cudautils::currentDevice();
@@ -33,7 +33,7 @@ namespace cudautils {
     // CUDAStreamCache lives through multiple tests (and go through
     // multiple shutdowns of the framework).
     cache_.clear();
-    cache_.resize(cudautils::cudaDeviceCount());
+    cache_.resize(cudautils::deviceCount());
   }
 
   CUDAStreamCache& getCUDAStreamCache() {
diff --git a/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
index a0917a320c28d..ad329b1168ec7 100644
--- a/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
+++ b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
@@ -4,7 +4,7 @@
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "FWCore/Utilities/interface/thread_safety_macros.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
 #include "CachingDeviceAllocator.h"
 
 #include <iomanip>
@@ -29,7 +29,7 @@ namespace cudautils {
       size_t ret = std::numeric_limits<size_t>::max();
       int currentDevice;
       cudaCheck(cudaGetDevice(&currentDevice));
-      const int numberOfDevices = cudaDeviceCount();
+      const int numberOfDevices = deviceCount();
       for (int i = 0; i < numberOfDevices; ++i) {
         size_t freeMemory, totalMemory;
         cudaCheck(cudaSetDevice(i));

From 0080fc0f7adc5f8a0d741ba30f196be01192dc17 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 8 Jan 2020 16:35:27 +0100
Subject: [PATCH 21/29] Remove redundant calls to cudaDeviceCount()

---
 DataFormats/Math/test/CholeskyInvert_t.cu             |  7 -------
 .../CUDAUtilities/test/HistoContainer_t.cu            |  7 -------
 .../CUDAUtilities/test/OneHistoContainer_t.cu         |  7 -------
 .../CUDAUtilities/test/cudastdAlgorithm_t.cu          | 11 +----------
 HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu   |  7 -------
 .../SiPixelClusterizer/test/gpuClustering_t.h         |  7 -------
 6 files changed, 1 insertion(+), 45 deletions(-)

diff --git a/DataFormats/Math/test/CholeskyInvert_t.cu b/DataFormats/Math/test/CholeskyInvert_t.cu
index 73bc3de897c8d..ae7116ddf09ce 100644
--- a/DataFormats/Math/test/CholeskyInvert_t.cu
+++ b/DataFormats/Math/test/CholeskyInvert_t.cu
@@ -18,7 +18,6 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h"
 
 constexpr int stride() { return 5 * 1024; }
 template <int DIM>
@@ -93,12 +92,6 @@ void go(bool soa) {
   auto delta1 = delta;
   auto delta2 = delta;
 
-  if (cudautils::cudaDeviceCount() == 0) {
-    std::cerr << "No CUDA devices on this system"
-              << "\n";
-    exit(EXIT_FAILURE);
-  }
-
   constexpr unsigned int SIZE = 4 * 1024;
 
   MX mm[stride()];  // just storage in case of SOA
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
index 8dc1abbe51cf3..772c0b64bd892 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
@@ -8,16 +8,9 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h"
 
 template <typename T>
 void go() {
-  if (cudautils::cudaDeviceCount() == 0) {
-    std::cerr << "No CUDA devices on this system"
-              << "\n";
-    exit(EXIT_FAILURE);
-  }
-
   std::mt19937 eng;
   std::uniform_int_distribution<T> rgen(std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
index 3343370ef45fb..940de878709d1 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
@@ -9,7 +9,6 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h"
 
 template <typename T, int NBINS, int S, int DELTA>
 __global__ void mykernel(T const* __restrict__ v, uint32_t N) {
@@ -93,12 +92,6 @@ __global__ void mykernel(T const* __restrict__ v, uint32_t N) {
 
 template <typename T, int NBINS = 128, int S = 8 * sizeof(T), int DELTA = 1000>
 void go() {
-  if (cudautils::cudaDeviceCount() == 0) {
-    std::cerr << "No CUDA devices on this system"
-              << "\n";
-    exit(EXIT_FAILURE);
-  }
-
   std::mt19937 eng;
 
   int rmin = std::numeric_limits<T>::min();
diff --git a/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu b/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu
index 7fcd26f78194d..834ac9446a4c5 100644
--- a/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu
@@ -4,7 +4,6 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h"
 
 __global__ void testBinaryFind() {
   int data[] = {1, 1, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6};
@@ -22,15 +21,7 @@ __global__ void testBinaryFind() {
   assert(data2 + 6 == cuda_std::binary_find(data2, data2 + 6, 5));
 }
 
-void wrapper() {
-  if (cudautils::cudaDeviceCount() == 0) {
-    std::cerr << "No CUDA devices on this system"
-              << "\n";
-    exit(EXIT_FAILURE);
-  }
-
-  cudautils::launch(testBinaryFind, {32, 64});
-}
+void wrapper() { cudautils::launch(testBinaryFind, {32, 64}); }
 
 int main() {
   cms::cudatest::requireDevices();
diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
index ff808c5dfb48f..2b5b439c85598 100644
--- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
@@ -13,7 +13,6 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h"
 
 template <typename T>
 struct RS {
@@ -40,12 +39,6 @@ void go(bool useShared) {
   auto start = std::chrono::high_resolution_clock::now();
   auto delta = start - start;
 
-  if (cudautils::cudaDeviceCount() == 0) {
-    std::cerr << "No CUDA devices on this system"
-              << "\n";
-    exit(EXIT_FAILURE);
-  }
-
   constexpr int blocks = 10;
   constexpr int blockSize = 256 * 32;
   constexpr int N = blockSize * blocks;
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
index 345ae820f59e1..4db03da324ada 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
@@ -15,7 +15,6 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h"
 #endif
 
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
@@ -24,12 +23,6 @@
 int main(void) {
 #ifdef __CUDACC__
   cms::cudatest::requireDevices();
-
-  if (cudautils::cudaDeviceCount() == 0) {
-    std::cerr << "No CUDA devices on this system"
-              << "\n";
-    exit(EXIT_FAILURE);
-  }
 #endif
 
   using namespace gpuClustering;

From 978ad6451f5e20152463b30c960b4746c7e7a8b5 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 7 Jan 2020 21:46:23 +0100
Subject: [PATCH 22/29] Rename cudautils::CUDAEventCache ->
 cudautils::EventCache

---
 .../Common/interface/CUDAProductBase.h        |  2 +-
 .../Common/test/test_CUDAProduct.cc           |  4 ++--
 .../CUDACore/interface/CUDAESProduct.h        |  4 ++--
 .../CUDACore/interface/CUDAScopedContext.h    |  4 ++--
 .../CUDACore/test/test_CUDAScopedContext.cc   |  4 ++--
 .../CUDAServices/src/CUDAService.cc           |  6 ++---
 .../{CUDAEventCache.h => EventCache.h}        | 14 ++++++------
 .../src/{CUDAEventCache.cc => EventCache.cc}  | 22 +++++++++----------
 8 files changed, 30 insertions(+), 30 deletions(-)
 rename HeterogeneousCore/CUDAUtilities/interface/{CUDAEventCache.h => EventCache.h} (79%)
 rename HeterogeneousCore/CUDAUtilities/src/{CUDAEventCache.cc => EventCache.cc} (76%)

diff --git a/CUDADataFormats/Common/interface/CUDAProductBase.h b/CUDADataFormats/Common/interface/CUDAProductBase.h
index 6887c50f7751a..f01b508f94f8a 100644
--- a/CUDADataFormats/Common/interface/CUDAProductBase.h
+++ b/CUDADataFormats/Common/interface/CUDAProductBase.h
@@ -74,7 +74,7 @@ class CUDAProductBase {
   // The cudaStream_t is really shared among edm::Event products, so
   // using shared_ptr also here
   cudautils::SharedStreamPtr stream_;  //!
-  // shared_ptr because of caching in CUDAEventCache
+  // shared_ptr because of caching in cudautils::EventCache
   cudautils::SharedEventPtr event_;  //!
 
   // This flag tells whether the CUDA stream may be reused by a
diff --git a/CUDADataFormats/Common/test/test_CUDAProduct.cc b/CUDADataFormats/Common/test/test_CUDAProduct.cc
index 8e334d336db88..0813577a70afc 100644
--- a/CUDADataFormats/Common/test/test_CUDAProduct.cc
+++ b/CUDADataFormats/Common/test/test_CUDAProduct.cc
@@ -5,7 +5,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
 
 #include <cuda_runtime_api.h>
 
@@ -15,7 +15,7 @@ namespace cudatest {
     static CUDAScopedContextProduce make(int dev, bool createEvent) {
       cudautils::SharedEventPtr event;
       if (createEvent) {
-        event = cudautils::getCUDAEventCache().getCUDAEvent();
+        event = cudautils::getEventCache().get();
       }
       return CUDAScopedContextProduce(dev, cudautils::getCUDAStreamCache().getCUDAStream(), std::move(event));
     }
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h b/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h
index a6c54e0a2c8ce..2378bfec196cc 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h
@@ -7,7 +7,7 @@
 #include <vector>
 
 #include "FWCore/Utilities/interface/thread_safety_macros.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
@@ -18,7 +18,7 @@ class CUDAESProduct {
 public:
   CUDAESProduct() : gpuDataPerDevice_(cudautils::deviceCount()) {
     for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) {
-      gpuDataPerDevice_[i].m_event = cudautils::getCUDAEventCache().getCUDAEvent();
+      gpuDataPerDevice_[i].m_event = cudautils::getEventCache().get();
     }
   }
   ~CUDAESProduct() = default;
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
index f5dc53b785a05..01533c1a5d222 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
@@ -10,7 +10,7 @@
 #include "FWCore/Utilities/interface/EDPutToken.h"
 #include "FWCore/Utilities/interface/StreamID.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
 
@@ -177,7 +177,7 @@ class CUDAScopedContextProduce : public impl::CUDAScopedContextGetterBase {
       : CUDAScopedContextGetterBase(device, std::move(stream)), event_{std::move(event)} {}
 
   // create the CUDA Event upfront to catch possible errors from its creation
-  cudautils::SharedEventPtr event_ = cudautils::getCUDAEventCache().getCUDAEvent();
+  cudautils::SharedEventPtr event_ = cudautils::getEventCache().get();
 };
 
 /**
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
index 2a7d066e1d5e3..af16bc7dabf3e 100644
--- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
@@ -10,7 +10,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
 
@@ -22,7 +22,7 @@ namespace cudatest {
     static CUDAScopedContextProduce make(int dev, bool createEvent) {
       cudautils::SharedEventPtr event;
       if (createEvent) {
-        event = cudautils::getCUDAEventCache().getCUDAEvent();
+        event = cudautils::getEventCache().get();
       }
       return CUDAScopedContextProduce(dev, cudautils::getCUDAStreamCache().getCUDAStream(), std::move(event));
     }
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
index dbc32c6911564..51dff6cb8079b 100644
--- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -11,7 +11,7 @@
 #include "FWCore/Utilities/interface/ReusableObjectHolder.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
@@ -303,7 +303,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config) {
     cudautils::allocator::getCachingDeviceAllocator();
     cudautils::allocator::getCachingHostAllocator();
   }
-  cudautils::getCUDAEventCache().clear();
+  cudautils::getEventCache().clear();
   cudautils::getCUDAStreamCache().clear();
 
   log << "CUDAService fully initialized";
@@ -322,7 +322,7 @@ CUDAService::~CUDAService() {
       cudautils::allocator::getCachingDeviceAllocator().FreeAllCached();
       cudautils::allocator::getCachingHostAllocator().FreeAllCached();
     }
-    cudautils::getCUDAEventCache().clear();
+    cudautils::getEventCache().clear();
     cudautils::getCUDAStreamCache().clear();
 
     for (int i = 0; i < numberOfDevices_; ++i) {
diff --git a/HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h b/HeterogeneousCore/CUDAUtilities/interface/EventCache.h
similarity index 79%
rename from HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h
rename to HeterogeneousCore/CUDAUtilities/interface/EventCache.h
index 8e5001b525351..5cf05b1146641 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/EventCache.h
@@ -1,5 +1,5 @@
-#ifndef HeterogeneousCore_CUDAUtilities_CUDAEventCache_h
-#define HeterogeneousCore_CUDAUtilities_CUDAEventCache_h
+#ifndef HeterogeneousCore_CUDAUtilities_EventCache_h
+#define HeterogeneousCore_CUDAUtilities_EventCache_h
 
 #include <vector>
 
@@ -11,11 +11,11 @@
 class CUDAService;
 
 namespace cudautils {
-  class CUDAEventCache {
+  class EventCache {
   public:
     using BareEvent = SharedEventPtr::element_type;
 
-    CUDAEventCache();
+    EventCache();
 
     // Gets a (cached) CUDA event for the current device. The event
     // will be returned to the cache by the shared_ptr destructor. The
@@ -23,7 +23,7 @@ namespace cudautils {
     // cudaEventQuery() == cudaSuccess.
     //
     // This function is thread safe
-    SharedEventPtr getCUDAEvent();
+    SharedEventPtr get();
 
   private:
     friend class ::CUDAService;
@@ -47,9 +47,9 @@ namespace cudautils {
     std::vector<edm::ReusableObjectHolder<BareEvent, Deleter>> cache_;
   };
 
-  // Gets the global instance of a CUDAEventCache
+  // Gets the global instance of a EventCache
   // This function is thread safe
-  CUDAEventCache& getCUDAEventCache();
+  EventCache& getEventCache();
 }  // namespace cudautils
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc b/HeterogeneousCore/CUDAUtilities/src/EventCache.cc
similarity index 76%
rename from HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc
rename to HeterogeneousCore/CUDAUtilities/src/EventCache.cc
index 6c46f9456cafe..8903b385e8fe6 100644
--- a/HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc
+++ b/HeterogeneousCore/CUDAUtilities/src/EventCache.cc
@@ -1,23 +1,23 @@
 #include "FWCore/Utilities/interface/thread_safety_macros.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
 
 namespace cudautils {
-  void CUDAEventCache::Deleter::operator()(cudaEvent_t event) const {
+  void EventCache::Deleter::operator()(cudaEvent_t event) const {
     if (device_ != -1) {
       ScopedSetDevice deviceGuard{device_};
       cudaCheck(cudaEventDestroy(event));
     }
   }
 
-  // CUDAEventCache should be constructed by the first call to
-  // getCUDAEventCache() only if we have CUDA devices present
-  CUDAEventCache::CUDAEventCache() : cache_(cudautils::deviceCount()) {}
+  // EventCache should be constructed by the first call to
+  // getEventCache() only if we have CUDA devices present
+  EventCache::EventCache() : cache_(cudautils::deviceCount()) {}
 
-  SharedEventPtr CUDAEventCache::getCUDAEvent() {
+  SharedEventPtr EventCache::get() {
     const auto dev = cudautils::currentDevice();
     auto event = makeOrGet(dev);
     auto ret = cudaEventQuery(event.get());
@@ -47,7 +47,7 @@ namespace cudautils {
     return event;
   }
 
-  SharedEventPtr CUDAEventCache::makeOrGet(int dev) {
+  SharedEventPtr EventCache::makeOrGet(int dev) {
     return cache_[dev].makeOrGet([dev]() {
       cudaEvent_t event;
       // it should be a bit faster to ignore timings
@@ -56,19 +56,19 @@ namespace cudautils {
     });
   }
 
-  void CUDAEventCache::clear() {
+  void EventCache::clear() {
     // Reset the contents of the caches, but leave an
     // edm::ReusableObjectHolder alive for each device. This is needed
     // mostly for the unit tests, where the function-static
-    // CUDAEventCache lives through multiple tests (and go through
+    // EventCache lives through multiple tests (and go through
     // multiple shutdowns of the framework).
     cache_.clear();
     cache_.resize(cudautils::deviceCount());
   }
 
-  CUDAEventCache& getCUDAEventCache() {
+  EventCache& getEventCache() {
     // the public interface is thread safe
-    CMS_THREAD_SAFE static CUDAEventCache cache;
+    CMS_THREAD_SAFE static EventCache cache;
     return cache;
   }
 }  // namespace cudautils

From 1d441e2cc720d0740d9c36472da9205c542935af Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 7 Jan 2020 22:19:00 +0100
Subject: [PATCH 23/29] Rename cudautils::CUDAStreamCache ->
 cudautils::StreamCache

---
 .../Common/test/test_CUDAProduct.cc           |  4 ++--
 .../CUDACore/src/CUDAScopedContext.cc         |  6 +++---
 .../CUDACore/test/test_CUDAScopedContext.cc   |  4 ++--
 .../CUDAServices/src/CUDAService.cc           |  8 ++++----
 .../CUDAServices/test/testCUDAService.cpp     |  1 -
 .../CUDATest/plugins/TestCUDAAnalyzerGPU.cc   |  6 +++---
 .../{CUDAStreamCache.h => StreamCache.h}      | 14 ++++++-------
 .../{CUDAStreamCache.cc => StreamCache.cc}    | 20 +++++++++----------
 .../plugins/SiPixelRawToClusterGPUKernel.h    |  2 +-
 9 files changed, 32 insertions(+), 33 deletions(-)
 rename HeterogeneousCore/CUDAUtilities/interface/{CUDAStreamCache.h => StreamCache.h} (76%)
 rename HeterogeneousCore/CUDAUtilities/src/{CUDAStreamCache.cc => StreamCache.cc} (66%)

diff --git a/CUDADataFormats/Common/test/test_CUDAProduct.cc b/CUDADataFormats/Common/test/test_CUDAProduct.cc
index 0813577a70afc..c86e1bea10d91 100644
--- a/CUDADataFormats/Common/test/test_CUDAProduct.cc
+++ b/CUDADataFormats/Common/test/test_CUDAProduct.cc
@@ -4,7 +4,7 @@
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
 
 #include <cuda_runtime_api.h>
@@ -17,7 +17,7 @@ namespace cudatest {
       if (createEvent) {
         event = cudautils::getEventCache().get();
       }
-      return CUDAScopedContextProduce(dev, cudautils::getCUDAStreamCache().getCUDAStream(), std::move(event));
+      return CUDAScopedContextProduce(dev, cudautils::getStreamCache().get(), std::move(event));
     }
   };
 }  // namespace cudatest
diff --git a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
index 2d2a155a5bc11..54dcdfe7548b6 100644
--- a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
@@ -3,7 +3,7 @@
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "FWCore/ServiceRegistry/interface/Service.h"
 #include "FWCore/Utilities/interface/Exception.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 #include "chooseCUDADevice.h"
@@ -40,7 +40,7 @@ namespace impl {
   CUDAScopedContextBase::CUDAScopedContextBase(edm::StreamID streamID)
       : currentDevice_(cudacore::chooseCUDADevice(streamID)) {
     cudaCheck(cudaSetDevice(currentDevice_));
-    stream_ = cudautils::getCUDAStreamCache().getCUDAStream();
+    stream_ = cudautils::getStreamCache().get();
   }
 
   CUDAScopedContextBase::CUDAScopedContextBase(const CUDAProductBase& data) : currentDevice_(data.device()) {
@@ -48,7 +48,7 @@ namespace impl {
     if (data.mayReuseStream()) {
       stream_ = data.streamPtr();
     } else {
-      stream_ = cudautils::getCUDAStreamCache().getCUDAStream();
+      stream_ = cudautils::getStreamCache().get();
     }
   }
 
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
index af16bc7dabf3e..f654b4d9551df 100644
--- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
@@ -9,7 +9,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
@@ -24,7 +24,7 @@ namespace cudatest {
       if (createEvent) {
         event = cudautils::getEventCache().get();
       }
-      return CUDAScopedContextProduce(dev, cudautils::getCUDAStreamCache().getCUDAStream(), std::move(event));
+      return CUDAScopedContextProduce(dev, cudautils::getStreamCache().get(), std::move(event));
     }
   };
 }  // namespace cudatest
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
index 51dff6cb8079b..74d6f6b79ec8b 100644
--- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -12,7 +12,7 @@
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
@@ -90,7 +90,7 @@ namespace {
     if (bufferSizes.empty())
       return;
 
-    auto streamPtr = cudautils::getCUDAStreamCache().getCUDAStream();
+    auto streamPtr = cudautils::getStreamCache().get();
 
     std::vector<UniquePtr<char[]> > buffers;
     buffers.reserve(bufferSizes.size());
@@ -304,7 +304,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config) {
     cudautils::allocator::getCachingHostAllocator();
   }
   cudautils::getEventCache().clear();
-  cudautils::getCUDAStreamCache().clear();
+  cudautils::getStreamCache().clear();
 
   log << "CUDAService fully initialized";
   enabled_ = true;
@@ -323,7 +323,7 @@ CUDAService::~CUDAService() {
       cudautils::allocator::getCachingHostAllocator().FreeAllCached();
     }
     cudautils::getEventCache().clear();
-    cudautils::getCUDAStreamCache().clear();
+    cudautils::getStreamCache().clear();
 
     for (int i = 0; i < numberOfDevices_; ++i) {
       cudaCheck(cudaSetDevice(i));
diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
index 572e077606b0b..265703ccf5903 100644
--- a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
+++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
@@ -12,7 +12,6 @@
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
 #include "FWCore/Utilities/interface/Exception.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
 
 namespace {
   CUDAService makeCUDAService(edm::ParameterSet ps) {
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
index 8fe5688018728..e8d4ade41be01 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
@@ -10,7 +10,7 @@
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
 
 #include "TestCUDAAnalyzerGPUKernel.h"
 
@@ -40,7 +40,7 @@ TestCUDAAnalyzerGPU::TestCUDAAnalyzerGPU(edm::ParameterSet const& iConfig)
       maxValue_(iConfig.getParameter<double>("maxValue")) {
   edm::Service<CUDAService> cs;
   if (cs->enabled()) {
-    auto streamPtr = cudautils::getCUDAStreamCache().getCUDAStream();
+    auto streamPtr = cudautils::getStreamCache().get();
     gpuAlgo_ = std::make_unique<TestCUDAAnalyzerGPUKernel>(streamPtr.get());
   }
 }
@@ -70,7 +70,7 @@ void TestCUDAAnalyzerGPU::analyze(edm::StreamID, edm::Event const& iEvent, edm::
 void TestCUDAAnalyzerGPU::endJob() {
   edm::LogVerbatim("TestCUDAAnalyzerGPU") << label_ << " TestCUDAAnalyzerGPU::endJob begin";
 
-  auto streamPtr = cudautils::getCUDAStreamCache().getCUDAStream();
+  auto streamPtr = cudautils::getStreamCache().get();
   auto value = gpuAlgo_->value(streamPtr.get());
   edm::LogVerbatim("TestCUDAAnalyzerGPU") << label_ << "  accumulated value " << value;
   assert(minValue_ <= value && value <= maxValue_);
diff --git a/HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h b/HeterogeneousCore/CUDAUtilities/interface/StreamCache.h
similarity index 76%
rename from HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h
rename to HeterogeneousCore/CUDAUtilities/interface/StreamCache.h
index 032e3f8745b14..92e4be75275c6 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/StreamCache.h
@@ -1,5 +1,5 @@
-#ifndef HeterogeneousCore_CUDAUtilities_CUDAStreamCache_h
-#define HeterogeneousCore_CUDAUtilities_CUDAStreamCache_h
+#ifndef HeterogeneousCore_CUDAUtilities_StreamCache_h
+#define HeterogeneousCore_CUDAUtilities_StreamCache_h
 
 #include <vector>
 
@@ -11,16 +11,16 @@
 class CUDAService;
 
 namespace cudautils {
-  class CUDAStreamCache {
+  class StreamCache {
   public:
     using BareStream = SharedStreamPtr::element_type;
 
-    CUDAStreamCache();
+    StreamCache();
 
     // Gets a (cached) CUDA stream for the current device. The stream
     // will be returned to the cache by the shared_ptr destructor.
     // This function is thread safe
-    SharedStreamPtr getCUDAStream();
+    SharedStreamPtr get();
 
   private:
     friend class ::CUDAService;
@@ -40,9 +40,9 @@ namespace cudautils {
     std::vector<edm::ReusableObjectHolder<BareStream, Deleter>> cache_;
   };
 
-  // Gets the global instance of a CUDAStreamCache
+  // Gets the global instance of a StreamCache
   // This function is thread safe
-  CUDAStreamCache& getCUDAStreamCache();
+  StreamCache& getStreamCache();
 }  // namespace cudautils
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/src/CUDAStreamCache.cc b/HeterogeneousCore/CUDAUtilities/src/StreamCache.cc
similarity index 66%
rename from HeterogeneousCore/CUDAUtilities/src/CUDAStreamCache.cc
rename to HeterogeneousCore/CUDAUtilities/src/StreamCache.cc
index a4455e5733481..06a908b1cec1d 100644
--- a/HeterogeneousCore/CUDAUtilities/src/CUDAStreamCache.cc
+++ b/HeterogeneousCore/CUDAUtilities/src/StreamCache.cc
@@ -1,23 +1,23 @@
 #include "FWCore/Utilities/interface/thread_safety_macros.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
 
 namespace cudautils {
-  void CUDAStreamCache::Deleter::operator()(cudaStream_t stream) const {
+  void StreamCache::Deleter::operator()(cudaStream_t stream) const {
     if (device_ != -1) {
       ScopedSetDevice deviceGuard{device_};
       cudaCheck(cudaStreamDestroy(stream));
     }
   }
 
-  // CUDAStreamCache should be constructed by the first call to
-  // getCUDAStreamCache() only if we have CUDA devices present
-  CUDAStreamCache::CUDAStreamCache() : cache_(cudautils::deviceCount()) {}
+  // StreamCache should be constructed by the first call to
+  // getStreamCache() only if we have CUDA devices present
+  StreamCache::StreamCache() : cache_(cudautils::deviceCount()) {}
 
-  SharedStreamPtr CUDAStreamCache::getCUDAStream() {
+  SharedStreamPtr StreamCache::get() {
     const auto dev = cudautils::currentDevice();
     return cache_[dev].makeOrGet([dev]() {
       cudaStream_t stream;
@@ -26,19 +26,19 @@ namespace cudautils {
     });
   }
 
-  void CUDAStreamCache::clear() {
+  void StreamCache::clear() {
     // Reset the contents of the caches, but leave an
     // edm::ReusableObjectHolder alive for each device. This is needed
     // mostly for the unit tests, where the function-static
-    // CUDAStreamCache lives through multiple tests (and go through
+    // StreamCache lives through multiple tests (and go through
     // multiple shutdowns of the framework).
     cache_.clear();
     cache_.resize(cudautils::deviceCount());
   }
 
-  CUDAStreamCache& getCUDAStreamCache() {
+  StreamCache& getStreamCache() {
     // the public interface is thread safe
-    CMS_THREAD_SAFE static CUDAStreamCache cache;
+    CMS_THREAD_SAFE static StreamCache cache;
     return cache;
   }
 }  // namespace cudautils
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
index 888fc07953d9d..8a4e0b6f78696 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -187,7 +187,7 @@ namespace pixelgpudetails {
       // stream is still alive
       //
       // technically the statement above is not true anymore now that
-      // the CUDA streams are cached within the CUDAStreamCache, but it is
+      // the CUDA streams are cached within the cudautils::StreamCache, but it is
       // still better to release as early as possible
       nModules_Clusters_h.reset();
       return std::make_pair(std::move(digis_d), std::move(clusters_d));

From 20f7926790508722d6bfc4f9d31ddb2c7e3d81a6 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 7 Jan 2020 22:23:17 +0100
Subject: [PATCH 24/29] Rename cudautils::eventIsOccurred() ->
 cudautils::eventWorkHasCompleted()

---
 CUDADataFormats/Common/src/CUDAProductBase.cc |  4 +--
 .../CUDACore/interface/CUDAESProduct.h        |  4 +--
 .../CUDACore/test/test_CUDAScopedContext.cc   |  4 +--
 .../CUDAUtilities/interface/EventCache.h      |  4 +--
 .../CUDAUtilities/interface/eventIsOccurred.h | 22 --------------
 .../interface/eventWorkHasCompleted.h         | 30 +++++++++++++++++++
 6 files changed, 38 insertions(+), 30 deletions(-)
 delete mode 100644 HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h

diff --git a/CUDADataFormats/Common/src/CUDAProductBase.cc b/CUDADataFormats/Common/src/CUDAProductBase.cc
index eda6ee99d13f5..9510603eb5d10 100644
--- a/CUDADataFormats/Common/src/CUDAProductBase.cc
+++ b/CUDADataFormats/Common/src/CUDAProductBase.cc
@@ -1,12 +1,12 @@
 #include "CUDADataFormats/Common/interface/CUDAProductBase.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
 
 bool CUDAProductBase::isAvailable() const {
   // if default-constructed, the product is not available
   if (not event_) {
     return false;
   }
-  return cudautils::eventIsOccurred(event_.get());
+  return cudautils::eventWorkHasCompleted(event_.get());
 }
 
 CUDAProductBase::~CUDAProductBase() {
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h b/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h
index 2378bfec196cc..5ef2399f96ea0 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h
@@ -11,7 +11,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
 
 template <typename T>
 class CUDAESProduct {
@@ -47,7 +47,7 @@ class CUDAESProduct {
         // Someone else is filling
 
         // Check first if the recorded event has occurred
-        if (cudautils::eventIsOccurred(data.m_event.get())) {
+        if (cudautils::eventWorkHasCompleted(data.m_event.get())) {
           // It was, so data is accessible from all CUDA streams on
           // the device. Set the 'filled' for all subsequent calls and
           // return the value
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
index f654b4d9551df..c0bb7656ba258 100644
--- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
@@ -7,7 +7,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
@@ -112,7 +112,7 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
       testCUDAScopedContextKernels_join(prod1, prod2, d_a3.get(), ctx2.stream());
       cudaCheck(cudaStreamSynchronize(ctx2.stream()));
       REQUIRE(wprod2->isAvailable());
-      REQUIRE(cudautils::eventIsOccurred(wprod2->event()));
+      REQUIRE(cudautils::eventWorkHasCompleted(wprod2->event()));
 
       h_a1 = 0;
       h_a2 = 0;
diff --git a/HeterogeneousCore/CUDAUtilities/interface/EventCache.h b/HeterogeneousCore/CUDAUtilities/interface/EventCache.h
index 5cf05b1146641..2828a7ab50417 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/EventCache.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/EventCache.h
@@ -19,8 +19,8 @@ namespace cudautils {
 
     // Gets a (cached) CUDA event for the current device. The event
     // will be returned to the cache by the shared_ptr destructor. The
-    // returned event is guaranteed to be "occurred", i.e.
-    // cudaEventQuery() == cudaSuccess.
+    // returned event is guaranteed to be in the state where all
+    // captured work has completed, i.e. cudaEventQuery() == cudaSuccess.
     //
     // This function is thread safe
     SharedEventPtr get();
diff --git a/HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h b/HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h
deleted file mode 100644
index 60be11dd83a6a..0000000000000
--- a/HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef HeterogeneousCore_CUDAUtilities_eventIsOccurred_h
-#define HeterogeneousCore_CUDAUtilities_eventIsOccurred_h
-
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-
-#include <cuda_runtime.h>
-
-namespace cudautils {
-  inline bool eventIsOccurred(cudaEvent_t event) {
-    const auto ret = cudaEventQuery(event);
-    if (ret == cudaSuccess) {
-      return true;
-    } else if (ret == cudaErrorNotReady) {
-      return false;
-    }
-    // leave error case handling to cudaCheck
-    cudaCheck(ret);
-    return false;  // to keep compiler happy
-  }
-}  // namespace cudautils
-
-#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h b/HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h
new file mode 100644
index 0000000000000..ef05d9cfbd951
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h
@@ -0,0 +1,30 @@
+#ifndef HeterogeneousCore_CUDAUtilities_eventWorkHasCompleted_h
+#define HeterogeneousCore_CUDAUtilities_eventWorkHasCompleted_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include <cuda_runtime.h>
+
+namespace cudautils {
+  /**
+   * Returns true if the work captured by the event (=queued to the
+   * CUDA stream at the point of cudaEventRecord()) has completed.
+   *
+   * Returns false if any captured work is incomplete.
+   *
+   * In case of errors, throws an exception.
+   */
+  inline bool eventWorkHasCompleted(cudaEvent_t event) {
+    const auto ret = cudaEventQuery(event);
+    if (ret == cudaSuccess) {
+      return true;
+    } else if (ret == cudaErrorNotReady) {
+      return false;
+    }
+    // leave error case handling to cudaCheck
+    cudaCheck(ret);
+    return false;  // to keep compiler happy
+  }
+}  // namespace cudautils
+
+#endif

From 034b34a27ae7902e2dcc421c52fe4c64ba227b3b Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 7 Jan 2020 23:01:30 +0100
Subject: [PATCH 25/29] Use eventWorkHasCompleted() in EventCache

---
 .../CUDAUtilities/src/EventCache.cc           | 27 ++++++++-----------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/src/EventCache.cc b/HeterogeneousCore/CUDAUtilities/src/EventCache.cc
index 8903b385e8fe6..0a5474d7f4aa0 100644
--- a/HeterogeneousCore/CUDAUtilities/src/EventCache.cc
+++ b/HeterogeneousCore/CUDAUtilities/src/EventCache.cc
@@ -3,6 +3,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
 
 namespace cudautils {
@@ -20,30 +21,24 @@ namespace cudautils {
   SharedEventPtr EventCache::get() {
     const auto dev = cudautils::currentDevice();
     auto event = makeOrGet(dev);
-    auto ret = cudaEventQuery(event.get());
-    // event is occurred, return immediately
-    if (ret == cudaSuccess) {
+    // captured work has completed, or a just-created event
+    if (eventWorkHasCompleted(event.get())) {
       return event;
     }
-    // return code is something else than "recorded", throw exception
-    if (ret != cudaErrorNotReady) {
-      cudaCheck(ret);
-    }
 
-    // Got recorded, but not yet occurred event. Try until we get an
-    // occurred event. Need to keep all recorded events until an
-    // occurred event is found in order to avoid ping-pong with a
-    // recorded event.
+    // Got an event with incomplete captured work. Try again until we
+    // get a completed (or a just-created) event. Need to keep all
+    // incomplete events until a completed event is found in order to
+    // avoid ping-pong with an incomplete event.
     std::vector<SharedEventPtr> ptrs{std::move(event)};
+    bool completed;
     do {
       event = makeOrGet(dev);
-      ret = cudaEventQuery(event.get());
-      if (ret == cudaErrorNotReady) {
+      completed = eventWorkHasCompleted(event.get());
+      if (not completed) {
         ptrs.emplace_back(std::move(event));
-      } else if (ret != cudaSuccess) {
-        cudaCheck(ret);
       }
-    } while (ret != cudaSuccess);
+    } while (not completed);
     return event;
   }
 

From f51beedb13c0de5654fb9a6fc5482ba0bd14cec6 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 8 Jan 2020 17:11:33 +0100
Subject: [PATCH 26/29] Rename all core CUDAX -> cms::cuda::X, and test CUDAX
 -> cms::cudatest::X

---
 .../Common/interface/CUDAProduct.h            |  55 ----
 .../Common/interface/CUDAProductBase.h        |  89 -------
 CUDADataFormats/Common/interface/Product.h    |  60 +++++
 .../Common/interface/ProductBase.h            |  93 +++++++
 CUDADataFormats/Common/src/CUDAProductBase.cc |  27 --
 CUDADataFormats/Common/src/ProductBase.cc     |  29 +++
 .../{test_CUDAProduct.cc => test_Product.cc}  |  28 +-
 HeterogeneousCore/CUDACore/README.md          | 240 ++++++++---------
 .../CUDACore/interface/CUDAContextState.h     |  57 ----
 .../CUDACore/interface/CUDAESProduct.h        |  99 -------
 .../CUDACore/interface/CUDAScopedContext.h    | 243 ------------------
 .../CUDACore/interface/ContextState.h         |  61 +++++
 .../CUDACore/interface/ESProduct.h            | 103 ++++++++
 .../CUDACore/interface/ScopedContext.h        | 242 +++++++++++++++++
 .../CUDACore/src/CUDAContextState.cc          |  14 -
 .../CUDACore/src/CUDAScopedContext.cc         | 117 ---------
 .../CUDACore/src/ContextState.cc              |  16 ++
 .../CUDACore/src/ScopedContext.cc             | 118 +++++++++
 .../CUDACore/src/chooseCUDADevice.h           |  10 -
 .../{chooseCUDADevice.cc => chooseDevice.cc}  |   8 +-
 HeterogeneousCore/CUDACore/src/chooseDevice.h |  10 +
 .../CUDACore/test/testStreamEvent.cu          |   2 +-
 .../test/test_CUDAScopedContextKernels.cu     |  13 -
 .../test/test_CUDAScopedContextKernels.h      |   9 -
 ...ScopedContext.cc => test_ScopedContext.cc} |  52 ++--
 .../test/test_ScopedContextKernels.cu         |  17 ++
 .../CUDACore/test/test_ScopedContextKernels.h |  13 +
 .../CUDATest/interface/CUDAThing.h            |  18 --
 HeterogeneousCore/CUDATest/interface/Thing.h  |  21 ++
 .../CUDATest/plugins/TestCUDAAnalyzerGPU.cc   |  16 +-
 .../CUDATest/plugins/TestCUDAProducerGPU.cc   |  24 +-
 .../CUDATest/plugins/TestCUDAProducerGPUEW.cc |  26 +-
 .../plugins/TestCUDAProducerGPUEWTask.cc      |  35 +--
 .../plugins/TestCUDAProducerGPUFirst.cc       |  15 +-
 .../plugins/TestCUDAProducerGPUtoCPU.cc       |  16 +-
 HeterogeneousCore/CUDATest/src/classes.h      |   4 +-
 .../CUDATest/src/classes_def.xml              |   4 +-
 .../test/test_TestCUDAProducerGPUFirst.cc     |  12 +-
 38 files changed, 1025 insertions(+), 991 deletions(-)
 delete mode 100644 CUDADataFormats/Common/interface/CUDAProduct.h
 delete mode 100644 CUDADataFormats/Common/interface/CUDAProductBase.h
 create mode 100644 CUDADataFormats/Common/interface/Product.h
 create mode 100644 CUDADataFormats/Common/interface/ProductBase.h
 delete mode 100644 CUDADataFormats/Common/src/CUDAProductBase.cc
 create mode 100644 CUDADataFormats/Common/src/ProductBase.cc
 rename CUDADataFormats/Common/test/{test_CUDAProduct.cc => test_Product.cc} (62%)
 delete mode 100644 HeterogeneousCore/CUDACore/interface/CUDAContextState.h
 delete mode 100644 HeterogeneousCore/CUDACore/interface/CUDAESProduct.h
 delete mode 100644 HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
 create mode 100644 HeterogeneousCore/CUDACore/interface/ContextState.h
 create mode 100644 HeterogeneousCore/CUDACore/interface/ESProduct.h
 create mode 100644 HeterogeneousCore/CUDACore/interface/ScopedContext.h
 delete mode 100644 HeterogeneousCore/CUDACore/src/CUDAContextState.cc
 delete mode 100644 HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
 create mode 100644 HeterogeneousCore/CUDACore/src/ContextState.cc
 create mode 100644 HeterogeneousCore/CUDACore/src/ScopedContext.cc
 delete mode 100644 HeterogeneousCore/CUDACore/src/chooseCUDADevice.h
 rename HeterogeneousCore/CUDACore/src/{chooseCUDADevice.cc => chooseDevice.cc} (81%)
 create mode 100644 HeterogeneousCore/CUDACore/src/chooseDevice.h
 delete mode 100644 HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu
 delete mode 100644 HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h
 rename HeterogeneousCore/CUDACore/test/{test_CUDAScopedContext.cc => test_ScopedContext.cc} (69%)
 create mode 100644 HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.cu
 create mode 100644 HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.h
 delete mode 100644 HeterogeneousCore/CUDATest/interface/CUDAThing.h
 create mode 100644 HeterogeneousCore/CUDATest/interface/Thing.h

diff --git a/CUDADataFormats/Common/interface/CUDAProduct.h b/CUDADataFormats/Common/interface/CUDAProduct.h
deleted file mode 100644
index 9862d9c79bcd0..0000000000000
--- a/CUDADataFormats/Common/interface/CUDAProduct.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#ifndef CUDADataFormats_Common_CUDAProduct_h
-#define CUDADataFormats_Common_CUDAProduct_h
-
-#include <memory>
-
-#include "CUDADataFormats/Common/interface/CUDAProductBase.h"
-
-namespace edm {
-  template <typename T>
-  class Wrapper;
-}
-namespace impl {
-  class CUDAScopedContextGetterBase;
-}
-
-/**
- * The purpose of this class is to wrap CUDA data to edm::Event in a
- * way which forces correct use of various utilities.
- *
- * The non-default construction has to be done with CUDAScopedContext
- * (in order to properly register the CUDA event).
- *
- * The default constructor is needed only for the ROOT dictionary generation.
- *
- * The CUDA event is in practice needed only for stream-stream
- * synchronization, but someone with long-enough lifetime has to own
- * it. Here is a somewhat natural place. If overhead is too much, we
- * can use them only where synchronization between streams is needed.
- */
-template <typename T>
-class CUDAProduct : public CUDAProductBase {
-public:
-  CUDAProduct() = default;  // Needed only for ROOT dictionary generation
-
-  CUDAProduct(const CUDAProduct&) = delete;
-  CUDAProduct& operator=(const CUDAProduct&) = delete;
-  CUDAProduct(CUDAProduct&&) = default;
-  CUDAProduct& operator=(CUDAProduct&&) = default;
-
-private:
-  friend class impl::CUDAScopedContextGetterBase;
-  friend class CUDAScopedContextProduce;
-  friend class edm::Wrapper<CUDAProduct<T>>;
-
-  explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event, T data)
-      : CUDAProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {}
-
-  template <typename... Args>
-  explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event, Args&&... args)
-      : CUDAProductBase(device, std::move(stream), std::move(event)), data_(std::forward<Args>(args)...) {}
-
-  T data_;  //!
-};
-
-#endif
diff --git a/CUDADataFormats/Common/interface/CUDAProductBase.h b/CUDADataFormats/Common/interface/CUDAProductBase.h
deleted file mode 100644
index f01b508f94f8a..0000000000000
--- a/CUDADataFormats/Common/interface/CUDAProductBase.h
+++ /dev/null
@@ -1,89 +0,0 @@
-#ifndef CUDADataFormats_Common_CUDAProductBase_h
-#define CUDADataFormats_Common_CUDAProductBase_h
-
-#include <atomic>
-#include <memory>
-
-#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
-
-namespace impl {
-  class CUDAScopedContextBase;
-}
-
-/**
- * Base class for all instantiations of CUDA<T> to hold the
- * non-T-dependent members.
- */
-class CUDAProductBase {
-public:
-  CUDAProductBase() = default;  // Needed only for ROOT dictionary generation
-  ~CUDAProductBase();
-
-  CUDAProductBase(const CUDAProductBase&) = delete;
-  CUDAProductBase& operator=(const CUDAProductBase&) = delete;
-  CUDAProductBase(CUDAProductBase&& other)
-      : stream_{std::move(other.stream_)},
-        event_{std::move(other.event_)},
-        mayReuseStream_{other.mayReuseStream_.load()},
-        device_{other.device_} {}
-  CUDAProductBase& operator=(CUDAProductBase&& other) {
-    stream_ = std::move(other.stream_);
-    event_ = std::move(other.event_);
-    mayReuseStream_ = other.mayReuseStream_.load();
-    device_ = other.device_;
-    return *this;
-  }
-
-  bool isValid() const { return stream_.get() != nullptr; }
-  bool isAvailable() const;
-
-  int device() const { return device_; }
-
-  // cudaStream_t is a pointer to a thread-safe object, for which a
-  // mutable access is needed even if the CUDAScopedContext itself
-  // would be const. Therefore it is ok to return a non-const
-  // pointer from a const method here.
-  cudaStream_t stream() const { return stream_.get(); }
-
-  // cudaEvent_t is a pointer to a thread-safe object, for which a
-  // mutable access is needed even if the CUDAScopedContext itself
-  // would be const. Therefore it is ok to return a non-const
-  // pointer from a const method here.
-  cudaEvent_t event() const { return event_.get(); }
-
-protected:
-  explicit CUDAProductBase(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event)
-      : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {}
-
-private:
-  friend class impl::CUDAScopedContextBase;
-  friend class CUDAScopedContextProduce;
-
-  // The following function is intended to be used only from CUDAScopedContext
-  const cudautils::SharedStreamPtr& streamPtr() const { return stream_; }
-
-  bool mayReuseStream() const {
-    bool expected = true;
-    bool changed = mayReuseStream_.compare_exchange_strong(expected, false);
-    // If the current thread is the one flipping the flag, it may
-    // reuse the stream.
-    return changed;
-  }
-
-  // The cudaStream_t is really shared among edm::Event products, so
-  // using shared_ptr also here
-  cudautils::SharedStreamPtr stream_;  //!
-  // shared_ptr because of caching in cudautils::EventCache
-  cudautils::SharedEventPtr event_;  //!
-
-  // This flag tells whether the CUDA stream may be reused by a
-  // consumer or not. The goal is to have a "chain" of modules to
-  // queue their work to the same stream.
-  mutable std::atomic<bool> mayReuseStream_ = true;  //!
-
-  // The CUDA device associated with this product
-  int device_ = -1;  //!
-};
-
-#endif
diff --git a/CUDADataFormats/Common/interface/Product.h b/CUDADataFormats/Common/interface/Product.h
new file mode 100644
index 0000000000000..70eae630b3ce3
--- /dev/null
+++ b/CUDADataFormats/Common/interface/Product.h
@@ -0,0 +1,60 @@
+#ifndef CUDADataFormats_Common_Product_h
+#define CUDADataFormats_Common_Product_h
+
+#include <memory>
+
+#include "CUDADataFormats/Common/interface/ProductBase.h"
+
+namespace edm {
+  template <typename T>
+  class Wrapper;
+}
+
+namespace cms {
+  namespace cuda {
+    namespace impl {
+      class ScopedContextGetterBase;
+    }
+
+    /**
+     * The purpose of this class is to wrap CUDA data to edm::Event in a
+     * way which forces correct use of various utilities.
+     *
+     * The non-default construction has to be done with cms::cuda::ScopedContext
+     * (in order to properly register the CUDA event).
+     *
+     * The default constructor is needed only for the ROOT dictionary generation.
+     *
+     * The CUDA event is in practice needed only for stream-stream
+     * synchronization, but someone with long-enough lifetime has to own
+     * it. Here is a somewhat natural place. If overhead is too much, we
+     * can use them only where synchronization between streams is needed.
+     */
+    template <typename T>
+    class Product : public ProductBase {
+    public:
+      Product() = default;  // Needed only for ROOT dictionary generation
+
+      Product(const Product&) = delete;
+      Product& operator=(const Product&) = delete;
+      Product(Product&&) = default;
+      Product& operator=(Product&&) = default;
+
+    private:
+      friend class impl::ScopedContextGetterBase;
+      friend class ScopedContextProduce;
+      friend class edm::Wrapper<Product<T>>;
+
+      explicit Product(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event, T data)
+          : ProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {}
+
+      template <typename... Args>
+      explicit Product(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event, Args&&... args)
+          : ProductBase(device, std::move(stream), std::move(event)), data_(std::forward<Args>(args)...) {}
+
+      T data_;  //!
+    };
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/CUDADataFormats/Common/interface/ProductBase.h b/CUDADataFormats/Common/interface/ProductBase.h
new file mode 100644
index 0000000000000..69e0770195608
--- /dev/null
+++ b/CUDADataFormats/Common/interface/ProductBase.h
@@ -0,0 +1,93 @@
+#ifndef CUDADataFormats_Common_ProductBase_h
+#define CUDADataFormats_Common_ProductBase_h
+
+#include <atomic>
+#include <memory>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
+
+namespace cms {
+  namespace cuda {
+    namespace impl {
+      class ScopedContextBase;
+    }
+
+    /**
+     * Base class for all instantiations of CUDA<T> to hold the
+     * non-T-dependent members.
+     */
+    class ProductBase {
+    public:
+      ProductBase() = default;  // Needed only for ROOT dictionary generation
+      ~ProductBase();
+
+      ProductBase(const ProductBase&) = delete;
+      ProductBase& operator=(const ProductBase&) = delete;
+      ProductBase(ProductBase&& other)
+          : stream_{std::move(other.stream_)},
+            event_{std::move(other.event_)},
+            mayReuseStream_{other.mayReuseStream_.load()},
+            device_{other.device_} {}
+      ProductBase& operator=(ProductBase&& other) {
+        stream_ = std::move(other.stream_);
+        event_ = std::move(other.event_);
+        mayReuseStream_ = other.mayReuseStream_.load();
+        device_ = other.device_;
+        return *this;
+      }
+
+      bool isValid() const { return stream_.get() != nullptr; }
+      bool isAvailable() const;
+
+      int device() const { return device_; }
+
+      // cudaStream_t is a pointer to a thread-safe object, for which a
+      // mutable access is needed even if the cms::cuda::ScopedContext itself
+      // would be const. Therefore it is ok to return a non-const
+      // pointer from a const method here.
+      cudaStream_t stream() const { return stream_.get(); }
+
+      // cudaEvent_t is a pointer to a thread-safe object, for which a
+      // mutable access is needed even if the cms::cuda::ScopedContext itself
+      // would be const. Therefore it is ok to return a non-const
+      // pointer from a const method here.
+      cudaEvent_t event() const { return event_.get(); }
+
+    protected:
+      explicit ProductBase(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event)
+          : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {}
+
+    private:
+      friend class impl::ScopedContextBase;
+      friend class ScopedContextProduce;
+
+      // The following function is intended to be used only from cms::cuda::ScopedContext
+      const cudautils::SharedStreamPtr& streamPtr() const { return stream_; }
+
+      bool mayReuseStream() const {
+        bool expected = true;
+        bool changed = mayReuseStream_.compare_exchange_strong(expected, false);
+        // If the current thread is the one flipping the flag, it may
+        // reuse the stream.
+        return changed;
+      }
+
+      // The cudaStream_t is really shared among edm::Event products, so
+      // using shared_ptr also here
+      cudautils::SharedStreamPtr stream_;  //!
+      // shared_ptr because of caching in cudautils::EventCache
+      cudautils::SharedEventPtr event_;  //!
+
+      // This flag tells whether the CUDA stream may be reused by a
+      // consumer or not. The goal is to have a "chain" of modules to
+      // queue their work to the same stream.
+      mutable std::atomic<bool> mayReuseStream_ = true;  //!
+
+      // The CUDA device associated with this product
+      int device_ = -1;  //!
+    };
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/CUDADataFormats/Common/src/CUDAProductBase.cc b/CUDADataFormats/Common/src/CUDAProductBase.cc
deleted file mode 100644
index 9510603eb5d10..0000000000000
--- a/CUDADataFormats/Common/src/CUDAProductBase.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-#include "CUDADataFormats/Common/interface/CUDAProductBase.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
-
-bool CUDAProductBase::isAvailable() const {
-  // if default-constructed, the product is not available
-  if (not event_) {
-    return false;
-  }
-  return cudautils::eventWorkHasCompleted(event_.get());
-}
-
-CUDAProductBase::~CUDAProductBase() {
-  // Make sure that the production of the product in the GPU is
-  // complete before destructing the product. This is to make sure
-  // that the EDM stream does not move to the next event before all
-  // asynchronous processing of the current is complete.
-
-  // TODO: a callback notifying a WaitingTaskHolder (or similar)
-  // would avoid blocking the CPU, but would also require more work.
-  //
-  // Intentionally not checking the return value to avoid throwing
-  // exceptions. If this call would fail, we should get failures
-  // elsewhere as well.
-  if (event_) {
-    cudaEventSynchronize(event_.get());
-  }
-}
diff --git a/CUDADataFormats/Common/src/ProductBase.cc b/CUDADataFormats/Common/src/ProductBase.cc
new file mode 100644
index 0000000000000..653d6a21b4add
--- /dev/null
+++ b/CUDADataFormats/Common/src/ProductBase.cc
@@ -0,0 +1,29 @@
+#include "CUDADataFormats/Common/interface/ProductBase.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
+
+namespace cms::cuda {
+  bool ProductBase::isAvailable() const {
+    // if default-constructed, the product is not available
+    if (not event_) {
+      return false;
+    }
+    return cudautils::eventWorkHasCompleted(event_.get());
+  }
+
+  ProductBase::~ProductBase() {
+    // Make sure that the production of the product in the GPU is
+    // complete before destructing the product. This is to make sure
+    // that the EDM stream does not move to the next event before all
+    // asynchronous processing of the current is complete.
+
+    // TODO: a callback notifying a WaitingTaskHolder (or similar)
+    // would avoid blocking the CPU, but would also require more work.
+    //
+    // Intentionally not checking the return value to avoid throwing
+    // exceptions. If this call would fail, we should get failures
+    // elsewhere as well.
+    if (event_) {
+      cudaEventSynchronize(event_.get());
+    }
+  }
+}  // namespace cms::cuda
diff --git a/CUDADataFormats/Common/test/test_CUDAProduct.cc b/CUDADataFormats/Common/test/test_Product.cc
similarity index 62%
rename from CUDADataFormats/Common/test/test_CUDAProduct.cc
rename to CUDADataFormats/Common/test/test_Product.cc
index c86e1bea10d91..163373f82871e 100644
--- a/CUDADataFormats/Common/test/test_CUDAProduct.cc
+++ b/CUDADataFormats/Common/test/test_Product.cc
@@ -1,7 +1,7 @@
 #include "catch.hpp"
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
@@ -9,22 +9,22 @@
 
 #include <cuda_runtime_api.h>
 
-namespace cudatest {
-  class TestCUDAScopedContext {
+namespace cms::cudatest {
+  class TestScopedContext {
   public:
-    static CUDAScopedContextProduce make(int dev, bool createEvent) {
+    static cuda::ScopedContextProduce make(int dev, bool createEvent) {
       cudautils::SharedEventPtr event;
       if (createEvent) {
         event = cudautils::getEventCache().get();
       }
-      return CUDAScopedContextProduce(dev, cudautils::getStreamCache().get(), std::move(event));
+      return cuda::ScopedContextProduce(dev, cudautils::getStreamCache().get(), std::move(event));
     }
   };
-}  // namespace cudatest
+}  // namespace cms::cudatest
 
-TEST_CASE("Use of CUDAProduct template", "[CUDACore]") {
+TEST_CASE("Use of cms::cuda::Product template", "[CUDACore]") {
   SECTION("Default constructed") {
-    auto foo = CUDAProduct<int>();
+    auto foo = cms::cuda::Product<int>();
     REQUIRE(!foo.isValid());
 
     auto bar = std::move(foo);
@@ -37,11 +37,11 @@ TEST_CASE("Use of CUDAProduct template", "[CUDACore]") {
   constexpr int defaultDevice = 0;
   cudaCheck(cudaSetDevice(defaultDevice));
   {
-    auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice, true);
-    std::unique_ptr<CUDAProduct<int>> dataPtr = ctx.wrap(10);
+    auto ctx = cms::cudatest::TestScopedContext::make(defaultDevice, true);
+    std::unique_ptr<cms::cuda::Product<int>> dataPtr = ctx.wrap(10);
     auto& data = *dataPtr;
 
-    SECTION("Construct from CUDAScopedContext") {
+    SECTION("Construct from cms::cuda::ScopedContext") {
       REQUIRE(data.isValid());
       REQUIRE(data.device() == defaultDevice);
       REQUIRE(data.stream() == ctx.stream());
@@ -49,13 +49,13 @@ TEST_CASE("Use of CUDAProduct template", "[CUDACore]") {
     }
 
     SECTION("Move constructor") {
-      auto data2 = CUDAProduct<int>(std::move(data));
+      auto data2 = cms::cuda::Product<int>(std::move(data));
       REQUIRE(data2.isValid());
       REQUIRE(!data.isValid());
     }
 
     SECTION("Move assignment") {
-      CUDAProduct<int> data2;
+      cms::cuda::Product<int> data2;
       data2 = std::move(data);
       REQUIRE(data2.isValid());
       REQUIRE(!data.isValid());
diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md
index 3948ae7e59f79..1e733b2afec71 100644
--- a/HeterogeneousCore/CUDACore/README.md
+++ b/HeterogeneousCore/CUDACore/README.md
@@ -77,7 +77,7 @@ This page documents the CUDA integration within CMSSW
      * Convert the output SoA to legacy CPU data formats
 3. Within `acquire()`/`produce()`, the current CUDA device is set
    implicitly and the CUDA stream is provided by the system (with
-   `CUDAScopedContextAcquire`/`CUDAScopedContextProduce`)
+   `cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`)
    * It is strongly recommended to use the provided CUDA stream for all operations
      * If that is not feasible for some reason, the provided CUDA
        stream must synchronize with the work queued on other CUDA
@@ -114,12 +114,12 @@ private:
 ...
 void IsolatedProducerCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   // Sets the current device and creates a CUDA stream
-  CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
+  cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
 
   auto const& inputData = iEvent.get(inputToken_);
 
   // Queues asynchronous data transfers and kernels to the CUDA stream
-  // returned by CUDAScopedContextAcquire::stream()
+  // returned by cms::cuda::ScopedContextAcquire::stream()
   gpuAlgo_.makeAsync(inputData, ctx.stream());
 
   // Destructor of ctx queues a callback to the CUDA stream notifying
@@ -148,18 +148,18 @@ private:
   ...
   ProducerOutputGPUAlgo gpuAlgo_;
   edm::EDGetTokenT<InputData> inputToken_;
-  edm::EDPutTokenT<CUDAProduct<OutputData>> outputToken_;
-  CUDAContextState ctxState_;
+  edm::EDPutTokenT<cms::cuda::Product<OutputData>> outputToken_;
+  cms::cuda::ContextState ctxState_;
 };
 ...
 void ProducerOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   // Sets the current device and creates a CUDA stream
-  CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_};
+  cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_};
 
   auto const& inputData = iEvent.get(inputToken_);
 
   // Queues asynchronous data transfers and kernels to the CUDA stream
-  // returned by CUDAScopedContextAcquire::stream()
+  // returned by cms::cuda::ScopedContextAcquire::stream()
   gpuAlgo.makeAsync(inputData, ctx.stream());
 
   // Destructor of ctx queues a callback to the CUDA stream notifying
@@ -170,11 +170,11 @@ void ProducerOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const
 // Called after the asynchronous work has finished
 void ProducerOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
   // Sets again the current device, uses the CUDA stream created in the acquire()
-  CUDAScopedContextProduce ctx{ctxState_};
+  cms::cuda::ScopedContextProduce ctx{ctxState_};
 
   // Now getResult() returns data in GPU memory that is passed to the
-  // constructor of OutputData. CUDAScopedContextProduce::emplace() wraps the
-  // OutputData to CUDAProduct<OutputData>. CUDAProduct<T> stores also
+  // constructor of OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the
+  // OutputData to cms::cuda::Product<OutputData>. cms::cuda::Product<T> stores also
   // the current device and the CUDA stream since those will be needed
   // in the consumer side.
   ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
@@ -193,21 +193,21 @@ public:
 private:
   ...
   ProducerInputGPUAlgo gpuAlgo_;
-  edm::EDGetTokenT<CUDAProduct<InputData>> inputToken_;
-  edm::EDGetTokenT<CUDAProduct<OtherInputData>> otherInputToken_;
+  edm::EDGetTokenT<cms::cuda:Product<InputData>> inputToken_;
+  edm::EDGetTokenT<cms::cuda::Product<OtherInputData>> otherInputToken_;
   edm::EDPutTokenT<OutputData> outputToken_;
 };
 ...
 void ProducerInputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  CUDAProduct<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
+  cms::cuda::Product<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
 
   // Set the current device to the same that was used to produce
   // InputData, and possibly use the same CUDA stream
-  CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
 
   // Grab the real input data. Checks that the input data is on the
   // current device. If the input data was produced in a different CUDA
-  // stream than the CUDAScopedContextAcquire holds, create an inter-stream
+  // stream than the cms::cuda::ScopedContextAcquire holds, create an inter-stream
   // synchronization point with CUDA event and cudaStreamWaitEvent()
   auto const& inputData = ctx.get(inputDataWrapped);
 
@@ -218,7 +218,7 @@ void ProducerInputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetu
 
 
   // Queues asynchronous data transfers and kernels to the CUDA stream
-  // returned by CUDAScopedContextAcquire::stream()
+  // returned by cms::cuda::ScopedContextAcquire::stream()
   gpuAlgo.makeAsync(inputData, otherInputData, ctx.stream());
 
   // Destructor of ctx queues a callback to the CUDA stream notifying
@@ -235,8 +235,8 @@ void ProducerInputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
 ```
 
 See [further below](#setting-the-current-device) for the conditions
-when the `CUDAScopedContextAcquire` constructor reuses the CUDA stream. Note
-that the `CUDAScopedContextAcquire` constructor taking `edm::StreamID` is
+when the `cms::cuda::ScopedContextAcquire` constructor reuses the CUDA stream. Note
+that the `cms::cuda::ScopedContextAcquire` constructor taking `edm::StreamID` is
 allowed, it will just always create a new CUDA stream.
 
 
@@ -252,25 +252,25 @@ public:
 private:
   ...
   ProducerInputGPUAlgo gpuAlgo_;
-  edm::EDGetTokenT<CUDAProduct<InputData>> inputToken_;
-  edm::EDPutTokenT<CUDAProduct<OutputData>> outputToken_;
+  edm::EDGetTokenT<cms::cuda::Product<InputData>> inputToken_;
+  edm::EDPutTokenT<cms::cuda::Product<OutputData>> outputToken_;
 };
 ...
 void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  CUDAProduct<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
+  cms::cuda::Product<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
 
   // Set the current device to the same that was used to produce
   // InputData, and also use the same CUDA stream
-  CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder), ctxState_};
+  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder), ctxState_};
 
   // Grab the real input data. Checks that the input data is on the
   // current device. If the input data was produced in a different CUDA
-  // stream than the CUDAScopedContextAcquire holds, create an inter-stream
+  // stream than the cms::cuda::ScopedContextAcquire holds, create an inter-stream
   // synchronization point with CUDA event and cudaStreamWaitEvent()
   auto const& inputData = ctx.get(inputDataWrapped);
 
   // Queues asynchronous data transfers and kernels to the CUDA stream
-  // returned by CUDAScopedContextAcquire::stream()
+  // returned by cms::cuda::ScopedContextAcquire::stream()
   gpuAlgo.makeAsync(inputData, ctx.stream());
 
   // Destructor of ctx queues a callback to the CUDA stream notifying
@@ -281,11 +281,11 @@ void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup&
 // Called after the asynchronous work has finished
 void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
   // Sets again the current device, uses the CUDA stream created in the acquire()
-  CUDAScopedContextProduce ctx{ctxState_};
+  cms::cuda::ScopedContextProduce ctx{ctxState_};
 
   // Now getResult() returns data in GPU memory that is passed to the
-  // constructor of OutputData. CUDAScopedContextProduce::emplace() wraps the
-  // OutputData to CUDAProduct<OutputData>. CUDAProduct<T> stores also
+  // constructor of OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the
+  // OutputData to cms::cuda::Product<OutputData>. cms::cuda::Product<T> stores also
   // the current device and the CUDA stream since those will be needed
   // in the consumer side.
   ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
@@ -309,25 +309,25 @@ private:
 
   ...
   ProducerInputGPUAlgo gpuAlgo_;
-  edm::EDGetTokenT<CUDAProduct<InputData>> inputToken_;
-  edm::EDPutTokenT<CUDAProduct<OutputData>> outputToken_;
+  edm::EDGetTokenT<cms::cuda::Product<InputData>> inputToken_;
+  edm::EDPutTokenT<cms::cuda::Product<OutputData>> outputToken_;
 };
 ...
 void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  CUDAProduct<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
+  cms::cuda::Product<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
 
   // Set the current device to the same that was used to produce
   // InputData, and also use the same CUDA stream
-  CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder), ctxState_};
+  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder), ctxState_};
 
   // Grab the real input data. Checks that the input data is on the
   // current device. If the input data was produced in a different CUDA
-  // stream than the CUDAScopedContextAcquire holds, create an inter-stream
+  // stream than the cms::cuda::ScopedContextAcquire holds, create an inter-stream
   // synchronization point with CUDA event and cudaStreamWaitEvent()
   auto const& inputData = ctx.get(inputDataWrapped);
 
   // Queues asynchronous data transfers and kernels to the CUDA stream
-  // returned by CUDAScopedContextAcquire::stream()
+  // returned by cms::cuda::ScopedContextAcquire::stream()
   gpuAlgo.makeAsync(inputData, ctx.stream());
 
   // Push a functor on top of "a stack of tasks" to be run as a next
@@ -336,7 +336,7 @@ void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup&
   // current device and CUDA stream have been already set up. The ctx
   // internally holds the WaitingTaskWithArenaHolder for the next task.
 
-  ctx.pushNextTask([this](CUDAScopedContextTask ctx) {
+  ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
     addMoreWork(ctx);
   });
 
@@ -346,11 +346,11 @@ void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup&
 }
 
 // Called after the asynchronous work queued in acquire() has finished
-void ProducerInputOutputCUDA::addMoreWork(CUDAScopedContextTask& ctx) {
+void ProducerInputOutputCUDA::addMoreWork(cms::cuda::ScopedContextTask& ctx) {
   // Current device and CUDA stream have already been set
 
   // Queues more asynchronous data transfer and kernels to the CUDA
-  // stream returned by CUDAScopedContextTask::stream()
+  // stream returned by cms::cuda::ScopedContextTask::stream()
   gpuAlgo.makeMoreAsync(ctx.stream());
 
   // Destructor of ctx queues a callback to the CUDA stream notifying
@@ -360,11 +360,11 @@ void ProducerInputOutputCUDA::addMoreWork(CUDAScopedContextTask& ctx) {
 // Called after the asynchronous work queued in addMoreWork() has finished
 void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
   // Sets again the current device, uses the CUDA stream created in the acquire()
-  CUDAScopedContextProduce ctx{ctxState_};
+  cms::cuda::ScopedContextProduce ctx{ctxState_};
 
   // Now getResult() returns data in GPU memory that is passed to the
-  // constructor of OutputData. CUDAScopedContextProduce::emplace() wraps the
-  // OutputData to CUDAProduct<OutputData>. CUDAProduct<T> stores also
+  // constructor of OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the
+  // OutputData to cms::cuda::Product<OutputData>. cms::cuda::Product<T> stores also
   // the current device and the CUDA stream since those will be needed
   // in the consumer side.
   ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
@@ -389,28 +389,28 @@ public:
 private:
   ...
   ProducerInputGPUAlgo gpuAlgo_;
-  edm::EDGetTokenT<CUDAProduct<InputData>> inputToken_;
-  edm::EDPutTokenT<CUDAProduct<OutputData>> outputToken_;
+  edm::EDGetTokenT<cms::cuda::Product<InputData>> inputToken_;
+  edm::EDPutTokenT<cms::cuda::Product<OutputData>> outputToken_;
 };
 ...
 void ProducerInputOutputCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const {
-  CUDAProduct<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
+  cms::cuda::Product<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
 
   // Set the current device to the same that was used to produce
   // InputData, and possibly use the same CUDA stream
-  CUDAScopedContextProduce ctx{inputDataWrapped};
+  cms::cuda::ScopedContextProduce ctx{inputDataWrapped};
 
   // Grab the real input data. Checks that the input data is on the
   // current device. If the input data was produced in a different CUDA
-  // stream than the CUDAScopedContextProduce holds, create an inter-stream
+  // stream than the cms::cuda::ScopedContextProduce holds, create an inter-stream
   // synchronization point with CUDA event and cudaStreamWaitEvent()
   auto const& inputData = ctx.get(inputDataWrapped);
 
   // Queues asynchronous data transfers and kernels to the CUDA stream
-  // returned by CUDAScopedContextProduce::stream(). Here makeAsync() also
+  // returned by cms::cuda::ScopedContextProduce::stream(). Here makeAsync() also
   // returns data in GPU memory that is passed to the constructor of
-  // OutputData. CUDAScopedContextProduce::emplace() wraps the OutputData to
-  // CUDAProduct<OutputData>. CUDAProduct<T> stores also the current
+  // OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the OutputData to
+  // cms::cuda::Product<OutputData>. cms::cuda::Product<T> stores also the current
   // device and the CUDA stream since those will be needed in the
   // consumer side.
   ctx.emplace(iEvent, outputToken, gpuAlgo.makeAsync(inputData, ctx.stream());
@@ -441,20 +441,20 @@ public:
 private:
   ...
   AnalyzerInputGPUAlgo gpuAlgo_;
-  edm::EDGetTokenT<CUDAProduct<InputData>> inputToken_;
-  edm::EDGetTokenT<CUDAProduct<OtherInputData>> otherInputToken_;
+  edm::EDGetTokenT<cms::cuda::Product<InputData>> inputToken_;
+  edm::EDGetTokenT<cms::cuda::Product<OtherInputData>> otherInputToken_;
 };
 ...
 void AnalyzerInputCUDA::analyze(edm::Event const& iEvent, edm::EventSetup& iSetup) {
-  CUDAProduct<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
+  cms::cuda::Product<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
 
   // Set the current device to the same that was used to produce
   // InputData, and possibly use the same CUDA stream
-  CUDAScopedContextAnalyze ctx{inputDataWrapped};
+  cms::cuda::ScopedContextAnalyze ctx{inputDataWrapped};
 
   // Grab the real input data. Checks that the input data is on the
   // current device. If the input data was produced in a different CUDA
-  // stream than the CUDAScopedContextAnalyze holds, create an inter-stream
+  // stream than the cms::cuda::ScopedContextAnalyze holds, create an inter-stream
   // synchronization point with CUDA event and cudaStreamWaitEvent()
   auto const& inputData = ctx.get(inputDataWrapped);
 
@@ -465,7 +465,7 @@ void AnalyzerInputCUDA::analyze(edm::Event const& iEvent, edm::EventSetup& iSetu
 
 
   // Queues asynchronous data transfers and kernels to the CUDA stream
-  // returned by CUDAScopedContextAnalyze::stream()
+  // returned by cms::cuda::ScopedContextAnalyze::stream()
   gpuAlgo.analyzeAsync(inputData, otherInputData, ctx.stream());
 }
 ```
@@ -530,13 +530,13 @@ GPU modules is done at the beginning of the job.
 
 For multi-GPU setup the device is chosen in the first CUDA module in a
 chain of modules by one of the constructors of
-`CUDAScopedContextAcquire`/`CUDAScopedContextProduce`
+`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`
 ```cpp
 // In ExternalWork acquire()
-CUDAScopedContextAcquire ctx{iEvent.streamID(), ...};
+cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), ...};
 
 // In normal produce() (or filter())
-CUDAScopedContextProduce ctx{iEvent.streamID()};
+cms::cuda::ScopedContextProduce ctx{iEvent.streamID()};
 ```
 As the choice is still the static EDM stream to device assignment, the
 EDM stream ID is needed. The logic will likely evolve in the future to
@@ -548,18 +548,18 @@ full event.
 The "GPU data product" should be a class/struct containing smart
 pointer(s) to device data (see [Memory allocation](#memory-allocation)).
 When putting the data to event, the data is wrapped to
-`CUDAProduct<T>` template, which holds
+`cms::cuda::Product<T>` template, which holds
 * the GPU data product
   * must be moveable, but no other restrictions
 * the current device where the data was produced, and the CUDA stream the data was produced with
 * [CUDA event for synchronization between multiple CUDA streams](#synchronizing-between-cuda-streams)
 
-Note that the `CUDAProduct<T>` wrapper can be constructed only with
-`CUDAScopedContextProduce::wrap()`, and the data `T` can be obtained
+Note that the `cms::cuda::Product<T>` wrapper can be constructed only with
+`cms::cuda::ScopedContextProduce::wrap()`, and the data `T` can be obtained
 from it only with
-`CUDAScopedContextAcquire::get()`/`CUDAScopedContextProduce::get()`/`CUDAScopedContextAnalyze::get()`,
+`cms::cuda::ScopedContextAcquire::get()`/`cms::cuda::ScopedContextProduce::get()`/`cms::cuda::ScopedContextAnalyze::get()`,
 as described further below. When putting the data product directly to
-`edm::Event`, also `CUDASCopedContextProduce::emplace()` can be used.
+`edm::Event`, also `cms::cuda::SCopedContextProduce::emplace()` can be used.
 
 The GPU data products that depend on the CUDA runtime should be placed
 under `CUDADataFormats` package, using the same name for sub-package
@@ -624,48 +624,48 @@ synchronization should be explicitly amortized e.g. by caching.
 
 #### Setting the current device
 
-A CUDA producer should construct `CUDAScopedContextAcquire` in
-`acquire()` (`CUDAScopedContextProduce` `produce()` if not using
+A CUDA producer should construct `cms::cuda::ScopedContextAcquire` in
+`acquire()` (`cms::cuda::ScopedContextProduce` `produce()` if not using
 `ExternalWork`) either with `edm::StreamID`, or with a
-`CUDAProduct<T>` read as an input.
+`cms::cuda::Product<T>` read as an input.
 
 ```cpp
 // From edm::StreamID
-CUDAScopedContextAcquire ctx{iEvent.streamID(), ...};
+cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), ...};
 // or
-CUDAScopedContextProduce ctx{iEvent.streamID()};
+cms::cuda::ScopedContextProduce ctx{iEvent.streamID()};
 
 
-// From CUDAProduct<T>
-CUDAProduct<GPUClusters> const& cclus = iEvent.get(srcToken_);
-CUDAScopedContextAcquire ctx{cclus, ...};
+// From cms::cuda::Product<T>
+cms::cuda::Product<GPUClusters> const& cclus = iEvent.get(srcToken_);
+cms::cuda::ScopedContextAcquire ctx{cclus, ...};
 // or
-CUDAScopedContextProduce ctx{cclus};
+cms::cuda::ScopedContextProduce ctx{cclus};
 ```
 
-A CUDA analyzer should construct `CUDAScopedContextAnalyze` with a
-`CUDAProduct<T>` read as an input.
+A CUDA analyzer should construct `cms::cuda::ScopedContextAnalyze` with a
+`cms::cuda::Product<T>` read as an input.
 
 ```cpp
-CUDAProduct<GPUClusters> const& cclus = iEvent.get(srcToken_);
-CUDAScopedContextAnalyze ctx{cclus};
+cms::cuda::Product<GPUClusters> const& cclus = iEvent.get(srcToken_);
+cms::cuda::ScopedContextAnalyze ctx{cclus};
 ```
 
-`CUDAScopedContextAcquire`/`CUDAScopedContextProduce`/`CUDAScopedContextAnalyze` work in the RAII way and does the following
+`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze` work in the RAII way and does the following
 * Sets the current device for the current scope
   - If constructed from the `edm::StreamID`, chooses the device and creates a new CUDA stream
-  - If constructed from the `CUDAProduct<T>`, uses the same device and possibly the same CUDA stream as was used to produce the `CUDAProduct<T>`
+  - If constructed from the `cms::cuda::Product<T>`, uses the same device and possibly the same CUDA stream as was used to produce the `cms::cuda::Product<T>`
     * The CUDA stream is reused if this producer is the first consumer
-      of the `CUDAProduct<T>`, otherwise a new CUDA stream is created.
+      of the `cms::cuda::Product<T>`, otherwise a new CUDA stream is created.
       This approach is simple compromise to automatically express the work of
       parallel producers in different CUDA streams, and at the same
       time allow a chain of producers to queue their work to the same
       CUDA stream.
 * Gives access to the CUDA stream the algorithm should use to queue asynchronous work
-* `CUDAScopedContextAcquire` calls `edm::WaitingTaskWithArenaHolder::doneWaiting()` when necessary (in its destructor)
+* `cms::cuda::ScopedContextAcquire` calls `edm::WaitingTaskWithArenaHolder::doneWaiting()` when necessary (in its destructor)
 * [Synchronizes between CUDA streams if necessary](#synchronizing-between-cuda-streams)
-* Needed to get `CUDAProduct<T>` from the event
-  * `CUDAScopedContextProduce` is needed to put `CUDAProduct<T>` to the event
+* Needed to get `cms::cuda::Product<T>` from the event
+  * `cms::cuda::ScopedContextProduce` is needed to put `cms::cuda::Product<T>` to the event
 
 In case of multiple input products, from possibly different CUDA
 streams and/or CUDA devices, this approach gives the developer full
@@ -673,13 +673,13 @@ control in which of them the kernels of the algorithm should be run.
 
 #### Getting input
 
-The real product (`T`) can be obtained from `CUDAProduct<T>` only with
+The real product (`T`) can be obtained from `cms::cuda::Product<T>` only with
 the help of
-`CUDAScopedContextAcquire`/`CUDAScopedContextProduce`/`CUDAScopedContextAnalyze`.
+`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze`.
 
 ```cpp
-// From CUDAProduct<T>
-CUDAProduct<GPUClusters> cclus = iEvent.get(srcToken_);
+// From cms::cuda::Product<T>
+cms::cuda::Product<GPUClusters> cclus = iEvent.get(srcToken_);
 GPUClusters const& clus = ctx.get(cclus);
 
 // Directly from Event
@@ -697,7 +697,7 @@ It is usually best to wrap the CUDA kernel calls to a separate class,
 and then call methods of that class from the EDProducer. The only
 requirement is that the CUDA stream where to queue the operations
 should be the one from the
-`CUDAScopedContextAcquire`/`CUDAScopedContextProduce`/`CUDAScopedContextAnalyze`.
+`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze`.
 
 ```cpp
 gpuAlgo.makeClustersAsync(..., ctx.stream());
@@ -710,25 +710,25 @@ CUDA events and `cudaStreamWaitEvent()`.
 
 #### Putting output
 
-The GPU data needs to be wrapped to `CUDAProduct<T>` template with
-`CUDAScopedContextProduce::wrap()` or `CUDAScopedContextProduce::emplace()`
+The GPU data needs to be wrapped to `cms::cuda::Product<T>` template with
+`cms::cuda::ScopedContextProduce::wrap()` or `cms::cuda::ScopedContextProduce::emplace()`
 
 ```cpp
 GPUClusters clusters = gpuAlgo.makeClustersAsync(..., ctx.stream());
-std::unique_ptr<CUDA<GPUClusters>> ret = ctx.wrap(clusters);
+std::unique_ptr<cms::cuda::Product<GPUClusters>> ret = ctx.wrap(clusters);
 iEvent.put(std::move(ret));
 
 // or with one line
 iEvent.put(ctx.wrap(gpuAlgo.makeClustersAsync(ctx.stream())));
 
 // or avoid one unique_ptr with emplace
-edm::PutTokenT<CUDA<GPUClusters>> putToken_ = produces<CUDA<GPUClusters>>(); // in constructor
+edm::PutTokenT<cms::cuda::Product<GPUClusters>> putToken_ = produces<cms::cuda::Product<GPUClusters>>(); // in constructor
 ...
 ctx.emplace(iEvent, putToken_, gpuAlgo.makeClustersAsync(ctx.stream()));
 ```
 
 This step is needed to
-* store the current device and CUDA stream into `CUDAProduct<T>`
+* store the current device and CUDA stream into `cms::cuda::Product<T>`
 * record the CUDA event needed for CUDA stream synchronization
 
 #### `ExternalWork` extension
@@ -745,43 +745,43 @@ needed for downstream DQM, or queueing more asynchronous work. With
 `ExternalWork` an `acquire()` method needs to be implemented that gets
 an `edm::WaitingTaskWithArenaHolder` parameter. The
 `edm::WaitingTaskWithArenaHolder` should then be passed to the
-constructor of `CUDAScopedContextAcquire` along
+constructor of `cms::cuda::ScopedContextAcquire` along
 
 ```cpp
 void acquire(..., edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  CUDAProduct<GPUClusters> const& cclus = iEvent.get(token_);
-  CUDAScopedContextAcquire ctx{cclus, std::move(waitingTaskHolder)}; // can also copy instead of move if waitingTaskHolder is needed for something else as well
+  cms::cuda::Product<GPUClusters> const& cclus = iEvent.get(token_);
+  cms::cuda::ScopedContextAcquire ctx{cclus, std::move(waitingTaskHolder)}; // can also copy instead of move if waitingTaskHolder is needed for something else as well
   ...
 ```
 
-When constructed this way, `CUDAScopedContextAcquire` registers a
+When constructed this way, `cms::cuda::ScopedContextAcquire` registers a
 callback function to the CUDA stream in its destructor to call
 `waitingTaskHolder.doneWaiting()`.
 
-A GPU->GPU producer needs a `CUDAScopedContext` also in its
+A GPU->GPU producer needs a `cms::cuda::ScopedContext` also in its
 `produce()`. The device and CUDA stream are transferred via
-`CUDAContextState` member variable:
+`cms::cuda::ContextState` member variable:
 
 ```cpp
 class FooProducerCUDA ... {
   ...
-  CUDAContextState ctxState_;
+  cms::cuda::ContextState ctxState_;
 };
 
-void acquire(...) {
+void FooProducerCUDA::acquire(...) {
   ...
-  FooProducerCUDA::CUDAScopedContextAcquire ctx{..., std::move(waitingTaskHolder), ctxState_};
+  cms::cuda::ScopedContextAcquire ctx{..., std::move(waitingTaskHolder), ctxState_};
   ...
 }
 
-void produce(...( {
+void FooProducerCUDA::produce(...( {
   ...
-  FooProducerCUDA::CUDAScopedContextProduce ctx{ctxState_};
+  cms::cuda::ScopedContextProduce ctx{ctxState_};
 }
 ```
 
-The `CUDAScopedContextAcquire` saves its state to the `ctxState_` in
-the destructor, and `CUDAScopedContextProduce` then restores the
+The `cms::cuda::ScopedContextAcquire` saves its state to the `ctxState_` in
+the destructor, and `cms::cuda::ScopedContextProduce` then restores the
 context.
 
 #### Module-internal chain of CPU and GPU tasks
@@ -796,7 +796,7 @@ can also be used to re-run (part of) the GPU work.
 
 The "next tasks" to run are essentially structured as a stack, such
 that
-- `CUDAScopedContextAcquire`/`CUDAScopedContextTask::pushNextTask()`
+- `cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextTask::pushNextTask()`
   pushes a new functor on top of the stack
 - Completion of both the asynchronous work and the queueing function
   pops the top task of the stack and enqueues it (so that TBB
@@ -815,7 +815,7 @@ to run next (following the example of the previous section)
 ```cpp
 void FooProducerCUDA::acquire(...) {
    ...
-   ctx.pushNextTask([this](CUDAScopedContextTask ctx) {
+   ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
      ...
    });
    ...
@@ -823,7 +823,7 @@ void FooProducerCUDA::acquire(...) {
 ```
 
 In this case the `ctx`argument to the function is a
-`CUDAScopedContexTask` object constructed by the TBB task calling the
+`cms::cuda::ScopedContexTask` object constructed by the TBB task calling the
 user-given function. It follows that the current device and CUDA
 stream have been set up already. The `pushNextTask()` can be called
 many times. On each invocation the `pushNextTask()` pushes a new task
@@ -831,13 +831,13 @@ on top of the stack (i.e. in front of the chain). It follows that in
 ```cpp
 void FooProducerCUDA::acquire(...) {
    ...
-   ctx.pushNextTask([this](CUDAScopedContextTask ctx) {
+   ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
      ... // function 1
    });
-   ctx.pushNextTask([this](CUDAScopedContextTask ctx) {
+   ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
      ... // function 2
    });
-   ctx.pushNextTask([this](CUDAScopedContextTask ctx) {
+   ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
      ... // function 3
    });
    ...
@@ -850,7 +850,7 @@ service) in these intermediate tasks. In the near future memory
 allocations etc. will be made possible by taking them out from the
 `CUDAService`.
 
-The `CUDAScopedContextAcquire`/`CUDAScopedContextTask` have also a
+The `cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextTask` have also a
 more generic member function, `replaceWaitingTaskHolder()`, that can
 be used to just replace the currently-hold
 `edm::WaitingTaskWithArenaHolder` (that will get notified by the
@@ -869,7 +869,7 @@ the `ExternalWork` needs to be used along
 * In `produce()`
   * If needed, read additional CPU products (e.g. from `edm::Ref`s)
   * Reformat data back to legacy data formats
-  * Note: `CUDAScopedContextProduce` is **not** needed in `produce()`
+  * Note: `cms::cuda::ScopedContextProduce` is **not** needed in `produce()`
 
 #### Synchronizing between CUDA streams
 
@@ -877,15 +877,15 @@ In case the producer needs input data that were produced in two (or
 more) CUDA streams, these streams have to be synchronized. Here this
 synchronization is achieved with CUDA events.
 
-Each `CUDAProduct<T>` constains also a CUDA event object. The call to
-`CUDAScopedContextProduce::wrap()` will *record* the event in the CUDA
+Each `cms::cuda::Product<T>` constains also a CUDA event object. The call to
+`cms::cuda::ScopedContextProduce::wrap()` will *record* the event in the CUDA
 stream. This means that when all work queued to the CUDA stream up to
 that point has been finished, the CUDA event becomes *occurred*. Then,
 in
-`CUDAScopedContextAcquire::get()`/`CUDAScopedContextProduce::get()`/`CUDAScopedContextAnalyze::get()`,
-if the `CUDAProduct<T>` to get from has a different CUDA stream than
+`cms::cuda::ScopedContextAcquire::get()`/`cms::cuda::ScopedContextProduce::get()`/`cms::cuda::ScopedContextAnalyze::get()`,
+if the `cms::cuda::Product<T>` to get from has a different CUDA stream than
 the
-`CUDAScopedContextAcquire`/`CUDAScopedContextProduce`/`CUDAScopedContextAnalyze`,
+`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze`,
 `cudaStreamWaitEvent(stream, event)` is called. This means that all
 subsequent work queued to the CUDA stream will wait for the CUDA event
 to become occurred. Therefore this subsequent work can assume that the
@@ -901,12 +901,12 @@ pattern.
 2. Define a wrapper ESProduct that holds the aforementioned data in the pinned host memory
 3. The wrapper should have a function returning the payload on the
    device memory. The function should transfer the data to the device
-   asynchronously with the help of `CUDAESProduct<T>`.
+   asynchronously with the help of `cms::cuda::ESProduct<T>`.
 
 #### Example
 
 ```cpp
-#include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 
 // Declare the struct for the payload to be transferred. Here the
 // example is an array with (potentially) dynamic size. Note that all of
@@ -948,7 +948,7 @@ private:
 
   // Helper that takes care of complexity of transferring the data to
   // multiple devices
-  CUDAESProduct<GPUData> gpuData_;
+  cms::cuda::ESProduct<GPUData> gpuData_;
 };
 
 ESProductExampleCUDAWrapper::ESProductExampleCUDAWrapper(ESProductExample const& cpuProduct) {
@@ -957,7 +957,7 @@ ESProductExampleCUDAWrapper::ESProductExampleCUDAWrapper(ESProductExample const&
 }
 
 ESProductExampleCUDA const *ESProductExampleCUDAWrapper::getGPUProductAsync(cudaStream_t stream) const {
-  // CUDAESProduct<T> essentially holds an array of GPUData objects,
+  // cms::cuda::ESProduct<T> essentially holds an array of GPUData objects,
   // one per device. If the data have already been transferred to the
   // current device (or the transfer has been queued), the helper just
   // returns a reference to that GPUData object. Otherwise, i.e. data are
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAContextState.h b/HeterogeneousCore/CUDACore/interface/CUDAContextState.h
deleted file mode 100644
index b3c20dcb73159..0000000000000
--- a/HeterogeneousCore/CUDACore/interface/CUDAContextState.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_CUDAContextState_h
-#define HeterogeneousCore_CUDACore_CUDAContextState_h
-
-#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
-
-#include <memory>
-
-/**
- * The purpose of this class is to deliver the device and CUDA stream
- * information from ExternalWork's acquire() to producer() via a
- * member/StreamCache variable.
- */
-class CUDAContextState {
-public:
-  CUDAContextState() = default;
-  ~CUDAContextState() = default;
-
-  CUDAContextState(const CUDAContextState&) = delete;
-  CUDAContextState& operator=(const CUDAContextState&) = delete;
-  CUDAContextState(CUDAContextState&&) = delete;
-  CUDAContextState& operator=(CUDAContextState&& other) = delete;
-
-private:
-  friend class CUDAScopedContextAcquire;
-  friend class CUDAScopedContextProduce;
-  friend class CUDAScopedContextTask;
-
-  void set(int device, cudautils::SharedStreamPtr stream) {
-    throwIfStream();
-    device_ = device;
-    stream_ = std::move(stream);
-  }
-
-  int device() const { return device_; }
-
-  const cudautils::SharedStreamPtr& streamPtr() const {
-    throwIfNoStream();
-    return stream_;
-  }
-
-  cudautils::SharedStreamPtr releaseStreamPtr() {
-    throwIfNoStream();
-    // This function needs to effectively reset stream_ (i.e. stream_
-    // must be empty after this function). This behavior ensures that
-    // the SharedStreamPtr is not hold for inadvertedly long (i.e. to
-    // the next event), and is checked at run time.
-    return std::move(stream_);
-  }
-
-  void throwIfStream() const;
-  void throwIfNoStream() const;
-
-  cudautils::SharedStreamPtr stream_;
-  int device_;
-};
-
-#endif
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h b/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h
deleted file mode 100644
index 5ef2399f96ea0..0000000000000
--- a/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h
+++ /dev/null
@@ -1,99 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_CUDAESProduct_h
-#define HeterogeneousCore_CUDACore_CUDAESProduct_h
-
-#include <atomic>
-#include <cassert>
-#include <mutex>
-#include <vector>
-
-#include "FWCore/Utilities/interface/thread_safety_macros.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
-
-template <typename T>
-class CUDAESProduct {
-public:
-  CUDAESProduct() : gpuDataPerDevice_(cudautils::deviceCount()) {
-    for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) {
-      gpuDataPerDevice_[i].m_event = cudautils::getEventCache().get();
-    }
-  }
-  ~CUDAESProduct() = default;
-
-  // transferAsync should be a function of (T&, cudaStream_t)
-  // which enqueues asynchronous transfers (possibly kernels as well)
-  // to the CUDA stream
-  template <typename F>
-  const T& dataForCurrentDeviceAsync(cudaStream_t cudaStream, F transferAsync) const {
-    auto device = cudautils::currentDevice();
-
-    auto& data = gpuDataPerDevice_[device];
-
-    // If GPU data has already been filled, we can return it
-    // immediately
-    if (not data.m_filled.load()) {
-      // It wasn't, so need to fill it
-      std::scoped_lock<std::mutex> lk{data.m_mutex};
-
-      if (data.m_filled.load()) {
-        // Other thread marked it filled while we were locking the mutex, so we're free to return it
-        return data.m_data;
-      }
-
-      if (data.m_fillingStream != nullptr) {
-        // Someone else is filling
-
-        // Check first if the recorded event has occurred
-        if (cudautils::eventWorkHasCompleted(data.m_event.get())) {
-          // It was, so data is accessible from all CUDA streams on
-          // the device. Set the 'filled' for all subsequent calls and
-          // return the value
-          auto should_be_false = data.m_filled.exchange(true);
-          assert(not should_be_false);
-          data.m_fillingStream = nullptr;
-        } else if (data.m_fillingStream != cudaStream) {
-          // Filling is still going on. For other CUDA stream, add
-          // wait on the CUDA stream and return the value. Subsequent
-          // work queued on the stream will wait for the event to
-          // occur (i.e. transfer to finish).
-          cudaCheck(cudaStreamWaitEvent(cudaStream, data.m_event.get(), 0),
-                    "Failed to make a stream to wait for an event");
-        }
-        // else: filling is still going on. But for the same CUDA
-        // stream (which would be a bit strange but fine), we can just
-        // return as all subsequent work should be enqueued to the
-        // same CUDA stream (or stream to be explicitly synchronized
-        // by the caller)
-      } else {
-        // Now we can be sure that the data is not yet on the GPU, and
-        // this thread is the first to try that.
-        transferAsync(data.m_data, cudaStream);
-        assert(data.m_fillingStream == nullptr);
-        data.m_fillingStream = cudaStream;
-        // Now the filling has been enqueued to the cudaStream, so we
-        // can return the GPU data immediately, since all subsequent
-        // work must be either enqueued to the cudaStream, or the cudaStream
-        // must be synchronized by the caller
-      }
-    }
-
-    return data.m_data;
-  }
-
-private:
-  struct Item {
-    mutable std::mutex m_mutex;
-    CMS_THREAD_GUARD(m_mutex) mutable cudautils::SharedEventPtr m_event;
-    // non-null if some thread is already filling (cudaStream_t is just a pointer)
-    CMS_THREAD_GUARD(m_mutex) mutable cudaStream_t m_fillingStream = nullptr;
-    mutable std::atomic<bool> m_filled = false;  // easy check if data has been filled already or not
-    CMS_THREAD_GUARD(m_mutex) mutable T m_data;
-  };
-
-  std::vector<Item> gpuDataPerDevice_;
-};
-
-#endif
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
deleted file mode 100644
index 01533c1a5d222..0000000000000
--- a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
+++ /dev/null
@@ -1,243 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_CUDAScopedContext_h
-#define HeterogeneousCore_CUDACore_CUDAScopedContext_h
-
-#include <optional>
-
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
-#include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Utilities/interface/EDGetToken.h"
-#include "FWCore/Utilities/interface/EDPutToken.h"
-#include "FWCore/Utilities/interface/StreamID.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
-
-namespace cudatest {
-  class TestCUDAScopedContext;
-}
-
-namespace impl {
-  // This class is intended to be derived by other CUDAScopedContext*, not for general use
-  class CUDAScopedContextBase {
-  public:
-    int device() const { return currentDevice_; }
-
-    // cudaStream_t is a pointer to a thread-safe object, for which a
-    // mutable access is needed even if the CUDAScopedContext itself
-    // would be const. Therefore it is ok to return a non-const
-    // pointer from a const method here.
-    cudaStream_t stream() const { return stream_.get(); }
-    const cudautils::SharedStreamPtr& streamPtr() const { return stream_; }
-
-  protected:
-    // The constructors set the current device, but the device
-    // is not set back to the previous value at the destructor. This
-    // should be sufficient (and tiny bit faster) as all CUDA API
-    // functions relying on the current device should be called from
-    // the scope where this context is. The current device doesn't
-    // really matter between modules (or across TBB tasks).
-    explicit CUDAScopedContextBase(edm::StreamID streamID);
-
-    explicit CUDAScopedContextBase(const CUDAProductBase& data);
-
-    explicit CUDAScopedContextBase(int device, cudautils::SharedStreamPtr stream);
-
-  private:
-    int currentDevice_;
-    cudautils::SharedStreamPtr stream_;
-  };
-
-  class CUDAScopedContextGetterBase : public CUDAScopedContextBase {
-  public:
-    template <typename T>
-    const T& get(const CUDAProduct<T>& data) {
-      synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event());
-      return data.data_;
-    }
-
-    template <typename T>
-    const T& get(const edm::Event& iEvent, edm::EDGetTokenT<CUDAProduct<T>> token) {
-      return get(iEvent.get(token));
-    }
-
-  protected:
-    template <typename... Args>
-    CUDAScopedContextGetterBase(Args&&... args) : CUDAScopedContextBase(std::forward<Args>(args)...) {}
-
-    void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent);
-  };
-
-  class CUDAScopedContextHolderHelper {
-  public:
-    CUDAScopedContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder)
-        : waitingTaskHolder_{std::move(waitingTaskHolder)} {}
-
-    template <typename F>
-    void pushNextTask(F&& f, CUDAContextState const* state);
-
-    void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-      waitingTaskHolder_ = std::move(waitingTaskHolder);
-    }
-
-    void enqueueCallback(int device, cudaStream_t stream);
-
-  private:
-    edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
-  };
-}  // namespace impl
-
-/**
- * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire():
- * - setting the current device
- * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
- * - synchronizing between CUDA streams if necessary
- * and enforce that those get done in a proper way in RAII fashion.
- */
-class CUDAScopedContextAcquire : public impl::CUDAScopedContextGetterBase {
-public:
-  /// Constructor to create a new CUDA stream (no need for context beyond acquire())
-  explicit CUDAScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
-      : CUDAScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)} {}
-
-  /// Constructor to create a new CUDA stream, and the context is needed after acquire()
-  explicit CUDAScopedContextAcquire(edm::StreamID streamID,
-                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder,
-                                    CUDAContextState& state)
-      : CUDAScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {}
-
-  /// Constructor to (possibly) re-use a CUDA stream (no need for context beyond acquire())
-  explicit CUDAScopedContextAcquire(const CUDAProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
-      : CUDAScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)} {}
-
-  /// Constructor to (possibly) re-use a CUDA stream, and the context is needed after acquire()
-  explicit CUDAScopedContextAcquire(const CUDAProductBase& data,
-                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder,
-                                    CUDAContextState& state)
-      : CUDAScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {}
-
-  ~CUDAScopedContextAcquire();
-
-  template <typename F>
-  void pushNextTask(F&& f) {
-    if (contextState_ == nullptr)
-      throwNoState();
-    holderHelper_.pushNextTask(std::forward<F>(f), contextState_);
-  }
-
-  void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-    holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
-  }
-
-private:
-  void throwNoState();
-
-  impl::CUDAScopedContextHolderHelper holderHelper_;
-  CUDAContextState* contextState_ = nullptr;
-};
-
-/**
- * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce():
- * - setting the current device
- * - synchronizing between CUDA streams if necessary
- * and enforce that those get done in a proper way in RAII fashion.
- */
-class CUDAScopedContextProduce : public impl::CUDAScopedContextGetterBase {
-public:
-  /// Constructor to create a new CUDA stream (non-ExternalWork module)
-  explicit CUDAScopedContextProduce(edm::StreamID streamID) : CUDAScopedContextGetterBase(streamID) {}
-
-  /// Constructor to (possibly) re-use a CUDA stream (non-ExternalWork module)
-  explicit CUDAScopedContextProduce(const CUDAProductBase& data) : CUDAScopedContextGetterBase(data) {}
-
-  /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
-  explicit CUDAScopedContextProduce(CUDAContextState& state)
-      : CUDAScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {}
-
-  /// Record the CUDA event, all asynchronous work must have been queued before the destructor
-  ~CUDAScopedContextProduce();
-
-  template <typename T>
-  std::unique_ptr<CUDAProduct<T>> wrap(T data) {
-    // make_unique doesn't work because of private constructor
-    return std::unique_ptr<CUDAProduct<T>>(new CUDAProduct<T>(device(), streamPtr(), event_, std::move(data)));
-  }
-
-  template <typename T, typename... Args>
-  auto emplace(edm::Event& iEvent, edm::EDPutTokenT<T> token, Args&&... args) {
-    return iEvent.emplace(token, device(), streamPtr(), event_, std::forward<Args>(args)...);
-  }
-
-private:
-  friend class cudatest::TestCUDAScopedContext;
-
-  // This construcor is only meant for testing
-  explicit CUDAScopedContextProduce(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event)
-      : CUDAScopedContextGetterBase(device, std::move(stream)), event_{std::move(event)} {}
-
-  // create the CUDA Event upfront to catch possible errors from its creation
-  cudautils::SharedEventPtr event_ = cudautils::getEventCache().get();
-};
-
-/**
- * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire():
- * - setting the current device
- * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
- * and enforce that those get done in a proper way in RAII fashion.
- */
-class CUDAScopedContextTask : public impl::CUDAScopedContextBase {
-public:
-  /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
-  explicit CUDAScopedContextTask(CUDAContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
-      : CUDAScopedContextBase(state->device(), state->streamPtr()),  // don't move, state is re-used afterwards
-        holderHelper_{std::move(waitingTaskHolder)},
-        contextState_{state} {}
-
-  ~CUDAScopedContextTask();
-
-  template <typename F>
-  void pushNextTask(F&& f) {
-    holderHelper_.pushNextTask(std::forward<F>(f), contextState_);
-  }
-
-  void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-    holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
-  }
-
-private:
-  impl::CUDAScopedContextHolderHelper holderHelper_;
-  CUDAContextState const* contextState_;
-};
-
-/**
- * The aim of this class is to do necessary per-event "initialization" in analyze()
- * - setting the current device
- * - synchronizing between CUDA streams if necessary
- * and enforce that those get done in a proper way in RAII fashion.
- */
-/**
- * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce():
- * - setting the current device
- * - synchronizing between CUDA streams if necessary
- * and enforce that those get done in a proper way in RAII fashion.
- */
-class CUDAScopedContextAnalyze : public impl::CUDAScopedContextGetterBase {
-public:
-  /// Constructor to (possibly) re-use a CUDA stream
-  explicit CUDAScopedContextAnalyze(const CUDAProductBase& data) : CUDAScopedContextGetterBase(data) {}
-};
-
-namespace impl {
-  template <typename F>
-  void CUDAScopedContextHolderHelper::pushNextTask(F&& f, CUDAContextState const* state) {
-    replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{
-        edm::make_waiting_task_with_holder(tbb::task::allocate_root(),
-                                           std::move(waitingTaskHolder_),
-                                           [state, func = std::forward<F>(f)](edm::WaitingTaskWithArenaHolder h) {
-                                             func(CUDAScopedContextTask{state, std::move(h)});
-                                           })});
-  }
-}  // namespace impl
-
-#endif
diff --git a/HeterogeneousCore/CUDACore/interface/ContextState.h b/HeterogeneousCore/CUDACore/interface/ContextState.h
new file mode 100644
index 0000000000000..a2ab42363a7b5
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/interface/ContextState.h
@@ -0,0 +1,61 @@
+#ifndef HeterogeneousCore_CUDACore_ContextState_h
+#define HeterogeneousCore_CUDACore_ContextState_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
+
+#include <memory>
+
+namespace cms {
+  namespace cuda {
+    /**
+     * The purpose of this class is to deliver the device and CUDA stream
+     * information from ExternalWork's acquire() to producer() via a
+     * member/StreamCache variable.
+     */
+    class ContextState {
+    public:
+      ContextState() = default;
+      ~ContextState() = default;
+
+      ContextState(const ContextState&) = delete;
+      ContextState& operator=(const ContextState&) = delete;
+      ContextState(ContextState&&) = delete;
+      ContextState& operator=(ContextState&& other) = delete;
+
+    private:
+      friend class ScopedContextAcquire;
+      friend class ScopedContextProduce;
+      friend class ScopedContextTask;
+
+      void set(int device, cudautils::SharedStreamPtr stream) {
+        throwIfStream();
+        device_ = device;
+        stream_ = std::move(stream);
+      }
+
+      int device() const { return device_; }
+
+      const cudautils::SharedStreamPtr& streamPtr() const {
+        throwIfNoStream();
+        return stream_;
+      }
+
+      cudautils::SharedStreamPtr releaseStreamPtr() {
+        throwIfNoStream();
+        // This function needs to effectively reset stream_ (i.e. stream_
+        // must be empty after this function). This behavior ensures that
+        // the SharedStreamPtr is not hold for inadvertedly long (i.e. to
+        // the next event), and is checked at run time.
+        return std::move(stream_);
+      }
+
+      void throwIfStream() const;
+      void throwIfNoStream() const;
+
+      cudautils::SharedStreamPtr stream_;
+      int device_;
+    };
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/interface/ESProduct.h b/HeterogeneousCore/CUDACore/interface/ESProduct.h
new file mode 100644
index 0000000000000..40c7ebca15a9b
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/interface/ESProduct.h
@@ -0,0 +1,103 @@
+#ifndef HeterogeneousCore_CUDACore_CUDAESProduct_h
+#define HeterogeneousCore_CUDACore_CUDAESProduct_h
+
+#include <atomic>
+#include <cassert>
+#include <mutex>
+#include <vector>
+
+#include "FWCore/Utilities/interface/thread_safety_macros.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
+
+namespace cms {
+  namespace cuda {
+    template <typename T>
+    class ESProduct {
+    public:
+      ESProduct() : gpuDataPerDevice_(cudautils::deviceCount()) {
+        for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) {
+          gpuDataPerDevice_[i].m_event = cudautils::getEventCache().get();
+        }
+      }
+      ~ESProduct() = default;
+
+      // transferAsync should be a function of (T&, cudaStream_t)
+      // which enqueues asynchronous transfers (possibly kernels as well)
+      // to the CUDA stream
+      template <typename F>
+      const T& dataForCurrentDeviceAsync(cudaStream_t cudaStream, F transferAsync) const {
+        auto device = cudautils::currentDevice();
+
+        auto& data = gpuDataPerDevice_[device];
+
+        // If GPU data has already been filled, we can return it
+        // immediately
+        if (not data.m_filled.load()) {
+          // It wasn't, so need to fill it
+          std::scoped_lock<std::mutex> lk{data.m_mutex};
+
+          if (data.m_filled.load()) {
+            // Other thread marked it filled while we were locking the mutex, so we're free to return it
+            return data.m_data;
+          }
+
+          if (data.m_fillingStream != nullptr) {
+            // Someone else is filling
+
+            // Check first if the recorded event has occurred
+            if (cudautils::eventWorkHasCompleted(data.m_event.get())) {
+              // It was, so data is accessible from all CUDA streams on
+              // the device. Set the 'filled' for all subsequent calls and
+              // return the value
+              auto should_be_false = data.m_filled.exchange(true);
+              assert(not should_be_false);
+              data.m_fillingStream = nullptr;
+            } else if (data.m_fillingStream != cudaStream) {
+              // Filling is still going on. For other CUDA stream, add
+              // wait on the CUDA stream and return the value. Subsequent
+              // work queued on the stream will wait for the event to
+              // occur (i.e. transfer to finish).
+              cudaCheck(cudaStreamWaitEvent(cudaStream, data.m_event.get(), 0),
+                        "Failed to make a stream to wait for an event");
+            }
+            // else: filling is still going on. But for the same CUDA
+            // stream (which would be a bit strange but fine), we can just
+            // return as all subsequent work should be enqueued to the
+            // same CUDA stream (or stream to be explicitly synchronized
+            // by the caller)
+          } else {
+            // Now we can be sure that the data is not yet on the GPU, and
+            // this thread is the first to try that.
+            transferAsync(data.m_data, cudaStream);
+            assert(data.m_fillingStream == nullptr);
+            data.m_fillingStream = cudaStream;
+            // Now the filling has been enqueued to the cudaStream, so we
+            // can return the GPU data immediately, since all subsequent
+            // work must be either enqueued to the cudaStream, or the cudaStream
+            // must be synchronized by the caller
+          }
+        }
+
+        return data.m_data;
+      }
+
+    private:
+      struct Item {
+        mutable std::mutex m_mutex;
+        CMS_THREAD_GUARD(m_mutex) mutable cudautils::SharedEventPtr m_event;
+        // non-null if some thread is already filling (cudaStream_t is just a pointer)
+        CMS_THREAD_GUARD(m_mutex) mutable cudaStream_t m_fillingStream = nullptr;
+        mutable std::atomic<bool> m_filled = false;  // easy check if data has been filled already or not
+        CMS_THREAD_GUARD(m_mutex) mutable T m_data;
+      };
+
+      std::vector<Item> gpuDataPerDevice_;
+    };
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/interface/ScopedContext.h b/HeterogeneousCore/CUDACore/interface/ScopedContext.h
new file mode 100644
index 0000000000000..c13c0cc700628
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/interface/ScopedContext.h
@@ -0,0 +1,242 @@
+#ifndef HeterogeneousCore_CUDACore_ScopedContext_h
+#define HeterogeneousCore_CUDACore_ScopedContext_h
+
+#include <optional>
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/EDPutToken.h"
+#include "FWCore/Utilities/interface/StreamID.h"
+#include "HeterogeneousCore/CUDACore/interface/ContextState.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
+
+namespace cms {
+  namespace cudatest {
+    class TestScopedContext;
+  }
+
+  namespace cuda {
+
+    namespace impl {
+      // This class is intended to be derived by other ScopedContext*, not for general use
+      class ScopedContextBase {
+      public:
+        int device() const { return currentDevice_; }
+
+        // cudaStream_t is a pointer to a thread-safe object, for which a
+        // mutable access is needed even if the ScopedContext itself
+        // would be const. Therefore it is ok to return a non-const
+        // pointer from a const method here.
+        cudaStream_t stream() const { return stream_.get(); }
+        const cudautils::SharedStreamPtr& streamPtr() const { return stream_; }
+
+      protected:
+        // The constructors set the current device, but the device
+        // is not set back to the previous value at the destructor. This
+        // should be sufficient (and tiny bit faster) as all CUDA API
+        // functions relying on the current device should be called from
+        // the scope where this context is. The current device doesn't
+        // really matter between modules (or across TBB tasks).
+        explicit ScopedContextBase(edm::StreamID streamID);
+
+        explicit ScopedContextBase(const ProductBase& data);
+
+        explicit ScopedContextBase(int device, cudautils::SharedStreamPtr stream);
+
+      private:
+        int currentDevice_;
+        cudautils::SharedStreamPtr stream_;
+      };
+
+      class ScopedContextGetterBase : public ScopedContextBase {
+      public:
+        template <typename T>
+        const T& get(const Product<T>& data) {
+          synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event());
+          return data.data_;
+        }
+
+        template <typename T>
+        const T& get(const edm::Event& iEvent, edm::EDGetTokenT<Product<T>> token) {
+          return get(iEvent.get(token));
+        }
+
+      protected:
+        template <typename... Args>
+        ScopedContextGetterBase(Args&&... args) : ScopedContextBase(std::forward<Args>(args)...) {}
+
+        void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent);
+      };
+
+      class ScopedContextHolderHelper {
+      public:
+        ScopedContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+            : waitingTaskHolder_{std::move(waitingTaskHolder)} {}
+
+        template <typename F>
+        void pushNextTask(F&& f, ContextState const* state);
+
+        void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+          waitingTaskHolder_ = std::move(waitingTaskHolder);
+        }
+
+        void enqueueCallback(int device, cudaStream_t stream);
+
+      private:
+        edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
+      };
+    }  // namespace impl
+
+    /**
+     * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire():
+     * - setting the current device
+     * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
+     * - synchronizing between CUDA streams if necessary
+     * and enforce that those get done in a proper way in RAII fashion.
+     */
+    class ScopedContextAcquire : public impl::ScopedContextGetterBase {
+    public:
+      /// Constructor to create a new CUDA stream (no need for context beyond acquire())
+      explicit ScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+          : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)} {}
+
+      /// Constructor to create a new CUDA stream, and the context is needed after acquire()
+      explicit ScopedContextAcquire(edm::StreamID streamID,
+                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder,
+                                    ContextState& state)
+          : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {}
+
+      /// Constructor to (possibly) re-use a CUDA stream (no need for context beyond acquire())
+      explicit ScopedContextAcquire(const ProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+          : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)} {}
+
+      /// Constructor to (possibly) re-use a CUDA stream, and the context is needed after acquire()
+      explicit ScopedContextAcquire(const ProductBase& data,
+                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder,
+                                    ContextState& state)
+          : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {}
+
+      ~ScopedContextAcquire();
+
+      template <typename F>
+      void pushNextTask(F&& f) {
+        if (contextState_ == nullptr)
+          throwNoState();
+        holderHelper_.pushNextTask(std::forward<F>(f), contextState_);
+      }
+
+      void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+        holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
+      }
+
+    private:
+      void throwNoState();
+
+      impl::ScopedContextHolderHelper holderHelper_;
+      ContextState* contextState_ = nullptr;
+    };
+
+    /**
+     * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce():
+     * - setting the current device
+     * - synchronizing between CUDA streams if necessary
+     * and enforce that those get done in a proper way in RAII fashion.
+     */
+    class ScopedContextProduce : public impl::ScopedContextGetterBase {
+    public:
+      /// Constructor to create a new CUDA stream (non-ExternalWork module)
+      explicit ScopedContextProduce(edm::StreamID streamID) : ScopedContextGetterBase(streamID) {}
+
+      /// Constructor to (possibly) re-use a CUDA stream (non-ExternalWork module)
+      explicit ScopedContextProduce(const ProductBase& data) : ScopedContextGetterBase(data) {}
+
+      /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
+      explicit ScopedContextProduce(ContextState& state)
+          : ScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {}
+
+      /// Record the CUDA event, all asynchronous work must have been queued before the destructor
+      ~ScopedContextProduce();
+
+      template <typename T>
+      std::unique_ptr<Product<T>> wrap(T data) {
+        // make_unique doesn't work because of private constructor
+        return std::unique_ptr<Product<T>>(new Product<T>(device(), streamPtr(), event_, std::move(data)));
+      }
+
+      template <typename T, typename... Args>
+      auto emplace(edm::Event& iEvent, edm::EDPutTokenT<T> token, Args&&... args) {
+        return iEvent.emplace(token, device(), streamPtr(), event_, std::forward<Args>(args)...);
+      }
+
+    private:
+      friend class cudatest::TestScopedContext;
+
+      // This construcor is only meant for testing
+      explicit ScopedContextProduce(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event)
+          : ScopedContextGetterBase(device, std::move(stream)), event_{std::move(event)} {}
+
+      // create the CUDA Event upfront to catch possible errors from its creation
+      cudautils::SharedEventPtr event_ = cudautils::getEventCache().get();
+    };
+
+    /**
+     * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire():
+     * - setting the current device
+     * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
+     * and enforce that those get done in a proper way in RAII fashion.
+     */
+    class ScopedContextTask : public impl::ScopedContextBase {
+    public:
+      /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
+      explicit ScopedContextTask(ContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+          : ScopedContextBase(state->device(), state->streamPtr()),  // don't move, state is re-used afterwards
+            holderHelper_{std::move(waitingTaskHolder)},
+            contextState_{state} {}
+
+      ~ScopedContextTask();
+
+      template <typename F>
+      void pushNextTask(F&& f) {
+        holderHelper_.pushNextTask(std::forward<F>(f), contextState_);
+      }
+
+      void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+        holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
+      }
+
+    private:
+      impl::ScopedContextHolderHelper holderHelper_;
+      ContextState const* contextState_;
+    };
+
+    /**
+     * The aim of this class is to do necessary per-event "initialization" in analyze()
+     * - setting the current device
+     * - synchronizing between CUDA streams if necessary
+     * and enforce that those get done in a proper way in RAII fashion.
+     */
+    class ScopedContextAnalyze : public impl::ScopedContextGetterBase {
+    public:
+      /// Constructor to (possibly) re-use a CUDA stream
+      explicit ScopedContextAnalyze(const ProductBase& data) : ScopedContextGetterBase(data) {}
+    };
+
+    namespace impl {
+      template <typename F>
+      void ScopedContextHolderHelper::pushNextTask(F&& f, ContextState const* state) {
+        replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{
+            edm::make_waiting_task_with_holder(tbb::task::allocate_root(),
+                                               std::move(waitingTaskHolder_),
+                                               [state, func = std::forward<F>(f)](edm::WaitingTaskWithArenaHolder h) {
+                                                 func(ScopedContextTask{state, std::move(h)});
+                                               })});
+      }
+    }  // namespace impl
+  }    // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/src/CUDAContextState.cc b/HeterogeneousCore/CUDACore/src/CUDAContextState.cc
deleted file mode 100644
index bcdbae89d9094..0000000000000
--- a/HeterogeneousCore/CUDACore/src/CUDAContextState.cc
+++ /dev/null
@@ -1,14 +0,0 @@
-#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
-#include "FWCore/Utilities/interface/Exception.h"
-
-void CUDAContextState::throwIfStream() const {
-  if (stream_) {
-    throw cms::Exception("LogicError") << "Trying to set CUDAContextState, but it already had a valid state";
-  }
-}
-
-void CUDAContextState::throwIfNoStream() const {
-  if (not stream_) {
-    throw cms::Exception("LogicError") << "Trying to get CUDAContextState, but it did not have a valid state";
-  }
-}
diff --git a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
deleted file mode 100644
index 54dcdfe7548b6..0000000000000
--- a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
-
-#include "FWCore/MessageLogger/interface/MessageLogger.h"
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "FWCore/Utilities/interface/Exception.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-
-#include "chooseCUDADevice.h"
-
-namespace {
-  struct CallbackData {
-    edm::WaitingTaskWithArenaHolder holder;
-    int device;
-  };
-
-  void CUDART_CB cudaScopedContextCallback(cudaStream_t streamId, cudaError_t status, void* data) {
-    std::unique_ptr<CallbackData> guard{reinterpret_cast<CallbackData*>(data)};
-    edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder;
-    int device = guard->device;
-    if (status == cudaSuccess) {
-      LogTrace("CUDAScopedContext") << " GPU kernel finished (in callback) device " << device << " CUDA stream "
-                                    << streamId;
-      waitingTaskHolder.doneWaiting(nullptr);
-    } else {
-      // wrap the exception in a try-catch block to let GDB "catch throw" break on it
-      try {
-        auto error = cudaGetErrorName(status);
-        auto message = cudaGetErrorString(status);
-        throw cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << device
-                                          << " error " << error << ": " << message;
-      } catch (cms::Exception&) {
-        waitingTaskHolder.doneWaiting(std::current_exception());
-      }
-    }
-  }
-}  // namespace
-
-namespace impl {
-  CUDAScopedContextBase::CUDAScopedContextBase(edm::StreamID streamID)
-      : currentDevice_(cudacore::chooseCUDADevice(streamID)) {
-    cudaCheck(cudaSetDevice(currentDevice_));
-    stream_ = cudautils::getStreamCache().get();
-  }
-
-  CUDAScopedContextBase::CUDAScopedContextBase(const CUDAProductBase& data) : currentDevice_(data.device()) {
-    cudaCheck(cudaSetDevice(currentDevice_));
-    if (data.mayReuseStream()) {
-      stream_ = data.streamPtr();
-    } else {
-      stream_ = cudautils::getStreamCache().get();
-    }
-  }
-
-  CUDAScopedContextBase::CUDAScopedContextBase(int device, cudautils::SharedStreamPtr stream)
-      : currentDevice_(device), stream_(std::move(stream)) {
-    cudaCheck(cudaSetDevice(currentDevice_));
-  }
-
-  ////////////////////
-
-  void CUDAScopedContextGetterBase::synchronizeStreams(int dataDevice,
-                                                       cudaStream_t dataStream,
-                                                       bool available,
-                                                       cudaEvent_t dataEvent) {
-    if (dataDevice != device()) {
-      // Eventually replace with prefetch to current device (assuming unified memory works)
-      // If we won't go to unified memory, need to figure out something else...
-      throw cms::Exception("LogicError") << "Handling data from multiple devices is not yet supported";
-    }
-
-    if (dataStream != stream()) {
-      // Different streams, need to synchronize
-      if (not available) {
-        // Event not yet occurred, so need to add synchronization
-        // here. Sychronization is done by making the CUDA stream to
-        // wait for an event, so all subsequent work in the stream
-        // will run only after the event has "occurred" (i.e. data
-        // product became available).
-        cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event");
-      }
-    }
-  }
-
-  void CUDAScopedContextHolderHelper::enqueueCallback(int device, cudaStream_t stream) {
-    cudaCheck(
-        cudaStreamAddCallback(stream, cudaScopedContextCallback, new CallbackData{waitingTaskHolder_, device}, 0));
-  }
-}  // namespace impl
-
-////////////////////
-
-CUDAScopedContextAcquire::~CUDAScopedContextAcquire() {
-  holderHelper_.enqueueCallback(device(), stream());
-  if (contextState_) {
-    contextState_->set(device(), std::move(streamPtr()));
-  }
-}
-
-void CUDAScopedContextAcquire::throwNoState() {
-  throw cms::Exception("LogicError")
-      << "Calling CUDAScopedContextAcquire::insertNextTask() requires CUDAScopedContextAcquire to be constructed with "
-         "CUDAContextState, but that was not the case";
-}
-
-////////////////////
-
-CUDAScopedContextProduce::~CUDAScopedContextProduce() {
-  // Intentionally not checking the return value to avoid throwing
-  // exceptions. If this call would fail, we should get failures
-  // elsewhere as well.
-  cudaEventRecord(event_.get(), stream());
-}
-
-////////////////////
-
-CUDAScopedContextTask::~CUDAScopedContextTask() { holderHelper_.enqueueCallback(device(), stream()); }
diff --git a/HeterogeneousCore/CUDACore/src/ContextState.cc b/HeterogeneousCore/CUDACore/src/ContextState.cc
new file mode 100644
index 0000000000000..0670f01d472f3
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/ContextState.cc
@@ -0,0 +1,16 @@
+#include "HeterogeneousCore/CUDACore/interface/ContextState.h"
+#include "FWCore/Utilities/interface/Exception.h"
+
+namespace cms::cuda {
+  void ContextState::throwIfStream() const {
+    if (stream_) {
+      throw cms::Exception("LogicError") << "Trying to set ContextState, but it already had a valid state";
+    }
+  }
+
+  void ContextState::throwIfNoStream() const {
+    if (not stream_) {
+      throw cms::Exception("LogicError") << "Trying to get ContextState, but it did not have a valid state";
+    }
+  }
+}  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDACore/src/ScopedContext.cc b/HeterogeneousCore/CUDACore/src/ScopedContext.cc
new file mode 100644
index 0000000000000..adf242a6c43b2
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/ScopedContext.cc
@@ -0,0 +1,118 @@
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/Exception.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "chooseDevice.h"
+
+namespace {
+  struct CallbackData {
+    edm::WaitingTaskWithArenaHolder holder;
+    int device;
+  };
+
+  void CUDART_CB cudaScopedContextCallback(cudaStream_t streamId, cudaError_t status, void* data) {
+    std::unique_ptr<CallbackData> guard{reinterpret_cast<CallbackData*>(data)};
+    edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder;
+    int device = guard->device;
+    if (status == cudaSuccess) {
+      LogTrace("ScopedContext") << " GPU kernel finished (in callback) device " << device << " CUDA stream "
+                                << streamId;
+      waitingTaskHolder.doneWaiting(nullptr);
+    } else {
+      // wrap the exception in a try-catch block to let GDB "catch throw" break on it
+      try {
+        auto error = cudaGetErrorName(status);
+        auto message = cudaGetErrorString(status);
+        throw cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << device
+                                          << " error " << error << ": " << message;
+      } catch (cms::Exception&) {
+        waitingTaskHolder.doneWaiting(std::current_exception());
+      }
+    }
+  }
+}  // namespace
+
+namespace cms::cuda {
+  namespace impl {
+    ScopedContextBase::ScopedContextBase(edm::StreamID streamID) : currentDevice_(chooseDevice(streamID)) {
+      cudaCheck(cudaSetDevice(currentDevice_));
+      stream_ = cudautils::getStreamCache().get();
+    }
+
+    ScopedContextBase::ScopedContextBase(const ProductBase& data) : currentDevice_(data.device()) {
+      cudaCheck(cudaSetDevice(currentDevice_));
+      if (data.mayReuseStream()) {
+        stream_ = data.streamPtr();
+      } else {
+        stream_ = cudautils::getStreamCache().get();
+      }
+    }
+
+    ScopedContextBase::ScopedContextBase(int device, cudautils::SharedStreamPtr stream)
+        : currentDevice_(device), stream_(std::move(stream)) {
+      cudaCheck(cudaSetDevice(currentDevice_));
+    }
+
+    ////////////////////
+
+    void ScopedContextGetterBase::synchronizeStreams(int dataDevice,
+                                                     cudaStream_t dataStream,
+                                                     bool available,
+                                                     cudaEvent_t dataEvent) {
+      if (dataDevice != device()) {
+        // Eventually replace with prefetch to current device (assuming unified memory works)
+        // If we won't go to unified memory, need to figure out something else...
+        throw cms::Exception("LogicError") << "Handling data from multiple devices is not yet supported";
+      }
+
+      if (dataStream != stream()) {
+        // Different streams, need to synchronize
+        if (not available) {
+          // Event not yet occurred, so need to add synchronization
+          // here. Sychronization is done by making the CUDA stream to
+          // wait for an event, so all subsequent work in the stream
+          // will run only after the event has "occurred" (i.e. data
+          // product became available).
+          cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event");
+        }
+      }
+    }
+
+    void ScopedContextHolderHelper::enqueueCallback(int device, cudaStream_t stream) {
+      cudaCheck(
+          cudaStreamAddCallback(stream, cudaScopedContextCallback, new CallbackData{waitingTaskHolder_, device}, 0));
+    }
+  }  // namespace impl
+
+  ////////////////////
+
+  ScopedContextAcquire::~ScopedContextAcquire() {
+    holderHelper_.enqueueCallback(device(), stream());
+    if (contextState_) {
+      contextState_->set(device(), std::move(streamPtr()));
+    }
+  }
+
+  void ScopedContextAcquire::throwNoState() {
+    throw cms::Exception("LogicError")
+        << "Calling ScopedContextAcquire::insertNextTask() requires ScopedContextAcquire to be constructed with "
+           "ContextState, but that was not the case";
+  }
+
+  ////////////////////
+
+  ScopedContextProduce::~ScopedContextProduce() {
+    // Intentionally not checking the return value to avoid throwing
+    // exceptions. If this call would fail, we should get failures
+    // elsewhere as well.
+    cudaEventRecord(event_.get(), stream());
+  }
+
+  ////////////////////
+
+  ScopedContextTask::~ScopedContextTask() { holderHelper_.enqueueCallback(device(), stream()); }
+}  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.h b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.h
deleted file mode 100644
index bb09c302af7f5..0000000000000
--- a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_chooseCUDADevice_h
-#define HeterogeneousCore_CUDACore_chooseCUDADevice_h
-
-#include "FWCore/Utilities/interface/StreamID.h"
-
-namespace cudacore {
-  int chooseCUDADevice(edm::StreamID id);
-}
-
-#endif
diff --git a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc b/HeterogeneousCore/CUDACore/src/chooseDevice.cc
similarity index 81%
rename from HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
rename to HeterogeneousCore/CUDACore/src/chooseDevice.cc
index 7e9ac2faed380..7312760be7d84 100644
--- a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
+++ b/HeterogeneousCore/CUDACore/src/chooseDevice.cc
@@ -1,10 +1,10 @@
 #include "FWCore/ServiceRegistry/interface/Service.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 
-#include "chooseCUDADevice.h"
+#include "chooseDevice.h"
 
-namespace cudacore {
-  int chooseCUDADevice(edm::StreamID id) {
+namespace cms::cuda {
+  int chooseDevice(edm::StreamID id) {
     edm::Service<CUDAService> cudaService;
 
     // For startes we "statically" assign the device based on
@@ -15,4 +15,4 @@ namespace cudacore {
     // TODO: improve the "assignment" logic
     return id % cudaService->numberOfDevices();
   }
-}  // namespace cudacore
+}  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDACore/src/chooseDevice.h b/HeterogeneousCore/CUDACore/src/chooseDevice.h
new file mode 100644
index 0000000000000..ab642325aaecf
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/chooseDevice.h
@@ -0,0 +1,10 @@
+#ifndef HeterogeneousCore_CUDACore_chooseDevice_h
+#define HeterogeneousCore_CUDACore_chooseDevice_h
+
+#include "FWCore/Utilities/interface/StreamID.h"
+
+namespace cms::cuda {
+  int chooseDevice(edm::StreamID id);
+}
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/test/testStreamEvent.cu b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu
index deeb444dc255b..b0a66e2b777e4 100644
--- a/HeterogeneousCore/CUDACore/test/testStreamEvent.cu
+++ b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu
@@ -1,6 +1,6 @@
 /**
  * The purpose of this test program is to ensure that the logic for
- * CUDA event use in CUDAProduct and CUDAScopedContext
+ * CUDA event use in cms::cuda::Product and cms::cuda::ScopedContext
  */
 
 #include <iostream>
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu
deleted file mode 100644
index 330e83dfd4960..0000000000000
--- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu
+++ /dev/null
@@ -1,13 +0,0 @@
-#include "test_CUDAScopedContextKernels.h"
-
-namespace {
-  __global__ void single_mul(int *d) { d[0] = d[0] * 2; }
-
-  __global__ void join_add(const int *d1, const int *d2, int *d3) { d3[0] = d1[0] + d2[0]; }
-}  // namespace
-
-void testCUDAScopedContextKernels_single(int *d, cudaStream_t stream) { single_mul<<<1, 1, 0, stream>>>(d); }
-
-void testCUDAScopedContextKernels_join(const int *d1, const int *d2, int *d3, cudaStream_t stream) {
-  join_add<<<1, 1, 0, stream>>>(d1, d2, d3);
-}
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h
deleted file mode 100644
index 527a4ce71e1cb..0000000000000
--- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_test_CUDAScopedContextKernels_h
-#define HeterogeneousCore_CUDACore_test_CUDAScopedContextKernels_h
-
-#include <cuda_runtime.h>
-
-void testCUDAScopedContextKernels_single(int *d, cudaStream_t stream);
-void testCUDAScopedContextKernels_join(const int *d1, const int *d2, int *d3, cudaStream_t stream);
-
-#endif
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_ScopedContext.cc
similarity index 69%
rename from HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
rename to HeterogeneousCore/CUDACore/test/test_ScopedContext.cc
index c0bb7656ba258..c97d08e29a52c 100644
--- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/test/test_ScopedContext.cc
@@ -1,11 +1,11 @@
 #include "catch.hpp"
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "FWCore/Concurrency/interface/WaitingTask.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
@@ -14,74 +14,74 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
 
-#include "test_CUDAScopedContextKernels.h"
+#include "test_ScopedContextKernels.h"
 
-namespace cudatest {
-  class TestCUDAScopedContext {
+namespace cms::cudatest {
+  class TestScopedContext {
   public:
-    static CUDAScopedContextProduce make(int dev, bool createEvent) {
+    static cuda::ScopedContextProduce make(int dev, bool createEvent) {
       cudautils::SharedEventPtr event;
       if (createEvent) {
         event = cudautils::getEventCache().get();
       }
-      return CUDAScopedContextProduce(dev, cudautils::getStreamCache().get(), std::move(event));
+      return cuda::ScopedContextProduce(dev, cudautils::getStreamCache().get(), std::move(event));
     }
   };
-}  // namespace cudatest
+}  // namespace cms::cudatest
 
 namespace {
-  std::unique_ptr<CUDAProduct<int*>> produce(int device, int* d, int* h) {
-    auto ctx = cudatest::TestCUDAScopedContext::make(device, true);
+  std::unique_ptr<cms::cuda::Product<int*>> produce(int device, int* d, int* h) {
+    auto ctx = cms::cudatest::TestScopedContext::make(device, true);
     cudaCheck(cudaMemcpyAsync(d, h, sizeof(int), cudaMemcpyHostToDevice, ctx.stream()));
-    testCUDAScopedContextKernels_single(d, ctx.stream());
+    cms::cudatest::testScopedContextKernels_single(d, ctx.stream());
     return ctx.wrap(d);
   }
 }  // namespace
 
-TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
+TEST_CASE("Use of cms::cuda::ScopedContext", "[CUDACore]") {
   if (not cms::cudatest::testDevices()) {
     return;
   }
 
   constexpr int defaultDevice = 0;
   {
-    auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice, true);
+    auto ctx = cms::cudatest::TestScopedContext::make(defaultDevice, true);
 
     SECTION("Construct from device ID") { REQUIRE(cudautils::currentDevice() == defaultDevice); }
 
-    SECTION("Wrap T to CUDAProduct<T>") {
-      std::unique_ptr<CUDAProduct<int>> dataPtr = ctx.wrap(10);
+    SECTION("Wrap T to cms::cuda::Product<T>") {
+      std::unique_ptr<cms::cuda::Product<int>> dataPtr = ctx.wrap(10);
       REQUIRE(dataPtr.get() != nullptr);
       REQUIRE(dataPtr->device() == ctx.device());
       REQUIRE(dataPtr->stream() == ctx.stream());
     }
 
-    SECTION("Construct from from CUDAProduct<T>") {
-      std::unique_ptr<CUDAProduct<int>> dataPtr = ctx.wrap(10);
+    SECTION("Construct from from cms::cuda::Product<T>") {
+      std::unique_ptr<cms::cuda::Product<int>> dataPtr = ctx.wrap(10);
       const auto& data = *dataPtr;
 
-      CUDAScopedContextProduce ctx2{data};
+      cms::cuda::ScopedContextProduce ctx2{data};
       REQUIRE(cudautils::currentDevice() == data.device());
       REQUIRE(ctx2.stream() == data.stream());
 
       // Second use of a product should lead to new stream
-      CUDAScopedContextProduce ctx3{data};
+      cms::cuda::ScopedContextProduce ctx3{data};
       REQUIRE(cudautils::currentDevice() == data.device());
       REQUIRE(ctx3.stream() != data.stream());
     }
 
-    SECTION("Storing state in CUDAContextState") {
-      CUDAContextState ctxstate;
+    SECTION("Storing state in cms::cuda::ContextState") {
+      cms::cuda::ContextState ctxstate;
       {  // acquire
-        std::unique_ptr<CUDAProduct<int>> dataPtr = ctx.wrap(10);
+        std::unique_ptr<cms::cuda::Product<int>> dataPtr = ctx.wrap(10);
         const auto& data = *dataPtr;
         edm::WaitingTaskWithArenaHolder dummy{
             edm::make_waiting_task(tbb::task::allocate_root(), [](std::exception_ptr const* iPtr) {})};
-        CUDAScopedContextAcquire ctx2{data, std::move(dummy), ctxstate};
+        cms::cuda::ScopedContextAcquire ctx2{data, std::move(dummy), ctxstate};
       }
 
       {  // produce
-        CUDAScopedContextProduce ctx2{ctxstate};
+        cms::cuda::ScopedContextProduce ctx2{ctxstate};
         REQUIRE(cudautils::currentDevice() == ctx.device());
         REQUIRE(ctx2.stream() == ctx.stream());
       }
@@ -103,13 +103,13 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
       REQUIRE(wprod1->stream() != wprod2->stream());
 
       // Mimick a third producer "joining" the two streams
-      CUDAScopedContextProduce ctx2{*wprod1};
+      cms::cuda::ScopedContextProduce ctx2{*wprod1};
 
       auto prod1 = ctx2.get(*wprod1);
       auto prod2 = ctx2.get(*wprod2);
 
       auto d_a3 = cudautils::make_device_unique<int>(nullptr);
-      testCUDAScopedContextKernels_join(prod1, prod2, d_a3.get(), ctx2.stream());
+      cms::cudatest::testScopedContextKernels_join(prod1, prod2, d_a3.get(), ctx2.stream());
       cudaCheck(cudaStreamSynchronize(ctx2.stream()));
       REQUIRE(wprod2->isAvailable());
       REQUIRE(cudautils::eventWorkHasCompleted(wprod2->event()));
diff --git a/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.cu b/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.cu
new file mode 100644
index 0000000000000..b87f1e20a5f24
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.cu
@@ -0,0 +1,17 @@
+#include "test_ScopedContextKernels.h"
+
+namespace {
+  __global__ void single_mul(int *d) { d[0] = d[0] * 2; }
+
+  __global__ void join_add(const int *d1, const int *d2, int *d3) { d3[0] = d1[0] + d2[0]; }
+}  // namespace
+
+namespace cms {
+  namespace cudatest {
+    void testScopedContextKernels_single(int *d, cudaStream_t stream) { single_mul<<<1, 1, 0, stream>>>(d); }
+
+    void testScopedContextKernels_join(const int *d1, const int *d2, int *d3, cudaStream_t stream) {
+      join_add<<<1, 1, 0, stream>>>(d1, d2, d3);
+    }
+  }  // namespace cudatest
+}  // namespace cms
diff --git a/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.h b/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.h
new file mode 100644
index 0000000000000..dfc55682afc76
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.h
@@ -0,0 +1,13 @@
+#ifndef HeterogeneousCore_CUDACore_test_ScopedContextKernels_h
+#define HeterogeneousCore_CUDACore_test_ScopedContextKernels_h
+
+#include <cuda_runtime.h>
+
+namespace cms {
+  namespace cudatest {
+    void testScopedContextKernels_single(int *d, cudaStream_t stream);
+    void testScopedContextKernels_join(const int *d1, const int *d2, int *d3, cudaStream_t stream);
+  }  // namespace cudatest
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDATest/interface/CUDAThing.h b/HeterogeneousCore/CUDATest/interface/CUDAThing.h
deleted file mode 100644
index f8559a4f86b41..0000000000000
--- a/HeterogeneousCore/CUDATest/interface/CUDAThing.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef HeterogeneousCore_CUDATest_CUDAThing_H
-#define HeterogeneousCore_CUDATest_CUDAThing_H
-
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-
-class CUDAThing {
-public:
-  CUDAThing() = default;
-  explicit CUDAThing(cudautils::device::unique_ptr<float[]> ptr) : ptr_(std::move(ptr)) {}
-
-  const float *get() const { return ptr_.get(); }
-
-private:
-  cudautils::device::unique_ptr<float[]> ptr_;
-  ;
-};
-
-#endif
diff --git a/HeterogeneousCore/CUDATest/interface/Thing.h b/HeterogeneousCore/CUDATest/interface/Thing.h
new file mode 100644
index 0000000000000..e492625002a64
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/interface/Thing.h
@@ -0,0 +1,21 @@
+#ifndef HeterogeneousCore_CUDATest_Thing_H
+#define HeterogeneousCore_CUDATest_Thing_H
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+namespace cms {
+  namespace cudatest {
+    class Thing {
+    public:
+      Thing() = default;
+      explicit Thing(cudautils::device::unique_ptr<float[]> ptr) : ptr_(std::move(ptr)) {}
+
+      const float *get() const { return ptr_.get(); }
+
+    private:
+      cudautils::device::unique_ptr<float[]> ptr_;
+    };
+  }  // namespace cudatest
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
index e8d4ade41be01..d3cee9471f78c 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
@@ -6,10 +6,10 @@
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/ServiceRegistry/interface/Service.h"
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
+#include "HeterogeneousCore/CUDATest/interface/Thing.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
 
 #include "TestCUDAAnalyzerGPUKernel.h"
@@ -26,7 +26,7 @@ class TestCUDAAnalyzerGPU : public edm::global::EDAnalyzer<> {
 
 private:
   std::string const label_;
-  edm::EDGetTokenT<CUDAProduct<CUDAThing>> const srcToken_;
+  edm::EDGetTokenT<cms::cuda::Product<cms::cudatest::Thing>> const srcToken_;
   double const minValue_;
   double const maxValue_;
   // the public interface is thread safe
@@ -35,7 +35,7 @@ class TestCUDAAnalyzerGPU : public edm::global::EDAnalyzer<> {
 
 TestCUDAAnalyzerGPU::TestCUDAAnalyzerGPU(edm::ParameterSet const& iConfig)
     : label_(iConfig.getParameter<std::string>("@module_label")),
-      srcToken_(consumes<CUDAProduct<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))),
+      srcToken_(consumes<cms::cuda::Product<cms::cudatest::Thing>>(iConfig.getParameter<edm::InputTag>("src"))),
       minValue_(iConfig.getParameter<double>("minValue")),
       maxValue_(iConfig.getParameter<double>("maxValue")) {
   edm::Service<CUDAService> cs;
@@ -47,7 +47,7 @@ TestCUDAAnalyzerGPU::TestCUDAAnalyzerGPU(edm::ParameterSet const& iConfig)
 
 void TestCUDAAnalyzerGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
-  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source of CUDAProduct<CUDAThing>.");
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source of cms::cuda::Product<cms::cudatest::Thing>.");
   desc.add<double>("minValue", -1e308);
   desc.add<double>("maxValue", 1e308);
   descriptions.addWithDefaultLabel(desc);
@@ -59,8 +59,8 @@ void TestCUDAAnalyzerGPU::analyze(edm::StreamID, edm::Event const& iEvent, edm::
                                           << iEvent.id().event() << " stream " << iEvent.streamID();
 
   auto const& in = iEvent.get(srcToken_);
-  CUDAScopedContextAnalyze ctx{in};
-  CUDAThing const& input = ctx.get(in);
+  cms::cuda::ScopedContextAnalyze ctx{in};
+  cms::cudatest::Thing const& input = ctx.get(in);
   gpuAlgo_->analyzeAsync(input.get(), ctx.stream());
 
   edm::LogVerbatim("TestCUDAAnalyzerGPU")
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
index 8f8979a25a273..6f92ac91dd922 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
@@ -5,9 +5,9 @@
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
-#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDATest/interface/Thing.h"
 
 #include "TestCUDAProducerGPUKernel.h"
 
@@ -22,23 +22,23 @@ class TestCUDAProducerGPU : public edm::global::EDProducer<> {
 
 private:
   std::string const label_;
-  edm::EDGetTokenT<CUDAProduct<CUDAThing>> const srcToken_;
-  edm::EDPutTokenT<CUDAProduct<CUDAThing>> const dstToken_;
+  edm::EDGetTokenT<cms::cuda::Product<cms::cudatest::Thing>> const srcToken_;
+  edm::EDPutTokenT<cms::cuda::Product<cms::cudatest::Thing>> const dstToken_;
   TestCUDAProducerGPUKernel const gpuAlgo_;
 };
 
 TestCUDAProducerGPU::TestCUDAProducerGPU(edm::ParameterSet const& iConfig)
     : label_(iConfig.getParameter<std::string>("@module_label")),
-      srcToken_(consumes<CUDAProduct<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))),
-      dstToken_(produces<CUDAProduct<CUDAThing>>()) {}
+      srcToken_(consumes<cms::cuda::Product<cms::cudatest::Thing>>(iConfig.getParameter<edm::InputTag>("src"))),
+      dstToken_(produces<cms::cuda::Product<cms::cudatest::Thing>>()) {}
 
 void TestCUDAProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
-  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source of CUDAProduct<CUDAThing>.");
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source of cms::cuda::Product<cms::cudatest::Thing>.");
   descriptions.addWithDefaultLabel(desc);
   descriptions.setComment(
       "This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this is not the first "
-      "algorithm in the chain of the GPU EDProducers. Produces CUDAProduct<CUDAThing>.");
+      "algorithm in the chain of the GPU EDProducers. Produces cms::cuda::Product<cms::cudatest::Thing>.");
 }
 
 void TestCUDAProducerGPU::produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup const& iSetup) const {
@@ -46,10 +46,10 @@ void TestCUDAProducerGPU::produce(edm::StreamID streamID, edm::Event& iEvent, ed
                                           << iEvent.id().event() << " stream " << iEvent.streamID();
 
   auto const& in = iEvent.get(srcToken_);
-  CUDAScopedContextProduce ctx{in};
-  CUDAThing const& input = ctx.get(in);
+  cms::cuda::ScopedContextProduce ctx{in};
+  cms::cudatest::Thing const& input = ctx.get(in);
 
-  ctx.emplace(iEvent, dstToken_, CUDAThing{gpuAlgo_.runAlgo(label_, input.get(), ctx.stream())});
+  ctx.emplace(iEvent, dstToken_, cms::cudatest::Thing{gpuAlgo_.runAlgo(label_, input.get(), ctx.stream())});
 
   edm::LogVerbatim("TestCUDAProducerGPU")
       << label_ << " TestCUDAProducerGPU::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
index 383e15d0a96f3..710df90c5ff5d 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
@@ -6,12 +6,12 @@
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/ServiceRegistry/interface/Service.h"
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ContextState.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
+#include "HeterogeneousCore/CUDATest/interface/Thing.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
 
 #include "TestCUDAProducerGPUKernel.h"
@@ -30,18 +30,18 @@ class TestCUDAProducerGPUEW : public edm::stream::EDProducer<edm::ExternalWork>
 
 private:
   std::string const label_;
-  edm::EDGetTokenT<CUDAProduct<CUDAThing>> const srcToken_;
-  edm::EDPutTokenT<CUDAProduct<CUDAThing>> const dstToken_;
+  edm::EDGetTokenT<cms::cuda::Product<cms::cudatest::Thing>> const srcToken_;
+  edm::EDPutTokenT<cms::cuda::Product<cms::cudatest::Thing>> const dstToken_;
   TestCUDAProducerGPUKernel gpuAlgo_;
-  CUDAContextState ctxState_;
+  cms::cuda::ContextState ctxState_;
   cudautils::device::unique_ptr<float[]> devicePtr_;
   cudautils::host::noncached::unique_ptr<float> hostData_;
 };
 
 TestCUDAProducerGPUEW::TestCUDAProducerGPUEW(edm::ParameterSet const& iConfig)
     : label_{iConfig.getParameter<std::string>("@module_label")},
-      srcToken_{consumes<CUDAProduct<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))},
-      dstToken_{produces<CUDAProduct<CUDAThing>>()} {
+      srcToken_{consumes<cms::cuda::Product<cms::cudatest::Thing>>(iConfig.getParameter<edm::InputTag>("src"))},
+      dstToken_{produces<cms::cuda::Product<cms::cudatest::Thing>>()} {
   edm::Service<CUDAService> cs;
   if (cs->enabled()) {
     hostData_ = cudautils::make_host_noncached_unique<float>();
@@ -56,7 +56,7 @@ void TestCUDAProducerGPUEW::fillDescriptions(edm::ConfigurationDescriptions& des
       "This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this is not the first "
       "algorithm in the chain of the GPU EDProducers, and that transfers some data from GPU to CPU and thus needs to "
       "synchronize GPU and CPU. The synchronization is implemented with the ExternalWork extension. Produces "
-      "CUDAProduct<CUDAThing>.");
+      "cms::cuda::Product<cms::cuda::Thing>.");
 }
 
 void TestCUDAProducerGPUEW::acquire(edm::Event const& iEvent,
@@ -66,8 +66,8 @@ void TestCUDAProducerGPUEW::acquire(edm::Event const& iEvent,
                                             << iEvent.id().event() << " stream " << iEvent.streamID();
 
   auto const& in = iEvent.get(srcToken_);
-  CUDAScopedContextAcquire ctx{in, std::move(waitingTaskHolder), ctxState_};
-  CUDAThing const& input = ctx.get(in);
+  cms::cuda::ScopedContextAcquire ctx{in, std::move(waitingTaskHolder), ctxState_};
+  cms::cudatest::Thing const& input = ctx.get(in);
 
   devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx.stream());
   // Mimick the need to transfer some of the GPU data back to CPU to
@@ -84,7 +84,7 @@ void TestCUDAProducerGPUEW::produce(edm::Event& iEvent, edm::EventSetup const& i
       << label_ << " TestCUDAProducerGPUEW::produce begin event " << iEvent.id().event() << " stream "
       << iEvent.streamID() << " 10th element " << *hostData_;
 
-  CUDAScopedContextProduce ctx{ctxState_};
+  cms::cuda::ScopedContextProduce ctx{ctxState_};
 
   ctx.emplace(iEvent, dstToken_, std::move(devicePtr_));
 
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
index f3010c94b3d9c..12a8a82ca2865 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
@@ -9,12 +9,12 @@
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/ServiceRegistry/interface/Service.h"
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ContextState.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
+#include "HeterogeneousCore/CUDATest/interface/Thing.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
 
 #include "TestCUDAProducerGPUKernel.h"
@@ -34,21 +34,21 @@ class TestCUDAProducerGPUEWTask : public edm::stream::EDProducer<edm::ExternalWo
   void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
 
 private:
-  void addSimpleWork(edm::EventNumber_t eventID, edm::StreamID streamID, CUDAScopedContextTask& ctx);
+  void addSimpleWork(edm::EventNumber_t eventID, edm::StreamID streamID, cms::cuda::ScopedContextTask& ctx);
 
   std::string const label_;
-  edm::EDGetTokenT<CUDAProduct<CUDAThing>> const srcToken_;
-  edm::EDPutTokenT<CUDAProduct<CUDAThing>> const dstToken_;
+  edm::EDGetTokenT<cms::cuda::Product<cms::cudatest::Thing>> const srcToken_;
+  edm::EDPutTokenT<cms::cuda::Product<cms::cudatest::Thing>> const dstToken_;
   TestCUDAProducerGPUKernel gpuAlgo_;
-  CUDAContextState ctxState_;
+  cms::cuda::ContextState ctxState_;
   cudautils::device::unique_ptr<float[]> devicePtr_;
   cudautils::host::noncached::unique_ptr<float> hostData_;
 };
 
 TestCUDAProducerGPUEWTask::TestCUDAProducerGPUEWTask(edm::ParameterSet const& iConfig)
     : label_{iConfig.getParameter<std::string>("@module_label")},
-      srcToken_{consumes<CUDAProduct<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))},
-      dstToken_{produces<CUDAProduct<CUDAThing>>()} {
+      srcToken_{consumes<cms::cuda::Product<cms::cudatest::Thing>>(iConfig.getParameter<edm::InputTag>("src"))},
+      dstToken_{produces<cms::cuda::Product<cms::cudatest::Thing>>()} {
   edm::Service<CUDAService> cs;
   if (cs->enabled()) {
     hostData_ = cudautils::make_host_noncached_unique<float>();
@@ -65,7 +65,7 @@ void TestCUDAProducerGPUEWTask::fillDescriptions(edm::ConfigurationDescriptions&
       "alternating the transfers and kernel executions (e.g. to decide which kernel to run next based on a value from "
       "GPU). A synchronization between GPU and CPU is needed after each transfer. The synchronizations are implemented "
       "with the ExternalWork extension and explicit TBB tasks within the module. Produces "
-      "CUDAProduct<CUDAThing>.");
+      "cms::cuda::Product<cms::cudatest::Thing>.");
 }
 
 void TestCUDAProducerGPUEWTask::acquire(edm::Event const& iEvent,
@@ -75,9 +75,9 @@ void TestCUDAProducerGPUEWTask::acquire(edm::Event const& iEvent,
                                                 << iEvent.id().event() << " stream " << iEvent.streamID();
 
   auto const& in = iEvent.get(srcToken_);
-  CUDAScopedContextAcquire ctx{in, waitingTaskHolder, ctxState_};
+  cms::cuda::ScopedContextAcquire ctx{in, waitingTaskHolder, ctxState_};
 
-  CUDAThing const& input = ctx.get(in);
+  cms::cudatest::Thing const& input = ctx.get(in);
 
   devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx.stream());
   // Mimick the need to transfer some of the GPU data back to CPU to
@@ -87,7 +87,7 @@ void TestCUDAProducerGPUEWTask::acquire(edm::Event const& iEvent,
       cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
   // Push a task to run addSimpleWork() after the asynchronous work
   // (and acquire()) has finished instead of produce()
-  ctx.pushNextTask([iev = iEvent.id().event(), istr = iEvent.streamID(), this](CUDAScopedContextTask ctx) {
+  ctx.pushNextTask([iev = iEvent.id().event(), istr = iEvent.streamID(), this](cms::cuda::ScopedContextTask ctx) {
     addSimpleWork(iev, istr, ctx);
   });
 
@@ -97,7 +97,7 @@ void TestCUDAProducerGPUEWTask::acquire(edm::Event const& iEvent,
 
 void TestCUDAProducerGPUEWTask::addSimpleWork(edm::EventNumber_t eventID,
                                               edm::StreamID streamID,
-                                              CUDAScopedContextTask& ctx) {
+                                              cms::cuda::ScopedContextTask& ctx) {
   if (*hostData_ < 13) {
     edm::LogVerbatim("TestCUDAProducerGPUEWTask")
         << label_ << " TestCUDAProducerGPUEWTask::addSimpleWork begin event " << eventID << " stream " << streamID
@@ -105,7 +105,8 @@ void TestCUDAProducerGPUEWTask::addSimpleWork(edm::EventNumber_t eventID,
     cudaCheck(
         cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
 
-    ctx.pushNextTask([eventID, streamID, this](CUDAScopedContextTask ctx) { addSimpleWork(eventID, streamID, ctx); });
+    ctx.pushNextTask(
+        [eventID, streamID, this](cms::cuda::ScopedContextTask ctx) { addSimpleWork(eventID, streamID, ctx); });
     gpuAlgo_.runSimpleAlgo(devicePtr_.get(), ctx.stream());
     edm::LogVerbatim("TestCUDAProducerGPUEWTask")
         << label_ << " TestCUDAProducerGPUEWTask::addSimpleWork end event " << eventID << " stream " << streamID;
@@ -124,7 +125,7 @@ void TestCUDAProducerGPUEWTask::produce(edm::Event& iEvent, edm::EventSetup cons
     throw cms::Exception("Assert") << "Expecting 10th element to be 13, got " << *hostData_;
   }
 
-  CUDAScopedContextProduce ctx{ctxState_};
+  cms::cuda::ScopedContextProduce ctx{ctxState_};
 
   ctx.emplace(iEvent, dstToken_, std::move(devicePtr_));
 
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
index 25fad0abe9438..6549dabf1d9ff 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
@@ -5,9 +5,9 @@
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
-#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDATest/interface/Thing.h"
 
 #include "TestCUDAProducerGPUKernel.h"
 
@@ -22,19 +22,20 @@ class TestCUDAProducerGPUFirst : public edm::global::EDProducer<> {
 
 private:
   std::string const label_;
-  edm::EDPutTokenT<CUDAProduct<CUDAThing>> const dstToken_;
+  edm::EDPutTokenT<cms::cuda::Product<cms::cudatest::Thing>> const dstToken_;
   TestCUDAProducerGPUKernel const gpuAlgo_;
 };
 
 TestCUDAProducerGPUFirst::TestCUDAProducerGPUFirst(edm::ParameterSet const& iConfig)
-    : label_(iConfig.getParameter<std::string>("@module_label")), dstToken_{produces<CUDAProduct<CUDAThing>>()} {}
+    : label_(iConfig.getParameter<std::string>("@module_label")),
+      dstToken_{produces<cms::cuda::Product<cms::cudatest::Thing>>()} {}
 
 void TestCUDAProducerGPUFirst::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
   descriptions.addWithDefaultLabel(desc);
   descriptions.setComment(
       "This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this the first algorithm in "
-      "the chain of the GPU EDProducers. Produces CUDA<ProductCUDAThing>.");
+      "the chain of the GPU EDProducers. Produces cms::cuda::Productcms::cudatest::Thing>.");
 }
 
 void TestCUDAProducerGPUFirst::produce(edm::StreamID streamID,
@@ -43,7 +44,7 @@ void TestCUDAProducerGPUFirst::produce(edm::StreamID streamID,
   edm::LogVerbatim("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce begin event "
                                                << iEvent.id().event() << " stream " << iEvent.streamID();
 
-  CUDAScopedContextProduce ctx{streamID};
+  cms::cuda::ScopedContextProduce ctx{streamID};
 
   cudautils::device::unique_ptr<float[]> output = gpuAlgo_.runAlgo(label_, ctx.stream());
   ctx.emplace(iEvent, dstToken_, std::move(output));
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
index 1d5456f329e0f..d1b3288b95199 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
@@ -5,10 +5,10 @@
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
-#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDATest/interface/Thing.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 #include "TestCUDAProducerGPUKernel.h"
@@ -28,19 +28,19 @@ class TestCUDAProducerGPUtoCPU : public edm::stream::EDProducer<edm::ExternalWor
 
 private:
   std::string const label_;
-  edm::EDGetTokenT<CUDAProduct<CUDAThing>> const srcToken_;
+  edm::EDGetTokenT<cms::cuda::Product<cms::cudatest::Thing>> const srcToken_;
   edm::EDPutTokenT<int> const dstToken_;
   cudautils::host::unique_ptr<float[]> buffer_;
 };
 
 TestCUDAProducerGPUtoCPU::TestCUDAProducerGPUtoCPU(edm::ParameterSet const& iConfig)
     : label_{iConfig.getParameter<std::string>("@module_label")},
-      srcToken_{consumes<CUDAProduct<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))},
+      srcToken_{consumes<cms::cuda::Product<cms::cudatest::Thing>>(iConfig.getParameter<edm::InputTag>("src"))},
       dstToken_{produces<int>()} {}
 
 void TestCUDAProducerGPUtoCPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
-  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source for CUDAProduct<CUDAThing>.");
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source for cms::cuda::Product<cms::cudatest::Thing>.");
   descriptions.addWithDefaultLabel(desc);
   descriptions.setComment(
       "This EDProducer is part of the TestCUDAProducer* family. It models the GPU->CPU data transfer and formatting of "
@@ -54,8 +54,8 @@ void TestCUDAProducerGPUtoCPU::acquire(edm::Event const& iEvent,
                                                << iEvent.id().event() << " stream " << iEvent.streamID();
 
   auto const& in = iEvent.get(srcToken_);
-  CUDAScopedContextAcquire ctx{in, std::move(waitingTaskHolder)};
-  CUDAThing const& device = ctx.get(in);
+  cms::cuda::ScopedContextAcquire ctx{in, std::move(waitingTaskHolder)};
+  cms::cudatest::Thing const& device = ctx.get(in);
 
   buffer_ = cudautils::make_host_unique<float[]>(TestCUDAProducerGPUKernel::NUM_VALUES, ctx.stream());
   // Enqueue async copy, continue in produce once finished
diff --git a/HeterogeneousCore/CUDATest/src/classes.h b/HeterogeneousCore/CUDATest/src/classes.h
index 33d9bba2bb9b2..9cd0cf05bdf04 100644
--- a/HeterogeneousCore/CUDATest/src/classes.h
+++ b/HeterogeneousCore/CUDATest/src/classes.h
@@ -1,3 +1,3 @@
 #include "DataFormats/Common/interface/Wrapper.h"
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
-#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "HeterogeneousCore/CUDATest/interface/Thing.h"
diff --git a/HeterogeneousCore/CUDATest/src/classes_def.xml b/HeterogeneousCore/CUDATest/src/classes_def.xml
index bece1ece62a7b..74760047caf99 100644
--- a/HeterogeneousCore/CUDATest/src/classes_def.xml
+++ b/HeterogeneousCore/CUDATest/src/classes_def.xml
@@ -1,4 +1,4 @@
 <lcgdict>
-    <class name="CUDAProduct<CUDAThing>" persistent="false"/>
-    <class name="edm::Wrapper<CUDAProduct<CUDAThing>>" persistent="false"/>
+    <class name="cms::cuda::Product<cms::cudatest::Thing>" persistent="false"/>
+    <class name="edm::Wrapper<cms::cuda::Product<cms::cudatest::Thing>>" persistent="false"/>
 </lcgdict>
diff --git a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
index a8d2e6ba21564..a7e4f16043a1f 100644
--- a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
@@ -2,9 +2,9 @@
 #include "FWCore/TestProcessor/interface/TestProcessor.h"
 #include "FWCore/Utilities/interface/Exception.h"
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
-#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDATest/interface/Thing.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
@@ -72,10 +72,10 @@ process.moduleToTest(process.toTest)
   SECTION("Produce") {
     edm::test::TestProcessor tester{config};
     auto event = tester.test();
-    auto prod = event.get<CUDAProduct<CUDAThing> >();
+    auto prod = event.get<cms::cuda::Product<cms::cudatest::Thing> >();
     REQUIRE(prod->device() == defaultDevice);
-    auto ctx = CUDAScopedContextProduce(*prod);
-    const CUDAThing& thing = ctx.get(*prod);
+    auto ctx = cms::cuda::ScopedContextProduce(*prod);
+    const cms::cudatest::Thing& thing = ctx.get(*prod);
     const float* data = thing.get();
     REQUIRE(data != nullptr);
 

From cc3a9ee79816c9cb1a9a9092e54dda4de9a6feb5 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 8 Jan 2020 19:26:15 +0100
Subject: [PATCH 27/29] Propagate CUDAX->cms::cuda::X rename

---
 CUDADataFormats/BeamSpot/src/classes.h        |  2 +-
 CUDADataFormats/BeamSpot/src/classes_def.xml  |  4 +--
 CUDADataFormats/SiPixelCluster/src/classes.h  |  2 +-
 .../SiPixelCluster/src/classes_def.xml        |  4 +--
 CUDADataFormats/SiPixelDigi/src/classes.h     |  2 +-
 .../SiPixelDigi/src/classes_def.xml           |  8 ++---
 CUDADataFormats/Track/src/classes.h           |  6 ++--
 CUDADataFormats/Track/src/classes_def.xml     |  4 +--
 CUDADataFormats/TrackingRecHit/src/classes.h  |  2 +-
 .../TrackingRecHit/src/classes_def.xml        |  4 +--
 .../Vertex/interface/ZVertexHeterogeneous.h   |  4 +--
 CUDADataFormats/Vertex/src/classes.h          |  2 +-
 CUDADataFormats/Vertex/src/classes_def.xml    |  2 +-
 .../SiPixelGainCalibrationForHLTGPU.h         |  4 +--
 .../plugins/SiPixelDigiErrorsSoAFromCUDA.cc   | 11 +++----
 .../plugins/SiPixelDigisSoAFromCUDA.cc        | 10 +++----
 .../CUDACore/interface/ESProduct.h            |  4 +--
 .../interface/EcalGainRatiosGPU.h             |  4 +--
 .../EcalRecAlgos/interface/EcalPedestalsGPU.h |  4 +--
 .../interface/EcalPulseCovariancesGPU.h       |  4 +--
 .../interface/EcalPulseShapesGPU.h            |  4 +--
 .../interface/EcalSamplesCorrelationGPU.h     |  4 +--
 .../interface/EcalTimeBiasCorrectionsGPU.h    |  4 +--
 .../interface/EcalTimeCalibConstantsGPU.h     |  4 +--
 .../plugins/EcalUncalibRecHitProducerGPU.cc   |  8 ++---
 .../SiPixelFedCablingMapGPUWrapper.h          |  6 ++--
 .../plugins/SiPixelRawToClusterCUDA.cc        | 22 +++++++-------
 .../SiPixelRecHits/interface/PixelCPEFast.h   |  4 +--
 .../plugins/SiPixelRecHitCUDA.cc              | 29 +++++++++----------
 .../plugins/SiPixelRecHitFromSOA.cc           | 15 +++++-----
 .../plugins/SiPixelRecHitSoAFromLegacy.cc     |  1 -
 .../plugins/PixelTrackDumpCUDA.cc             | 14 ++++-----
 .../plugins/PixelTrackSoAFromCUDA.cc          | 12 ++++----
 .../PixelTriplets/plugins/CAHitNtupletCUDA.cc | 17 ++++++-----
 .../src/PixelVertexProducerCUDA.cc            | 13 +++++----
 .../src/PixelVertexSoAFromCUDA.cc             | 10 +++----
 .../plugins/BeamSpotToCUDA.cc                 | 10 +++----
 .../ClusterTPAssociationProducerCUDA.cc       | 26 ++++++++---------
 .../TrackerHitAssociation/src/classes.h       |  2 +-
 .../TrackerHitAssociation/src/classes_def.xml |  4 +--
 .../test/ClusterTPCUDAdump.cc                 | 10 +++----
 41 files changed, 154 insertions(+), 152 deletions(-)

diff --git a/CUDADataFormats/BeamSpot/src/classes.h b/CUDADataFormats/BeamSpot/src/classes.h
index 62f990c0ba3b3..f79c8c9452c63 100644
--- a/CUDADataFormats/BeamSpot/src/classes.h
+++ b/CUDADataFormats/BeamSpot/src/classes.h
@@ -1,7 +1,7 @@
 #ifndef CUDADataFormats_BeamSpot_classes_h
 #define CUDADataFormats_BeamSpot_classes_h
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
diff --git a/CUDADataFormats/BeamSpot/src/classes_def.xml b/CUDADataFormats/BeamSpot/src/classes_def.xml
index 29a0eafa04005..198edeebe7c73 100644
--- a/CUDADataFormats/BeamSpot/src/classes_def.xml
+++ b/CUDADataFormats/BeamSpot/src/classes_def.xml
@@ -1,4 +1,4 @@
 <lcgdict>
-  <class name="CUDAProduct<BeamSpotCUDA>" persistent="false"/>
-  <class name="edm::Wrapper<CUDAProduct<BeamSpotCUDA>>" persistent="false"/>
+  <class name="cms::cuda::Product<BeamSpotCUDA>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<BeamSpotCUDA>>" persistent="false"/>
 </lcgdict>
diff --git a/CUDADataFormats/SiPixelCluster/src/classes.h b/CUDADataFormats/SiPixelCluster/src/classes.h
index 08d46244adc7d..0698cb103dab9 100644
--- a/CUDADataFormats/SiPixelCluster/src/classes.h
+++ b/CUDADataFormats/SiPixelCluster/src/classes.h
@@ -1,7 +1,7 @@
 #ifndef CUDADataFormats_SiPixelCluster_classes_h
 #define CUDADataFormats_SiPixelCluster_classes_h
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
diff --git a/CUDADataFormats/SiPixelCluster/src/classes_def.xml b/CUDADataFormats/SiPixelCluster/src/classes_def.xml
index ba0706ac4b8aa..70decb9f27df7 100644
--- a/CUDADataFormats/SiPixelCluster/src/classes_def.xml
+++ b/CUDADataFormats/SiPixelCluster/src/classes_def.xml
@@ -1,4 +1,4 @@
 <lcgdict>
-  <class name="CUDAProduct<SiPixelClustersCUDA>" persistent="false"/>
-  <class name="edm::Wrapper<CUDAProduct<SiPixelClustersCUDA>>" persistent="false"/>
+  <class name="cms::cuda::Product<SiPixelClustersCUDA>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<SiPixelClustersCUDA>>" persistent="false"/>
 </lcgdict>
diff --git a/CUDADataFormats/SiPixelDigi/src/classes.h b/CUDADataFormats/SiPixelDigi/src/classes.h
index 41b135640b883..fca0811e4650f 100644
--- a/CUDADataFormats/SiPixelDigi/src/classes.h
+++ b/CUDADataFormats/SiPixelDigi/src/classes.h
@@ -1,7 +1,7 @@
 #ifndef CUDADataFormats_SiPixelDigi_classes_h
 #define CUDADataFormats_SiPixelDigi_classes_h
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
 #include "DataFormats/Common/interface/Wrapper.h"
diff --git a/CUDADataFormats/SiPixelDigi/src/classes_def.xml b/CUDADataFormats/SiPixelDigi/src/classes_def.xml
index 9d6816ed3b14c..ff775afdc2046 100644
--- a/CUDADataFormats/SiPixelDigi/src/classes_def.xml
+++ b/CUDADataFormats/SiPixelDigi/src/classes_def.xml
@@ -1,7 +1,7 @@
 <lcgdict>
-  <class name="CUDAProduct<SiPixelDigisCUDA>" persistent="false"/>
-  <class name="edm::Wrapper<CUDAProduct<SiPixelDigisCUDA>>" persistent="false"/>
+  <class name="cms::cuda::Product<SiPixelDigisCUDA>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<SiPixelDigisCUDA>>" persistent="false"/>
 
-  <class name="CUDAProduct<SiPixelDigiErrorsCUDA>" persistent="false"/>
-  <class name="edm::Wrapper<CUDAProduct<SiPixelDigiErrorsCUDA>>" persistent="false"/>
+  <class name="cms::cuda::Product<SiPixelDigiErrorsCUDA>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<SiPixelDigiErrorsCUDA>>" persistent="false"/>
 </lcgdict>
diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h
index 699e45ede05d4..8a38f939bc68b 100644
--- a/CUDADataFormats/Track/src/classes.h
+++ b/CUDADataFormats/Track/src/classes.h
@@ -1,7 +1,7 @@
-#ifndef CUDADataFormats__src_classes_h
-#define CUDADataFormats__src_classes_h
+#ifndef CUDADataFormats_Track_src_classes_h
+#define CUDADataFormats_Track__src_classes_h
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/Common/interface/ArrayShadow.h"
diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml
index a4c2e766582dd..7c73c676ad13d 100644
--- a/CUDADataFormats/Track/src/classes_def.xml
+++ b/CUDADataFormats/Track/src/classes_def.xml
@@ -1,6 +1,6 @@
 <lcgdict>
-  <class name="CUDAProduct<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
-  <class name="edm::Wrapper<CUDAProduct<HeterogeneousSoA<pixelTrack::TrackSoA>>>" persistent="false"/>
+  <class name="cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoA>>>" persistent="false"/>
   <class name="HeterogeneousSoA<pixelTrack::TrackSoA>" persistent="false"/>
   <class name="edm::Wrapper<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
   <class name="ArrayShadow<std::array<unsigned int,2001>>" persistent="false"/>
diff --git a/CUDADataFormats/TrackingRecHit/src/classes.h b/CUDADataFormats/TrackingRecHit/src/classes.h
index 90cfd0945d76e..d80226ec7a14b 100644
--- a/CUDADataFormats/TrackingRecHit/src/classes.h
+++ b/CUDADataFormats/TrackingRecHit/src/classes.h
@@ -1,7 +1,7 @@
 #ifndef CUDADataFormats_SiPixelCluster_src_classes_h
 #define CUDADataFormats_SiPixelCluster_src_classes_h
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 #include "DataFormats/Common/interface/Wrapper.h"
diff --git a/CUDADataFormats/TrackingRecHit/src/classes_def.xml b/CUDADataFormats/TrackingRecHit/src/classes_def.xml
index 4e8325ddce87e..02b0eb37d157b 100644
--- a/CUDADataFormats/TrackingRecHit/src/classes_def.xml
+++ b/CUDADataFormats/TrackingRecHit/src/classes_def.xml
@@ -1,10 +1,10 @@
 <lcgdict>
   <class name="TrackingRecHit2DCPU" persistent="false"/>
   <class name="TrackingRecHit2DHost" persistent="false"/>
-  <class name="CUDAProduct<TrackingRecHit2DGPU>" persistent="false"/>
+  <class name="cms::cuda::Product<TrackingRecHit2DGPU>" persistent="false"/>
   <class name="edm::Wrapper<TrackingRecHit2DCPU>" persistent="false"/>
   <class name="edm::Wrapper<TrackingRecHit2DHost>" persistent="false"/>
-  <class name="edm::Wrapper<CUDAProduct<TrackingRecHit2DGPU>>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<TrackingRecHit2DGPU>>" persistent="false"/>
   <class name="HostProduct<unsigned int[]>" persistent="false"/>
   <class name="edm::Wrapper<HostProduct<unsigned int[]>>" persistent="false"/>
 </lcgdict>
diff --git a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
index d12ed5f3d98de..aacfddc6fe7e2 100644
--- a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
+++ b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
@@ -7,8 +7,8 @@
 
 using ZVertexHeterogeneous = HeterogeneousSoA<ZVertexSoA>;
 #ifndef __CUDACC__
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
-using ZVertexCUDAProduct = CUDAProduct<ZVertexHeterogeneous>;
+#include "CUDADataFormats/Common/interface/Product.h"
+using ZVertexCUDAProduct = cms::cuda::Product<ZVertexHeterogeneous>;
 #endif
 
 #endif
diff --git a/CUDADataFormats/Vertex/src/classes.h b/CUDADataFormats/Vertex/src/classes.h
index f1144d1e3014e..e7fea871f7d39 100644
--- a/CUDADataFormats/Vertex/src/classes.h
+++ b/CUDADataFormats/Vertex/src/classes.h
@@ -2,7 +2,7 @@
 #define CUDADataFormats__src_classes_h
 
 #include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
 #endif
diff --git a/CUDADataFormats/Vertex/src/classes_def.xml b/CUDADataFormats/Vertex/src/classes_def.xml
index c43814eb03def..ea633080af9af 100644
--- a/CUDADataFormats/Vertex/src/classes_def.xml
+++ b/CUDADataFormats/Vertex/src/classes_def.xml
@@ -1,5 +1,5 @@
 <lcgdict>
-  <class name="CUDAProduct<ZVertexHeterogeneous>" persistent="false"/>
+  <class name="cms::cuda::Product<ZVertexHeterogeneous>" persistent="false"/>
   <class name="edm::Wrapper<ZVertexCUDAProduct>" persistent="false"/>
   <class name="ZVertexHeterogeneous" persistent="false"/>
   <class name="edm::Wrapper<ZVertexHeterogeneous>" persistent="false"/>
diff --git a/CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h b/CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h
index 8bfefee5c3387..6fb487a244e71 100644
--- a/CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h
+++ b/CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h
@@ -2,7 +2,7 @@
 #define CalibTracker_SiPixelESProducers_interface_SiPixelGainCalibrationForHLTGPU_h
 
 #include "CondFormats/SiPixelObjects/interface/SiPixelGainCalibrationForHLT.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 
 class SiPixelGainCalibrationForHLT;
 class SiPixelGainForHLTonGPU;
@@ -26,7 +26,7 @@ class SiPixelGainCalibrationForHLTGPU {
     SiPixelGainForHLTonGPU *gainForHLTonGPU = nullptr;
     SiPixelGainForHLTonGPU_DecodingStructure *gainDataOnGPU = nullptr;
   };
-  CUDAESProduct<GPUData> gpuData_;
+  cms::cuda::ESProduct<GPUData> gpuData_;
 };
 
 #endif  // CalibTracker_SiPixelESProducers_interface_SiPixelGainCalibrationForHLTGPU_h
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
index ad6c46082be8b..8dfe536bb1555 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
@@ -1,4 +1,4 @@
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
 #include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h"
 #include "FWCore/Framework/interface/EventSetup.h"
@@ -8,7 +8,7 @@
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 class SiPixelDigiErrorsSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
@@ -24,7 +24,7 @@ class SiPixelDigiErrorsSoAFromCUDA : public edm::stream::EDProducer<edm::Externa
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
 
-  edm::EDGetTokenT<CUDAProduct<SiPixelDigiErrorsCUDA>> digiErrorGetToken_;
+  edm::EDGetTokenT<cms::cuda::Product<SiPixelDigiErrorsCUDA>> digiErrorGetToken_;
   edm::EDPutTokenT<SiPixelDigiErrorsSoA> digiErrorPutToken_;
 
   cudautils::host::unique_ptr<PixelErrorCompact[]> data_;
@@ -33,7 +33,8 @@ class SiPixelDigiErrorsSoAFromCUDA : public edm::stream::EDProducer<edm::Externa
 };
 
 SiPixelDigiErrorsSoAFromCUDA::SiPixelDigiErrorsSoAFromCUDA(const edm::ParameterSet& iConfig)
-    : digiErrorGetToken_(consumes<CUDAProduct<SiPixelDigiErrorsCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
+    : digiErrorGetToken_(
+          consumes<cms::cuda::Product<SiPixelDigiErrorsCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
       digiErrorPutToken_(produces<SiPixelDigiErrorsSoA>()) {}
 
 void SiPixelDigiErrorsSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
@@ -46,7 +47,7 @@ void SiPixelDigiErrorsSoAFromCUDA::acquire(const edm::Event& iEvent,
                                            const edm::EventSetup& iSetup,
                                            edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   // Do the transfer in a CUDA stream parallel to the computation CUDA stream
-  CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
+  cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
 
   const auto& gpuDigiErrors = ctx.get(iEvent, digiErrorGetToken_);
 
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
index 7794032154e98..a41fdf91fe978 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
@@ -1,4 +1,4 @@
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h"
 #include "FWCore/Framework/interface/EventSetup.h"
@@ -8,7 +8,7 @@
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 class SiPixelDigisSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
@@ -24,7 +24,7 @@ class SiPixelDigisSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
 
-  edm::EDGetTokenT<CUDAProduct<SiPixelDigisCUDA>> digiGetToken_;
+  edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiGetToken_;
   edm::EDPutTokenT<SiPixelDigisSoA> digiPutToken_;
 
   cudautils::host::unique_ptr<uint32_t[]> pdigi_;
@@ -36,7 +36,7 @@ class SiPixelDigisSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork
 };
 
 SiPixelDigisSoAFromCUDA::SiPixelDigisSoAFromCUDA(const edm::ParameterSet& iConfig)
-    : digiGetToken_(consumes<CUDAProduct<SiPixelDigisCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
+    : digiGetToken_(consumes<cms::cuda::Product<SiPixelDigisCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
       digiPutToken_(produces<SiPixelDigisSoA>()) {}
 
 void SiPixelDigisSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
@@ -49,7 +49,7 @@ void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent,
                                       const edm::EventSetup& iSetup,
                                       edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   // Do the transfer in a CUDA stream parallel to the computation CUDA stream
-  CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
+  cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
 
   const auto& gpuDigis = ctx.get(iEvent, digiGetToken_);
 
diff --git a/HeterogeneousCore/CUDACore/interface/ESProduct.h b/HeterogeneousCore/CUDACore/interface/ESProduct.h
index 40c7ebca15a9b..5f8cf17e137d0 100644
--- a/HeterogeneousCore/CUDACore/interface/ESProduct.h
+++ b/HeterogeneousCore/CUDACore/interface/ESProduct.h
@@ -1,5 +1,5 @@
-#ifndef HeterogeneousCore_CUDACore_CUDAESProduct_h
-#define HeterogeneousCore_CUDACore_CUDAESProduct_h
+#ifndef HeterogeneousCore_CUDACore_ESProduct_h
+#define HeterogeneousCore_CUDACore_ESProduct_h
 
 #include <atomic>
 #include <cassert>
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h
index 41f02518c9fc8..e268e5d3d5c13 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h
@@ -5,7 +5,7 @@
 
 #ifndef __CUDACC__
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #endif
 
 #include <cuda_runtime.h>
@@ -37,7 +37,7 @@ class EcalGainRatiosGPU {
   std::vector<float, CUDAHostAllocator<float>> gain12Over6_;
   std::vector<float, CUDAHostAllocator<float>> gain6Over1_;
 
-  CUDAESProduct<Product> product_;
+  cms::cuda::ESProduct<Product> product_;
 
 #endif
 };
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h
index 973501ed25a8d..420697dea6bda 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h
@@ -5,7 +5,7 @@
 
 #ifndef __CUDACC__
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #endif
 
 #include <cuda_runtime.h>
@@ -42,7 +42,7 @@ class EcalPedestalsGPU {
   std::vector<float, CUDAHostAllocator<float>> mean_x1_;
   std::vector<float, CUDAHostAllocator<float>> rms_x1_;
 
-  CUDAESProduct<Product> product_;
+  cms::cuda::ESProduct<Product> product_;
 #endif
 };
 
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h
index 4bc347233e75d..b5b9271f6e65e 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h
@@ -5,7 +5,7 @@
 
 #ifndef __CUDACC__
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #endif
 
 #include <cuda_runtime.h>
@@ -35,7 +35,7 @@ class EcalPulseCovariancesGPU {
   std::vector<EcalPulseCovariance> const& valuesEB_;
   std::vector<EcalPulseCovariance> const& valuesEE_;
 
-  CUDAESProduct<Product> product_;
+  cms::cuda::ESProduct<Product> product_;
 #endif
 };
 
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h
index 2d9d118b2c024..88893b626ce05 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h
@@ -5,7 +5,7 @@
 
 #ifndef __CUDACC__
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #endif
 
 #include <cuda_runtime.h>
@@ -35,7 +35,7 @@ class EcalPulseShapesGPU {
   std::vector<EcalPulseShape> const& valuesEB_;
   std::vector<EcalPulseShape> const& valuesEE_;
 
-  CUDAESProduct<Product> product_;
+  cms::cuda::ESProduct<Product> product_;
 #endif
 };
 
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h
index 3c9ee6718a5f9..dac1ee041bfc5 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h
@@ -5,7 +5,7 @@
 
 #ifndef __CUDACC__
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #endif
 
 #include <cuda_runtime.h>
@@ -39,7 +39,7 @@ class EcalSamplesCorrelationGPU {
   std::vector<double> const& EEG6SamplesCorrelation_;
   std::vector<double> const& EEG1SamplesCorrelation_;
 
-  CUDAESProduct<Product> product_;
+  cms::cuda::ESProduct<Product> product_;
 #endif
 };
 
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h
index 93bc395e449a3..70af33b52f216 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h
@@ -5,7 +5,7 @@
 
 #ifndef __CUDACC__
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #endif
 
 #include <cuda_runtime.h>
@@ -44,7 +44,7 @@ class EcalTimeBiasCorrectionsGPU {
   std::vector<float> const& EETimeCorrShiftBins_;
 
 #ifndef __CUDACC__
-  CUDAESProduct<Product> product_;
+  cms::cuda::ESProduct<Product> product_;
 #endif
 };
 
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h
index 2c6c3075dde8d..fd640e7c989b3 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h
@@ -5,7 +5,7 @@
 
 #ifndef __CUDACC__
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #endif
 
 #include <cuda_runtime.h>
@@ -38,7 +38,7 @@ class EcalTimeCalibConstantsGPU {
   std::vector<float> const& valuesEB_;
   std::vector<float> const& valuesEE_;
 
-  CUDAESProduct<Product> product_;
+  cms::cuda::ESProduct<Product> product_;
 #endif
 };
 
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc
index 786ba409525e6..a90cc1536c482 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc
@@ -4,7 +4,7 @@
 //#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h"
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
@@ -79,7 +79,7 @@ class EcalUncalibRecHitProducerGPU : public edm::stream::EDProducer<edm::Externa
   ecal::multifit::EventDataForScratchGPU eventDataForScratchGPU_;
   bool shouldTransferToHost_{true};
 
-  CUDAContextState cudaState_;
+  cms::cuda::ContextState cudaState_;
 
   std::unique_ptr<ecal::UncalibratedRecHit<ecal::Tag::soa>> ebRecHits_{nullptr}, eeRecHits_{nullptr};
 
@@ -294,7 +294,7 @@ void EcalUncalibRecHitProducerGPU::acquire(edm::Event const& event,
   //DurationMeasurer<std::chrono::milliseconds> timer{std::string{"acquire duration"}};
 
   // raii
-  CUDAScopedContextAcquire ctx{event.streamID(), std::move(holder), cudaState_};
+  cms::cuda::ScopedContextAcquire ctx{event.streamID(), std::move(holder), cudaState_};
 
   // conditions
   setup.get<EcalPedestalsRcd>().get(pedestalsHandle_);
@@ -364,7 +364,7 @@ void EcalUncalibRecHitProducerGPU::acquire(edm::Event const& event,
 
 void EcalUncalibRecHitProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) {
   //DurationMeasurer<std::chrono::milliseconds> timer{std::string{"produce duration"}};
-  CUDAScopedContextProduce ctx{cudaState_};
+  cms::cuda::ScopedContextProduce ctx{cudaState_};
 
   if (shouldTransferToHost_) {
     // rec hits objects were not originally member variables
diff --git a/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h b/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h
index 14a5d25504479..681354767a7a3 100644
--- a/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h
+++ b/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h
@@ -1,7 +1,7 @@
 #ifndef RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h
 #define RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h
 
-#include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPU.h"
@@ -44,13 +44,13 @@ class SiPixelFedCablingMapGPUWrapper {
     ~GPUData();
     SiPixelFedCablingMapGPU *cablingMapDevice = nullptr;  // pointer to struct in GPU
   };
-  CUDAESProduct<GPUData> gpuData_;
+  cms::cuda::ESProduct<GPUData> gpuData_;
 
   struct ModulesToUnpack {
     ~ModulesToUnpack();
     unsigned char *modToUnpDefault = nullptr;  // pointer to GPU
   };
-  CUDAESProduct<ModulesToUnpack> modToUnp_;
+  cms::cuda::ESProduct<ModulesToUnpack> modToUnp_;
 };
 
 #endif
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
index 62004d385577d..feb07fe0e686e 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
@@ -1,4 +1,4 @@
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
@@ -25,7 +25,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h"
 #include "RecoTracker/Record/interface/CkfComponentsRecord.h"
@@ -51,11 +51,11 @@ class SiPixelRawToClusterCUDA : public edm::stream::EDProducer<edm::ExternalWork
 
   edm::EDGetTokenT<FEDRawDataCollection> rawGetToken_;
 
-  edm::EDPutTokenT<CUDAProduct<SiPixelDigisCUDA>> digiPutToken_;
-  edm::EDPutTokenT<CUDAProduct<SiPixelDigiErrorsCUDA>> digiErrorPutToken_;
-  edm::EDPutTokenT<CUDAProduct<SiPixelClustersCUDA>> clusterPutToken_;
+  edm::EDPutTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiPutToken_;
+  edm::EDPutTokenT<cms::cuda::Product<SiPixelDigiErrorsCUDA>> digiErrorPutToken_;
+  edm::EDPutTokenT<cms::cuda::Product<SiPixelClustersCUDA>> clusterPutToken_;
 
-  CUDAContextState ctxState_;
+  cms::cuda::ContextState ctxState_;
 
   edm::ESWatcher<SiPixelFedCablingMapRcd> recordWatcher_;
   edm::ESGetToken<SiPixelFedCablingMapGPUWrapper, CkfComponentsRecord> gpuMapToken_;
@@ -78,8 +78,8 @@ class SiPixelRawToClusterCUDA : public edm::stream::EDProducer<edm::ExternalWork
 
 SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfig)
     : rawGetToken_(consumes<FEDRawDataCollection>(iConfig.getParameter<edm::InputTag>("InputLabel"))),
-      digiPutToken_(produces<CUDAProduct<SiPixelDigisCUDA>>()),
-      clusterPutToken_(produces<CUDAProduct<SiPixelClustersCUDA>>()),
+      digiPutToken_(produces<cms::cuda::Product<SiPixelDigisCUDA>>()),
+      clusterPutToken_(produces<cms::cuda::Product<SiPixelClustersCUDA>>()),
       gpuMapToken_(esConsumes<SiPixelFedCablingMapGPUWrapper, CkfComponentsRecord>()),
       gainsToken_(esConsumes<SiPixelGainCalibrationForHLTGPU, SiPixelGainCalibrationForHLTGPURcd>()),
       cablingMapToken_(esConsumes<SiPixelFedCablingMap, SiPixelFedCablingMapRcd>(
@@ -89,7 +89,7 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfi
       usePilotBlade_(iConfig.getParameter<bool>("UsePilotBlade"))  // Control the usage of pilot-blade data, FED=40
 {
   if (includeErrors_) {
-    digiErrorPutToken_ = produces<CUDAProduct<SiPixelDigiErrorsCUDA>>();
+    digiErrorPutToken_ = produces<cms::cuda::Product<SiPixelDigiErrorsCUDA>>();
   }
 
   // regions
@@ -128,7 +128,7 @@ void SiPixelRawToClusterCUDA::fillDescriptions(edm::ConfigurationDescriptions& d
 void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
                                       const edm::EventSetup& iSetup,
                                       edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_};
+  cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_};
 
   auto hgpuMap = iSetup.getHandle(gpuMapToken_);
   if (hgpuMap->hasQuality() != useQuality_) {
@@ -247,7 +247,7 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
 }
 
 void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
-  CUDAScopedContextProduce ctx{ctxState_};
+  cms::cuda::ScopedContextProduce ctx{ctxState_};
 
   auto tmp = gpuAlgo_.getResults();
   ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first));
diff --git a/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h b/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h
index e43c45f90523f..70e30563c66c3 100644
--- a/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h
+++ b/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h
@@ -6,7 +6,7 @@
 #include "CalibTracker/SiPixelESProducers/interface/SiPixelCPEGenericDBErrorParametrization.h"
 #include "CondFormats/SiPixelTransient/interface/SiPixelGenError.h"
 #include "CondFormats/SiPixelTransient/interface/SiPixelTemplate.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
@@ -95,7 +95,7 @@ class PixelCPEFast final : public PixelCPEBase {
     pixelCPEforGPU::ParamsOnGPU h_paramsOnGPU;
     pixelCPEforGPU::ParamsOnGPU *d_paramsOnGPU = nullptr;  // copy of the above on the Device
   };
-  CUDAESProduct<GPUData> gpuData_;
+  cms::cuda::ESProduct<GPUData> gpuData_;
 
   void fillParamsForGpu();
 };
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
index 1641719d0537d..4d85c41339020 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
@@ -1,8 +1,7 @@
 #include <cuda_runtime.h>
 
 #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
@@ -18,7 +17,7 @@
 #include "FWCore/Utilities/interface/InputTag.h"
 #include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
 #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "RecoLocalTracker/Records/interface/TkPixelCPERecord.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h"
@@ -36,11 +35,11 @@ class SiPixelRecHitCUDA : public edm::global::EDProducer<> {
   void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
 
   // The mess with inputs will be cleaned up when migrating to the new framework
-  edm::EDGetTokenT<CUDAProduct<BeamSpotCUDA>> tBeamSpot;
-  edm::EDGetTokenT<CUDAProduct<SiPixelClustersCUDA>> token_;
-  edm::EDGetTokenT<CUDAProduct<SiPixelDigisCUDA>> tokenDigi_;
+  edm::EDGetTokenT<cms::cuda::Product<BeamSpotCUDA>> tBeamSpot;
+  edm::EDGetTokenT<cms::cuda::Product<SiPixelClustersCUDA>> token_;
+  edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> tokenDigi_;
 
-  edm::EDPutTokenT<CUDAProduct<TrackingRecHit2DCUDA>> tokenHit_;
+  edm::EDPutTokenT<cms::cuda::Product<TrackingRecHit2DCUDA>> tokenHit_;
 
   std::string cpeName_;
 
@@ -48,10 +47,10 @@ class SiPixelRecHitCUDA : public edm::global::EDProducer<> {
 };
 
 SiPixelRecHitCUDA::SiPixelRecHitCUDA(const edm::ParameterSet& iConfig)
-    : tBeamSpot(consumes<CUDAProduct<BeamSpotCUDA>>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
-      token_(consumes<CUDAProduct<SiPixelClustersCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
-      tokenDigi_(consumes<CUDAProduct<SiPixelDigisCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
-      tokenHit_(produces<CUDAProduct<TrackingRecHit2DCUDA>>()),
+    : tBeamSpot(consumes<cms::cuda::Product<BeamSpotCUDA>>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
+      token_(consumes<cms::cuda::Product<SiPixelClustersCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
+      tokenDigi_(consumes<cms::cuda::Product<SiPixelDigisCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
+      tokenHit_(produces<cms::cuda::Product<TrackingRecHit2DCUDA>>()),
       cpeName_(iConfig.getParameter<std::string>("CPE")) {}
 
 void SiPixelRecHitCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
@@ -82,17 +81,17 @@ void SiPixelRecHitCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, cons
     throw cms::Exception("Configuration") << "too bad, not a fast cpe gpu processing not possible....";
   }
 
-  edm::Handle<CUDAProduct<SiPixelClustersCUDA>> hclusters;
+  edm::Handle<cms::cuda::Product<SiPixelClustersCUDA>> hclusters;
   iEvent.getByToken(token_, hclusters);
 
-  CUDAScopedContextProduce ctx{*hclusters};
+  cms::cuda::ScopedContextProduce ctx{*hclusters};
   auto const& clusters = ctx.get(*hclusters);
 
-  edm::Handle<CUDAProduct<SiPixelDigisCUDA>> hdigis;
+  edm::Handle<cms::cuda::Product<SiPixelDigisCUDA>> hdigis;
   iEvent.getByToken(tokenDigi_, hdigis);
   auto const& digis = ctx.get(*hdigis);
 
-  edm::Handle<CUDAProduct<BeamSpotCUDA>> hbs;
+  edm::Handle<cms::cuda::Product<BeamSpotCUDA>> hbs;
   iEvent.getByToken(tBeamSpot, hbs);
   auto const& bs = ctx.get(*hbs);
 
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc
index a4f19ac276a7a..5dbf0da75dc42 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc
@@ -1,6 +1,6 @@
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 #include "DataFormats/Common/interface/DetSetVectorNew.h"
@@ -19,7 +19,7 @@
 #include "Geometry/CommonDetUnit/interface/PixelGeomDetUnit.h"
 #include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
 #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
 
 class SiPixelRecHitFromSOA : public edm::stream::EDProducer<edm::ExternalWork> {
@@ -37,8 +37,8 @@ class SiPixelRecHitFromSOA : public edm::stream::EDProducer<edm::ExternalWork> {
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
-  edm::EDGetTokenT<CUDAProduct<TrackingRecHit2DCUDA>> tokenHit_;  // CUDA hits
-  edm::EDGetTokenT<SiPixelClusterCollectionNew> clusterToken_;    // Legacy Clusters
+  edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DCUDA>> tokenHit_;  // CUDA hits
+  edm::EDGetTokenT<SiPixelClusterCollectionNew> clusterToken_;           // Legacy Clusters
 
   uint32_t m_nHits;
   cudautils::host::unique_ptr<uint16_t[]> m_store16;
@@ -47,7 +47,8 @@ class SiPixelRecHitFromSOA : public edm::stream::EDProducer<edm::ExternalWork> {
 };
 
 SiPixelRecHitFromSOA::SiPixelRecHitFromSOA(const edm::ParameterSet& iConfig)
-    : tokenHit_(consumes<CUDAProduct<TrackingRecHit2DCUDA>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
+    : tokenHit_(
+          consumes<cms::cuda::Product<TrackingRecHit2DCUDA>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
       clusterToken_(consumes<SiPixelClusterCollectionNew>(iConfig.getParameter<edm::InputTag>("src"))) {
   produces<SiPixelRecHitCollectionNew>();
   produces<HMSstorage>();
@@ -63,8 +64,8 @@ void SiPixelRecHitFromSOA::fillDescriptions(edm::ConfigurationDescriptions& desc
 void SiPixelRecHitFromSOA::acquire(edm::Event const& iEvent,
                                    edm::EventSetup const& iSetup,
                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  CUDAProduct<TrackingRecHit2DCUDA> const& inputDataWrapped = iEvent.get(tokenHit_);
-  CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+  cms::cuda::Product<TrackingRecHit2DCUDA> const& inputDataWrapped = iEvent.get(tokenHit_);
+  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
   auto const& inputData = ctx.get(inputDataWrapped);
 
   m_nHits = inputData.nHits();
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
index 7900cf8b2289a..1b4b483ad8ffc 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
@@ -22,7 +22,6 @@
 #include "FWCore/Utilities/interface/InputTag.h"
 #include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
 #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "RecoLocalTracker/Records/interface/TkPixelCPERecord.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h"
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
index cd143fb3aab2c..04faf570c3fcc 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
@@ -1,6 +1,6 @@
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 #include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
@@ -18,7 +18,7 @@
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/InputTag.h"
 #include "FWCore/Utilities/interface/RunningAverage.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
 
 class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> {
@@ -31,8 +31,8 @@ class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> {
 private:
   void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override;
   const bool m_onGPU;
-  edm::EDGetTokenT<CUDAProduct<PixelTrackHeterogeneous>> tokenGPUTrack_;
-  edm::EDGetTokenT<CUDAProduct<ZVertexHeterogeneous>> tokenGPUVertex_;
+  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
+  edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenGPUVertex_;
   edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrack_;
   edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertex_;
 };
@@ -41,9 +41,9 @@ PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig)
     : m_onGPU(iConfig.getParameter<bool>("onGPU")) {
   if (m_onGPU) {
     tokenGPUTrack_ =
-        consumes<CUDAProduct<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+        consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenGPUVertex_ =
-        consumes<CUDAProduct<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+        consumes<cms::cuda::Product<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
   } else {
     tokenSoATrack_ = consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenSoAVertex_ = consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
@@ -64,7 +64,7 @@ void PixelTrackDumpCUDA::analyze(edm::StreamID streamID,
                                  const edm::EventSetup& iSetup) const {
   if (m_onGPU) {
     auto const& hTracks = iEvent.get(tokenGPUTrack_);
-    CUDAScopedContextProduce ctx{hTracks};
+    cms::cuda::ScopedContextProduce ctx{hTracks};
 
     auto const& tracks = ctx.get(hTracks);
     auto const* tsoa = tracks.get();
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
index 3e73cfd7a4e96..4ed23b7dc5394 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -1,6 +1,6 @@
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "DataFormats/Common/interface/Handle.h"
@@ -15,7 +15,7 @@
 #include "FWCore/PluginManager/interface/ModuleDef.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/InputTag.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 
 class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
@@ -30,14 +30,14 @@ class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
-  edm::EDGetTokenT<CUDAProduct<PixelTrackHeterogeneous>> tokenCUDA_;
+  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenCUDA_;
   edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;
 
   cudautils::host::unique_ptr<pixelTrack::TrackSoA> m_soa;
 };
 
 PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig)
-    : tokenCUDA_(consumes<CUDAProduct<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
+    : tokenCUDA_(consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
       tokenSOA_(produces<PixelTrackHeterogeneous>()) {}
 
 void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
@@ -50,8 +50,8 @@ void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& des
 void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
                                     edm::EventSetup const& iSetup,
                                     edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  CUDAProduct<PixelTrackHeterogeneous> const& inputDataWrapped = iEvent.get(tokenCUDA_);
-  CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+  cms::cuda::Product<PixelTrackHeterogeneous> const& inputDataWrapped = iEvent.get(tokenCUDA_);
+  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
   auto const& inputData = ctx.get(inputDataWrapped);
 
   m_soa = inputData.toHostAsync(ctx.stream());
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
index 11b644d466768..31e5070e55e05 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -1,6 +1,6 @@
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "FWCore/Framework/interface/ESHandle.h"
 #include "FWCore/Framework/interface/Event.h"
@@ -15,7 +15,7 @@
 #include "FWCore/PluginManager/interface/ModuleDef.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/RunningAverage.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
 
 #include "CAHitNtupletGeneratorOnGPU.h"
@@ -34,8 +34,8 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> {
 
   bool m_OnGPU;
 
-  edm::EDGetTokenT<CUDAProduct<TrackingRecHit2DGPU>> tokenHitGPU_;
-  edm::EDPutTokenT<CUDAProduct<PixelTrackHeterogeneous>> tokenTrackGPU_;
+  edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> tokenHitGPU_;
+  edm::EDPutTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenTrackGPU_;
   edm::EDGetTokenT<TrackingRecHit2DCPU> tokenHitCPU_;
   edm::EDPutTokenT<PixelTrackHeterogeneous> tokenTrackCPU_;
 
@@ -45,8 +45,9 @@ class CAHitNtupletCUDA : public edm::global::EDProducer<> {
 CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig)
     : m_OnGPU(iConfig.getParameter<bool>("onGPU")), gpuAlgo_(iConfig, consumesCollector()) {
   if (m_OnGPU) {
-    tokenHitGPU_ = consumes<CUDAProduct<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
-    tokenTrackGPU_ = produces<CUDAProduct<PixelTrackHeterogeneous>>();
+    tokenHitGPU_ =
+        consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
+    tokenTrackGPU_ = produces<cms::cuda::Product<PixelTrackHeterogeneous>>();
   } else {
     tokenHitCPU_ = consumes<TrackingRecHit2DCPU>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
     tokenTrackCPU_ = produces<PixelTrackHeterogeneous>();
@@ -68,10 +69,10 @@ void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const
   auto bf = 1. / PixelRecoUtilities::fieldInInvGev(es);
 
   if (m_OnGPU) {
-    edm::Handle<CUDAProduct<TrackingRecHit2DCUDA>> hHits;
+    edm::Handle<cms::cuda::Product<TrackingRecHit2DCUDA>> hHits;
     iEvent.getByToken(tokenHitGPU_, hHits);
 
-    CUDAScopedContextProduce ctx{*hHits};
+    cms::cuda::ScopedContextProduce ctx{*hHits};
     auto const& hits = ctx.get(*hHits);
 
     ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream()));
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc
index e959abcfef949..e9054dbf17c53 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc
@@ -1,6 +1,6 @@
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "FWCore/Framework/interface/ESHandle.h"
 #include "FWCore/Framework/interface/Event.h"
@@ -15,7 +15,7 @@
 #include "FWCore/PluginManager/interface/ModuleDef.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/RunningAverage.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 
 #include "gpuVertexFinder.h"
 
@@ -31,7 +31,7 @@ class PixelVertexProducerCUDA : public edm::global::EDProducer<> {
 
   bool m_OnGPU;
 
-  edm::EDGetTokenT<CUDAProduct<PixelTrackHeterogeneous>> tokenGPUTrack_;
+  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
   edm::EDPutTokenT<ZVertexCUDAProduct> tokenGPUVertex_;
   edm::EDGetTokenT<PixelTrackHeterogeneous> tokenCPUTrack_;
   edm::EDPutTokenT<ZVertexHeterogeneous> tokenCPUVertex_;
@@ -55,7 +55,8 @@ PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf)
       m_ptMin(conf.getParameter<double>("PtMin"))  // 0.5 GeV
 {
   if (m_OnGPU) {
-    tokenGPUTrack_ = consumes<CUDAProduct<PixelTrackHeterogeneous>>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenGPUTrack_ =
+        consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
     tokenGPUVertex_ = produces<ZVertexCUDAProduct>();
   } else {
     tokenCPUTrack_ = consumes<PixelTrackHeterogeneous>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
@@ -88,10 +89,10 @@ void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& d
 
 void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
   if (m_OnGPU) {
-    edm::Handle<CUDAProduct<PixelTrackHeterogeneous>> hTracks;
+    edm::Handle<cms::cuda::Product<PixelTrackHeterogeneous>> hTracks;
     iEvent.getByToken(tokenGPUTrack_, hTracks);
 
-    CUDAScopedContextProduce ctx{*hTracks};
+    cms::cuda::ScopedContextProduce ctx{*hTracks};
     auto const* tracks = ctx.get(*hTracks).get();
 
     assert(tracks);
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexSoAFromCUDA.cc
index 7a1bd064e5a2b..aee9be2326572 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexSoAFromCUDA.cc
@@ -1,6 +1,6 @@
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
 #include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
 #include "DataFormats/Common/interface/Handle.h"
@@ -15,7 +15,7 @@
 #include "FWCore/PluginManager/interface/ModuleDef.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/InputTag.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 
 class PixelVertexSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
@@ -30,14 +30,14 @@ class PixelVertexSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
-  edm::EDGetTokenT<CUDAProduct<ZVertexHeterogeneous>> tokenCUDA_;
+  edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenCUDA_;
   edm::EDPutTokenT<ZVertexHeterogeneous> tokenSOA_;
 
   cudautils::host::unique_ptr<ZVertexSoA> m_soa;
 };
 
 PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig)
-    : tokenCUDA_(consumes<CUDAProduct<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
+    : tokenCUDA_(consumes<cms::cuda::Product<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
       tokenSOA_(produces<ZVertexHeterogeneous>()) {}
 
 void PixelVertexSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
@@ -51,7 +51,7 @@ void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent,
                                      edm::EventSetup const& iSetup,
                                      edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   auto const& inputDataWrapped = iEvent.get(tokenCUDA_);
-  CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
   auto const& inputData = ctx.get(inputDataWrapped);
 
   m_soa = inputData.toHostAsync(ctx.stream());
diff --git a/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc b/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc
index ad840b0ad97b2..2eefc648a4c6e 100644
--- a/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc
+++ b/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc
@@ -1,4 +1,4 @@
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
 #include "FWCore/Framework/interface/Event.h"
@@ -8,7 +8,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
 
@@ -44,12 +44,12 @@ class BeamSpotToCUDA : public edm::global::EDProducer<edm::StreamCache<BSHost>>
 
 private:
   edm::EDGetTokenT<reco::BeamSpot> bsGetToken_;
-  edm::EDPutTokenT<CUDAProduct<BeamSpotCUDA>> bsPutToken_;
+  edm::EDPutTokenT<cms::cuda::Product<BeamSpotCUDA>> bsPutToken_;
 };
 
 BeamSpotToCUDA::BeamSpotToCUDA(const edm::ParameterSet& iConfig)
     : bsGetToken_{consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("src"))},
-      bsPutToken_{produces<CUDAProduct<BeamSpotCUDA>>()} {}
+      bsPutToken_{produces<cms::cuda::Product<BeamSpotCUDA>>()} {}
 
 void BeamSpotToCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
@@ -58,7 +58,7 @@ void BeamSpotToCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptio
 }
 
 void BeamSpotToCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
-  CUDAScopedContextProduce ctx{streamID};
+  cms::cuda::ScopedContextProduce ctx{streamID};
 
   const reco::BeamSpot& bs = iEvent.get(bsGetToken_);
 
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationProducerCUDA.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationProducerCUDA.cc
index b8a4e97e84d10..cedb9f8fedf29 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationProducerCUDA.cc
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationProducerCUDA.cc
@@ -4,7 +4,7 @@
 
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 #include "DataFormats/Common/interface/DetSetVector.h"
@@ -29,7 +29,7 @@
 #include "FWCore/Utilities/interface/InputTag.h"
 #include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
 #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "SimDataFormats/Track/interface/SimTrackContainer.h"
 #include "SimDataFormats/TrackerDigiSimLink/interface/PixelDigiSimLink.h"
@@ -71,10 +71,10 @@ class ClusterTPAssociationProducerCUDA : public edm::global::EDProducer<> {
   edm::EDGetTokenT<edmNew::DetSetVector<Phase2TrackerCluster1D>> phase2OTClustersToken_;
   edm::EDGetTokenT<TrackingParticleCollection> trackingParticleToken_;
 
-  edm::EDGetTokenT<CUDAProduct<SiPixelDigisCUDA>> tGpuDigis;
-  edm::EDGetTokenT<CUDAProduct<TrackingRecHit2DCUDA>> tGpuHits;
+  edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> tGpuDigis;
+  edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DCUDA>> tGpuHits;
 
-  edm::EDPutTokenT<CUDAProduct<ProductCUDA>> tokenGPUProd_;
+  edm::EDPutTokenT<cms::cuda::Product<ProductCUDA>> tokenGPUProd_;
 
   clusterSLOnGPU::Kernel m_gpuAlgo;
 };
@@ -94,12 +94,12 @@ ClusterTPAssociationProducerCUDA::ClusterTPAssociationProducerCUDA(const edm::Pa
           cfg.getParameter<edm::InputTag>("phase2OTClusterSrc"))),
       trackingParticleToken_(
           consumes<TrackingParticleCollection>(cfg.getParameter<edm::InputTag>("trackingParticleSrc"))),
-      tGpuDigis(
-          consumes<CUDAProduct<SiPixelDigisCUDA>>(cfg.getParameter<edm::InputTag>("heterogeneousPixelDigiClusterSrc"))),
-      tGpuHits(
-          consumes<CUDAProduct<TrackingRecHit2DCUDA>>(cfg.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc"))),
+      tGpuDigis(consumes<cms::cuda::Product<SiPixelDigisCUDA>>(
+          cfg.getParameter<edm::InputTag>("heterogeneousPixelDigiClusterSrc"))),
+      tGpuHits(consumes<cms::cuda::Product<TrackingRecHit2DCUDA>>(
+          cfg.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc"))),
       m_gpuAlgo(cfg.getParameter<bool>("dumpCSV")) {
-  tokenGPUProd_ = produces<CUDAProduct<ProductCUDA>>();
+  tokenGPUProd_ = produces<cms::cuda::Product<ProductCUDA>>();
 }
 
 void ClusterTPAssociationProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions &descriptions) {
@@ -159,12 +159,12 @@ void ClusterTPAssociationProducerCUDA::produce(edm::StreamID streamID,
 
   auto mapping = makeMap(iEvent);
 
-  edm::Handle<CUDAProduct<SiPixelDigisCUDA>> gd;
+  edm::Handle<cms::cuda::Product<SiPixelDigisCUDA>> gd;
   iEvent.getByToken(tGpuDigis, gd);
-  edm::Handle<CUDAProduct<TrackingRecHit2DCUDA>> gh;
+  edm::Handle<cms::cuda::Product<TrackingRecHit2DCUDA>> gh;
   iEvent.getByToken(tGpuHits, gh);
 
-  CUDAScopedContextProduce ctx{*gd};
+  cms::cuda::ScopedContextProduce ctx{*gd};
   auto const &gDigis = ctx.get(*gd);
   auto const &gHits = ctx.get(*gh);
   auto ndigis = gDigis.nDigis();
diff --git a/SimTracker/TrackerHitAssociation/src/classes.h b/SimTracker/TrackerHitAssociation/src/classes.h
index 3c143de04fe0a..c8f98cd38ca81 100644
--- a/SimTracker/TrackerHitAssociation/src/classes.h
+++ b/SimTracker/TrackerHitAssociation/src/classes.h
@@ -5,7 +5,7 @@
 #include "DataFormats/Common/interface/AssociationMap.h"
 #include "DataFormats/TrackerRecHit2D/interface/OmniClusterRef.h"
 #include "SimTracker/TrackerHitAssociation/interface/ClusterTPAssociation.h"
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "SimTracker/TrackerHitAssociation/interface/trackerHitAssociationHeterogeneous.h"
 
 #include "DataFormats/Common/interface/AssociationMap.h"
diff --git a/SimTracker/TrackerHitAssociation/src/classes_def.xml b/SimTracker/TrackerHitAssociation/src/classes_def.xml
index fc157eacc310f..e9701e768fe75 100644
--- a/SimTracker/TrackerHitAssociation/src/classes_def.xml
+++ b/SimTracker/TrackerHitAssociation/src/classes_def.xml
@@ -22,7 +22,7 @@
   <class name="edm::Wrapper<std::map<OmniClusterRef, std::vector<TrackingParticleRef> > >" persistent="false" /> 
 
   <class name="trackerHitAssociationHeterogeneous::ProductCUDA" persistent="false"/>
-  <class name="CUDAProduct<trackerHitAssociationHeterogeneous::ProductCUDA>" persistent="false"/>
-  <class name="edm::Wrapper<CUDAProduct<trackerHitAssociationHeterogeneous::ProductCUDA>>" persistent="false"/>
+  <class name="cms::cuda::Product<trackerHitAssociationHeterogeneous::ProductCUDA>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<trackerHitAssociationHeterogeneous::ProductCUDA>>" persistent="false"/>
 
 </lcgdict>
diff --git a/SimTracker/TrackerHitAssociation/test/ClusterTPCUDAdump.cc b/SimTracker/TrackerHitAssociation/test/ClusterTPCUDAdump.cc
index 83fbde7a7fa2d..9c7a2e3e4828b 100644
--- a/SimTracker/TrackerHitAssociation/test/ClusterTPCUDAdump.cc
+++ b/SimTracker/TrackerHitAssociation/test/ClusterTPCUDAdump.cc
@@ -1,6 +1,6 @@
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "FWCore/Framework/interface/ConsumesCollector.h"
 #include "FWCore/Framework/interface/ESHandle.h"
@@ -16,7 +16,7 @@
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/InputTag.h"
 #include "FWCore/Utilities/interface/RunningAverage.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "SimTracker/TrackerHitAssociation/interface/trackerHitAssociationHeterogeneous.h"
 
@@ -34,12 +34,12 @@ class ClusterTPCUDAdump : public edm::global::EDAnalyzer<> {
 private:
   void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override;
   const bool m_onGPU;
-  edm::EDGetTokenT<CUDAProduct<ProductCUDA>> tokenGPU_;
+  edm::EDGetTokenT<cms::cuda::Product<ProductCUDA>> tokenGPU_;
 };
 
 ClusterTPCUDAdump::ClusterTPCUDAdump(const edm::ParameterSet& iConfig) : m_onGPU(iConfig.getParameter<bool>("onGPU")) {
   if (m_onGPU) {
-    tokenGPU_ = consumes<CUDAProduct<ProductCUDA>>(iConfig.getParameter<edm::InputTag>("clusterTP"));
+    tokenGPU_ = consumes<cms::cuda::Product<ProductCUDA>>(iConfig.getParameter<edm::InputTag>("clusterTP"));
   } else {
   }
 }
@@ -47,7 +47,7 @@ ClusterTPCUDAdump::ClusterTPCUDAdump(const edm::ParameterSet& iConfig) : m_onGPU
 void ClusterTPCUDAdump::analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const {
   if (m_onGPU) {
     auto const& hctp = iEvent.get(tokenGPU_);
-    CUDAScopedContextProduce ctx{hctp};
+    cms::cuda::ScopedContextProduce ctx{hctp};
 
     auto const& ctp = ctx.get(hctp);
     auto const& soa = ctp.view();

From 083ac40e79ea7de6bdfe32f40985a11b3327ba54 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 14 Jan 2020 20:29:48 +0100
Subject: [PATCH 28/29] Rename cudautils namespace to cms::cuda

---
 CUDADataFormats/Common/interface/Product.h    |   4 +-
 .../Common/interface/ProductBase.h            |  12 +-
 CUDADataFormats/Common/src/ProductBase.cc     |   2 +-
 CUDADataFormats/Common/test/test_Product.cc   |   6 +-
 HeterogeneousCore/CUDACore/README.md          |   6 +-
 .../CUDACore/interface/ContextState.h         |   8 +-
 .../CUDACore/interface/ESProduct.h            |  10 +-
 .../CUDACore/interface/ScopedContext.h        |  10 +-
 .../CUDACore/src/ScopedContext.cc             |   6 +-
 .../CUDACore/test/test_ScopedContext.cc       |  24 +--
 .../CUDAServices/src/CUDAService.cc           |  30 +--
 HeterogeneousCore/CUDATest/interface/Thing.h  |   4 +-
 .../CUDATest/plugins/TestCUDAAnalyzerGPU.cc   |   4 +-
 .../plugins/TestCUDAAnalyzerGPUKernel.cu      |  10 +-
 .../plugins/TestCUDAAnalyzerGPUKernel.h       |   2 +-
 .../CUDATest/plugins/TestCUDAProducerGPUEW.cc |   6 +-
 .../plugins/TestCUDAProducerGPUEWTask.cc      |   6 +-
 .../plugins/TestCUDAProducerGPUFirst.cc       |   2 +-
 .../plugins/TestCUDAProducerGPUKernel.cu      |  24 +--
 .../plugins/TestCUDAProducerGPUKernel.h       |   6 +-
 .../plugins/TestCUDAProducerGPUtoCPU.cc       |   4 +-
 .../CUDAUtilities/interface/EventCache.h      |  70 +++----
 .../CUDAUtilities/interface/MessageLogger.h   | 182 +++++++++---------
 .../CUDAUtilities/interface/ScopedSetDevice.h |  36 ++--
 .../CUDAUtilities/interface/SharedEventPtr.h  |  14 +-
 .../CUDAUtilities/interface/SharedStreamPtr.h |  14 +-
 .../CUDAUtilities/interface/StreamCache.h     |  56 +++---
 .../CUDAUtilities/interface/allocate_device.h |  14 +-
 .../CUDAUtilities/interface/allocate_host.h   |  14 +-
 .../CUDAUtilities/interface/copyAsync.h       |  68 +++----
 .../CUDAUtilities/interface/cudaCheck.h       |  92 ++++-----
 .../CUDAUtilities/interface/currentDevice.h   |  16 +-
 .../CUDAUtilities/interface/deviceCount.h     |  16 +-
 .../interface/device_unique_ptr.h             | 156 +++++++--------
 .../interface/eventWorkHasCompleted.h         |  28 +--
 .../interface/host_noncached_unique_ptr.h     | 109 +++++------
 .../CUDAUtilities/interface/host_unique_ptr.h | 122 ++++++------
 .../CUDAUtilities/interface/launch.h          | 150 ++++++++-------
 .../CUDAUtilities/interface/memsetAsync.h     |  31 +--
 .../CUDAUtilities/src/EventCache.cc           |  10 +-
 .../CUDAUtilities/src/MessageLogger.cc        |   4 +-
 .../CUDAUtilities/src/StreamCache.cc          |  10 +-
 .../CUDAUtilities/src/allocate_device.cc      |  14 +-
 .../CUDAUtilities/src/allocate_host.cc        |  14 +-
 .../src/getCachingDeviceAllocator.h           | 122 ++++++------
 .../src/getCachingHostAllocator.h             |  64 +++---
 .../CUDAUtilities/test/copyAsync_t.cpp        |  36 ++--
 .../test/device_unique_ptr_t.cpp              |  10 +-
 .../test/host_noncached_unique_ptr_t.cpp      |   8 +-
 .../CUDAUtilities/test/host_unique_ptr_t.cpp  |  10 +-
 .../CUDAUtilities/test/memsetAsync_t.cpp      |  24 +--
 51 files changed, 865 insertions(+), 835 deletions(-)

diff --git a/CUDADataFormats/Common/interface/Product.h b/CUDADataFormats/Common/interface/Product.h
index 70eae630b3ce3..41bb8356e67cf 100644
--- a/CUDADataFormats/Common/interface/Product.h
+++ b/CUDADataFormats/Common/interface/Product.h
@@ -45,11 +45,11 @@ namespace cms {
       friend class ScopedContextProduce;
       friend class edm::Wrapper<Product<T>>;
 
-      explicit Product(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event, T data)
+      explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, T data)
           : ProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {}
 
       template <typename... Args>
-      explicit Product(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event, Args&&... args)
+      explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, Args&&... args)
           : ProductBase(device, std::move(stream), std::move(event)), data_(std::forward<Args>(args)...) {}
 
       T data_;  //!
diff --git a/CUDADataFormats/Common/interface/ProductBase.h b/CUDADataFormats/Common/interface/ProductBase.h
index 69e0770195608..efe2242903bd0 100644
--- a/CUDADataFormats/Common/interface/ProductBase.h
+++ b/CUDADataFormats/Common/interface/ProductBase.h
@@ -55,15 +55,15 @@ namespace cms {
       cudaEvent_t event() const { return event_.get(); }
 
     protected:
-      explicit ProductBase(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event)
+      explicit ProductBase(int device, SharedStreamPtr stream, SharedEventPtr event)
           : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {}
 
     private:
       friend class impl::ScopedContextBase;
       friend class ScopedContextProduce;
 
-      // The following function is intended to be used only from cms::cuda::ScopedContext
-      const cudautils::SharedStreamPtr& streamPtr() const { return stream_; }
+      // The following function is intended to be used only from ScopedContext
+      const SharedStreamPtr& streamPtr() const { return stream_; }
 
       bool mayReuseStream() const {
         bool expected = true;
@@ -75,9 +75,9 @@ namespace cms {
 
       // The cudaStream_t is really shared among edm::Event products, so
       // using shared_ptr also here
-      cudautils::SharedStreamPtr stream_;  //!
-      // shared_ptr because of caching in cudautils::EventCache
-      cudautils::SharedEventPtr event_;  //!
+      SharedStreamPtr stream_;  //!
+      // shared_ptr because of caching in cms::cuda::EventCache
+      SharedEventPtr event_;  //!
 
       // This flag tells whether the CUDA stream may be reused by a
       // consumer or not. The goal is to have a "chain" of modules to
diff --git a/CUDADataFormats/Common/src/ProductBase.cc b/CUDADataFormats/Common/src/ProductBase.cc
index 653d6a21b4add..8e1cf64b17122 100644
--- a/CUDADataFormats/Common/src/ProductBase.cc
+++ b/CUDADataFormats/Common/src/ProductBase.cc
@@ -7,7 +7,7 @@ namespace cms::cuda {
     if (not event_) {
       return false;
     }
-    return cudautils::eventWorkHasCompleted(event_.get());
+    return eventWorkHasCompleted(event_.get());
   }
 
   ProductBase::~ProductBase() {
diff --git a/CUDADataFormats/Common/test/test_Product.cc b/CUDADataFormats/Common/test/test_Product.cc
index 163373f82871e..d70ff3e63e903 100644
--- a/CUDADataFormats/Common/test/test_Product.cc
+++ b/CUDADataFormats/Common/test/test_Product.cc
@@ -13,11 +13,11 @@ namespace cms::cudatest {
   class TestScopedContext {
   public:
     static cuda::ScopedContextProduce make(int dev, bool createEvent) {
-      cudautils::SharedEventPtr event;
+      cms::cuda::SharedEventPtr event;
       if (createEvent) {
-        event = cudautils::getEventCache().get();
+        event = cms::cuda::getEventCache().get();
       }
-      return cuda::ScopedContextProduce(dev, cudautils::getStreamCache().get(), std::move(event));
+      return cuda::ScopedContextProduce(dev, cms::cuda::getStreamCache().get(), std::move(event));
     }
   };
 }  // namespace cms::cudatest
diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md
index 1e733b2afec71..75d9cf4c227a7 100644
--- a/HeterogeneousCore/CUDACore/README.md
+++ b/HeterogeneousCore/CUDACore/README.md
@@ -584,8 +584,8 @@ The memory allocations should be done dynamically with the following functions
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
-cudautils::device::unique_ptr<float[]> device_buffer = cudautils::make_device_unique<float[]>(50, cudaStream);
-cudautils::host::unique_ptr<float[]>   host_buffer   = cudautils::make_host_unique<float[]>(50, cudaStream);
+cms::cuda::device::unique_ptr<float[]> device_buffer = cms::cuda::make_device_unique<float[]>(50, cudaStream);
+cms::cuda::host::unique_ptr<float[]>   host_buffer   = cms::cuda::make_host_unique<float[]>(50, cudaStream);
 ```
 
 in the `acquire()` and `produce()` functions. The same
@@ -608,7 +608,7 @@ own `unique_ptr` for that.
 ```cpp
 #include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
 
-cudautils::host::noncached_unique_ptr<float[]> host_buffer = cudautils::make_host_noncached_unique<float[]>(50, flags);
+cms::cuda::host::noncached_unique_ptr<float[]> host_buffer = cms::cuda::make_host_noncached_unique<float[]>(50, flags);
 ```
 The `flags` is passed directly to `cudaHostAlloc()`.
 
diff --git a/HeterogeneousCore/CUDACore/interface/ContextState.h b/HeterogeneousCore/CUDACore/interface/ContextState.h
index a2ab42363a7b5..4de13c643a79f 100644
--- a/HeterogeneousCore/CUDACore/interface/ContextState.h
+++ b/HeterogeneousCore/CUDACore/interface/ContextState.h
@@ -27,7 +27,7 @@ namespace cms {
       friend class ScopedContextProduce;
       friend class ScopedContextTask;
 
-      void set(int device, cudautils::SharedStreamPtr stream) {
+      void set(int device, SharedStreamPtr stream) {
         throwIfStream();
         device_ = device;
         stream_ = std::move(stream);
@@ -35,12 +35,12 @@ namespace cms {
 
       int device() const { return device_; }
 
-      const cudautils::SharedStreamPtr& streamPtr() const {
+      const SharedStreamPtr& streamPtr() const {
         throwIfNoStream();
         return stream_;
       }
 
-      cudautils::SharedStreamPtr releaseStreamPtr() {
+      SharedStreamPtr releaseStreamPtr() {
         throwIfNoStream();
         // This function needs to effectively reset stream_ (i.e. stream_
         // must be empty after this function). This behavior ensures that
@@ -52,7 +52,7 @@ namespace cms {
       void throwIfStream() const;
       void throwIfNoStream() const;
 
-      cudautils::SharedStreamPtr stream_;
+      SharedStreamPtr stream_;
       int device_;
     };
   }  // namespace cuda
diff --git a/HeterogeneousCore/CUDACore/interface/ESProduct.h b/HeterogeneousCore/CUDACore/interface/ESProduct.h
index 5f8cf17e137d0..b17dc1ff4a388 100644
--- a/HeterogeneousCore/CUDACore/interface/ESProduct.h
+++ b/HeterogeneousCore/CUDACore/interface/ESProduct.h
@@ -18,9 +18,9 @@ namespace cms {
     template <typename T>
     class ESProduct {
     public:
-      ESProduct() : gpuDataPerDevice_(cudautils::deviceCount()) {
+      ESProduct() : gpuDataPerDevice_(deviceCount()) {
         for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) {
-          gpuDataPerDevice_[i].m_event = cudautils::getEventCache().get();
+          gpuDataPerDevice_[i].m_event = getEventCache().get();
         }
       }
       ~ESProduct() = default;
@@ -30,7 +30,7 @@ namespace cms {
       // to the CUDA stream
       template <typename F>
       const T& dataForCurrentDeviceAsync(cudaStream_t cudaStream, F transferAsync) const {
-        auto device = cudautils::currentDevice();
+        auto device = currentDevice();
 
         auto& data = gpuDataPerDevice_[device];
 
@@ -49,7 +49,7 @@ namespace cms {
             // Someone else is filling
 
             // Check first if the recorded event has occurred
-            if (cudautils::eventWorkHasCompleted(data.m_event.get())) {
+            if (eventWorkHasCompleted(data.m_event.get())) {
               // It was, so data is accessible from all CUDA streams on
               // the device. Set the 'filled' for all subsequent calls and
               // return the value
@@ -88,7 +88,7 @@ namespace cms {
     private:
       struct Item {
         mutable std::mutex m_mutex;
-        CMS_THREAD_GUARD(m_mutex) mutable cudautils::SharedEventPtr m_event;
+        CMS_THREAD_GUARD(m_mutex) mutable SharedEventPtr m_event;
         // non-null if some thread is already filling (cudaStream_t is just a pointer)
         CMS_THREAD_GUARD(m_mutex) mutable cudaStream_t m_fillingStream = nullptr;
         mutable std::atomic<bool> m_filled = false;  // easy check if data has been filled already or not
diff --git a/HeterogeneousCore/CUDACore/interface/ScopedContext.h b/HeterogeneousCore/CUDACore/interface/ScopedContext.h
index c13c0cc700628..cdc3e2dd2c620 100644
--- a/HeterogeneousCore/CUDACore/interface/ScopedContext.h
+++ b/HeterogeneousCore/CUDACore/interface/ScopedContext.h
@@ -32,7 +32,7 @@ namespace cms {
         // would be const. Therefore it is ok to return a non-const
         // pointer from a const method here.
         cudaStream_t stream() const { return stream_.get(); }
-        const cudautils::SharedStreamPtr& streamPtr() const { return stream_; }
+        const SharedStreamPtr& streamPtr() const { return stream_; }
 
       protected:
         // The constructors set the current device, but the device
@@ -45,11 +45,11 @@ namespace cms {
 
         explicit ScopedContextBase(const ProductBase& data);
 
-        explicit ScopedContextBase(int device, cudautils::SharedStreamPtr stream);
+        explicit ScopedContextBase(int device, SharedStreamPtr stream);
 
       private:
         int currentDevice_;
-        cudautils::SharedStreamPtr stream_;
+        SharedStreamPtr stream_;
       };
 
       class ScopedContextGetterBase : public ScopedContextBase {
@@ -176,11 +176,11 @@ namespace cms {
       friend class cudatest::TestScopedContext;
 
       // This construcor is only meant for testing
-      explicit ScopedContextProduce(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event)
+      explicit ScopedContextProduce(int device, SharedStreamPtr stream, SharedEventPtr event)
           : ScopedContextGetterBase(device, std::move(stream)), event_{std::move(event)} {}
 
       // create the CUDA Event upfront to catch possible errors from its creation
-      cudautils::SharedEventPtr event_ = cudautils::getEventCache().get();
+      SharedEventPtr event_ = getEventCache().get();
     };
 
     /**
diff --git a/HeterogeneousCore/CUDACore/src/ScopedContext.cc b/HeterogeneousCore/CUDACore/src/ScopedContext.cc
index adf242a6c43b2..7461ebbee9f0d 100644
--- a/HeterogeneousCore/CUDACore/src/ScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/src/ScopedContext.cc
@@ -40,7 +40,7 @@ namespace cms::cuda {
   namespace impl {
     ScopedContextBase::ScopedContextBase(edm::StreamID streamID) : currentDevice_(chooseDevice(streamID)) {
       cudaCheck(cudaSetDevice(currentDevice_));
-      stream_ = cudautils::getStreamCache().get();
+      stream_ = getStreamCache().get();
     }
 
     ScopedContextBase::ScopedContextBase(const ProductBase& data) : currentDevice_(data.device()) {
@@ -48,11 +48,11 @@ namespace cms::cuda {
       if (data.mayReuseStream()) {
         stream_ = data.streamPtr();
       } else {
-        stream_ = cudautils::getStreamCache().get();
+        stream_ = getStreamCache().get();
       }
     }
 
-    ScopedContextBase::ScopedContextBase(int device, cudautils::SharedStreamPtr stream)
+    ScopedContextBase::ScopedContextBase(int device, SharedStreamPtr stream)
         : currentDevice_(device), stream_(std::move(stream)) {
       cudaCheck(cudaSetDevice(currentDevice_));
     }
diff --git a/HeterogeneousCore/CUDACore/test/test_ScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_ScopedContext.cc
index c97d08e29a52c..7a6543a667c3d 100644
--- a/HeterogeneousCore/CUDACore/test/test_ScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/test/test_ScopedContext.cc
@@ -20,11 +20,11 @@ namespace cms::cudatest {
   class TestScopedContext {
   public:
     static cuda::ScopedContextProduce make(int dev, bool createEvent) {
-      cudautils::SharedEventPtr event;
+      cms::cuda::SharedEventPtr event;
       if (createEvent) {
-        event = cudautils::getEventCache().get();
+        event = cms::cuda::getEventCache().get();
       }
-      return cuda::ScopedContextProduce(dev, cudautils::getStreamCache().get(), std::move(event));
+      return cuda::ScopedContextProduce(dev, cms::cuda::getStreamCache().get(), std::move(event));
     }
   };
 }  // namespace cms::cudatest
@@ -47,7 +47,7 @@ TEST_CASE("Use of cms::cuda::ScopedContext", "[CUDACore]") {
   {
     auto ctx = cms::cudatest::TestScopedContext::make(defaultDevice, true);
 
-    SECTION("Construct from device ID") { REQUIRE(cudautils::currentDevice() == defaultDevice); }
+    SECTION("Construct from device ID") { REQUIRE(cms::cuda::currentDevice() == defaultDevice); }
 
     SECTION("Wrap T to cms::cuda::Product<T>") {
       std::unique_ptr<cms::cuda::Product<int>> dataPtr = ctx.wrap(10);
@@ -61,12 +61,12 @@ TEST_CASE("Use of cms::cuda::ScopedContext", "[CUDACore]") {
       const auto& data = *dataPtr;
 
       cms::cuda::ScopedContextProduce ctx2{data};
-      REQUIRE(cudautils::currentDevice() == data.device());
+      REQUIRE(cms::cuda::currentDevice() == data.device());
       REQUIRE(ctx2.stream() == data.stream());
 
       // Second use of a product should lead to new stream
       cms::cuda::ScopedContextProduce ctx3{data};
-      REQUIRE(cudautils::currentDevice() == data.device());
+      REQUIRE(cms::cuda::currentDevice() == data.device());
       REQUIRE(ctx3.stream() != data.stream());
     }
 
@@ -82,22 +82,22 @@ TEST_CASE("Use of cms::cuda::ScopedContext", "[CUDACore]") {
 
       {  // produce
         cms::cuda::ScopedContextProduce ctx2{ctxstate};
-        REQUIRE(cudautils::currentDevice() == ctx.device());
+        REQUIRE(cms::cuda::currentDevice() == ctx.device());
         REQUIRE(ctx2.stream() == ctx.stream());
       }
     }
 
     SECTION("Joining multiple CUDA streams") {
-      cudautils::ScopedSetDevice setDeviceForThisScope(defaultDevice);
+      cms::cuda::ScopedSetDevice setDeviceForThisScope(defaultDevice);
 
       // Mimick a producer on the first CUDA stream
       int h_a1 = 1;
-      auto d_a1 = cudautils::make_device_unique<int>(nullptr);
+      auto d_a1 = cms::cuda::make_device_unique<int>(nullptr);
       auto wprod1 = produce(defaultDevice, d_a1.get(), &h_a1);
 
       // Mimick a producer on the second CUDA stream
       int h_a2 = 2;
-      auto d_a2 = cudautils::make_device_unique<int>(nullptr);
+      auto d_a2 = cms::cuda::make_device_unique<int>(nullptr);
       auto wprod2 = produce(defaultDevice, d_a2.get(), &h_a2);
 
       REQUIRE(wprod1->stream() != wprod2->stream());
@@ -108,11 +108,11 @@ TEST_CASE("Use of cms::cuda::ScopedContext", "[CUDACore]") {
       auto prod1 = ctx2.get(*wprod1);
       auto prod2 = ctx2.get(*wprod2);
 
-      auto d_a3 = cudautils::make_device_unique<int>(nullptr);
+      auto d_a3 = cms::cuda::make_device_unique<int>(nullptr);
       cms::cudatest::testScopedContextKernels_join(prod1, prod2, d_a3.get(), ctx2.stream());
       cudaCheck(cudaStreamSynchronize(ctx2.stream()));
       REQUIRE(wprod2->isAvailable());
-      REQUIRE(cudautils::eventWorkHasCompleted(wprod2->event()));
+      REQUIRE(cms::cuda::eventWorkHasCompleted(wprod2->event()));
 
       h_a1 = 0;
       h_a2 = 0;
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
index 74d6f6b79ec8b..5d1bd30264186 100644
--- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -90,7 +90,7 @@ namespace {
     if (bufferSizes.empty())
       return;
 
-    auto streamPtr = cudautils::getStreamCache().get();
+    auto streamPtr = cms::cuda::getStreamCache().get();
 
     std::vector<UniquePtr<char[]> > buffers;
     buffers.reserve(bufferSizes.size());
@@ -104,16 +104,16 @@ namespace {
     cudaCheck(cudaGetDevice(&device));
     for (int i = 0; i < numberOfDevices; ++i) {
       cudaCheck(cudaSetDevice(i));
-      preallocate<cudautils::device::unique_ptr>(
-          [&](size_t size, cudaStream_t stream) { return cudautils::make_device_unique<char[]>(size, stream); },
+      preallocate<cms::cuda::device::unique_ptr>(
+          [&](size_t size, cudaStream_t stream) { return cms::cuda::make_device_unique<char[]>(size, stream); },
           bufferSizes);
     }
     cudaCheck(cudaSetDevice(device));
   }
 
   void hostPreallocate(const std::vector<unsigned int>& bufferSizes) {
-    preallocate<cudautils::host::unique_ptr>(
-        [&](size_t size, cudaStream_t stream) { return cudautils::make_host_unique<char[]>(size, stream); },
+    preallocate<cms::cuda::host::unique_ptr>(
+        [&](size_t size, cudaStream_t stream) { return cms::cuda::make_host_unique<char[]>(size, stream); },
         bufferSizes);
   }
 }  // namespace
@@ -299,12 +299,12 @@ CUDAService::CUDAService(edm::ParameterSet const& config) {
   log << "\n";
 
   // Make sure the caching allocators and stream/event caches are constructed before declaring successful construction
-  if constexpr (cudautils::allocator::useCaching) {
-    cudautils::allocator::getCachingDeviceAllocator();
-    cudautils::allocator::getCachingHostAllocator();
+  if constexpr (cms::cuda::allocator::useCaching) {
+    cms::cuda::allocator::getCachingDeviceAllocator();
+    cms::cuda::allocator::getCachingHostAllocator();
   }
-  cudautils::getEventCache().clear();
-  cudautils::getStreamCache().clear();
+  cms::cuda::getEventCache().clear();
+  cms::cuda::getStreamCache().clear();
 
   log << "CUDAService fully initialized";
   enabled_ = true;
@@ -318,12 +318,12 @@ CUDAService::CUDAService(edm::ParameterSet const& config) {
 CUDAService::~CUDAService() {
   if (enabled_) {
     // Explicitly destruct the allocator before the device resets below
-    if constexpr (cudautils::allocator::useCaching) {
-      cudautils::allocator::getCachingDeviceAllocator().FreeAllCached();
-      cudautils::allocator::getCachingHostAllocator().FreeAllCached();
+    if constexpr (cms::cuda::allocator::useCaching) {
+      cms::cuda::allocator::getCachingDeviceAllocator().FreeAllCached();
+      cms::cuda::allocator::getCachingHostAllocator().FreeAllCached();
     }
-    cudautils::getEventCache().clear();
-    cudautils::getStreamCache().clear();
+    cms::cuda::getEventCache().clear();
+    cms::cuda::getStreamCache().clear();
 
     for (int i = 0; i < numberOfDevices_; ++i) {
       cudaCheck(cudaSetDevice(i));
diff --git a/HeterogeneousCore/CUDATest/interface/Thing.h b/HeterogeneousCore/CUDATest/interface/Thing.h
index e492625002a64..27dc58e1443f4 100644
--- a/HeterogeneousCore/CUDATest/interface/Thing.h
+++ b/HeterogeneousCore/CUDATest/interface/Thing.h
@@ -8,12 +8,12 @@ namespace cms {
     class Thing {
     public:
       Thing() = default;
-      explicit Thing(cudautils::device::unique_ptr<float[]> ptr) : ptr_(std::move(ptr)) {}
+      explicit Thing(cms::cuda::device::unique_ptr<float[]> ptr) : ptr_(std::move(ptr)) {}
 
       const float *get() const { return ptr_.get(); }
 
     private:
-      cudautils::device::unique_ptr<float[]> ptr_;
+      cms::cuda::device::unique_ptr<float[]> ptr_;
     };
   }  // namespace cudatest
 }  // namespace cms
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
index d3cee9471f78c..2778ed02f3ac6 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
@@ -40,7 +40,7 @@ TestCUDAAnalyzerGPU::TestCUDAAnalyzerGPU(edm::ParameterSet const& iConfig)
       maxValue_(iConfig.getParameter<double>("maxValue")) {
   edm::Service<CUDAService> cs;
   if (cs->enabled()) {
-    auto streamPtr = cudautils::getStreamCache().get();
+    auto streamPtr = cms::cuda::getStreamCache().get();
     gpuAlgo_ = std::make_unique<TestCUDAAnalyzerGPUKernel>(streamPtr.get());
   }
 }
@@ -70,7 +70,7 @@ void TestCUDAAnalyzerGPU::analyze(edm::StreamID, edm::Event const& iEvent, edm::
 void TestCUDAAnalyzerGPU::endJob() {
   edm::LogVerbatim("TestCUDAAnalyzerGPU") << label_ << " TestCUDAAnalyzerGPU::endJob begin";
 
-  auto streamPtr = cudautils::getStreamCache().get();
+  auto streamPtr = cms::cuda::getStreamCache().get();
   auto value = gpuAlgo_->value(streamPtr.get());
   edm::LogVerbatim("TestCUDAAnalyzerGPU") << label_ << "  accumulated value " << value;
   assert(minValue_ <= value && value <= maxValue_);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.cu
index 01ded40c6d7ff..2b3951a2b5cfe 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.cu
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.cu
@@ -24,8 +24,8 @@ namespace {
 }  // namespace
 
 TestCUDAAnalyzerGPUKernel::TestCUDAAnalyzerGPUKernel(cudaStream_t stream) {
-  sum_ = cudautils::make_device_unique<float[]>(NUM_VALUES, stream);
-  cudautils::memsetAsync(sum_, 0, NUM_VALUES, stream);
+  sum_ = cms::cuda::make_device_unique<float[]>(NUM_VALUES, stream);
+  cms::cuda::memsetAsync(sum_, 0, NUM_VALUES, stream);
   // better to synchronize since there is no guarantee that the stream
   // of analyzeAsync() would be otherwise synchronized with this one
   cudaCheck(cudaStreamSynchronize(stream));
@@ -36,10 +36,10 @@ void TestCUDAAnalyzerGPUKernel::analyzeAsync(const float *d_input, cudaStream_t
 }
 
 float TestCUDAAnalyzerGPUKernel::value(cudaStream_t stream) const {
-  auto accumulator = cudautils::make_device_unique<float>(stream);
-  auto h_accumulator = cudautils::make_host_unique<float>(stream);
+  auto accumulator = cms::cuda::make_device_unique<float>(stream);
+  auto h_accumulator = cms::cuda::make_host_unique<float>(stream);
   sum<<<1, 1, 0, stream>>>(sum_.get(), accumulator.get(), NUM_VALUES);
-  cudautils::copyAsync(h_accumulator, accumulator, stream);
+  cms::cuda::copyAsync(h_accumulator, accumulator, stream);
   // need to synchronize
   cudaCheck(cudaStreamSynchronize(stream));
   return *h_accumulator;
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.h b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.h
index 612e617c67c8c..a9a6b962c2cc4 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.h
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.h
@@ -17,7 +17,7 @@ class TestCUDAAnalyzerGPUKernel {
   float value(cudaStream_t stream) const;
 
 private:
-  cudautils::device::unique_ptr<float[]> sum_;  // all writes are atomic in CUDA
+  cms::cuda::device::unique_ptr<float[]> sum_;  // all writes are atomic in CUDA
 };
 
 #endif
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
index 710df90c5ff5d..9b6fe85636026 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
@@ -34,8 +34,8 @@ class TestCUDAProducerGPUEW : public edm::stream::EDProducer<edm::ExternalWork>
   edm::EDPutTokenT<cms::cuda::Product<cms::cudatest::Thing>> const dstToken_;
   TestCUDAProducerGPUKernel gpuAlgo_;
   cms::cuda::ContextState ctxState_;
-  cudautils::device::unique_ptr<float[]> devicePtr_;
-  cudautils::host::noncached::unique_ptr<float> hostData_;
+  cms::cuda::device::unique_ptr<float[]> devicePtr_;
+  cms::cuda::host::noncached::unique_ptr<float> hostData_;
 };
 
 TestCUDAProducerGPUEW::TestCUDAProducerGPUEW(edm::ParameterSet const& iConfig)
@@ -44,7 +44,7 @@ TestCUDAProducerGPUEW::TestCUDAProducerGPUEW(edm::ParameterSet const& iConfig)
       dstToken_{produces<cms::cuda::Product<cms::cudatest::Thing>>()} {
   edm::Service<CUDAService> cs;
   if (cs->enabled()) {
-    hostData_ = cudautils::make_host_noncached_unique<float>();
+    hostData_ = cms::cuda::make_host_noncached_unique<float>();
   }
 }
 
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
index 12a8a82ca2865..d1e4f94a30d96 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
@@ -41,8 +41,8 @@ class TestCUDAProducerGPUEWTask : public edm::stream::EDProducer<edm::ExternalWo
   edm::EDPutTokenT<cms::cuda::Product<cms::cudatest::Thing>> const dstToken_;
   TestCUDAProducerGPUKernel gpuAlgo_;
   cms::cuda::ContextState ctxState_;
-  cudautils::device::unique_ptr<float[]> devicePtr_;
-  cudautils::host::noncached::unique_ptr<float> hostData_;
+  cms::cuda::device::unique_ptr<float[]> devicePtr_;
+  cms::cuda::host::noncached::unique_ptr<float> hostData_;
 };
 
 TestCUDAProducerGPUEWTask::TestCUDAProducerGPUEWTask(edm::ParameterSet const& iConfig)
@@ -51,7 +51,7 @@ TestCUDAProducerGPUEWTask::TestCUDAProducerGPUEWTask(edm::ParameterSet const& iC
       dstToken_{produces<cms::cuda::Product<cms::cudatest::Thing>>()} {
   edm::Service<CUDAService> cs;
   if (cs->enabled()) {
-    hostData_ = cudautils::make_host_noncached_unique<float>();
+    hostData_ = cms::cuda::make_host_noncached_unique<float>();
   }
 }
 
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
index 6549dabf1d9ff..b9752f6f41630 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
@@ -46,7 +46,7 @@ void TestCUDAProducerGPUFirst::produce(edm::StreamID streamID,
 
   cms::cuda::ScopedContextProduce ctx{streamID};
 
-  cudautils::device::unique_ptr<float[]> output = gpuAlgo_.runAlgo(label_, ctx.stream());
+  cms::cuda::device::unique_ptr<float[]> output = gpuAlgo_.runAlgo(label_, ctx.stream());
   ctx.emplace(iEvent, dstToken_, std::move(output));
 
   edm::LogVerbatim("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce end event "
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
index 585139606df98..69264a40aca62 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
@@ -62,7 +62,7 @@ namespace {
   }
 }  // namespace
 
-cudautils::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const std::string &label,
+cms::cuda::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const std::string &label,
                                                                           const float *d_input,
                                                                           cudaStream_t stream) const {
   // First make the sanity check
@@ -78,16 +78,16 @@ cudautils::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const
     }
   }
 
-  auto h_a = cudautils::make_host_unique<float[]>(NUM_VALUES, stream);
-  auto h_b = cudautils::make_host_unique<float[]>(NUM_VALUES, stream);
+  auto h_a = cms::cuda::make_host_unique<float[]>(NUM_VALUES, stream);
+  auto h_b = cms::cuda::make_host_unique<float[]>(NUM_VALUES, stream);
 
   for (auto i = 0; i < NUM_VALUES; i++) {
     h_a[i] = i;
     h_b[i] = i * i;
   }
 
-  auto d_a = cudautils::make_device_unique<float[]>(NUM_VALUES, stream);
-  auto d_b = cudautils::make_device_unique<float[]>(NUM_VALUES, stream);
+  auto d_a = cms::cuda::make_device_unique<float[]>(NUM_VALUES, stream);
+  auto d_b = cms::cuda::make_device_unique<float[]>(NUM_VALUES, stream);
 
   cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream));
   cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream));
@@ -95,15 +95,15 @@ cudautils::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const
   int threadsPerBlock{32};
   int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
 
-  auto d_c = cudautils::make_device_unique<float[]>(NUM_VALUES, stream);
-  auto current_device = cudautils::currentDevice();
-  cudautils::LogVerbatim("TestHeterogeneousEDProducerGPU")
+  auto d_c = cms::cuda::make_device_unique<float[]>(NUM_VALUES, stream);
+  auto current_device = cms::cuda::currentDevice();
+  cms::cuda::LogVerbatim("TestHeterogeneousEDProducerGPU")
       << "  " << label << " GPU launching kernels device " << current_device << " CUDA stream " << stream;
   vectorAdd<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_a.get(), d_b.get(), d_c.get(), NUM_VALUES);
 
-  auto d_ma = cudautils::make_device_unique<float[]>(NUM_VALUES * NUM_VALUES, stream);
-  auto d_mb = cudautils::make_device_unique<float[]>(NUM_VALUES * NUM_VALUES, stream);
-  auto d_mc = cudautils::make_device_unique<float[]>(NUM_VALUES * NUM_VALUES, stream);
+  auto d_ma = cms::cuda::make_device_unique<float[]>(NUM_VALUES * NUM_VALUES, stream);
+  auto d_mb = cms::cuda::make_device_unique<float[]>(NUM_VALUES * NUM_VALUES, stream);
+  auto d_mc = cms::cuda::make_device_unique<float[]>(NUM_VALUES * NUM_VALUES, stream);
   dim3 threadsPerBlock3{NUM_VALUES, NUM_VALUES};
   dim3 blocksPerGrid3{1, 1};
   if (NUM_VALUES * NUM_VALUES > 32) {
@@ -118,7 +118,7 @@ cudautils::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const
 
   matrixMulVector<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_mc.get(), d_b.get(), d_c.get(), NUM_VALUES);
 
-  cudautils::LogVerbatim("TestHeterogeneousEDProducerGPU")
+  cms::cuda::LogVerbatim("TestHeterogeneousEDProducerGPU")
       << "  " << label << " GPU kernels launched, returning return pointer device " << current_device << " CUDA stream "
       << stream;
   return d_a;
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h
index a332755b390a3..5eeba0009656e 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h
@@ -10,7 +10,7 @@
 /**
  * This class models the actual CUDA implementation of an algorithm.
  *
- * Memory is allocated dynamically with the allocator in cudautils.
+ * Memory is allocated dynamically with the allocator in cms::cuda.
  *
  * The algorithm is intended to waste time with large matrix
  * operations so that the asynchronous nature of the CUDA integration
@@ -24,10 +24,10 @@ class TestCUDAProducerGPUKernel {
   ~TestCUDAProducerGPUKernel() = default;
 
   // returns (owning) pointer to device memory
-  cudautils::device::unique_ptr<float[]> runAlgo(const std::string& label, cudaStream_t stream) const {
+  cms::cuda::device::unique_ptr<float[]> runAlgo(const std::string& label, cudaStream_t stream) const {
     return runAlgo(label, nullptr, stream);
   }
-  cudautils::device::unique_ptr<float[]> runAlgo(const std::string& label,
+  cms::cuda::device::unique_ptr<float[]> runAlgo(const std::string& label,
                                                  const float* d_input,
                                                  cudaStream_t stream) const;
 
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
index d1b3288b95199..dc07fc0add7f7 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
@@ -30,7 +30,7 @@ class TestCUDAProducerGPUtoCPU : public edm::stream::EDProducer<edm::ExternalWor
   std::string const label_;
   edm::EDGetTokenT<cms::cuda::Product<cms::cudatest::Thing>> const srcToken_;
   edm::EDPutTokenT<int> const dstToken_;
-  cudautils::host::unique_ptr<float[]> buffer_;
+  cms::cuda::host::unique_ptr<float[]> buffer_;
 };
 
 TestCUDAProducerGPUtoCPU::TestCUDAProducerGPUtoCPU(edm::ParameterSet const& iConfig)
@@ -57,7 +57,7 @@ void TestCUDAProducerGPUtoCPU::acquire(edm::Event const& iEvent,
   cms::cuda::ScopedContextAcquire ctx{in, std::move(waitingTaskHolder)};
   cms::cudatest::Thing const& device = ctx.get(in);
 
-  buffer_ = cudautils::make_host_unique<float[]>(TestCUDAProducerGPUKernel::NUM_VALUES, ctx.stream());
+  buffer_ = cms::cuda::make_host_unique<float[]>(TestCUDAProducerGPUKernel::NUM_VALUES, ctx.stream());
   // Enqueue async copy, continue in produce once finished
   cudaCheck(cudaMemcpyAsync(buffer_.get(),
                             device.get(),
diff --git a/HeterogeneousCore/CUDAUtilities/interface/EventCache.h b/HeterogeneousCore/CUDAUtilities/interface/EventCache.h
index 2828a7ab50417..8ee56884a40dc 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/EventCache.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/EventCache.h
@@ -10,46 +10,48 @@
 
 class CUDAService;
 
-namespace cudautils {
-  class EventCache {
-  public:
-    using BareEvent = SharedEventPtr::element_type;
-
-    EventCache();
-
-    // Gets a (cached) CUDA event for the current device. The event
-    // will be returned to the cache by the shared_ptr destructor. The
-    // returned event is guaranteed to be in the state where all
-    // captured work has completed, i.e. cudaEventQuery() == cudaSuccess.
-    //
-    // This function is thread safe
-    SharedEventPtr get();
+namespace cms {
+  namespace cuda {
+    class EventCache {
+    public:
+      using BareEvent = SharedEventPtr::element_type;
 
-  private:
-    friend class ::CUDAService;
+      EventCache();
 
-    // thread safe
-    SharedEventPtr makeOrGet(int dev);
+      // Gets a (cached) CUDA event for the current device. The event
+      // will be returned to the cache by the shared_ptr destructor. The
+      // returned event is guaranteed to be in the state where all
+      // captured work has completed, i.e. cudaEventQuery() == cudaSuccess.
+      //
+      // This function is thread safe
+      SharedEventPtr get();
 
-    // not thread safe, intended to be called only from CUDAService destructor
-    void clear();
+    private:
+      friend class ::CUDAService;
 
-    class Deleter {
-    public:
-      Deleter() = default;
-      Deleter(int d) : device_{d} {}
-      void operator()(cudaEvent_t event) const;
+      // thread safe
+      SharedEventPtr makeOrGet(int dev);
 
-    private:
-      int device_ = -1;
-    };
+      // not thread safe, intended to be called only from CUDAService destructor
+      void clear();
+
+      class Deleter {
+      public:
+        Deleter() = default;
+        Deleter(int d) : device_{d} {}
+        void operator()(cudaEvent_t event) const;
 
-    std::vector<edm::ReusableObjectHolder<BareEvent, Deleter>> cache_;
-  };
+      private:
+        int device_ = -1;
+      };
 
-  // Gets the global instance of a EventCache
-  // This function is thread safe
-  EventCache& getEventCache();
-}  // namespace cudautils
+      std::vector<edm::ReusableObjectHolder<BareEvent, Deleter>> cache_;
+    };
+
+    // Gets the global instance of a EventCache
+    // This function is thread safe
+    EventCache& getEventCache();
+  }  // namespace cuda
+}  // namespace cms
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/MessageLogger.h b/HeterogeneousCore/CUDAUtilities/interface/MessageLogger.h
index ba098f687f846..95900ec845470 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/MessageLogger.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/MessageLogger.h
@@ -4,101 +4,103 @@
 #include <sstream>
 #include <string>
 
-namespace cudautils {
+namespace cms {
+  namespace cuda {
 
-  /**
+    /**
    * This class is a temporary measure to hide C++17 constructs in
    * MessaLogger from .cu files (those are mainly files that launch
    * kernels). It will be removed once we will be able to compile .cu
    * files with C++17 capable compiler.
    */
-  class MessageLogger {
-  public:
-    MessageLogger(std::string const& category) : category_(category) {}
-
-    MessageLogger(std::string&& category) : category_(std::move(category)) {}
-
-    ~MessageLogger() = default;
-
-    MessageLogger(MessageLogger const&) = delete;
-    MessageLogger(MessageLogger&&) = delete;
-    MessageLogger& operator=(MessageLogger const&) = delete;
-    MessageLogger& operator=(MessageLogger&&) = delete;
-
-    template <typename T>
-    MessageLogger& operator<<(T const& element) {
-      message_ << element;
-      return *this;
-    }
-
-  protected:
-    std::string category_;
-    std::stringstream message_;
-  };
-
-  class LogSystem : public MessageLogger {
-  public:
-    LogSystem(std::string const& category) : MessageLogger(category) {}
-    LogSystem(std::string&& category) : MessageLogger(std::move(category)) {}
-    ~LogSystem();
-  };
-
-  class LogAbsolute : public MessageLogger {
-  public:
-    LogAbsolute(std::string const& category) : MessageLogger(category) {}
-    LogAbsolute(std::string&& category) : MessageLogger(std::move(category)) {}
-    ~LogAbsolute();
-  };
-
-  class LogError : public MessageLogger {
-  public:
-    LogError(std::string const& category) : MessageLogger(category) {}
-    LogError(std::string&& category) : MessageLogger(std::move(category)) {}
-    ~LogError();
-  };
-
-  class LogProblem : public MessageLogger {
-  public:
-    LogProblem(std::string const& category) : MessageLogger(category) {}
-    LogProblem(std::string&& category) : MessageLogger(std::move(category)) {}
-    ~LogProblem();
-  };
-
-  class LogImportant : public MessageLogger {
-  public:
-    LogImportant(std::string const& category) : MessageLogger(category) {}
-    LogImportant(std::string&& category) : MessageLogger(std::move(category)) {}
-    ~LogImportant();
-  };
-
-  class LogWarning : public MessageLogger {
-  public:
-    LogWarning(std::string const& category) : MessageLogger(category) {}
-    LogWarning(std::string&& category) : MessageLogger(std::move(category)) {}
-    ~LogWarning();
-  };
-
-  class LogPrint : public MessageLogger {
-  public:
-    LogPrint(std::string const& category) : MessageLogger(category) {}
-    LogPrint(std::string&& category) : MessageLogger(std::move(category)) {}
-    ~LogPrint();
-  };
-
-  class LogInfo : public MessageLogger {
-  public:
-    LogInfo(std::string const& category) : MessageLogger(category) {}
-    LogInfo(std::string&& category) : MessageLogger(std::move(category)) {}
-    ~LogInfo();
-  };
-
-  class LogVerbatim : public MessageLogger {
-  public:
-    LogVerbatim(std::string const& category) : MessageLogger(category) {}
-    LogVerbatim(std::string&& category) : MessageLogger(std::move(category)) {}
-    ~LogVerbatim();
-  };
-
-}  // namespace cudautils
+    class MessageLogger {
+    public:
+      MessageLogger(std::string const& category) : category_(category) {}
+
+      MessageLogger(std::string&& category) : category_(std::move(category)) {}
+
+      ~MessageLogger() = default;
+
+      MessageLogger(MessageLogger const&) = delete;
+      MessageLogger(MessageLogger&&) = delete;
+      MessageLogger& operator=(MessageLogger const&) = delete;
+      MessageLogger& operator=(MessageLogger&&) = delete;
+
+      template <typename T>
+      MessageLogger& operator<<(T const& element) {
+        message_ << element;
+        return *this;
+      }
+
+    protected:
+      std::string category_;
+      std::stringstream message_;
+    };
+
+    class LogSystem : public MessageLogger {
+    public:
+      LogSystem(std::string const& category) : MessageLogger(category) {}
+      LogSystem(std::string&& category) : MessageLogger(std::move(category)) {}
+      ~LogSystem();
+    };
+
+    class LogAbsolute : public MessageLogger {
+    public:
+      LogAbsolute(std::string const& category) : MessageLogger(category) {}
+      LogAbsolute(std::string&& category) : MessageLogger(std::move(category)) {}
+      ~LogAbsolute();
+    };
+
+    class LogError : public MessageLogger {
+    public:
+      LogError(std::string const& category) : MessageLogger(category) {}
+      LogError(std::string&& category) : MessageLogger(std::move(category)) {}
+      ~LogError();
+    };
+
+    class LogProblem : public MessageLogger {
+    public:
+      LogProblem(std::string const& category) : MessageLogger(category) {}
+      LogProblem(std::string&& category) : MessageLogger(std::move(category)) {}
+      ~LogProblem();
+    };
+
+    class LogImportant : public MessageLogger {
+    public:
+      LogImportant(std::string const& category) : MessageLogger(category) {}
+      LogImportant(std::string&& category) : MessageLogger(std::move(category)) {}
+      ~LogImportant();
+    };
+
+    class LogWarning : public MessageLogger {
+    public:
+      LogWarning(std::string const& category) : MessageLogger(category) {}
+      LogWarning(std::string&& category) : MessageLogger(std::move(category)) {}
+      ~LogWarning();
+    };
+
+    class LogPrint : public MessageLogger {
+    public:
+      LogPrint(std::string const& category) : MessageLogger(category) {}
+      LogPrint(std::string&& category) : MessageLogger(std::move(category)) {}
+      ~LogPrint();
+    };
+
+    class LogInfo : public MessageLogger {
+    public:
+      LogInfo(std::string const& category) : MessageLogger(category) {}
+      LogInfo(std::string&& category) : MessageLogger(std::move(category)) {}
+      ~LogInfo();
+    };
+
+    class LogVerbatim : public MessageLogger {
+    public:
+      LogVerbatim(std::string const& category) : MessageLogger(category) {}
+      LogVerbatim(std::string&& category) : MessageLogger(std::move(category)) {}
+      ~LogVerbatim();
+    };
+
+  }  // namespace cuda
+}  // namespace cms
 
 #endif  // HeterogeneousCore_CUDAUtilities_interface_MessageLogger_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h b/HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h
index 385c8910acad8..9b296dd390ea3 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h
@@ -5,24 +5,26 @@
 
 #include <cuda_runtime.h>
 
-namespace cudautils {
-  class ScopedSetDevice {
-  public:
-    explicit ScopedSetDevice(int newDevice) {
-      cudaCheck(cudaGetDevice(&prevDevice_));
-      cudaCheck(cudaSetDevice(newDevice));
-    }
+namespace cms {
+  namespace cuda {
+    class ScopedSetDevice {
+    public:
+      explicit ScopedSetDevice(int newDevice) {
+        cudaCheck(cudaGetDevice(&prevDevice_));
+        cudaCheck(cudaSetDevice(newDevice));
+      }
 
-    ~ScopedSetDevice() {
-      // Intentionally don't check the return value to avoid
-      // exceptions to be thrown. If this call fails, the process is
-      // doomed anyway.
-      cudaSetDevice(prevDevice_);
-    }
+      ~ScopedSetDevice() {
+        // Intentionally don't check the return value to avoid
+        // exceptions to be thrown. If this call fails, the process is
+        // doomed anyway.
+        cudaSetDevice(prevDevice_);
+      }
 
-  private:
-    int prevDevice_;
-  };
-}  // namespace cudautils
+    private:
+      int prevDevice_;
+    };
+  }  // namespace cuda
+}  // namespace cms
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h b/HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h
index e9955782fe8c8..7aa10327a4e63 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h
@@ -6,11 +6,13 @@
 
 #include <cuda_runtime.h>
 
-namespace cudautils {
-  // cudaEvent_t itself is a typedef for a pointer, for the use with
-  // edm::ReusableObjectHolder the pointed-to type is more interesting
-  // to avoid extra layer of indirection
-  using SharedEventPtr = std::shared_ptr<std::remove_pointer_t<cudaEvent_t>>;
-}  // namespace cudautils
+namespace cms {
+  namespace cuda {
+    // cudaEvent_t itself is a typedef for a pointer, for the use with
+    // edm::ReusableObjectHolder the pointed-to type is more interesting
+    // to avoid extra layer of indirection
+    using SharedEventPtr = std::shared_ptr<std::remove_pointer_t<cudaEvent_t>>;
+  }  // namespace cuda
+}  // namespace cms
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h b/HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h
index 2b5be232f03d9..14f54e35f6aa6 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h
@@ -6,11 +6,13 @@
 
 #include <cuda_runtime.h>
 
-namespace cudautils {
-  // cudaStream_t itself is a typedef for a pointer, for the use with
-  // edm::ReusableObjectHolder the pointed-to type is more interesting
-  // to avoid extra layer of indirection
-  using SharedStreamPtr = std::shared_ptr<std::remove_pointer_t<cudaStream_t>>;
-}  // namespace cudautils
+namespace cms {
+  namespace cuda {
+    // cudaStream_t itself is a typedef for a pointer, for the use with
+    // edm::ReusableObjectHolder the pointed-to type is more interesting
+    // to avoid extra layer of indirection
+    using SharedStreamPtr = std::shared_ptr<std::remove_pointer_t<cudaStream_t>>;
+  }  // namespace cuda
+}  // namespace cms
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/StreamCache.h b/HeterogeneousCore/CUDAUtilities/interface/StreamCache.h
index 92e4be75275c6..a9c93546fac09 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/StreamCache.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/StreamCache.h
@@ -10,39 +10,41 @@
 
 class CUDAService;
 
-namespace cudautils {
-  class StreamCache {
-  public:
-    using BareStream = SharedStreamPtr::element_type;
+namespace cms {
+  namespace cuda {
+    class StreamCache {
+    public:
+      using BareStream = SharedStreamPtr::element_type;
 
-    StreamCache();
+      StreamCache();
 
-    // Gets a (cached) CUDA stream for the current device. The stream
-    // will be returned to the cache by the shared_ptr destructor.
-    // This function is thread safe
-    SharedStreamPtr get();
+      // Gets a (cached) CUDA stream for the current device. The stream
+      // will be returned to the cache by the shared_ptr destructor.
+      // This function is thread safe
+      SharedStreamPtr get();
 
-  private:
-    friend class ::CUDAService;
-    // not thread safe, intended to be called only from CUDAService destructor
-    void clear();
+    private:
+      friend class ::CUDAService;
+      // not thread safe, intended to be called only from CUDAService destructor
+      void clear();
 
-    class Deleter {
-    public:
-      Deleter() = default;
-      Deleter(int d) : device_{d} {}
-      void operator()(cudaStream_t stream) const;
+      class Deleter {
+      public:
+        Deleter() = default;
+        Deleter(int d) : device_{d} {}
+        void operator()(cudaStream_t stream) const;
 
-    private:
-      int device_ = -1;
-    };
+      private:
+        int device_ = -1;
+      };
 
-    std::vector<edm::ReusableObjectHolder<BareStream, Deleter>> cache_;
-  };
+      std::vector<edm::ReusableObjectHolder<BareStream, Deleter>> cache_;
+    };
 
-  // Gets the global instance of a StreamCache
-  // This function is thread safe
-  StreamCache& getStreamCache();
-}  // namespace cudautils
+    // Gets the global instance of a StreamCache
+    // This function is thread safe
+    StreamCache& getStreamCache();
+  }  // namespace cuda
+}  // namespace cms
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/allocate_device.h b/HeterogeneousCore/CUDAUtilities/interface/allocate_device.h
index 1c689f03ef831..9c271fc2fbff1 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/allocate_device.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/allocate_device.h
@@ -3,12 +3,14 @@
 
 #include <cuda_runtime.h>
 
-namespace cudautils {
-  // Allocate device memory
-  void *allocate_device(int dev, size_t nbytes, cudaStream_t stream);
+namespace cms {
+  namespace cuda {
+    // Allocate device memory
+    void *allocate_device(int dev, size_t nbytes, cudaStream_t stream);
 
-  // Free device memory (to be called from unique_ptr)
-  void free_device(int device, void *ptr);
-}  // namespace cudautils
+    // Free device memory (to be called from unique_ptr)
+    void free_device(int device, void *ptr);
+  }  // namespace cuda
+}  // namespace cms
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/allocate_host.h b/HeterogeneousCore/CUDAUtilities/interface/allocate_host.h
index 8913e8ce14c7f..1bba4580028d3 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/allocate_host.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/allocate_host.h
@@ -3,12 +3,14 @@
 
 #include <cuda_runtime.h>
 
-namespace cudautils {
-  // Allocate pinned host memory (to be called from unique_ptr)
-  void *allocate_host(size_t nbytes, cudaStream_t stream);
+namespace cms {
+  namespace cuda {
+    // Allocate pinned host memory (to be called from unique_ptr)
+    void *allocate_host(size_t nbytes, cudaStream_t stream);
 
-  // Free pinned host memory (to be called from unique_ptr)
-  void free_host(void *ptr);
-}  // namespace cudautils
+    // Free pinned host memory (to be called from unique_ptr)
+    void free_host(void *ptr);
+  }  // namespace cuda
+}  // namespace cms
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
index 97dd81b9ac14a..102a1d7bf2d86 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
@@ -7,42 +7,42 @@
 
 #include <type_traits>
 
-namespace cudautils {
-  // Single element
-  template <typename T>
-  inline void copyAsync(cudautils::device::unique_ptr<T>& dst,
-                        const cudautils::host::unique_ptr<T>& src,
-                        cudaStream_t stream) {
-    // Shouldn't compile for array types because of sizeof(T), but
-    // let's add an assert with a more helpful message
-    static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
-    cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream));
-  }
+namespace cms {
+  namespace cuda {
+    // Single element
+    template <typename T>
+    inline void copyAsync(device::unique_ptr<T>& dst, const host::unique_ptr<T>& src, cudaStream_t stream) {
+      // Shouldn't compile for array types because of sizeof(T), but
+      // let's add an assert with a more helpful message
+      static_assert(std::is_array<T>::value == false,
+                    "For array types, use the other overload with the size parameter");
+      cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream));
+    }
 
-  template <typename T>
-  inline void copyAsync(cudautils::host::unique_ptr<T>& dst,
-                        const cudautils::device::unique_ptr<T>& src,
-                        cudaStream_t stream) {
-    static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
-    cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyDeviceToHost, stream));
-  }
+    template <typename T>
+    inline void copyAsync(host::unique_ptr<T>& dst, const device::unique_ptr<T>& src, cudaStream_t stream) {
+      static_assert(std::is_array<T>::value == false,
+                    "For array types, use the other overload with the size parameter");
+      cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyDeviceToHost, stream));
+    }
 
-  // Multiple elements
-  template <typename T>
-  inline void copyAsync(cudautils::device::unique_ptr<T[]>& dst,
-                        const cudautils::host::unique_ptr<T[]>& src,
-                        size_t nelements,
-                        cudaStream_t stream) {
-    cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream));
-  }
+    // Multiple elements
+    template <typename T>
+    inline void copyAsync(device::unique_ptr<T[]>& dst,
+                          const host::unique_ptr<T[]>& src,
+                          size_t nelements,
+                          cudaStream_t stream) {
+      cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream));
+    }
 
-  template <typename T>
-  inline void copyAsync(cudautils::host::unique_ptr<T[]>& dst,
-                        const cudautils::device::unique_ptr<T[]>& src,
-                        size_t nelements,
-                        cudaStream_t stream) {
-    cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyDeviceToHost, stream));
-  }
-}  // namespace cudautils
+    template <typename T>
+    inline void copyAsync(host::unique_ptr<T[]>& dst,
+                          const device::unique_ptr<T[]>& src,
+                          size_t nelements,
+                          cudaStream_t stream) {
+      cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyDeviceToHost, stream));
+    }
+  }  // namespace cuda
+}  // namespace cms
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h
index 37aa62c17fffb..71959994e0044 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h
@@ -13,50 +13,52 @@
 // CMSSW headers
 #include "FWCore/Utilities/interface/Likely.h"
 
-namespace cudautils {
-
-  [[noreturn]] inline void abortOnCudaError(const char* file,
-                                            int line,
-                                            const char* cmd,
-                                            const char* error,
-                                            const char* message,
-                                            const char* description = nullptr) {
-    std::ostringstream out;
-    out << "\n";
-    out << file << ", line " << line << ":\n";
-    out << "cudaCheck(" << cmd << ");\n";
-    out << error << ": " << message << "\n";
-    if (description)
-      out << description << "\n";
-    throw std::runtime_error(out.str());
-  }
-
-  inline bool cudaCheck_(
-      const char* file, int line, const char* cmd, CUresult result, const char* description = nullptr) {
-    if (LIKELY(result == CUDA_SUCCESS))
-      return true;
-
-    const char* error;
-    const char* message;
-    cuGetErrorName(result, &error);
-    cuGetErrorString(result, &message);
-    abortOnCudaError(file, line, cmd, error, message, description);
-    return false;
-  }
-
-  inline bool cudaCheck_(
-      const char* file, int line, const char* cmd, cudaError_t result, const char* description = nullptr) {
-    if (LIKELY(result == cudaSuccess))
-      return true;
-
-    const char* error = cudaGetErrorName(result);
-    const char* message = cudaGetErrorString(result);
-    abortOnCudaError(file, line, cmd, error, message, description);
-    return false;
-  }
-
-}  // namespace cudautils
-
-#define cudaCheck(ARG, ...) (cudautils::cudaCheck_(__FILE__, __LINE__, #ARG, (ARG), ##__VA_ARGS__))
+namespace cms {
+  namespace cuda {
+
+    [[noreturn]] inline void abortOnCudaError(const char* file,
+                                              int line,
+                                              const char* cmd,
+                                              const char* error,
+                                              const char* message,
+                                              const char* description = nullptr) {
+      std::ostringstream out;
+      out << "\n";
+      out << file << ", line " << line << ":\n";
+      out << "cudaCheck(" << cmd << ");\n";
+      out << error << ": " << message << "\n";
+      if (description)
+        out << description << "\n";
+      throw std::runtime_error(out.str());
+    }
+
+    inline bool cudaCheck_(
+        const char* file, int line, const char* cmd, CUresult result, const char* description = nullptr) {
+      if (LIKELY(result == CUDA_SUCCESS))
+        return true;
+
+      const char* error;
+      const char* message;
+      cuGetErrorName(result, &error);
+      cuGetErrorString(result, &message);
+      abortOnCudaError(file, line, cmd, error, message, description);
+      return false;
+    }
+
+    inline bool cudaCheck_(
+        const char* file, int line, const char* cmd, cudaError_t result, const char* description = nullptr) {
+      if (LIKELY(result == cudaSuccess))
+        return true;
+
+      const char* error = cudaGetErrorName(result);
+      const char* message = cudaGetErrorString(result);
+      abortOnCudaError(file, line, cmd, error, message, description);
+      return false;
+    }
+
+  }  // namespace cuda
+}  // namespace cms
+
+#define cudaCheck(ARG, ...) (cms::cuda::cudaCheck_(__FILE__, __LINE__, #ARG, (ARG), ##__VA_ARGS__))
 
 #endif  // HeterogeneousCore_CUDAUtilities_cudaCheck_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/currentDevice.h b/HeterogeneousCore/CUDAUtilities/interface/currentDevice.h
index 2659bcbf1d95c..06ddbe00489ec 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/currentDevice.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/currentDevice.h
@@ -5,12 +5,14 @@
 
 #include <cuda_runtime.h>
 
-namespace cudautils {
-  inline int currentDevice() {
-    int dev;
-    cudaCheck(cudaGetDevice(&dev));
-    return dev;
-  }
-}  // namespace cudautils
+namespace cms {
+  namespace cuda {
+    inline int currentDevice() {
+      int dev;
+      cudaCheck(cudaGetDevice(&dev));
+      return dev;
+    }
+  }  // namespace cuda
+}  // namespace cms
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/deviceCount.h b/HeterogeneousCore/CUDAUtilities/interface/deviceCount.h
index fb6b741225f8f..de9a809f993b0 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/deviceCount.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/deviceCount.h
@@ -5,12 +5,14 @@
 
 #include <cuda_runtime.h>
 
-namespace cudautils {
-  inline int deviceCount() {
-    int ndevices;
-    cudaCheck(cudaGetDeviceCount(&ndevices));
-    return ndevices;
-  }
-}  // namespace cudautils
+namespace cms {
+  namespace cuda {
+    inline int deviceCount() {
+      int ndevices;
+      cudaCheck(cudaGetDeviceCount(&ndevices));
+      return ndevices;
+    }
+  }  // namespace cuda
+}  // namespace cms
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h
index 76c876acbf5b3..eb86c05be465c 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h
@@ -8,93 +8,95 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
 
-namespace cudautils {
-  namespace device {
-    namespace impl {
-      // Additional layer of types to distinguish from host::unique_ptr
-      class DeviceDeleter {
-      public:
-        DeviceDeleter() = default;  // for edm::Wrapper
-        DeviceDeleter(int device) : device_{device} {}
+namespace cms {
+  namespace cuda {
+    namespace device {
+      namespace impl {
+        // Additional layer of types to distinguish from host::unique_ptr
+        class DeviceDeleter {
+        public:
+          DeviceDeleter() = default;  // for edm::Wrapper
+          DeviceDeleter(int device) : device_{device} {}
 
-        void operator()(void *ptr) {
-          if (LIKELY(device_ >= 0)) {
-            free_device(device_, ptr);
+          void operator()(void *ptr) {
+            if (LIKELY(device_ >= 0)) {
+              free_device(device_, ptr);
+            }
           }
-        }
 
-      private:
-        int device_ = -1;
-      };
-    }  // namespace impl
+        private:
+          int device_ = -1;
+        };
+      }  // namespace impl
 
-    template <typename T>
-    using unique_ptr = std::unique_ptr<T, impl::DeviceDeleter>;
-
-    namespace impl {
       template <typename T>
-      struct make_device_unique_selector {
-        using non_array = cudautils::device::unique_ptr<T>;
-      };
-      template <typename T>
-      struct make_device_unique_selector<T[]> {
-        using unbounded_array = cudautils::device::unique_ptr<T[]>;
-      };
-      template <typename T, size_t N>
-      struct make_device_unique_selector<T[N]> {
-        struct bounded_array {};
-      };
-    }  // namespace impl
-  }    // namespace device
+      using unique_ptr = std::unique_ptr<T, impl::DeviceDeleter>;
+
+      namespace impl {
+        template <typename T>
+        struct make_device_unique_selector {
+          using non_array = cms::cuda::device::unique_ptr<T>;
+        };
+        template <typename T>
+        struct make_device_unique_selector<T[]> {
+          using unbounded_array = cms::cuda::device::unique_ptr<T[]>;
+        };
+        template <typename T, size_t N>
+        struct make_device_unique_selector<T[N]> {
+          struct bounded_array {};
+        };
+      }  // namespace impl
+    }    // namespace device
 
-  template <typename T>
-  typename device::impl::make_device_unique_selector<T>::non_array make_device_unique(cudaStream_t stream) {
-    static_assert(std::is_trivially_constructible<T>::value,
-                  "Allocating with non-trivial constructor on the device memory is not supported");
-    int dev = cudautils::currentDevice();
-    void *mem = cudautils::allocate_device(dev, sizeof(T), stream);
-    return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
-                                                                            device::impl::DeviceDeleter{dev}};
-  }
+    template <typename T>
+    typename device::impl::make_device_unique_selector<T>::non_array make_device_unique(cudaStream_t stream) {
+      static_assert(std::is_trivially_constructible<T>::value,
+                    "Allocating with non-trivial constructor on the device memory is not supported");
+      int dev = currentDevice();
+      void *mem = allocate_device(dev, sizeof(T), stream);
+      return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                              device::impl::DeviceDeleter{dev}};
+    }
 
-  template <typename T>
-  typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique(size_t n,
-                                                                                            cudaStream_t stream) {
-    using element_type = typename std::remove_extent<T>::type;
-    static_assert(std::is_trivially_constructible<element_type>::value,
-                  "Allocating with non-trivial constructor on the device memory is not supported");
-    int dev = cudautils::currentDevice();
-    void *mem = cudautils::allocate_device(dev, n * sizeof(element_type), stream);
-    return typename device::impl::make_device_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
-                                                                                  device::impl::DeviceDeleter{dev}};
-  }
+    template <typename T>
+    typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique(size_t n,
+                                                                                              cudaStream_t stream) {
+      using element_type = typename std::remove_extent<T>::type;
+      static_assert(std::is_trivially_constructible<element_type>::value,
+                    "Allocating with non-trivial constructor on the device memory is not supported");
+      int dev = currentDevice();
+      void *mem = allocate_device(dev, n * sizeof(element_type), stream);
+      return typename device::impl::make_device_unique_selector<T>::unbounded_array{
+          reinterpret_cast<element_type *>(mem), device::impl::DeviceDeleter{dev}};
+    }
 
-  template <typename T, typename... Args>
-  typename device::impl::make_device_unique_selector<T>::bounded_array make_device_unique(Args &&...) = delete;
+    template <typename T, typename... Args>
+    typename device::impl::make_device_unique_selector<T>::bounded_array make_device_unique(Args &&...) = delete;
 
-  // No check for the trivial constructor, make it clear in the interface
-  template <typename T>
-  typename device::impl::make_device_unique_selector<T>::non_array make_device_unique_uninitialized(
-      cudaStream_t stream) {
-    int dev = cudautils::currentDevice();
-    void *mem = cudautils::allocate_device(dev, sizeof(T), stream);
-    return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
-                                                                            device::impl::DeviceDeleter{dev}};
-  }
+    // No check for the trivial constructor, make it clear in the interface
+    template <typename T>
+    typename device::impl::make_device_unique_selector<T>::non_array make_device_unique_uninitialized(
+        cudaStream_t stream) {
+      int dev = currentDevice();
+      void *mem = allocate_device(dev, sizeof(T), stream);
+      return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                              device::impl::DeviceDeleter{dev}};
+    }
 
-  template <typename T>
-  typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique_uninitialized(
-      size_t n, cudaStream_t stream) {
-    using element_type = typename std::remove_extent<T>::type;
-    int dev = cudautils::currentDevice();
-    void *mem = cudautils::allocate_device(dev, n * sizeof(element_type), stream);
-    return typename device::impl::make_device_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
-                                                                                  device::impl::DeviceDeleter{dev}};
-  }
+    template <typename T>
+    typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique_uninitialized(
+        size_t n, cudaStream_t stream) {
+      using element_type = typename std::remove_extent<T>::type;
+      int dev = currentDevice();
+      void *mem = allocate_device(dev, n * sizeof(element_type), stream);
+      return typename device::impl::make_device_unique_selector<T>::unbounded_array{
+          reinterpret_cast<element_type *>(mem), device::impl::DeviceDeleter{dev}};
+    }
 
-  template <typename T, typename... Args>
-  typename device::impl::make_device_unique_selector<T>::bounded_array make_device_unique_uninitialized(Args &&...) =
-      delete;
-}  // namespace cudautils
+    template <typename T, typename... Args>
+    typename device::impl::make_device_unique_selector<T>::bounded_array make_device_unique_uninitialized(Args &&...) =
+        delete;
+  }  // namespace cuda
+}  // namespace cms
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h b/HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h
index ef05d9cfbd951..6dfb04ac9e10d 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h
@@ -5,8 +5,9 @@
 
 #include <cuda_runtime.h>
 
-namespace cudautils {
-  /**
+namespace cms {
+  namespace cuda {
+    /**
    * Returns true if the work captured by the event (=queued to the
    * CUDA stream at the point of cudaEventRecord()) has completed.
    *
@@ -14,17 +15,18 @@ namespace cudautils {
    *
    * In case of errors, throws an exception.
    */
-  inline bool eventWorkHasCompleted(cudaEvent_t event) {
-    const auto ret = cudaEventQuery(event);
-    if (ret == cudaSuccess) {
-      return true;
-    } else if (ret == cudaErrorNotReady) {
-      return false;
+    inline bool eventWorkHasCompleted(cudaEvent_t event) {
+      const auto ret = cudaEventQuery(event);
+      if (ret == cudaSuccess) {
+        return true;
+      } else if (ret == cudaErrorNotReady) {
+        return false;
+      }
+      // leave error case handling to cudaCheck
+      cudaCheck(ret);
+      return false;  // to keep compiler happy
     }
-    // leave error case handling to cudaCheck
-    cudaCheck(ret);
-    return false;  // to keep compiler happy
-  }
-}  // namespace cudautils
+  }  // namespace cuda
+}  // namespace cms
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h
index 4e62ab9e8d4c8..009f4b859fb7d 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h
@@ -7,67 +7,68 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-namespace cudautils {
-  namespace host {
-    namespace noncached {
-      namespace impl {
-        // Additional layer of types to distinguish from host::unique_ptr
-        class HostDeleter {
-        public:
-          void operator()(void *ptr) { cudaCheck(cudaFreeHost(ptr)); }
-        };
-      }  // namespace impl
+namespace cms {
+  namespace cuda {
+    namespace host {
+      namespace noncached {
+        namespace impl {
+          // Additional layer of types to distinguish from host::unique_ptr
+          class HostDeleter {
+          public:
+            void operator()(void *ptr) { cudaCheck(cudaFreeHost(ptr)); }
+          };
+        }  // namespace impl
 
-      template <typename T>
-      using unique_ptr = std::unique_ptr<T, impl::HostDeleter>;
-
-      namespace impl {
-        template <typename T>
-        struct make_host_unique_selector {
-          using non_array = cudautils::host::noncached::unique_ptr<T>;
-        };
         template <typename T>
-        struct make_host_unique_selector<T[]> {
-          using unbounded_array = cudautils::host::noncached::unique_ptr<T[]>;
-        };
-        template <typename T, size_t N>
-        struct make_host_unique_selector<T[N]> {
-          struct bounded_array {};
-        };
-      }  // namespace impl
-    }    // namespace noncached
-  }      // namespace host
+        using unique_ptr = std::unique_ptr<T, impl::HostDeleter>;
+
+        namespace impl {
+          template <typename T>
+          struct make_host_unique_selector {
+            using non_array = cms::cuda::host::noncached::unique_ptr<T>;
+          };
+          template <typename T>
+          struct make_host_unique_selector<T[]> {
+            using unbounded_array = cms::cuda::host::noncached::unique_ptr<T[]>;
+          };
+          template <typename T, size_t N>
+          struct make_host_unique_selector<T[N]> {
+            struct bounded_array {};
+          };
+        }  // namespace impl
+      }    // namespace noncached
+    }      // namespace host
 
-  /**
+    /**
    * The difference wrt. make_host_unique is that these
    * do not cache, so they should not be called per-event.
    */
-  template <typename T>
-  typename host::noncached::impl::make_host_unique_selector<T>::non_array make_host_noncached_unique(
-      unsigned int flags = cudaHostAllocDefault) {
-    static_assert(std::is_trivially_constructible<T>::value,
-                  "Allocating with non-trivial constructor on the pinned host memory is not supported");
-    void *mem;
-    cudaCheck(cudaHostAlloc(&mem, sizeof(T), flags));
-    return
-        typename cudautils::host::noncached::impl::make_host_unique_selector<T>::non_array(reinterpret_cast<T *>(mem));
-  }
+    template <typename T>
+    typename host::noncached::impl::make_host_unique_selector<T>::non_array make_host_noncached_unique(
+        unsigned int flags = cudaHostAllocDefault) {
+      static_assert(std::is_trivially_constructible<T>::value,
+                    "Allocating with non-trivial constructor on the pinned host memory is not supported");
+      void *mem;
+      cudaCheck(cudaHostAlloc(&mem, sizeof(T), flags));
+      return typename host::noncached::impl::make_host_unique_selector<T>::non_array(reinterpret_cast<T *>(mem));
+    }
 
-  template <typename T>
-  typename host::noncached::impl::make_host_unique_selector<T>::unbounded_array make_host_noncached_unique(
-      size_t n, unsigned int flags = cudaHostAllocDefault) {
-    using element_type = typename std::remove_extent<T>::type;
-    static_assert(std::is_trivially_constructible<element_type>::value,
-                  "Allocating with non-trivial constructor on the pinned host memory is not supported");
-    void *mem;
-    cudaCheck(cudaHostAlloc(&mem, n * sizeof(element_type), flags));
-    return typename cudautils::host::noncached::impl::make_host_unique_selector<T>::unbounded_array(
-        reinterpret_cast<element_type *>(mem));
-  }
+    template <typename T>
+    typename host::noncached::impl::make_host_unique_selector<T>::unbounded_array make_host_noncached_unique(
+        size_t n, unsigned int flags = cudaHostAllocDefault) {
+      using element_type = typename std::remove_extent<T>::type;
+      static_assert(std::is_trivially_constructible<element_type>::value,
+                    "Allocating with non-trivial constructor on the pinned host memory is not supported");
+      void *mem;
+      cudaCheck(cudaHostAlloc(&mem, n * sizeof(element_type), flags));
+      return typename host::noncached::impl::make_host_unique_selector<T>::unbounded_array(
+          reinterpret_cast<element_type *>(mem));
+    }
 
-  template <typename T, typename... Args>
-  typename cudautils::host::noncached::impl::make_host_unique_selector<T>::bounded_array make_host_noncached_unique(
-      Args &&...) = delete;
-}  // namespace cudautils
+    template <typename T, typename... Args>
+    typename host::noncached::impl::make_host_unique_selector<T>::bounded_array make_host_noncached_unique(Args &&...) =
+        delete;
+  }  // namespace cuda
+}  // namespace cms
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h
index a4405ebf7fd75..a64b7c9869d6c 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h
@@ -6,73 +6,75 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/allocate_host.h"
 
-namespace cudautils {
-  namespace host {
-    namespace impl {
-      // Additional layer of types to distinguish from host::unique_ptr
-      class HostDeleter {
-      public:
-        void operator()(void *ptr) { cudautils::free_host(ptr); }
-      };
-    }  // namespace impl
+namespace cms {
+  namespace cuda {
+    namespace host {
+      namespace impl {
+        // Additional layer of types to distinguish from host::unique_ptr
+        class HostDeleter {
+        public:
+          void operator()(void *ptr) { cms::cuda::free_host(ptr); }
+        };
+      }  // namespace impl
 
-    template <typename T>
-    using unique_ptr = std::unique_ptr<T, impl::HostDeleter>;
-
-    namespace impl {
       template <typename T>
-      struct make_host_unique_selector {
-        using non_array = cudautils::host::unique_ptr<T>;
-      };
-      template <typename T>
-      struct make_host_unique_selector<T[]> {
-        using unbounded_array = cudautils::host::unique_ptr<T[]>;
-      };
-      template <typename T, size_t N>
-      struct make_host_unique_selector<T[N]> {
-        struct bounded_array {};
-      };
-    }  // namespace impl
-  }    // namespace host
+      using unique_ptr = std::unique_ptr<T, impl::HostDeleter>;
+
+      namespace impl {
+        template <typename T>
+        struct make_host_unique_selector {
+          using non_array = cms::cuda::host::unique_ptr<T>;
+        };
+        template <typename T>
+        struct make_host_unique_selector<T[]> {
+          using unbounded_array = cms::cuda::host::unique_ptr<T[]>;
+        };
+        template <typename T, size_t N>
+        struct make_host_unique_selector<T[N]> {
+          struct bounded_array {};
+        };
+      }  // namespace impl
+    }    // namespace host
 
-  // Allocate pinned host memory
-  template <typename T>
-  typename host::impl::make_host_unique_selector<T>::non_array make_host_unique(cudaStream_t stream) {
-    static_assert(std::is_trivially_constructible<T>::value,
-                  "Allocating with non-trivial constructor on the pinned host memory is not supported");
-    void *mem = allocate_host(sizeof(T), stream);
-    return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem)};
-  }
+    // Allocate pinned host memory
+    template <typename T>
+    typename host::impl::make_host_unique_selector<T>::non_array make_host_unique(cudaStream_t stream) {
+      static_assert(std::is_trivially_constructible<T>::value,
+                    "Allocating with non-trivial constructor on the pinned host memory is not supported");
+      void *mem = allocate_host(sizeof(T), stream);
+      return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem)};
+    }
 
-  template <typename T>
-  typename host::impl::make_host_unique_selector<T>::unbounded_array make_host_unique(size_t n, cudaStream_t stream) {
-    using element_type = typename std::remove_extent<T>::type;
-    static_assert(std::is_trivially_constructible<element_type>::value,
-                  "Allocating with non-trivial constructor on the pinned host memory is not supported");
-    void *mem = allocate_host(n * sizeof(element_type), stream);
-    return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem)};
-  }
+    template <typename T>
+    typename host::impl::make_host_unique_selector<T>::unbounded_array make_host_unique(size_t n, cudaStream_t stream) {
+      using element_type = typename std::remove_extent<T>::type;
+      static_assert(std::is_trivially_constructible<element_type>::value,
+                    "Allocating with non-trivial constructor on the pinned host memory is not supported");
+      void *mem = allocate_host(n * sizeof(element_type), stream);
+      return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem)};
+    }
 
-  template <typename T, typename... Args>
-  typename host::impl::make_host_unique_selector<T>::bounded_array make_host_unique(Args &&...) = delete;
+    template <typename T, typename... Args>
+    typename host::impl::make_host_unique_selector<T>::bounded_array make_host_unique(Args &&...) = delete;
 
-  // No check for the trivial constructor, make it clear in the interface
-  template <typename T>
-  typename host::impl::make_host_unique_selector<T>::non_array make_host_unique_uninitialized(cudaStream_t stream) {
-    void *mem = allocate_host(sizeof(T), stream);
-    return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem)};
-  }
+    // No check for the trivial constructor, make it clear in the interface
+    template <typename T>
+    typename host::impl::make_host_unique_selector<T>::non_array make_host_unique_uninitialized(cudaStream_t stream) {
+      void *mem = allocate_host(sizeof(T), stream);
+      return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem)};
+    }
 
-  template <typename T>
-  typename host::impl::make_host_unique_selector<T>::unbounded_array make_host_unique_uninitialized(
-      size_t n, cudaStream_t stream) {
-    using element_type = typename std::remove_extent<T>::type;
-    void *mem = allocate_host(n * sizeof(element_type), stream);
-    return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem)};
-  }
+    template <typename T>
+    typename host::impl::make_host_unique_selector<T>::unbounded_array make_host_unique_uninitialized(
+        size_t n, cudaStream_t stream) {
+      using element_type = typename std::remove_extent<T>::type;
+      void *mem = allocate_host(n * sizeof(element_type), stream);
+      return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem)};
+    }
 
-  template <typename T, typename... Args>
-  typename host::impl::make_host_unique_selector<T>::bounded_array make_host_unique_uninitialized(Args &&...) = delete;
-}  // namespace cudautils
+    template <typename T, typename... Args>
+    typename host::impl::make_host_unique_selector<T>::bounded_array make_host_unique_uninitialized(Args &&...) = delete;
+  }  // namespace cuda
+}  // namespace cms
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/launch.h b/HeterogeneousCore/CUDAUtilities/interface/launch.h
index f3c8240224c0f..60d6e5d844fea 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/launch.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/launch.h
@@ -8,7 +8,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 /*
- * `cudautils::launch` and `cudautils::launch_cooperative` are wrappers around
+ * `cms::cuda::launch` and `cms::cuda::launch_cooperative` are wrappers around
  * the CUDA Runtime API calls to setup and call a CUDA kernel from the host.
  *
  * `kernel` should be a pointer to a __global__ void(...) function.
@@ -23,8 +23,8 @@
  *  the exact type.
  *
  *  Unlike the `kernel<<<...>>>(...)` syntax and the `cuda::launch(...)` 
- *  implementation from the CUDA API Wrappers, `cudautils::launch(...)` and 
- *  `cudautils::launch_cooperative` can be called from standard C++ host code.
+ *  implementation from the CUDA API Wrappers, `cms::cuda::launch(...)` and 
+ *  `cms::cuda::launch_cooperative` can be called from standard C++ host code.
  *
  *  Possible optimisations
  *
@@ -42,104 +42,106 @@
  *      kernel signature.
  */
 
-namespace cudautils {
+namespace cms {
+  namespace cuda {
 
-  struct LaunchParameters {
-    dim3 gridDim;
-    dim3 blockDim;
-    size_t sharedMem;
-    cudaStream_t stream;
+    struct LaunchParameters {
+      dim3 gridDim;
+      dim3 blockDim;
+      size_t sharedMem;
+      cudaStream_t stream;
 
-    LaunchParameters(dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, cudaStream_t stream = nullptr)
-        : gridDim(gridDim), blockDim(blockDim), sharedMem(sharedMem), stream(stream) {}
+      LaunchParameters(dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, cudaStream_t stream = nullptr)
+          : gridDim(gridDim), blockDim(blockDim), sharedMem(sharedMem), stream(stream) {}
 
-    LaunchParameters(int gridDim, int blockDim, size_t sharedMem = 0, cudaStream_t stream = nullptr)
-        : gridDim(gridDim), blockDim(blockDim), sharedMem(sharedMem), stream(stream) {}
-  };
+      LaunchParameters(int gridDim, int blockDim, size_t sharedMem = 0, cudaStream_t stream = nullptr)
+          : gridDim(gridDim), blockDim(blockDim), sharedMem(sharedMem), stream(stream) {}
+    };
 
-  namespace detail {
+    namespace detail {
 
-    template <typename T>
-    struct kernel_traits;
+      template <typename T>
+      struct kernel_traits;
 
-    template <typename... Args>
-    struct kernel_traits<void(Args...)> {
-      static constexpr size_t arguments_size = sizeof...(Args);
+      template <typename... Args>
+      struct kernel_traits<void(Args...)> {
+        static constexpr size_t arguments_size = sizeof...(Args);
 
-      using argument_type_tuple = std::tuple<Args...>;
+        using argument_type_tuple = std::tuple<Args...>;
 
-      template <size_t i>
-      using argument_type = typename std::tuple_element<i, argument_type_tuple>::type;
-    };
+        template <size_t i>
+        using argument_type = typename std::tuple_element<i, argument_type_tuple>::type;
+      };
 
-    // fill an array with the pointers to the elements of a tuple
-    template <int I>
-    struct pointer_setter {
-      template <typename Tuple>
-      void operator()(void const* ptrs[], Tuple const& t) {
-        pointer_setter<I - 1>()(ptrs, t);
-        ptrs[I - 1] = &std::get<I - 1>(t);
-      }
-    };
+      // fill an array with the pointers to the elements of a tuple
+      template <int I>
+      struct pointer_setter {
+        template <typename Tuple>
+        void operator()(void const* ptrs[], Tuple const& t) {
+          pointer_setter<I - 1>()(ptrs, t);
+          ptrs[I - 1] = &std::get<I - 1>(t);
+        }
+      };
 
-    template <>
-    struct pointer_setter<0> {
-      template <typename Tuple>
-      void operator()(void const* ptrs[], Tuple const& t) {}
-    };
+      template <>
+      struct pointer_setter<0> {
+        template <typename Tuple>
+        void operator()(void const* ptrs[], Tuple const& t) {}
+      };
 
-  }  // namespace detail
+    }  // namespace detail
 
-  // wrappers for cudaLaunchKernel
+    // wrappers for cudaLaunchKernel
 
-  inline void launch(void (*kernel)(), LaunchParameters config) {
-    cudaCheck(cudaLaunchKernel(
-        (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
-  }
+    inline void launch(void (*kernel)(), LaunchParameters config) {
+      cudaCheck(cudaLaunchKernel(
+          (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
+    }
 
-  template <typename F, typename... Args>
+    template <typename F, typename... Args>
 #if __cplusplus >= 201703L
-  std::enable_if_t<std::is_invocable_r<void, F, Args&&...>::value>
+    std::enable_if_t<std::is_invocable_r<void, F, Args&&...>::value>
 #else
-  std::enable_if_t<std::is_void<std::result_of_t<F && (Args && ...)> >::value>
+    std::enable_if_t<std::is_void<std::result_of_t<F && (Args && ...)> >::value>
 #endif
-  launch(F* kernel, LaunchParameters config, Args&&... args) {
-    using function_type = detail::kernel_traits<F>;
-    typename function_type::argument_type_tuple args_copy(args...);
+    launch(F* kernel, LaunchParameters config, Args&&... args) {
+      using function_type = detail::kernel_traits<F>;
+      typename function_type::argument_type_tuple args_copy(args...);
 
-    constexpr auto size = function_type::arguments_size;
-    void const* pointers[size];
+      constexpr auto size = function_type::arguments_size;
+      void const* pointers[size];
 
-    detail::pointer_setter<size>()(pointers, args_copy);
-    cudaCheck(cudaLaunchKernel(
-        (const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream));
-  }
+      detail::pointer_setter<size>()(pointers, args_copy);
+      cudaCheck(cudaLaunchKernel(
+          (const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream));
+    }
 
-  // wrappers for cudaLaunchCooperativeKernel
+    // wrappers for cudaLaunchCooperativeKernel
 
-  inline void launch_cooperative(void (*kernel)(), LaunchParameters config) {
-    cudaCheck(cudaLaunchCooperativeKernel(
-        (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
-  }
+    inline void launch_cooperative(void (*kernel)(), LaunchParameters config) {
+      cudaCheck(cudaLaunchCooperativeKernel(
+          (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
+    }
 
-  template <typename F, typename... Args>
+    template <typename F, typename... Args>
 #if __cplusplus >= 201703L
-  std::enable_if_t<std::is_invocable_r<void, F, Args&&...>::value>
+    std::enable_if_t<std::is_invocable_r<void, F, Args&&...>::value>
 #else
-  std::enable_if_t<std::is_void<std::result_of_t<F && (Args && ...)> >::value>
+    std::enable_if_t<std::is_void<std::result_of_t<F && (Args && ...)> >::value>
 #endif
-  launch_cooperative(F* kernel, LaunchParameters config, Args&&... args) {
-    using function_type = detail::kernel_traits<F>;
-    typename function_type::argument_type_tuple args_copy(args...);
+    launch_cooperative(F* kernel, LaunchParameters config, Args&&... args) {
+      using function_type = detail::kernel_traits<F>;
+      typename function_type::argument_type_tuple args_copy(args...);
 
-    constexpr auto size = function_type::arguments_size;
-    void const* pointers[size];
+      constexpr auto size = function_type::arguments_size;
+      void const* pointers[size];
 
-    detail::pointer_setter<size>()(pointers, args_copy);
-    cudaCheck(cudaLaunchCooperativeKernel(
-        (const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream));
-  }
+      detail::pointer_setter<size>()(pointers, args_copy);
+      cudaCheck(cudaLaunchCooperativeKernel(
+          (const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream));
+    }
 
-}  // namespace cudautils
+  }  // namespace cuda
+}  // namespace cms
 
 #endif  // HeterogeneousCore_CUDAUtilities_launch_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
index 203aac78a165c..3a4dbe3d1f6de 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
@@ -6,25 +6,28 @@
 
 #include <type_traits>
 
-namespace cudautils {
-  template <typename T>
-  inline void memsetAsync(cudautils::device::unique_ptr<T>& ptr, T value, cudaStream_t stream) {
-    // Shouldn't compile for array types because of sizeof(T), but
-    // let's add an assert with a more helpful message
-    static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
-    cudaCheck(cudaMemsetAsync(ptr.get(), value, sizeof(T), stream));
-  }
+namespace cms {
+  namespace cuda {
+    template <typename T>
+    inline void memsetAsync(device::unique_ptr<T>& ptr, T value, cudaStream_t stream) {
+      // Shouldn't compile for array types because of sizeof(T), but
+      // let's add an assert with a more helpful message
+      static_assert(std::is_array<T>::value == false,
+                    "For array types, use the other overload with the size parameter");
+      cudaCheck(cudaMemsetAsync(ptr.get(), value, sizeof(T), stream));
+    }
 
-  /**
+    /**
    * The type of `value` is `int` because of `cudaMemsetAsync()` takes
    * it as an `int`. Note that `cudaMemsetAsync()` sets the value of
    * each **byte** to `value`. This may lead to unexpected results if
    * `sizeof(T) > 1` and `value != 0`.
    */
-  template <typename T>
-  inline void memsetAsync(cudautils::device::unique_ptr<T[]>& ptr, int value, size_t nelements, cudaStream_t stream) {
-    cudaCheck(cudaMemsetAsync(ptr.get(), value, nelements * sizeof(T), stream));
-  }
-}  // namespace cudautils
+    template <typename T>
+    inline void memsetAsync(device::unique_ptr<T[]>& ptr, int value, size_t nelements, cudaStream_t stream) {
+      cudaCheck(cudaMemsetAsync(ptr.get(), value, nelements * sizeof(T), stream));
+    }
+  }  // namespace cuda
+}  // namespace cms
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/src/EventCache.cc b/HeterogeneousCore/CUDAUtilities/src/EventCache.cc
index 0a5474d7f4aa0..a80cfdd412ec5 100644
--- a/HeterogeneousCore/CUDAUtilities/src/EventCache.cc
+++ b/HeterogeneousCore/CUDAUtilities/src/EventCache.cc
@@ -6,7 +6,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
 
-namespace cudautils {
+namespace cms::cuda {
   void EventCache::Deleter::operator()(cudaEvent_t event) const {
     if (device_ != -1) {
       ScopedSetDevice deviceGuard{device_};
@@ -16,10 +16,10 @@ namespace cudautils {
 
   // EventCache should be constructed by the first call to
   // getEventCache() only if we have CUDA devices present
-  EventCache::EventCache() : cache_(cudautils::deviceCount()) {}
+  EventCache::EventCache() : cache_(deviceCount()) {}
 
   SharedEventPtr EventCache::get() {
-    const auto dev = cudautils::currentDevice();
+    const auto dev = currentDevice();
     auto event = makeOrGet(dev);
     // captured work has completed, or a just-created event
     if (eventWorkHasCompleted(event.get())) {
@@ -58,7 +58,7 @@ namespace cudautils {
     // EventCache lives through multiple tests (and go through
     // multiple shutdowns of the framework).
     cache_.clear();
-    cache_.resize(cudautils::deviceCount());
+    cache_.resize(deviceCount());
   }
 
   EventCache& getEventCache() {
@@ -66,4 +66,4 @@ namespace cudautils {
     CMS_THREAD_SAFE static EventCache cache;
     return cache;
   }
-}  // namespace cudautils
+}  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDAUtilities/src/MessageLogger.cc b/HeterogeneousCore/CUDAUtilities/src/MessageLogger.cc
index ed1faf0d91e46..ffd9c1ca4f613 100644
--- a/HeterogeneousCore/CUDAUtilities/src/MessageLogger.cc
+++ b/HeterogeneousCore/CUDAUtilities/src/MessageLogger.cc
@@ -1,7 +1,7 @@
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/MessageLogger.h"
 
-namespace cudautils {
+namespace cms::cuda {
 
   LogSystem::~LogSystem() { edm::LogSystem(category_) << message_.str(); }
 
@@ -21,4 +21,4 @@ namespace cudautils {
 
   LogVerbatim::~LogVerbatim() { edm::LogVerbatim(category_) << message_.str(); }
 
-}  // namespace cudautils
+}  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDAUtilities/src/StreamCache.cc b/HeterogeneousCore/CUDAUtilities/src/StreamCache.cc
index 06a908b1cec1d..0f39a6ea720da 100644
--- a/HeterogeneousCore/CUDAUtilities/src/StreamCache.cc
+++ b/HeterogeneousCore/CUDAUtilities/src/StreamCache.cc
@@ -5,7 +5,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
 
-namespace cudautils {
+namespace cms::cuda {
   void StreamCache::Deleter::operator()(cudaStream_t stream) const {
     if (device_ != -1) {
       ScopedSetDevice deviceGuard{device_};
@@ -15,10 +15,10 @@ namespace cudautils {
 
   // StreamCache should be constructed by the first call to
   // getStreamCache() only if we have CUDA devices present
-  StreamCache::StreamCache() : cache_(cudautils::deviceCount()) {}
+  StreamCache::StreamCache() : cache_(deviceCount()) {}
 
   SharedStreamPtr StreamCache::get() {
-    const auto dev = cudautils::currentDevice();
+    const auto dev = currentDevice();
     return cache_[dev].makeOrGet([dev]() {
       cudaStream_t stream;
       cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
@@ -33,7 +33,7 @@ namespace cudautils {
     // StreamCache lives through multiple tests (and go through
     // multiple shutdowns of the framework).
     cache_.clear();
-    cache_.resize(cudautils::deviceCount());
+    cache_.resize(deviceCount());
   }
 
   StreamCache& getStreamCache() {
@@ -41,4 +41,4 @@ namespace cudautils {
     CMS_THREAD_SAFE static StreamCache cache;
     return cache;
   }
-}  // namespace cudautils
+}  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc b/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc
index 2e7b7151dea38..c3a33fcee3553 100644
--- a/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc
+++ b/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc
@@ -9,18 +9,18 @@
 
 namespace {
   const size_t maxAllocationSize =
-      notcub::CachingDeviceAllocator::IntPow(cudautils::allocator::binGrowth, cudautils::allocator::maxBin);
+      notcub::CachingDeviceAllocator::IntPow(cms::cuda::allocator::binGrowth, cms::cuda::allocator::maxBin);
 }
 
-namespace cudautils {
+namespace cms::cuda {
   void *allocate_device(int dev, size_t nbytes, cudaStream_t stream) {
     void *ptr = nullptr;
-    if constexpr (cudautils::allocator::useCaching) {
+    if constexpr (allocator::useCaching) {
       if (UNLIKELY(nbytes > maxAllocationSize)) {
         throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) +
                                  " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize));
       }
-      cudaCheck(cudautils::allocator::getCachingDeviceAllocator().DeviceAllocate(dev, &ptr, nbytes, stream));
+      cudaCheck(allocator::getCachingDeviceAllocator().DeviceAllocate(dev, &ptr, nbytes, stream));
     } else {
       ScopedSetDevice setDeviceForThisScope(dev);
       cudaCheck(cudaMalloc(&ptr, nbytes));
@@ -29,12 +29,12 @@ namespace cudautils {
   }
 
   void free_device(int device, void *ptr) {
-    if constexpr (cudautils::allocator::useCaching) {
-      cudaCheck(cudautils::allocator::getCachingDeviceAllocator().DeviceFree(device, ptr));
+    if constexpr (allocator::useCaching) {
+      cudaCheck(allocator::getCachingDeviceAllocator().DeviceFree(device, ptr));
     } else {
       ScopedSetDevice setDeviceForThisScope(device);
       cudaCheck(cudaFree(ptr));
     }
   }
 
-}  // namespace cudautils
+}  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDAUtilities/src/allocate_host.cc b/HeterogeneousCore/CUDAUtilities/src/allocate_host.cc
index 265b18671e654..1e8c8f9cd33c5 100644
--- a/HeterogeneousCore/CUDAUtilities/src/allocate_host.cc
+++ b/HeterogeneousCore/CUDAUtilities/src/allocate_host.cc
@@ -8,18 +8,18 @@
 
 namespace {
   const size_t maxAllocationSize =
-      notcub::CachingDeviceAllocator::IntPow(cudautils::allocator::binGrowth, cudautils::allocator::maxBin);
+      notcub::CachingDeviceAllocator::IntPow(cms::cuda::allocator::binGrowth, cms::cuda::allocator::maxBin);
 }
 
-namespace cudautils {
+namespace cms::cuda {
   void *allocate_host(size_t nbytes, cudaStream_t stream) {
     void *ptr = nullptr;
-    if constexpr (cudautils::allocator::useCaching) {
+    if constexpr (allocator::useCaching) {
       if (UNLIKELY(nbytes > maxAllocationSize)) {
         throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) +
                                  " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize));
       }
-      cudaCheck(cudautils::allocator::getCachingHostAllocator().HostAllocate(&ptr, nbytes, stream));
+      cudaCheck(allocator::getCachingHostAllocator().HostAllocate(&ptr, nbytes, stream));
     } else {
       cudaCheck(cudaMallocHost(&ptr, nbytes));
     }
@@ -27,11 +27,11 @@ namespace cudautils {
   }
 
   void free_host(void *ptr) {
-    if constexpr (cudautils::allocator::useCaching) {
-      cudaCheck(cudautils::allocator::getCachingHostAllocator().HostFree(ptr));
+    if constexpr (allocator::useCaching) {
+      cudaCheck(allocator::getCachingHostAllocator().HostFree(ptr));
     } else {
       cudaCheck(cudaFreeHost(ptr));
     }
   }
 
-}  // namespace cudautils
+}  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
index ad329b1168ec7..8158f414b07d4 100644
--- a/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
+++ b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
@@ -9,72 +9,70 @@
 
 #include <iomanip>
 
-namespace cudautils {
-  namespace allocator {
-    // Use caching or not
-    constexpr bool useCaching = true;
-    // Growth factor (bin_growth in cub::CachingDeviceAllocator
-    constexpr unsigned int binGrowth = 8;
-    // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
-    constexpr unsigned int minBin = 1;
-    // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
-    constexpr unsigned int maxBin = 10;
-    // Total storage for the allocator. 0 means no limit.
-    constexpr size_t maxCachedBytes = 0;
-    // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.
-    constexpr double maxCachedFraction = 0.8;
-    constexpr bool debug = false;
+namespace cms::cuda::allocator {
+  // Use caching or not
+  constexpr bool useCaching = true;
+  // Growth factor (bin_growth in cub::CachingDeviceAllocator
+  constexpr unsigned int binGrowth = 8;
+  // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
+  constexpr unsigned int minBin = 1;
+  // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
+  constexpr unsigned int maxBin = 10;
+  // Total storage for the allocator. 0 means no limit.
+  constexpr size_t maxCachedBytes = 0;
+  // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.
+  constexpr double maxCachedFraction = 0.8;
+  constexpr bool debug = false;
 
-    inline size_t minCachedBytes() {
-      size_t ret = std::numeric_limits<size_t>::max();
-      int currentDevice;
-      cudaCheck(cudaGetDevice(&currentDevice));
-      const int numberOfDevices = deviceCount();
-      for (int i = 0; i < numberOfDevices; ++i) {
-        size_t freeMemory, totalMemory;
-        cudaCheck(cudaSetDevice(i));
-        cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
-        ret = std::min(ret, static_cast<size_t>(maxCachedFraction * freeMemory));
-      }
-      cudaCheck(cudaSetDevice(currentDevice));
-      if (maxCachedBytes > 0) {
-        ret = std::min(ret, maxCachedBytes);
-      }
-      return ret;
+  inline size_t minCachedBytes() {
+    size_t ret = std::numeric_limits<size_t>::max();
+    int currentDevice;
+    cudaCheck(cudaGetDevice(&currentDevice));
+    const int numberOfDevices = deviceCount();
+    for (int i = 0; i < numberOfDevices; ++i) {
+      size_t freeMemory, totalMemory;
+      cudaCheck(cudaSetDevice(i));
+      cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
+      ret = std::min(ret, static_cast<size_t>(maxCachedFraction * freeMemory));
+    }
+    cudaCheck(cudaSetDevice(currentDevice));
+    if (maxCachedBytes > 0) {
+      ret = std::min(ret, maxCachedBytes);
     }
+    return ret;
+  }
 
-    inline notcub::CachingDeviceAllocator& getCachingDeviceAllocator() {
-      LogDebug("CachingDeviceAllocator").log([](auto& log) {
-        log << "cub::CachingDeviceAllocator settings\n"
-            << "  bin growth " << binGrowth << "\n"
-            << "  min bin    " << minBin << "\n"
-            << "  max bin    " << maxBin << "\n"
-            << "  resulting bins:\n";
-        for (auto bin = minBin; bin <= maxBin; ++bin) {
-          auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin);
-          if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) {
-            log << "    " << std::setw(8) << (binSize >> 30) << " GB\n";
-          } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) {
-            log << "    " << std::setw(8) << (binSize >> 20) << " MB\n";
-          } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) {
-            log << "    " << std::setw(8) << (binSize >> 10) << " kB\n";
-          } else {
-            log << "    " << std::setw(9) << binSize << " B\n";
-          }
+  inline notcub::CachingDeviceAllocator& getCachingDeviceAllocator() {
+    LogDebug("CachingDeviceAllocator").log([](auto& log) {
+      log << "cub::CachingDeviceAllocator settings\n"
+          << "  bin growth " << binGrowth << "\n"
+          << "  min bin    " << minBin << "\n"
+          << "  max bin    " << maxBin << "\n"
+          << "  resulting bins:\n";
+      for (auto bin = minBin; bin <= maxBin; ++bin) {
+        auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin);
+        if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) {
+          log << "    " << std::setw(8) << (binSize >> 30) << " GB\n";
+        } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) {
+          log << "    " << std::setw(8) << (binSize >> 20) << " MB\n";
+        } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) {
+          log << "    " << std::setw(8) << (binSize >> 10) << " kB\n";
+        } else {
+          log << "    " << std::setw(9) << binSize << " B\n";
         }
-        log << "  maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n";
-      });
+      }
+      log << "  maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n";
+    });
 
-      // the public interface is thread safe
-      CMS_THREAD_SAFE static notcub::CachingDeviceAllocator allocator{binGrowth,
-                                                                      minBin,
-                                                                      maxBin,
-                                                                      minCachedBytes(),
-                                                                      false,  // do not skip cleanup
-                                                                      debug};
-      return allocator;
-    }
-  }  // namespace allocator
-}  // namespace cudautils
+    // the public interface is thread safe
+    CMS_THREAD_SAFE static notcub::CachingDeviceAllocator allocator{binGrowth,
+                                                                    minBin,
+                                                                    maxBin,
+                                                                    minCachedBytes(),
+                                                                    false,  // do not skip cleanup
+                                                                    debug};
+    return allocator;
+  }
+}  // namespace cms::cuda::allocator
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h b/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h
index b9e31a78176f9..6e74648696dd8 100644
--- a/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h
+++ b/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h
@@ -10,40 +10,38 @@
 
 #include <iomanip>
 
-namespace cudautils {
-  namespace allocator {
-    inline notcub::CachingHostAllocator& getCachingHostAllocator() {
-      LogDebug("CachingHostAllocator").log([](auto& log) {
-        log << "cub::CachingHostAllocator settings\n"
-            << "  bin growth " << binGrowth << "\n"
-            << "  min bin    " << minBin << "\n"
-            << "  max bin    " << maxBin << "\n"
-            << "  resulting bins:\n";
-        for (auto bin = minBin; bin <= maxBin; ++bin) {
-          auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin);
-          if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) {
-            log << "    " << std::setw(8) << (binSize >> 30) << " GB\n";
-          } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) {
-            log << "    " << std::setw(8) << (binSize >> 20) << " MB\n";
-          } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) {
-            log << "    " << std::setw(8) << (binSize >> 10) << " kB\n";
-          } else {
-            log << "    " << std::setw(9) << binSize << " B\n";
-          }
+namespace cms::cuda::allocator {
+  inline notcub::CachingHostAllocator& getCachingHostAllocator() {
+    LogDebug("CachingHostAllocator").log([](auto& log) {
+      log << "cub::CachingHostAllocator settings\n"
+          << "  bin growth " << binGrowth << "\n"
+          << "  min bin    " << minBin << "\n"
+          << "  max bin    " << maxBin << "\n"
+          << "  resulting bins:\n";
+      for (auto bin = minBin; bin <= maxBin; ++bin) {
+        auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin);
+        if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) {
+          log << "    " << std::setw(8) << (binSize >> 30) << " GB\n";
+        } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) {
+          log << "    " << std::setw(8) << (binSize >> 20) << " MB\n";
+        } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) {
+          log << "    " << std::setw(8) << (binSize >> 10) << " kB\n";
+        } else {
+          log << "    " << std::setw(9) << binSize << " B\n";
         }
-        log << "  maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n";
-      });
+      }
+      log << "  maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n";
+    });
 
-      // the public interface is thread safe
-      CMS_THREAD_SAFE static notcub::CachingHostAllocator allocator{binGrowth,
-                                                                    minBin,
-                                                                    maxBin,
-                                                                    minCachedBytes(),
-                                                                    false,  // do not skip cleanup
-                                                                    debug};
-      return allocator;
-    }
-  }  // namespace allocator
-}  // namespace cudautils
+    // the public interface is thread safe
+    CMS_THREAD_SAFE static notcub::CachingHostAllocator allocator{binGrowth,
+                                                                  minBin,
+                                                                  maxBin,
+                                                                  minCachedBytes(),
+                                                                  false,  // do not skip cleanup
+                                                                  debug};
+    return allocator;
+  }
+}  // namespace cms::cuda::allocator
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
index 3dbf853ce43a6..24a114db68ce1 100644
--- a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
@@ -16,13 +16,13 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
 
   SECTION("Host to device") {
     SECTION("Single element") {
-      auto host_orig = cudautils::make_host_unique<int>(stream);
+      auto host_orig = cms::cuda::make_host_unique<int>(stream);
       *host_orig = 42;
 
-      auto device = cudautils::make_device_unique<int>(stream);
-      auto host = cudautils::make_host_unique<int>(stream);
+      auto device = cms::cuda::make_device_unique<int>(stream);
+      auto host = cms::cuda::make_host_unique<int>(stream);
 
-      cudautils::copyAsync(device, host_orig, stream);
+      cms::cuda::copyAsync(device, host_orig, stream);
       cudaCheck(cudaMemcpyAsync(host.get(), device.get(), sizeof(int), cudaMemcpyDeviceToHost, stream));
       cudaCheck(cudaStreamSynchronize(stream));
 
@@ -32,16 +32,16 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
     SECTION("Multiple elements") {
       constexpr int N = 100;
 
-      auto host_orig = cudautils::make_host_unique<int[]>(N, stream);
+      auto host_orig = cms::cuda::make_host_unique<int[]>(N, stream);
       for (int i = 0; i < N; ++i) {
         host_orig[i] = i;
       }
 
-      auto device = cudautils::make_device_unique<int[]>(N, stream);
-      auto host = cudautils::make_host_unique<int[]>(N, stream);
+      auto device = cms::cuda::make_device_unique<int[]>(N, stream);
+      auto host = cms::cuda::make_host_unique<int[]>(N, stream);
 
       SECTION("Copy all") {
-        cudautils::copyAsync(device, host_orig, N, stream);
+        cms::cuda::copyAsync(device, host_orig, N, stream);
         cudaCheck(cudaMemcpyAsync(host.get(), device.get(), N * sizeof(int), cudaMemcpyDeviceToHost, stream));
         cudaCheck(cudaStreamSynchronize(stream));
         for (int i = 0; i < N; ++i) {
@@ -54,7 +54,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
       }
 
       SECTION("Copy some") {
-        cudautils::copyAsync(device, host_orig, 42, stream);
+        cms::cuda::copyAsync(device, host_orig, 42, stream);
         cudaCheck(cudaMemcpyAsync(host.get(), device.get(), 42 * sizeof(int), cudaMemcpyDeviceToHost, stream));
         cudaCheck(cudaStreamSynchronize(stream));
         for (int i = 0; i < 42; ++i) {
@@ -66,14 +66,14 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
 
   SECTION("Device to host") {
     SECTION("Single element") {
-      auto host_orig = cudautils::make_host_unique<int>(stream);
+      auto host_orig = cms::cuda::make_host_unique<int>(stream);
       *host_orig = 42;
 
-      auto device = cudautils::make_device_unique<int>(stream);
-      auto host = cudautils::make_host_unique<int>(stream);
+      auto device = cms::cuda::make_device_unique<int>(stream);
+      auto host = cms::cuda::make_host_unique<int>(stream);
 
       cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), sizeof(int), cudaMemcpyHostToDevice, stream));
-      cudautils::copyAsync(host, device, stream);
+      cms::cuda::copyAsync(host, device, stream);
       cudaCheck(cudaStreamSynchronize(stream));
 
       REQUIRE(*host == 42);
@@ -82,17 +82,17 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
     SECTION("Multiple elements") {
       constexpr int N = 100;
 
-      auto host_orig = cudautils::make_host_unique<int[]>(N, stream);
+      auto host_orig = cms::cuda::make_host_unique<int[]>(N, stream);
       for (int i = 0; i < N; ++i) {
         host_orig[i] = i;
       }
 
-      auto device = cudautils::make_device_unique<int[]>(N, stream);
-      auto host = cudautils::make_host_unique<int[]>(N, stream);
+      auto device = cms::cuda::make_device_unique<int[]>(N, stream);
+      auto host = cms::cuda::make_host_unique<int[]>(N, stream);
 
       SECTION("Copy all") {
         cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), N * sizeof(int), cudaMemcpyHostToDevice, stream));
-        cudautils::copyAsync(host, device, N, stream);
+        cms::cuda::copyAsync(host, device, N, stream);
         cudaCheck(cudaStreamSynchronize(stream));
         for (int i = 0; i < N; ++i) {
           CHECK(host[i] == i);
@@ -105,7 +105,7 @@ TEST_CASE("copyAsync", "[cudaMemTools]") {
 
       SECTION("Copy some") {
         cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), 42 * sizeof(int), cudaMemcpyHostToDevice, stream));
-        cudautils::copyAsync(host, device, 42, stream);
+        cms::cuda::copyAsync(host, device, 42, stream);
         cudaCheck(cudaStreamSynchronize(stream));
         for (int i = 0; i < 42; ++i) {
           CHECK(host[i] == 200 + i);
diff --git a/HeterogeneousCore/CUDAUtilities/test/device_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/device_unique_ptr_t.cpp
index 27c5bd3b23f3d..4c8412f2cecc2 100644
--- a/HeterogeneousCore/CUDAUtilities/test/device_unique_ptr_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/device_unique_ptr_t.cpp
@@ -13,12 +13,12 @@ TEST_CASE("device_unique_ptr", "[cudaMemTools]") {
   cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
   SECTION("Single element") {
-    auto ptr = cudautils::make_device_unique<int>(stream);
+    auto ptr = cms::cuda::make_device_unique<int>(stream);
     REQUIRE(ptr != nullptr);
   }
 
   SECTION("Reset") {
-    auto ptr = cudautils::make_device_unique<int>(stream);
+    auto ptr = cms::cuda::make_device_unique<int>(stream);
     REQUIRE(ptr != nullptr);
     cudaCheck(cudaStreamSynchronize(stream));
 
@@ -27,7 +27,7 @@ TEST_CASE("device_unique_ptr", "[cudaMemTools]") {
   }
 
   SECTION("Multiple elements") {
-    auto ptr = cudautils::make_device_unique<int[]>(10, stream);
+    auto ptr = cms::cuda::make_device_unique<int[]>(10, stream);
     REQUIRE(ptr != nullptr);
     cudaCheck(cudaStreamSynchronize(stream));
 
@@ -37,9 +37,9 @@ TEST_CASE("device_unique_ptr", "[cudaMemTools]") {
 
   SECTION("Allocating too much") {
     constexpr size_t maxSize = 1 << 30;  // 8**10
-    auto ptr = cudautils::make_device_unique<char[]>(maxSize, stream);
+    auto ptr = cms::cuda::make_device_unique<char[]>(maxSize, stream);
     ptr.reset();
-    REQUIRE_THROWS(ptr = cudautils::make_device_unique<char[]>(maxSize + 1, stream));
+    REQUIRE_THROWS(ptr = cms::cuda::make_device_unique<char[]>(maxSize + 1, stream));
   }
 
   cudaCheck(cudaStreamDestroy(stream));
diff --git a/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp
index 5111936a07c90..62dcaaf1eba9e 100644
--- a/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp
@@ -9,16 +9,16 @@ TEST_CASE("host_noncached_unique_ptr", "[cudaMemTools]") {
   }
 
   SECTION("Single element") {
-    auto ptr1 = cudautils::make_host_noncached_unique<int>();
+    auto ptr1 = cms::cuda::make_host_noncached_unique<int>();
     REQUIRE(ptr1 != nullptr);
-    auto ptr2 = cudautils::make_host_noncached_unique<int>(cudaHostAllocPortable | cudaHostAllocWriteCombined);
+    auto ptr2 = cms::cuda::make_host_noncached_unique<int>(cudaHostAllocPortable | cudaHostAllocWriteCombined);
     REQUIRE(ptr2 != nullptr);
   }
 
   SECTION("Multiple elements") {
-    auto ptr1 = cudautils::make_host_noncached_unique<int[]>(10);
+    auto ptr1 = cms::cuda::make_host_noncached_unique<int[]>(10);
     REQUIRE(ptr1 != nullptr);
-    auto ptr2 = cudautils::make_host_noncached_unique<int[]>(10, cudaHostAllocPortable | cudaHostAllocWriteCombined);
+    auto ptr2 = cms::cuda::make_host_noncached_unique<int[]>(10, cudaHostAllocPortable | cudaHostAllocWriteCombined);
     REQUIRE(ptr2 != nullptr);
   }
 }
diff --git a/HeterogeneousCore/CUDAUtilities/test/host_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/host_unique_ptr_t.cpp
index 230ad48945d69..8ab2bca86106b 100644
--- a/HeterogeneousCore/CUDAUtilities/test/host_unique_ptr_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/host_unique_ptr_t.cpp
@@ -13,12 +13,12 @@ TEST_CASE("host_unique_ptr", "[cudaMemTools]") {
   cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
   SECTION("Single element") {
-    auto ptr = cudautils::make_host_unique<int>(stream);
+    auto ptr = cms::cuda::make_host_unique<int>(stream);
     REQUIRE(ptr != nullptr);
   }
 
   SECTION("Reset") {
-    auto ptr = cudautils::make_host_unique<int>(stream);
+    auto ptr = cms::cuda::make_host_unique<int>(stream);
     REQUIRE(ptr != nullptr);
     cudaCheck(cudaStreamSynchronize(stream));
 
@@ -27,7 +27,7 @@ TEST_CASE("host_unique_ptr", "[cudaMemTools]") {
   }
 
   SECTION("Multiple elements") {
-    auto ptr = cudautils::make_host_unique<int[]>(10, stream);
+    auto ptr = cms::cuda::make_host_unique<int[]>(10, stream);
     REQUIRE(ptr != nullptr);
     cudaCheck(cudaStreamSynchronize(stream));
 
@@ -37,9 +37,9 @@ TEST_CASE("host_unique_ptr", "[cudaMemTools]") {
 
   SECTION("Allocating too much") {
     constexpr size_t maxSize = 1 << 30;  // 8**10
-    auto ptr = cudautils::make_host_unique<char[]>(maxSize, stream);
+    auto ptr = cms::cuda::make_host_unique<char[]>(maxSize, stream);
     ptr.reset();
-    REQUIRE_THROWS(ptr = cudautils::make_host_unique<char[]>(maxSize + 1, stream));
+    REQUIRE_THROWS(ptr = cms::cuda::make_host_unique<char[]>(maxSize + 1, stream));
   }
 
   cudaCheck(cudaStreamDestroy(stream));
diff --git a/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp
index ec30f4badea3e..1afa1e5806ff6 100644
--- a/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp
@@ -16,14 +16,14 @@ TEST_CASE("memsetAsync", "[cudaMemTools]") {
   cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
   SECTION("Single element") {
-    auto host_orig = cudautils::make_host_unique<int>(stream);
+    auto host_orig = cms::cuda::make_host_unique<int>(stream);
     *host_orig = 42;
 
-    auto device = cudautils::make_device_unique<int>(stream);
-    auto host = cudautils::make_host_unique<int>(stream);
-    cudautils::copyAsync(device, host_orig, stream);
-    cudautils::memsetAsync(device, 0, stream);
-    cudautils::copyAsync(host, device, stream);
+    auto device = cms::cuda::make_device_unique<int>(stream);
+    auto host = cms::cuda::make_host_unique<int>(stream);
+    cms::cuda::copyAsync(device, host_orig, stream);
+    cms::cuda::memsetAsync(device, 0, stream);
+    cms::cuda::copyAsync(host, device, stream);
     cudaCheck(cudaStreamSynchronize(stream));
 
     REQUIRE(*host == 0);
@@ -32,16 +32,16 @@ TEST_CASE("memsetAsync", "[cudaMemTools]") {
   SECTION("Multiple elements") {
     constexpr int N = 100;
 
-    auto host_orig = cudautils::make_host_unique<int[]>(N, stream);
+    auto host_orig = cms::cuda::make_host_unique<int[]>(N, stream);
     for (int i = 0; i < N; ++i) {
       host_orig[i] = i;
     }
 
-    auto device = cudautils::make_device_unique<int[]>(N, stream);
-    auto host = cudautils::make_host_unique<int[]>(N, stream);
-    cudautils::copyAsync(device, host_orig, N, stream);
-    cudautils::memsetAsync(device, 0, N, stream);
-    cudautils::copyAsync(host, device, N, stream);
+    auto device = cms::cuda::make_device_unique<int[]>(N, stream);
+    auto host = cms::cuda::make_host_unique<int[]>(N, stream);
+    cms::cuda::copyAsync(device, host_orig, N, stream);
+    cms::cuda::memsetAsync(device, 0, N, stream);
+    cms::cuda::copyAsync(host, device, N, stream);
     cudaCheck(cudaStreamSynchronize(stream));
 
     for (int i = 0; i < N; ++i) {

From 55bf9949b8c216694fef4fdf2c1e8ddf32a8e05c Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 15 Jan 2020 17:41:25 +0100
Subject: [PATCH 29/29] Propagate cudautils->cms::cuda rename

---
 .../BeamSpot/interface/BeamSpotCUDA.h         |   2 +-
 CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc  |   2 +-
 .../Common/interface/HeterogeneousSoA.h       |  40 ++---
 .../Common/interface/HostProduct.h            |   4 +-
 .../interface/SiPixelClustersCUDA.h           |  10 +-
 .../SiPixelCluster/src/SiPixelClustersCUDA.cc |  14 +-
 .../interface/SiPixelDigiErrorsCUDA.h         |   8 +-
 .../SiPixelDigi/interface/SiPixelDigisCUDA.h  |  24 +--
 .../SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc  |  16 +-
 .../SiPixelDigi/src/SiPixelDigisCUDA.cc       |  44 ++---
 .../interface/TrackingRecHit2DHeterogeneous.h |  10 +-
 .../src/TrackingRecHit2DCUDA.cc               |  10 +-
 .../test/gpuFrameTransformKernel.cu           |   2 +-
 .../test/gpuFrameTransformTest.cpp            |  16 +-
 DataFormats/Math/test/CholeskyInvert_t.cu     |   8 +-
 DataFormats/Math/test/cudaAtan2Test.cu        |   4 +-
 DataFormats/Math/test/cudaMathTest.cu         |  10 +-
 .../plugins/SiPixelDigiErrorsSoAFromCUDA.cc   |   2 +-
 .../plugins/SiPixelDigisSoAFromCUDA.cc        |   8 +-
 .../CUDAUtilities/interface/HistoContainer.h  | 164 +++++++++---------
 .../CUDAUtilities/test/HistoContainer_t.cu    |  10 +-
 .../CUDAUtilities/test/OneHistoContainer_t.cu |   4 +-
 .../CUDAUtilities/test/OneToManyAssoc_t.h     |  40 ++---
 .../CUDAUtilities/test/cudastdAlgorithm_t.cu  |   2 +-
 .../CUDAUtilities/test/radixSort_t.cu         |  12 +-
 .../SiPixelFedCablingMapGPUWrapper.h          |   2 +-
 .../plugins/SiPixelRawToClusterCUDA.cc        |   2 +-
 .../plugins/SiPixelRawToClusterGPUKernel.cu   |  10 +-
 .../plugins/SiPixelRawToClusterGPUKernel.h    |   8 +-
 .../src/SiPixelFedCablingMapGPUWrapper.cc     |   6 +-
 .../SiPixelClusterizer/test/gpuClustering_t.h |  22 +--
 .../SiPixelRecHits/plugins/PixelRecHits.cu    |   4 +-
 .../plugins/SiPixelRecHitFromSOA.cc           |   6 +-
 .../plugins/SiPixelRecHitSoAFromLegacy.cc     |   2 +-
 .../plugins/PixelTrackSoAFromCUDA.cc          |   2 +-
 .../plugins/BrokenLineFitOnGPU.cu             |   6 +-
 .../plugins/CAHitNtupletGeneratorKernels.cc   |   8 +-
 .../plugins/CAHitNtupletGeneratorKernels.cu   |  12 +-
 .../CAHitNtupletGeneratorKernelsAlloc.h       |   4 +-
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     |   2 +-
 .../PixelTriplets/plugins/RiemannFitOnGPU.cu  |   8 +-
 .../src/PixelVertexSoAFromCUDA.cc             |   2 +-
 .../src/gpuVertexFinderImpl.h                 |   4 +-
 .../PixelVertexFinding/test/VertexFinder_t.h  |  18 +-
 .../plugins/BeamSpotToCUDA.cc                 |   4 +-
 45 files changed, 300 insertions(+), 298 deletions(-)

diff --git a/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h b/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h
index 36b152b64dfc1..800634d2f5270 100644
--- a/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h
+++ b/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h
@@ -26,7 +26,7 @@ class BeamSpotCUDA {
   Data const* data() const { return data_d_.get(); }
 
 private:
-  cudautils::device::unique_ptr<Data> data_d_;
+  cms::cuda::device::unique_ptr<Data> data_d_;
 };
 
 #endif
diff --git a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
index a297ae11dc327..575fcf63b8eaa 100644
--- a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
+++ b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
@@ -4,6 +4,6 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
 BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cudaStream_t stream) {
-  data_d_ = cudautils::make_device_unique<Data>(stream);
+  data_d_ = cms::cuda::make_device_unique<Data>(stream);
   cudaCheck(cudaMemcpyAsync(data_d_.get(), data_h, sizeof(Data), cudaMemcpyHostToDevice, stream));
 }
diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
index 907b7647a3452..6fec0026dfaa1 100644
--- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h
+++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -19,8 +19,8 @@ class HeterogeneousSoA {
   HeterogeneousSoA(HeterogeneousSoA &&) = default;
   HeterogeneousSoA &operator=(HeterogeneousSoA &&) = default;
 
-  explicit HeterogeneousSoA(cudautils::device::unique_ptr<T> &&p) : dm_ptr(std::move(p)) {}
-  explicit HeterogeneousSoA(cudautils::host::unique_ptr<T> &&p) : hm_ptr(std::move(p)) {}
+  explicit HeterogeneousSoA(cms::cuda::device::unique_ptr<T> &&p) : dm_ptr(std::move(p)) {}
+  explicit HeterogeneousSoA(cms::cuda::host::unique_ptr<T> &&p) : hm_ptr(std::move(p)) {}
   explicit HeterogeneousSoA(std::unique_ptr<T> &&p) : std_ptr(std::move(p)) {}
 
   auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
@@ -36,17 +36,17 @@ class HeterogeneousSoA {
   auto *operator-> () { return get(); }
 
   // in reality valid only for GPU version...
-  cudautils::host::unique_ptr<T> toHostAsync(cudaStream_t stream) const {
+  cms::cuda::host::unique_ptr<T> toHostAsync(cudaStream_t stream) const {
     assert(dm_ptr);
-    auto ret = cudautils::make_host_unique<T>(stream);
+    auto ret = cms::cuda::make_host_unique<T>(stream);
     cudaCheck(cudaMemcpyAsync(ret.get(), dm_ptr.get(), sizeof(T), cudaMemcpyDefault, stream));
     return ret;
   }
 
 private:
   // a union wan't do it, a variant will not be more efficienct
-  cudautils::device::unique_ptr<T> dm_ptr;  //!
-  cudautils::host::unique_ptr<T> hm_ptr;    //!
+  cms::cuda::device::unique_ptr<T> dm_ptr;  //!
+  cms::cuda::host::unique_ptr<T> hm_ptr;    //!
   std::unique_ptr<T> std_ptr;               //!
 };
 
@@ -54,56 +54,56 @@ namespace cudaCompat {
 
   struct GPUTraits {
     template <typename T>
-    using unique_ptr = cudautils::device::unique_ptr<T>;
+    using unique_ptr = cms::cuda::device::unique_ptr<T>;
 
     template <typename T>
     static auto make_unique(cudaStream_t stream) {
-      return cudautils::make_device_unique<T>(stream);
+      return cms::cuda::make_device_unique<T>(stream);
     }
 
     template <typename T>
     static auto make_unique(size_t size, cudaStream_t stream) {
-      return cudautils::make_device_unique<T>(size, stream);
+      return cms::cuda::make_device_unique<T>(size, stream);
     }
 
     template <typename T>
     static auto make_host_unique(cudaStream_t stream) {
-      return cudautils::make_host_unique<T>(stream);
+      return cms::cuda::make_host_unique<T>(stream);
     }
 
     template <typename T>
     static auto make_device_unique(cudaStream_t stream) {
-      return cudautils::make_device_unique<T>(stream);
+      return cms::cuda::make_device_unique<T>(stream);
     }
 
     template <typename T>
     static auto make_device_unique(size_t size, cudaStream_t stream) {
-      return cudautils::make_device_unique<T>(size, stream);
+      return cms::cuda::make_device_unique<T>(size, stream);
     }
   };
 
   struct HostTraits {
     template <typename T>
-    using unique_ptr = cudautils::host::unique_ptr<T>;
+    using unique_ptr = cms::cuda::host::unique_ptr<T>;
 
     template <typename T>
     static auto make_unique(cudaStream_t stream) {
-      return cudautils::make_host_unique<T>(stream);
+      return cms::cuda::make_host_unique<T>(stream);
     }
 
     template <typename T>
     static auto make_host_unique(cudaStream_t stream) {
-      return cudautils::make_host_unique<T>(stream);
+      return cms::cuda::make_host_unique<T>(stream);
     }
 
     template <typename T>
     static auto make_device_unique(cudaStream_t stream) {
-      return cudautils::make_device_unique<T>(stream);
+      return cms::cuda::make_device_unique<T>(stream);
     }
 
     template <typename T>
     static auto make_device_unique(size_t size, cudaStream_t stream) {
-      return cudautils::make_device_unique<T>(size, stream);
+      return cms::cuda::make_device_unique<T>(size, stream);
     }
   };
 
@@ -158,7 +158,7 @@ class HeterogeneousSoAImpl {
 
   T *get() { return m_ptr.get(); }
 
-  cudautils::host::unique_ptr<T> toHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<T> toHostAsync(cudaStream_t stream) const;
 
 private:
   unique_ptr<T> m_ptr;  //!
@@ -171,8 +171,8 @@ HeterogeneousSoAImpl<T, Traits>::HeterogeneousSoAImpl(cudaStream_t stream) {
 
 // in reality valid only for GPU version...
 template <typename T, typename Traits>
-cudautils::host::unique_ptr<T> HeterogeneousSoAImpl<T, Traits>::toHostAsync(cudaStream_t stream) const {
-  auto ret = cudautils::make_host_unique<T>(stream);
+cms::cuda::host::unique_ptr<T> HeterogeneousSoAImpl<T, Traits>::toHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<T>(stream);
   cudaCheck(cudaMemcpyAsync(ret.get(), get(), sizeof(T), cudaMemcpyDefault, stream));
   return ret;
 }
diff --git a/CUDADataFormats/Common/interface/HostProduct.h b/CUDADataFormats/Common/interface/HostProduct.h
index 17ad98ba403a4..aefd7b61f9781 100644
--- a/CUDADataFormats/Common/interface/HostProduct.h
+++ b/CUDADataFormats/Common/interface/HostProduct.h
@@ -12,7 +12,7 @@ class HostProduct {
   HostProduct(HostProduct&&) = default;
   HostProduct& operator=(HostProduct&&) = default;
 
-  explicit HostProduct(cudautils::host::unique_ptr<T>&& p) : hm_ptr(std::move(p)) {}
+  explicit HostProduct(cms::cuda::host::unique_ptr<T>&& p) : hm_ptr(std::move(p)) {}
   explicit HostProduct(std::unique_ptr<T>&& p) : std_ptr(std::move(p)) {}
 
   auto const* get() const { return hm_ptr ? hm_ptr.get() : std_ptr.get(); }
@@ -22,7 +22,7 @@ class HostProduct {
   auto const* operator-> () const { return get(); }
 
 private:
-  cudautils::host::unique_ptr<T> hm_ptr;  //!
+  cms::cuda::host::unique_ptr<T> hm_ptr;  //!
   std::unique_ptr<T> std_ptr;             //!
 };
 
diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
index d3650e164d44e..dbfb5ff5e1761 100644
--- a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
+++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
@@ -58,14 +58,14 @@ class SiPixelClustersCUDA {
   DeviceConstView *view() const { return view_d.get(); }
 
 private:
-  cudautils::device::unique_ptr<uint32_t[]> moduleStart_d;   // index of the first pixel of each module
-  cudautils::device::unique_ptr<uint32_t[]> clusInModule_d;  // number of clusters found in each module
-  cudautils::device::unique_ptr<uint32_t[]> moduleId_d;      // module id of each module
+  cms::cuda::device::unique_ptr<uint32_t[]> moduleStart_d;   // index of the first pixel of each module
+  cms::cuda::device::unique_ptr<uint32_t[]> clusInModule_d;  // number of clusters found in each module
+  cms::cuda::device::unique_ptr<uint32_t[]> moduleId_d;      // module id of each module
 
   // originally from rechits
-  cudautils::device::unique_ptr<uint32_t[]> clusModuleStart_d;  // index of the first cluster of each module
+  cms::cuda::device::unique_ptr<uint32_t[]> clusModuleStart_d;  // index of the first cluster of each module
 
-  cudautils::device::unique_ptr<DeviceConstView> view_d;  // "me" pointer
+  cms::cuda::device::unique_ptr<DeviceConstView> view_d;  // "me" pointer
 
   uint32_t nClusters_h;
 };
diff --git a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
index c814cd4a2e131..7bef9d0d8a52f 100644
--- a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
+++ b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
@@ -5,17 +5,17 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 
 SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxClusters, cudaStream_t stream) {
-  moduleStart_d = cudautils::make_device_unique<uint32_t[]>(maxClusters + 1, stream);
-  clusInModule_d = cudautils::make_device_unique<uint32_t[]>(maxClusters, stream);
-  moduleId_d = cudautils::make_device_unique<uint32_t[]>(maxClusters, stream);
-  clusModuleStart_d = cudautils::make_device_unique<uint32_t[]>(maxClusters + 1, stream);
+  moduleStart_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters + 1, stream);
+  clusInModule_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
+  moduleId_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
+  clusModuleStart_d = cms::cuda::make_device_unique<uint32_t[]>(maxClusters + 1, stream);
 
-  auto view = cudautils::make_host_unique<DeviceConstView>(stream);
+  auto view = cms::cuda::make_host_unique<DeviceConstView>(stream);
   view->moduleStart_ = moduleStart_d.get();
   view->clusInModule_ = clusInModule_d.get();
   view->moduleId_ = moduleId_d.get();
   view->clusModuleStart_ = clusModuleStart_d.get();
 
-  view_d = cudautils::make_device_unique<DeviceConstView>(stream);
-  cudautils::copyAsync(view_d, view, stream);
+  view_d = cms::cuda::make_device_unique<DeviceConstView>(stream);
+  cms::cuda::copyAsync(view_d, view, stream);
 }
diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h
index 7c18d58a3fc12..1557fd64750e7 100644
--- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h
+++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h
@@ -26,15 +26,15 @@ class SiPixelDigiErrorsCUDA {
   GPU::SimpleVector<PixelErrorCompact> const* c_error() const { return error_d.get(); }
 
   using HostDataError =
-      std::pair<GPU::SimpleVector<PixelErrorCompact>, cudautils::host::unique_ptr<PixelErrorCompact[]>>;
+      std::pair<GPU::SimpleVector<PixelErrorCompact>, cms::cuda::host::unique_ptr<PixelErrorCompact[]>>;
   HostDataError dataErrorToHostAsync(cudaStream_t stream) const;
 
   void copyErrorToHostAsync(cudaStream_t stream);
 
 private:
-  cudautils::device::unique_ptr<PixelErrorCompact[]> data_d;
-  cudautils::device::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_d;
-  cudautils::host::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_h;
+  cms::cuda::device::unique_ptr<PixelErrorCompact[]> data_d;
+  cms::cuda::device::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_d;
+  cms::cuda::host::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_h;
   PixelFormatterErrors formatterErrors_h;
 };
 
diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
index 47efe634ad93d..04207f3e0b385 100644
--- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
+++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
@@ -50,10 +50,10 @@ class SiPixelDigisCUDA {
   uint32_t const *c_pdigi() const { return pdigi_d.get(); }
   uint32_t const *c_rawIdArr() const { return rawIdArr_d.get(); }
 
-  cudautils::host::unique_ptr<uint16_t[]> adcToHostAsync(cudaStream_t stream) const;
-  cudautils::host::unique_ptr<int32_t[]> clusToHostAsync(cudaStream_t stream) const;
-  cudautils::host::unique_ptr<uint32_t[]> pdigiToHostAsync(cudaStream_t stream) const;
-  cudautils::host::unique_ptr<uint32_t[]> rawIdArrToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<uint16_t[]> adcToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<int32_t[]> clusToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<uint32_t[]> pdigiToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<uint32_t[]> rawIdArrToHostAsync(cudaStream_t stream) const;
 
   class DeviceConstView {
   public:
@@ -79,17 +79,17 @@ class SiPixelDigisCUDA {
 
 private:
   // These are consumed by downstream device code
-  cudautils::device::unique_ptr<uint16_t[]> xx_d;         // local coordinates of each pixel
-  cudautils::device::unique_ptr<uint16_t[]> yy_d;         //
-  cudautils::device::unique_ptr<uint16_t[]> adc_d;        // ADC of each pixel
-  cudautils::device::unique_ptr<uint16_t[]> moduleInd_d;  // module id of each pixel
-  cudautils::device::unique_ptr<int32_t[]> clus_d;        // cluster id of each pixel
-  cudautils::device::unique_ptr<DeviceConstView> view_d;  // "me" pointer
+  cms::cuda::device::unique_ptr<uint16_t[]> xx_d;         // local coordinates of each pixel
+  cms::cuda::device::unique_ptr<uint16_t[]> yy_d;         //
+  cms::cuda::device::unique_ptr<uint16_t[]> adc_d;        // ADC of each pixel
+  cms::cuda::device::unique_ptr<uint16_t[]> moduleInd_d;  // module id of each pixel
+  cms::cuda::device::unique_ptr<int32_t[]> clus_d;        // cluster id of each pixel
+  cms::cuda::device::unique_ptr<DeviceConstView> view_d;  // "me" pointer
 
   // These are for CPU output; should we (eventually) place them to a
   // separate product?
-  cudautils::device::unique_ptr<uint32_t[]> pdigi_d;
-  cudautils::device::unique_ptr<uint32_t[]> rawIdArr_d;
+  cms::cuda::device::unique_ptr<uint32_t[]> pdigi_d;
+  cms::cuda::device::unique_ptr<uint32_t[]> rawIdArr_d;
 
   uint32_t nModules_h = 0;
   uint32_t nDigis_h = 0;
diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc
index 7640348c15f08..ffef71092f6c9 100644
--- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc
+++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc
@@ -9,32 +9,32 @@
 
 SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cudaStream_t stream)
     : formatterErrors_h(std::move(errors)) {
-  error_d = cudautils::make_device_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
-  data_d = cudautils::make_device_unique<PixelErrorCompact[]>(maxFedWords, stream);
+  error_d = cms::cuda::make_device_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
+  data_d = cms::cuda::make_device_unique<PixelErrorCompact[]>(maxFedWords, stream);
 
-  cudautils::memsetAsync(data_d, 0x00, maxFedWords, stream);
+  cms::cuda::memsetAsync(data_d, 0x00, maxFedWords, stream);
 
-  error_h = cudautils::make_host_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
+  error_h = cms::cuda::make_host_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
   GPU::make_SimpleVector(error_h.get(), maxFedWords, data_d.get());
   assert(error_h->empty());
   assert(error_h->capacity() == static_cast<int>(maxFedWords));
 
-  cudautils::copyAsync(error_d, error_h, stream);
+  cms::cuda::copyAsync(error_d, error_h, stream);
 }
 
 void SiPixelDigiErrorsCUDA::copyErrorToHostAsync(cudaStream_t stream) {
-  cudautils::copyAsync(error_h, error_d, stream);
+  cms::cuda::copyAsync(error_h, error_d, stream);
 }
 
 SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync(cudaStream_t stream) const {
   // On one hand size() could be sufficient. On the other hand, if
   // someone copies the SimpleVector<>, (s)he might expect the data
   // buffer to actually have space for capacity() elements.
-  auto data = cudautils::make_host_unique<PixelErrorCompact[]>(error_h->capacity(), stream);
+  auto data = cms::cuda::make_host_unique<PixelErrorCompact[]>(error_h->capacity(), stream);
 
   // but transfer only the required amount
   if (not error_h->empty()) {
-    cudautils::copyAsync(data, data_d, error_h->size(), stream);
+    cms::cuda::copyAsync(data, data_d, error_h->size(), stream);
   }
   auto err = *error_h;
   err.set_data(data.get());
diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
index a8aab7ab5a4b8..664364b6ff25a 100644
--- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
+++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
@@ -5,46 +5,46 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 
 SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream) {
-  xx_d = cudautils::make_device_unique<uint16_t[]>(maxFedWords, stream);
-  yy_d = cudautils::make_device_unique<uint16_t[]>(maxFedWords, stream);
-  adc_d = cudautils::make_device_unique<uint16_t[]>(maxFedWords, stream);
-  moduleInd_d = cudautils::make_device_unique<uint16_t[]>(maxFedWords, stream);
-  clus_d = cudautils::make_device_unique<int32_t[]>(maxFedWords, stream);
+  xx_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
+  yy_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
+  adc_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
+  moduleInd_d = cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream);
+  clus_d = cms::cuda::make_device_unique<int32_t[]>(maxFedWords, stream);
 
-  pdigi_d = cudautils::make_device_unique<uint32_t[]>(maxFedWords, stream);
-  rawIdArr_d = cudautils::make_device_unique<uint32_t[]>(maxFedWords, stream);
+  pdigi_d = cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, stream);
+  rawIdArr_d = cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, stream);
 
-  auto view = cudautils::make_host_unique<DeviceConstView>(stream);
+  auto view = cms::cuda::make_host_unique<DeviceConstView>(stream);
   view->xx_ = xx_d.get();
   view->yy_ = yy_d.get();
   view->adc_ = adc_d.get();
   view->moduleInd_ = moduleInd_d.get();
   view->clus_ = clus_d.get();
 
-  view_d = cudautils::make_device_unique<DeviceConstView>(stream);
-  cudautils::copyAsync(view_d, view, stream);
+  view_d = cms::cuda::make_device_unique<DeviceConstView>(stream);
+  cms::cuda::copyAsync(view_d, view, stream);
 }
 
-cudautils::host::unique_ptr<uint16_t[]> SiPixelDigisCUDA::adcToHostAsync(cudaStream_t stream) const {
-  auto ret = cudautils::make_host_unique<uint16_t[]>(nDigis(), stream);
-  cudautils::copyAsync(ret, adc_d, nDigis(), stream);
+cms::cuda::host::unique_ptr<uint16_t[]> SiPixelDigisCUDA::adcToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<uint16_t[]>(nDigis(), stream);
+  cms::cuda::copyAsync(ret, adc_d, nDigis(), stream);
   return ret;
 }
 
-cudautils::host::unique_ptr<int32_t[]> SiPixelDigisCUDA::clusToHostAsync(cudaStream_t stream) const {
-  auto ret = cudautils::make_host_unique<int32_t[]>(nDigis(), stream);
-  cudautils::copyAsync(ret, clus_d, nDigis(), stream);
+cms::cuda::host::unique_ptr<int32_t[]> SiPixelDigisCUDA::clusToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<int32_t[]>(nDigis(), stream);
+  cms::cuda::copyAsync(ret, clus_d, nDigis(), stream);
   return ret;
 }
 
-cudautils::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::pdigiToHostAsync(cudaStream_t stream) const {
-  auto ret = cudautils::make_host_unique<uint32_t[]>(nDigis(), stream);
-  cudautils::copyAsync(ret, pdigi_d, nDigis(), stream);
+cms::cuda::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::pdigiToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<uint32_t[]>(nDigis(), stream);
+  cms::cuda::copyAsync(ret, pdigi_d, nDigis(), stream);
   return ret;
 }
 
-cudautils::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::rawIdArrToHostAsync(cudaStream_t stream) const {
-  auto ret = cudautils::make_host_unique<uint32_t[]>(nDigis(), stream);
-  cudautils::copyAsync(ret, rawIdArr_d, nDigis(), stream);
+cms::cuda::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::rawIdArrToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<uint32_t[]>(nDigis(), stream);
+  cms::cuda::copyAsync(ret, rawIdArr_d, nDigis(), stream);
   return ret;
 }
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
index aa551f21b4aad..955f97ca6bd54 100644
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
@@ -37,9 +37,9 @@ class TrackingRecHit2DHeterogeneous {
   auto iphi() { return m_iphi; }
 
   // only the local coord and detector index
-  cudautils::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const;
-  cudautils::host::unique_ptr<uint16_t[]> detIndexToHostAsync(cudaStream_t stream) const;
-  cudautils::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<uint16_t[]> detIndexToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;
 
 private:
   static constexpr uint32_t n16 = 4;
@@ -89,7 +89,7 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
         constexpr
 #endif
         (std::is_same<Traits, cudaCompat::GPUTraits>::value) {
-      cudautils::copyAsync(m_view, view, stream);
+      cms::cuda::copyAsync(m_view, view, stream);
     } else {
       m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
     }
@@ -136,7 +136,7 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
       constexpr
 #endif
       (std::is_same<Traits, cudaCompat::GPUTraits>::value) {
-    cudautils::copyAsync(m_view, view, stream);
+    cms::cuda::copyAsync(m_view, view, stream);
   } else {
     m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
   }
diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
index e6f223bfec4e3..7b04ed2d530a0 100644
--- a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
+++ b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
@@ -5,15 +5,15 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 template <>
-cudautils::host::unique_ptr<float[]> TrackingRecHit2DCUDA::localCoordToHostAsync(cudaStream_t stream) const {
-  auto ret = cudautils::make_host_unique<float[]>(4 * nHits(), stream);
-  cudautils::copyAsync(ret, m_store32, 4 * nHits(), stream);
+cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DCUDA::localCoordToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<float[]>(4 * nHits(), stream);
+  cms::cuda::copyAsync(ret, m_store32, 4 * nHits(), stream);
   return ret;
 }
 
 template <>
-cudautils::host::unique_ptr<uint32_t[]> TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(cudaStream_t stream) const {
-  auto ret = cudautils::make_host_unique<uint32_t[]>(2001, stream);
+cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<uint32_t[]>(2001, stream);
   cudaCheck(cudaMemcpyAsync(ret.get(), m_hitsModuleStart, 4 * 2001, cudaMemcpyDefault, stream));
   return ret;
 }
diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu b/DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu
index 9af9f5bef600a..c24510146fb59 100644
--- a/DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu
+++ b/DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu
@@ -36,5 +36,5 @@ void toGlobalWrapper(SOAFrame<float> const* frame,
   std::cout << "CUDA toGlobal kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads"
             << std::endl;
 
-  cudautils::launch(toGlobal, {blocksPerGrid, threadsPerBlock}, frame, xl, yl, x, y, z, le, ge, n);
+  cms::cuda::launch(toGlobal, {blocksPerGrid, threadsPerBlock}, frame, xl, yl, x, y, z, le, ge, n);
 }
diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
index e0d305964cc65..ad62b7a1d131c 100644
--- a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
+++ b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
@@ -49,15 +49,15 @@ int main(void) {
   float le[3 * size];
   float ge[6 * size];
 
-  auto d_xl = cudautils::make_device_unique<float[]>(size, nullptr);
-  auto d_yl = cudautils::make_device_unique<float[]>(size, nullptr);
+  auto d_xl = cms::cuda::make_device_unique<float[]>(size, nullptr);
+  auto d_yl = cms::cuda::make_device_unique<float[]>(size, nullptr);
 
-  auto d_x = cudautils::make_device_unique<float[]>(size, nullptr);
-  auto d_y = cudautils::make_device_unique<float[]>(size, nullptr);
-  auto d_z = cudautils::make_device_unique<float[]>(size, nullptr);
+  auto d_x = cms::cuda::make_device_unique<float[]>(size, nullptr);
+  auto d_y = cms::cuda::make_device_unique<float[]>(size, nullptr);
+  auto d_z = cms::cuda::make_device_unique<float[]>(size, nullptr);
 
-  auto d_le = cudautils::make_device_unique<float[]>(3 * size, nullptr);
-  auto d_ge = cudautils::make_device_unique<float[]>(6 * size, nullptr);
+  auto d_le = cms::cuda::make_device_unique<float[]>(3 * size, nullptr);
+  auto d_ge = cms::cuda::make_device_unique<float[]>(6 * size, nullptr);
 
   double a = 0.01;
   double ca = std::cos(a);
@@ -70,7 +70,7 @@ int main(void) {
 
   SFrame sf1(f1.position().x(), f1.position().y(), f1.position().z(), f1.rotation());
 
-  auto d_sf = cudautils::make_device_unique<char[]>(sizeof(SFrame), nullptr);
+  auto d_sf = cms::cuda::make_device_unique<char[]>(sizeof(SFrame), nullptr);
   cudaCheck(cudaMemcpy(d_sf.get(), &sf1, sizeof(SFrame), cudaMemcpyHostToDevice));
 
   for (auto i = 0U; i < size; ++i) {
diff --git a/DataFormats/Math/test/CholeskyInvert_t.cu b/DataFormats/Math/test/CholeskyInvert_t.cu
index ae7116ddf09ce..55df4ed23f20d 100644
--- a/DataFormats/Math/test/CholeskyInvert_t.cu
+++ b/DataFormats/Math/test/CholeskyInvert_t.cu
@@ -123,7 +123,7 @@ void go(bool soa) {
 
   std::cout << mm[SIZE / 2](1, 1) << std::endl;
 
-  auto m_d = cudautils::make_device_unique<double[]>(DIM * DIM * stride(), nullptr);
+  auto m_d = cms::cuda::make_device_unique<double[]>(DIM * DIM * stride(), nullptr);
   cudaCheck(cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice));
 
   constexpr int NKK =
@@ -139,9 +139,9 @@ void go(bool soa) {
     delta -= (std::chrono::high_resolution_clock::now() - start);
 
     if (soa)
-      cudautils::launch(invertSOA<DIM>, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE);
+      cms::cuda::launch(invertSOA<DIM>, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE);
     else
-      cudautils::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
+      cms::cuda::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
 
     cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost));
 
@@ -154,7 +154,7 @@ void go(bool soa) {
       delta1 -= (std::chrono::high_resolution_clock::now() - start);
 
 #ifndef DOPROF
-      cudautils::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
+      cms::cuda::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
       cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost));
 #endif
       delta1 += (std::chrono::high_resolution_clock::now() - start);
diff --git a/DataFormats/Math/test/cudaAtan2Test.cu b/DataFormats/Math/test/cudaAtan2Test.cu
index 70a818021ed53..731447fe826e4 100644
--- a/DataFormats/Math/test/cudaAtan2Test.cu
+++ b/DataFormats/Math/test/cudaAtan2Test.cu
@@ -68,7 +68,7 @@ void go() {
   // atan2
   delta -= (std::chrono::high_resolution_clock::now() - start);
 
-  auto diff_d = cudautils::make_device_unique<int[]>(3, nullptr);
+  auto diff_d = cms::cuda::make_device_unique<int[]>(3, nullptr);
 
   int diffs[3];
   cudaCheck(cudaMemset(diff_d.get(), 0, 3 * 4));
@@ -80,7 +80,7 @@ void go() {
   std::cout << "CUDA kernel 'diff' launch with " << blocksPerGrid.x << " blocks of " << threadsPerBlock.y
             << " threads\n";
 
-  cudautils::launch(diffAtan<DEGREE>, {blocksPerGrid, threadsPerBlock}, diff_d.get());
+  cms::cuda::launch(diffAtan<DEGREE>, {blocksPerGrid, threadsPerBlock}, diff_d.get());
 
   cudaCheck(cudaMemcpy(diffs, diff_d.get(), 3 * 4, cudaMemcpyDeviceToHost));
   delta += (std::chrono::high_resolution_clock::now() - start);
diff --git a/DataFormats/Math/test/cudaMathTest.cu b/DataFormats/Math/test/cudaMathTest.cu
index f19be00100c7f..dd6576de46c1c 100644
--- a/DataFormats/Math/test/cudaMathTest.cu
+++ b/DataFormats/Math/test/cudaMathTest.cu
@@ -101,9 +101,9 @@ void go() {
   std::generate(h_B.get(), h_B.get() + numElements, [&]() { return rgen(eng); });
 
   delta -= (std::chrono::high_resolution_clock::now() - start);
-  auto d_A = cudautils::make_device_unique<float[]>(numElements, nullptr);
-  auto d_B = cudautils::make_device_unique<float[]>(numElements, nullptr);
-  auto d_C = cudautils::make_device_unique<float[]>(numElements, nullptr);
+  auto d_A = cms::cuda::make_device_unique<float[]>(numElements, nullptr);
+  auto d_B = cms::cuda::make_device_unique<float[]>(numElements, nullptr);
+  auto d_C = cms::cuda::make_device_unique<float[]>(numElements, nullptr);
 
   cudaCheck(cudaMemcpy(d_A.get(), h_A.get(), size, cudaMemcpyHostToDevice));
   cudaCheck(cudaMemcpy(d_B.get(), h_B.get(), size, cudaMemcpyHostToDevice));
@@ -117,14 +117,14 @@ void go() {
   std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads\n";
 
   delta -= (std::chrono::high_resolution_clock::now() - start);
-  cudautils::launch(
+  cms::cuda::launch(
       vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
   delta += (std::chrono::high_resolution_clock::now() - start);
   std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
             << std::endl;
 
   delta -= (std::chrono::high_resolution_clock::now() - start);
-  cudautils::launch(
+  cms::cuda::launch(
       vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
   delta += (std::chrono::high_resolution_clock::now() - start);
   std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
index 8dfe536bb1555..be4cc5d9a3336 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
@@ -27,7 +27,7 @@ class SiPixelDigiErrorsSoAFromCUDA : public edm::stream::EDProducer<edm::Externa
   edm::EDGetTokenT<cms::cuda::Product<SiPixelDigiErrorsCUDA>> digiErrorGetToken_;
   edm::EDPutTokenT<SiPixelDigiErrorsSoA> digiErrorPutToken_;
 
-  cudautils::host::unique_ptr<PixelErrorCompact[]> data_;
+  cms::cuda::host::unique_ptr<PixelErrorCompact[]> data_;
   GPU::SimpleVector<PixelErrorCompact> error_;
   const PixelFormatterErrors* formatterErrors_ = nullptr;
 };
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
index a41fdf91fe978..dbec74585998f 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
@@ -27,10 +27,10 @@ class SiPixelDigisSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork
   edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiGetToken_;
   edm::EDPutTokenT<SiPixelDigisSoA> digiPutToken_;
 
-  cudautils::host::unique_ptr<uint32_t[]> pdigi_;
-  cudautils::host::unique_ptr<uint32_t[]> rawIdArr_;
-  cudautils::host::unique_ptr<uint16_t[]> adc_;
-  cudautils::host::unique_ptr<int32_t[]> clus_;
+  cms::cuda::host::unique_ptr<uint32_t[]> pdigi_;
+  cms::cuda::host::unique_ptr<uint32_t[]> rawIdArr_;
+  cms::cuda::host::unique_ptr<uint16_t[]> adc_;
+  cms::cuda::host::unique_ptr<int32_t[]> clus_;
 
   int nDigis_;
 };
diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index 6fbaced1858dd..67b2b46f45101 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -19,112 +19,114 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
 
-namespace cudautils {
-
-  template <typename Histo, typename T>
-  __global__ void countFromVector(Histo *__restrict__ h,
-                                  uint32_t nh,
-                                  T const *__restrict__ v,
-                                  uint32_t const *__restrict__ offsets) {
-    int first = blockDim.x * blockIdx.x + threadIdx.x;
-    for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
-      auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
-      assert((*off) > 0);
-      int32_t ih = off - offsets - 1;
-      assert(ih >= 0);
-      assert(ih < int(nh));
-      (*h).count(v[i], ih);
+namespace cms {
+  namespace cuda {
+
+    template <typename Histo, typename T>
+    __global__ void countFromVector(Histo *__restrict__ h,
+                                    uint32_t nh,
+                                    T const *__restrict__ v,
+                                    uint32_t const *__restrict__ offsets) {
+      int first = blockDim.x * blockIdx.x + threadIdx.x;
+      for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
+        auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
+        assert((*off) > 0);
+        int32_t ih = off - offsets - 1;
+        assert(ih >= 0);
+        assert(ih < int(nh));
+        (*h).count(v[i], ih);
+      }
     }
-  }
 
-  template <typename Histo, typename T>
-  __global__ void fillFromVector(Histo *__restrict__ h,
-                                 uint32_t nh,
-                                 T const *__restrict__ v,
-                                 uint32_t const *__restrict__ offsets) {
-    int first = blockDim.x * blockIdx.x + threadIdx.x;
-    for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
-      auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
-      assert((*off) > 0);
-      int32_t ih = off - offsets - 1;
-      assert(ih >= 0);
-      assert(ih < int(nh));
-      (*h).fill(v[i], i, ih);
+    template <typename Histo, typename T>
+    __global__ void fillFromVector(Histo *__restrict__ h,
+                                   uint32_t nh,
+                                   T const *__restrict__ v,
+                                   uint32_t const *__restrict__ offsets) {
+      int first = blockDim.x * blockIdx.x + threadIdx.x;
+      for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
+        auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
+        assert((*off) > 0);
+        int32_t ih = off - offsets - 1;
+        assert(ih >= 0);
+        assert(ih < int(nh));
+        (*h).fill(v[i], i, ih);
+      }
     }
-  }
 
-  template <typename Histo>
-  inline void launchZero(Histo *__restrict__ h,
-                         cudaStream_t stream
+    template <typename Histo>
+    inline void launchZero(Histo *__restrict__ h,
+                           cudaStream_t stream
 #ifndef __CUDACC__
-                         = cudaStreamDefault
+                           = cudaStreamDefault
 #endif
-  ) {
-    uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off));
+    ) {
+      uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off));
 #ifdef __CUDACC__
-    cudaCheck(cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream));
+      cudaCheck(cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream));
 #else
-    ::memset(off, 0, 4 * Histo::totbins());
+      ::memset(off, 0, 4 * Histo::totbins());
 #endif
-  }
+    }
 
-  template <typename Histo>
-  inline void launchFinalize(Histo *__restrict__ h,
-                             uint8_t *__restrict__ ws
+    template <typename Histo>
+    inline void launchFinalize(Histo *__restrict__ h,
+                               uint8_t *__restrict__ ws
 #ifndef __CUDACC__
-                             = cudaStreamDefault
+                               = cudaStreamDefault
 #endif
-                             ,
-                             cudaStream_t stream
+                               ,
+                               cudaStream_t stream
 #ifndef __CUDACC__
-                             = cudaStreamDefault
+                               = cudaStreamDefault
 #endif
-  ) {
+    ) {
 #ifdef __CUDACC__
-    assert(ws);
-    uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off));
-    size_t wss = Histo::wsSize();
-    assert(wss > 0);
-    CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
+      assert(ws);
+      uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off));
+      size_t wss = Histo::wsSize();
+      assert(wss > 0);
+      CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
 #else
-    h->finalize();
+      h->finalize();
 #endif
-  }
+    }
 
-  template <typename Histo, typename T>
-  inline void fillManyFromVector(Histo *__restrict__ h,
-                                 uint8_t *__restrict__ ws,
-                                 uint32_t nh,
-                                 T const *__restrict__ v,
-                                 uint32_t const *__restrict__ offsets,
-                                 uint32_t totSize,
-                                 int nthreads,
-                                 cudaStream_t stream
+    template <typename Histo, typename T>
+    inline void fillManyFromVector(Histo *__restrict__ h,
+                                   uint8_t *__restrict__ ws,
+                                   uint32_t nh,
+                                   T const *__restrict__ v,
+                                   uint32_t const *__restrict__ offsets,
+                                   uint32_t totSize,
+                                   int nthreads,
+                                   cudaStream_t stream
 #ifndef __CUDACC__
-                                 = cudaStreamDefault
+                                   = cudaStreamDefault
 #endif
-  ) {
-    launchZero(h, stream);
+    ) {
+      launchZero(h, stream);
 #ifdef __CUDACC__
-    auto nblocks = (totSize + nthreads - 1) / nthreads;
-    countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
-    cudaCheck(cudaGetLastError());
-    launchFinalize(h, ws, stream);
-    fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
-    cudaCheck(cudaGetLastError());
+      auto nblocks = (totSize + nthreads - 1) / nthreads;
+      countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+      cudaCheck(cudaGetLastError());
+      launchFinalize(h, ws, stream);
+      fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+      cudaCheck(cudaGetLastError());
 #else
-    countFromVector(h, nh, v, offsets);
-    h->finalize();
-    fillFromVector(h, nh, v, offsets);
+      countFromVector(h, nh, v, offsets);
+      h->finalize();
+      fillFromVector(h, nh, v, offsets);
 #endif
-  }
+    }
 
-  template <typename Assoc>
-  __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {
-    assoc->bulkFinalizeFill(*apc);
-  }
+    template <typename Assoc>
+    __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {
+      assoc->bulkFinalizeFill(*apc);
+    }
 
-}  // namespace cudautils
+  }  // namespace cuda
+}  // namespace cms
 
 // iteratate over N bins left and right of the one containing "v"
 template <typename Hist, typename V, typename Func>
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
index 772c0b64bd892..aa7eb245c350d 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
@@ -16,7 +16,7 @@ void go() {
 
   constexpr int N = 12000;
   T v[N];
-  auto v_d = cudautils::make_device_unique<T[]>(N, nullptr);
+  auto v_d = cms::cuda::make_device_unique<T[]>(N, nullptr);
 
   cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
 
@@ -30,10 +30,10 @@ void go() {
             << (std::numeric_limits<T>::max() - std::numeric_limits<T>::min()) / Hist::nbins() << std::endl;
 
   Hist h;
-  auto h_d = cudautils::make_device_unique<Hist[]>(1, nullptr);
-  auto ws_d = cudautils::make_device_unique<uint8_t[]>(Hist::wsSize(), nullptr);
+  auto h_d = cms::cuda::make_device_unique<Hist[]>(1, nullptr);
+  auto ws_d = cms::cuda::make_device_unique<uint8_t[]>(Hist::wsSize(), nullptr);
 
-  auto off_d = cudautils::make_device_unique<uint32_t[]>(nParts + 1, nullptr);
+  auto off_d = cms::cuda::make_device_unique<uint32_t[]>(nParts + 1, nullptr);
 
   for (int it = 0; it < 5; ++it) {
     offsets[0] = 0;
@@ -68,7 +68,7 @@ void go() {
 
     cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
 
-    cudautils::fillManyFromVector(h_d.get(), ws_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0);
+    cms::cuda::fillManyFromVector(h_d.get(), ws_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0);
     cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost));
     assert(0 == h.off[0]);
     assert(offsets[10] == h.size());
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
index 940de878709d1..020d69268a420 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
@@ -106,7 +106,7 @@ void go() {
   constexpr int N = 12000;
   T v[N];
 
-  auto v_d = cudautils::make_device_unique<T[]>(N, nullptr);
+  auto v_d = cms::cuda::make_device_unique<T[]>(N, nullptr);
   assert(v_d.get());
 
   using Hist = HistoContainer<T, NBINS, N, S>;
@@ -125,7 +125,7 @@ void go() {
     assert(v);
     cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
     assert(v_d.get());
-    cudautils::launch(mykernel<T, NBINS, S, DELTA>, {1, 256}, v_d.get(), N);
+    cms::cuda::launch(mykernel<T, NBINS, S, DELTA>, {1, 256}, v_d.get(), N);
   }
 }
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
index 2cdafd0a876cb..92bb359115a02 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
+++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
@@ -100,7 +100,7 @@ __global__ void verifyBulk(Assoc const* __restrict__ assoc, AtomicPairCounter co
 int main() {
 #ifdef __CUDACC__
   cms::cudatest::requireDevices();
-  auto current_device = cudautils::currentDevice();
+  auto current_device = cms::cuda::currentDevice();
 #else
   // make sure cuda emulation is working
   std::cout << "cuda x's " << threadIdx.x << ' ' << blockIdx.x << ' ' << blockDim.x << ' ' << gridDim.x << std::endl;
@@ -167,11 +167,11 @@ int main() {
   std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl;
 
 #ifdef __CUDACC__
-  auto v_d = cudautils::make_device_unique<std::array<uint16_t, 4>[]>(N, nullptr);
+  auto v_d = cms::cuda::make_device_unique<std::array<uint16_t, 4>[]>(N, nullptr);
   assert(v_d.get());
-  auto a_d = cudautils::make_device_unique<Assoc[]>(1, nullptr);
-  auto sa_d = cudautils::make_device_unique<SmallAssoc[]>(1, nullptr);
-  auto ws_d = cudautils::make_device_unique<uint8_t[]>(Assoc::wsSize(), nullptr);
+  auto a_d = cms::cuda::make_device_unique<Assoc[]>(1, nullptr);
+  auto sa_d = cms::cuda::make_device_unique<SmallAssoc[]>(1, nullptr);
+  auto ws_d = cms::cuda::make_device_unique<uint8_t[]>(Assoc::wsSize(), nullptr);
 
   cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array<uint16_t, 4>), cudaMemcpyHostToDevice));
 #else
@@ -180,7 +180,7 @@ int main() {
   auto v_d = tr.data();
 #endif
 
-  cudautils::launchZero(a_d.get(), 0);
+  cms::cuda::launchZero(a_d.get(), 0);
 
 #ifdef __CUDACC__
   auto nThreads = 256;
@@ -188,12 +188,12 @@ int main() {
 
   count<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
 
-  cudautils::launchFinalize(a_d.get(), ws_d.get(), 0);
+  cms::cuda::launchFinalize(a_d.get(), ws_d.get(), 0);
   verify<<<1, 1>>>(a_d.get());
   fill<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
 #else
   count(v_d, a_d.get(), N);
-  cudautils::launchFinalize(a_d.get());
+  cms::cuda::launchFinalize(a_d.get());
   verify(a_d.get());
   fill(v_d, a_d.get(), N);
 #endif
@@ -231,7 +231,7 @@ int main() {
   cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
   nBlocks = (N + nThreads - 1) / nThreads;
   fillBulk<<<nBlocks, nThreads>>>(dc_d, v_d.get(), a_d.get(), N);
-  cudautils::finalizeBulk<<<nBlocks, nThreads>>>(dc_d, a_d.get());
+  cms::cuda::finalizeBulk<<<nBlocks, nThreads>>>(dc_d, a_d.get());
   verifyBulk<<<1, 1>>>(a_d.get(), dc_d);
 
   cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost));
@@ -239,19 +239,19 @@ int main() {
 
   cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
   fillBulk<<<nBlocks, nThreads>>>(dc_d, v_d.get(), sa_d.get(), N);
-  cudautils::finalizeBulk<<<nBlocks, nThreads>>>(dc_d, sa_d.get());
+  cms::cuda::finalizeBulk<<<nBlocks, nThreads>>>(dc_d, sa_d.get());
   verifyBulk<<<1, 1>>>(sa_d.get(), dc_d);
 
 #else
   dc_d = &dc;
   fillBulk(dc_d, v_d, a_d.get(), N);
-  cudautils::finalizeBulk(dc_d, a_d.get());
+  cms::cuda::finalizeBulk(dc_d, a_d.get());
   verifyBulk(a_d.get(), dc_d);
   memcpy(&la, a_d.get(), sizeof(Assoc));
 
   AtomicPairCounter sdc(0);
   fillBulk(&sdc, v_d, sa_d.get(), N);
-  cudautils::finalizeBulk(&sdc, sa_d.get());
+  cms::cuda::finalizeBulk(&sdc, sa_d.get());
   verifyBulk(sa_d.get(), &sdc);
 
 #endif
@@ -274,14 +274,14 @@ int main() {
 
   // here verify use of block local counters
 #ifdef __CUDACC__
-  auto m1_d = cudautils::make_device_unique<Multiplicity[]>(1, nullptr);
-  auto m2_d = cudautils::make_device_unique<Multiplicity[]>(1, nullptr);
+  auto m1_d = cms::cuda::make_device_unique<Multiplicity[]>(1, nullptr);
+  auto m2_d = cms::cuda::make_device_unique<Multiplicity[]>(1, nullptr);
 #else
   auto m1_d = std::make_unique<Multiplicity>();
   auto m2_d = std::make_unique<Multiplicity>();
 #endif
-  cudautils::launchZero(m1_d.get(), 0);
-  cudautils::launchZero(m2_d.get(), 0);
+  cms::cuda::launchZero(m1_d.get(), 0);
+  cms::cuda::launchZero(m2_d.get(), 0);
 
 #ifdef __CUDACC__
   nBlocks = (4 * N + nThreads - 1) / nThreads;
@@ -289,8 +289,8 @@ int main() {
   countMultiLocal<<<nBlocks, nThreads>>>(v_d.get(), m2_d.get(), N);
   verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get());
 
-  cudautils::launchFinalize(m1_d.get(), ws_d.get(), 0);
-  cudautils::launchFinalize(m2_d.get(), ws_d.get(), 0);
+  cms::cuda::launchFinalize(m1_d.get(), ws_d.get(), 0);
+  cms::cuda::launchFinalize(m2_d.get(), ws_d.get(), 0);
   verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get());
 
   cudaCheck(cudaGetLastError());
@@ -300,8 +300,8 @@ int main() {
   countMultiLocal(v_d, m2_d.get(), N);
   verifyMulti(m1_d.get(), m2_d.get());
 
-  cudautils::launchFinalize(m1_d.get());
-  cudautils::launchFinalize(m2_d.get());
+  cms::cuda::launchFinalize(m1_d.get());
+  cms::cuda::launchFinalize(m2_d.get());
   verifyMulti(m1_d.get(), m2_d.get());
 #endif
   return 0;
diff --git a/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu b/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu
index 834ac9446a4c5..7b8ed6219392e 100644
--- a/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu
@@ -21,7 +21,7 @@ __global__ void testBinaryFind() {
   assert(data2 + 6 == cuda_std::binary_find(data2, data2 + 6, 5));
 }
 
-void wrapper() { cudautils::launch(testBinaryFind, {32, 64}); }
+void wrapper() { cms::cuda::launch(testBinaryFind, {32, 64}); }
 
 int main() {
   cms::cudatest::requireDevices();
diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
index 2b5b439c85598..febdb9c92b0a7 100644
--- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
@@ -89,10 +89,10 @@ void go(bool useShared) {
 
     std::random_shuffle(v, v + N);
 
-    auto v_d = cudautils::make_device_unique<U[]>(N, nullptr);
-    auto ind_d = cudautils::make_device_unique<uint16_t[]>(N, nullptr);
-    auto ws_d = cudautils::make_device_unique<uint16_t[]>(N, nullptr);
-    auto off_d = cudautils::make_device_unique<uint32_t[]>(blocks + 1, nullptr);
+    auto v_d = cms::cuda::make_device_unique<U[]>(N, nullptr);
+    auto ind_d = cms::cuda::make_device_unique<uint16_t[]>(N, nullptr);
+    auto ws_d = cms::cuda::make_device_unique<uint16_t[]>(N, nullptr);
+    auto off_d = cms::cuda::make_device_unique<uint32_t[]>(blocks + 1, nullptr);
 
     cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
     cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks + 1), cudaMemcpyHostToDevice));
@@ -105,10 +105,10 @@ void go(bool useShared) {
     delta -= (std::chrono::high_resolution_clock::now() - start);
     constexpr int MaxSize = 256 * 32;
     if (useShared)
-      cudautils::launch(
+      cms::cuda::launch(
           radixSortMultiWrapper<U, NS>, {blocks, ntXBl, MaxSize * 2}, v_d.get(), ind_d.get(), off_d.get(), nullptr);
     else
-      cudautils::launch(
+      cms::cuda::launch(
           radixSortMultiWrapper2<U, NS>, {blocks, ntXBl}, v_d.get(), ind_d.get(), off_d.get(), ws_d.get());
 
     if (i == 0)
diff --git a/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h b/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h
index 681354767a7a3..2f9eb092bc648 100644
--- a/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h
+++ b/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h
@@ -29,7 +29,7 @@ class SiPixelFedCablingMapGPUWrapper {
 
   // returns pointer to GPU memory
   const unsigned char *getModToUnpAllAsync(cudaStream_t cudaStream) const;
-  cudautils::device::unique_ptr<unsigned char[]> getModToUnpRegionalAsync(std::set<unsigned int> const &modules,
+  cms::cuda::device::unique_ptr<unsigned char[]> getModToUnpRegionalAsync(std::set<unsigned int> const &modules,
                                                                           cudaStream_t cudaStream) const;
 
 private:
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
index feb07fe0e686e..95aac36dbd197 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
@@ -143,7 +143,7 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
   // get the GPU product already here so that the async transfer can begin
   const auto* gpuGains = hgains->getGPUProductAsync(ctx.stream());
 
-  cudautils::device::unique_ptr<unsigned char[]> modulesToUnpackRegional;
+  cms::cuda::device::unique_ptr<unsigned char[]> modulesToUnpackRegional;
   const unsigned char* gpuModulesToUnpack;
 
   if (regions_) {
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index 8e0d5123e6ecc..53af26ac7527d 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -47,8 +47,8 @@ namespace pixelgpudetails {
   constexpr uint32_t MAX_FED_WORDS = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
 
   SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender() {
-    word_ = cudautils::make_host_noncached_unique<unsigned int[]>(MAX_FED_WORDS, cudaHostAllocWriteCombined);
-    fedId_ = cudautils::make_host_noncached_unique<unsigned char[]>(MAX_FED_WORDS, cudaHostAllocWriteCombined);
+    word_ = cms::cuda::make_host_noncached_unique<unsigned int[]>(MAX_FED_WORDS, cudaHostAllocWriteCombined);
+    fedId_ = cms::cuda::make_host_noncached_unique<unsigned char[]>(MAX_FED_WORDS, cudaHostAllocWriteCombined);
   }
 
   void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId,
@@ -549,7 +549,7 @@ namespace pixelgpudetails {
     }
     clusters_d = SiPixelClustersCUDA(gpuClustering::MaxNumModules, stream);
 
-    nModules_Clusters_h = cudautils::make_host_unique<uint32_t[]>(2, stream);
+    nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(2, stream);
 
     if (wordCounter)  // protect in case of empty event....
     {
@@ -558,8 +558,8 @@ namespace pixelgpudetails {
 
       assert(0 == wordCounter % 2);
       // wordCounter is the total no of words in each event to be trasfered on device
-      auto word_d = cudautils::make_device_unique<uint32_t[]>(wordCounter, stream);
-      auto fedId_d = cudautils::make_device_unique<uint8_t[]>(wordCounter, stream);
+      auto word_d = cms::cuda::make_device_unique<uint32_t[]>(wordCounter, stream);
+      auto fedId_d = cms::cuda::make_device_unique<uint8_t[]>(wordCounter, stream);
 
       cudaCheck(
           cudaMemcpyAsync(word_d.get(), wordFed.word(), wordCounter * sizeof(uint32_t), cudaMemcpyDefault, stream));
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
index 8a4e0b6f78696..767c5a1e92ad0 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -156,8 +156,8 @@ namespace pixelgpudetails {
       const unsigned char* fedId() const { return fedId_.get(); }
 
     private:
-      cudautils::host::noncached::unique_ptr<unsigned int[]> word_;
-      cudautils::host::noncached::unique_ptr<unsigned char[]> fedId_;
+      cms::cuda::host::noncached::unique_ptr<unsigned int[]> word_;
+      cms::cuda::host::noncached::unique_ptr<unsigned char[]> fedId_;
     };
 
     SiPixelRawToClusterGPUKernel() = default;
@@ -187,7 +187,7 @@ namespace pixelgpudetails {
       // stream is still alive
       //
       // technically the statement above is not true anymore now that
-      // the CUDA streams are cached within the cudautils::StreamCache, but it is
+      // the CUDA streams are cached within the cms::cuda::StreamCache, but it is
       // still better to release as early as possible
       nModules_Clusters_h.reset();
       return std::make_pair(std::move(digis_d), std::move(clusters_d));
@@ -199,7 +199,7 @@ namespace pixelgpudetails {
     uint32_t nDigis = 0;
 
     // Data to be put in the event
-    cudautils::host::unique_ptr<uint32_t[]> nModules_Clusters_h;
+    cms::cuda::host::unique_ptr<uint32_t[]> nModules_Clusters_h;
     SiPixelDigisCUDA digis_d;
     SiPixelClustersCUDA clusters_d;
     SiPixelDigiErrorsCUDA digiErrors_d;
diff --git a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
index d4b8e40dea76b..7d3a9aa8d9b07 100644
--- a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
@@ -127,10 +127,10 @@ const unsigned char* SiPixelFedCablingMapGPUWrapper::getModToUnpAllAsync(cudaStr
   return data.modToUnpDefault;
 }
 
-cudautils::device::unique_ptr<unsigned char[]> SiPixelFedCablingMapGPUWrapper::getModToUnpRegionalAsync(
+cms::cuda::device::unique_ptr<unsigned char[]> SiPixelFedCablingMapGPUWrapper::getModToUnpRegionalAsync(
     std::set<unsigned int> const& modules, cudaStream_t cudaStream) const {
-  auto modToUnpDevice = cudautils::make_device_unique<unsigned char[]>(pixelgpudetails::MAX_SIZE, cudaStream);
-  auto modToUnpHost = cudautils::make_host_unique<unsigned char[]>(pixelgpudetails::MAX_SIZE, cudaStream);
+  auto modToUnpDevice = cms::cuda::make_device_unique<unsigned char[]>(pixelgpudetails::MAX_SIZE, cudaStream);
+  auto modToUnpHost = cms::cuda::make_host_unique<unsigned char[]>(pixelgpudetails::MAX_SIZE, cudaStream);
 
   std::vector<unsigned int> const& fedIds = cablingMap_->fedIds();
   std::unique_ptr<SiPixelFedCablingTree> const& cabling = cablingMap_->cablingTree();
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
index 4db03da324ada..8ec665f8960b6 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
@@ -37,14 +37,14 @@ int main(void) {
   auto h_clus = std::make_unique<int[]>(numElements);
 
 #ifdef __CUDACC__
-  auto d_id = cudautils::make_device_unique<uint16_t[]>(numElements, nullptr);
-  auto d_x = cudautils::make_device_unique<uint16_t[]>(numElements, nullptr);
-  auto d_y = cudautils::make_device_unique<uint16_t[]>(numElements, nullptr);
-  auto d_adc = cudautils::make_device_unique<uint16_t[]>(numElements, nullptr);
-  auto d_clus = cudautils::make_device_unique<int[]>(numElements, nullptr);
-  auto d_moduleStart = cudautils::make_device_unique<uint32_t[]>(MaxNumModules + 1, nullptr);
-  auto d_clusInModule = cudautils::make_device_unique<uint32_t[]>(MaxNumModules, nullptr);
-  auto d_moduleId = cudautils::make_device_unique<uint32_t[]>(MaxNumModules, nullptr);
+  auto d_id = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
+  auto d_x = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
+  auto d_y = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
+  auto d_adc = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
+  auto d_clus = cms::cuda::make_device_unique<int[]>(numElements, nullptr);
+  auto d_moduleStart = cms::cuda::make_device_unique<uint32_t[]>(MaxNumModules + 1, nullptr);
+  auto d_clusInModule = cms::cuda::make_device_unique<uint32_t[]>(MaxNumModules, nullptr);
+  auto d_moduleId = cms::cuda::make_device_unique<uint32_t[]>(MaxNumModules, nullptr);
 #else
 
   auto h_moduleStart = std::make_unique<uint32_t[]>(MaxNumModules + 1);
@@ -255,7 +255,7 @@ int main(void) {
     std::cout << "CUDA countModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock
               << " threads\n";
 
-    cudautils::launch(countModules, {blocksPerGrid, threadsPerBlock}, d_id.get(), d_moduleStart.get(), d_clus.get(), n);
+    cms::cuda::launch(countModules, {blocksPerGrid, threadsPerBlock}, d_id.get(), d_moduleStart.get(), d_clus.get(), n);
 
     blocksPerGrid = MaxNumModules;  //nModules;
 
@@ -263,7 +263,7 @@ int main(void) {
               << " threads\n";
     cudaCheck(cudaMemset(d_clusInModule.get(), 0, MaxNumModules * sizeof(uint32_t)));
 
-    cudautils::launch(findClus,
+    cms::cuda::launch(findClus,
                       {blocksPerGrid, threadsPerBlock},
                       d_id.get(),
                       d_x.get(),
@@ -289,7 +289,7 @@ int main(void) {
     if (ncl != std::accumulate(nclus, nclus + MaxNumModules, 0))
       std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
 
-    cudautils::launch(clusterChargeCut,
+    cms::cuda::launch(clusterChargeCut,
                       {blocksPerGrid, threadsPerBlock},
                       d_id.get(),
                       d_adc.get(),
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index 1342ab916e472..4e4f38f329d01 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -63,8 +63,8 @@ namespace pixelgpudetails {
     }
 
     if (nHits) {
-      auto hws = cudautils::make_device_unique<uint8_t[]>(TrackingRecHit2DSOAView::Hist::wsSize(), stream);
-      cudautils::fillManyFromVector(
+      auto hws = cms::cuda::make_device_unique<uint8_t[]>(TrackingRecHit2DSOAView::Hist::wsSize(), stream);
+      cms::cuda::fillManyFromVector(
           hits_d.phiBinner(), hws.get(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256, stream);
       cudaCheck(cudaGetLastError());
     }
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc
index 5dbf0da75dc42..7b072abc1dd47 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSOA.cc
@@ -41,9 +41,9 @@ class SiPixelRecHitFromSOA : public edm::stream::EDProducer<edm::ExternalWork> {
   edm::EDGetTokenT<SiPixelClusterCollectionNew> clusterToken_;           // Legacy Clusters
 
   uint32_t m_nHits;
-  cudautils::host::unique_ptr<uint16_t[]> m_store16;
-  cudautils::host::unique_ptr<float[]> m_store32;
-  cudautils::host::unique_ptr<uint32_t[]> m_hitsModuleStart;
+  cms::cuda::host::unique_ptr<uint16_t[]> m_store16;
+  cms::cuda::host::unique_ptr<float[]> m_store32;
+  cms::cuda::host::unique_ptr<uint32_t[]> m_hitsModuleStart;
 };
 
 SiPixelRecHitFromSOA::SiPixelRecHitFromSOA(const edm::ParameterSet& iConfig)
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
index 1b4b483ad8ffc..fbe0fd13b84a4 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
@@ -250,7 +250,7 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
   for (auto i = 0; i < 11; ++i) {
     output->hitsLayerStart()[i] = hitsModuleStart[cpeView.layerGeometry().layerStart[i]];
   }
-  cudautils::fillManyFromVector(
+  cms::cuda::fillManyFromVector(
       output->phiBinner(), nullptr, 10, output->iphi(), output->hitsLayerStart(), numberOfHits, 256, nullptr);
 
   // std::cout << "created HitSoa for " <<  numberOfClusters << " clusters in " << numberOfDetUnits << " Dets" << std::endl;
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
index 4ed23b7dc5394..c8310bc645db3 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -33,7 +33,7 @@ class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
   edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenCUDA_;
   edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;
 
-  cudautils::host::unique_ptr<pixelTrack::TrackSoA> m_soa;
+  cms::cuda::host::unique_ptr<pixelTrack::TrackSoA> m_soa;
 };
 
 PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig)
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
index 660cf75e1f460..6fc537237286f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -11,11 +11,11 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
   auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
 
   //  Fit internals
-  auto hitsGPU_ = cudautils::make_device_unique<double[]>(
+  auto hitsGPU_ = cms::cuda::make_device_unique<double[]>(
       maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
-  auto hits_geGPU_ = cudautils::make_device_unique<float[]>(
+  auto hits_geGPU_ = cms::cuda::make_device_unique<float[]>(
       maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
-  auto fast_fit_resultsGPU_ = cudautils::make_device_unique<double[]>(
+  auto fast_fit_resultsGPU_ = cms::cuda::make_device_unique<double[]>(
       maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 75066458dc170..05106a1bfed41 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -67,7 +67,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   assert(tuples_d && quality_d);
 
   // zero tuples
-  cudautils::launchZero(tuples_d, cudaStream);
+  cms::cuda::launchZero(tuples_d, cudaStream);
 
   auto nhits = hh.nHits();
   assert(nhits <= pixelGPUConstants::maxNumberOfHits);
@@ -108,13 +108,13 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   if (m_params.doStats_)
     kernel_mark_used(hh.view(), device_theCells_.get(), device_nCells_);
 
-  cudautils::finalizeBulk(device_hitTuple_apc_, tuples_d);
+  cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d);
 
   // remove duplicates (tracks that share a doublet)
   kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, quality_d);
 
   kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
-  cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
+  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
   kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
 
   if (nhits > 1 && m_params.lateFishbone_) {
@@ -154,7 +154,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   // fill hit->track "map"
   kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
-  cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
+  cms::cuda::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
   kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
 
   // remove duplicates (tracks that share a hit)
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index aaf882633f17d..7bfee1c8d557f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -21,7 +21,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   auto *quality_d = (Quality *)(&tracks_d->m_quality);
 
   // zero tuples
-  cudautils::launchZero(tuples_d, cudaStream);
+  cms::cuda::launchZero(tuples_d, cudaStream);
 
   auto nhits = hh.nHits();
   assert(nhits <= pixelGPUConstants::maxNumberOfHits);
@@ -96,7 +96,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 
   blockSize = 128;
   numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize;
-  cudautils::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
+  cms::cuda::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
 
   // remove duplicates (tracks that share a doublet)
   numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
@@ -108,7 +108,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   numberOfBlocks = (3 * CAConstants::maxTuples() / 4 + blockSize - 1) / blockSize;
   kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       tuples_d, quality_d, device_tupleMultiplicity_.get());
-  cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
+  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
   kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       tuples_d, quality_d, device_tupleMultiplicity_.get());
   cudaCheck(cudaGetLastError());
@@ -160,7 +160,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 #endif
 
   // in principle we can use "nhits" to heuristically dimension the workspace...
-  device_isOuterHitOfCell_ = cudautils::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
+  device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
   assert(device_isOuterHitOfCell_.get());
   {
     int threadsPerBlock = 128;
@@ -175,7 +175,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
     cudaCheck(cudaGetLastError());
   }
 
-  device_theCells_ = cudautils::make_device_unique<GPUCACell[]>(m_params.maxNumberOfDoublets_, stream);
+  device_theCells_ = cms::cuda::make_device_unique<GPUCACell[]>(m_params.maxNumberOfDoublets_, stream);
 
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
@@ -252,7 +252,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
         tuples_d, quality_d, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
-    cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
+    cms::cuda::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
     cudaCheck(cudaGetLastError());
     kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
index b91911c66924e..592aee9770ae4 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.h
@@ -46,6 +46,6 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(cudaStream_t stream) {
   } else {
     *device_nCells_ = 0;
   }
-  cudautils::launchZero(device_tupleMultiplicity_.get(), stream);
-  cudautils::launchZero(device_hitToTuple_.get(), stream);  // we may wish to keep it in the edm...
+  cms::cuda::launchZero(device_tupleMultiplicity_.get(), stream);
+  cms::cuda::launchZero(device_hitToTuple_.get(), stream);  // we may wish to keep it in the edm...
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 2e875caba7130..4a8240706efc2 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -162,7 +162,7 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription&
 PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d,
                                                                     float bfield,
                                                                     cudaStream_t stream) const {
-  PixelTrackHeterogeneous tracks(cudautils::make_device_unique<pixelTrack::TrackSoA>(stream));
+  PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<pixelTrack::TrackSoA>(stream));
 
   auto* soa = tracks.get();
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
index cb5d32b47aea3..1077bb7736667 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
@@ -11,14 +11,14 @@ void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv,
   auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
 
   //  Fit internals
-  auto hitsGPU_ = cudautils::make_device_unique<double[]>(
+  auto hitsGPU_ = cms::cuda::make_device_unique<double[]>(
       maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
-  auto hits_geGPU_ = cudautils::make_device_unique<float[]>(
+  auto hits_geGPU_ = cms::cuda::make_device_unique<float[]>(
       maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
-  auto fast_fit_resultsGPU_ = cudautils::make_device_unique<double[]>(
+  auto fast_fit_resultsGPU_ = cms::cuda::make_device_unique<double[]>(
       maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
   auto circle_fit_resultsGPU_holder =
-      cudautils::make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream);
+      cms::cuda::make_device_unique<char[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream);
   Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get());
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexSoAFromCUDA.cc
index aee9be2326572..0cadf24580cf7 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexSoAFromCUDA.cc
@@ -33,7 +33,7 @@ class PixelVertexSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
   edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenCUDA_;
   edm::EDPutTokenT<ZVertexHeterogeneous> tokenSOA_;
 
-  cudautils::host::unique_ptr<ZVertexSoA> m_soa;
+  cms::cuda::host::unique_ptr<ZVertexSoA> m_soa;
 };
 
 PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig)
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
index d6e63227ccf85..76bf8a5d4e978 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinderImpl.h
@@ -87,7 +87,7 @@ namespace gpuVertexFinder {
 #ifdef __CUDACC__
   ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
     // std::cout << "producing Vertices on GPU" << std::endl;
-    ZVertexHeterogeneous vertices(cudautils::make_device_unique<ZVertexSoA>(stream));
+    ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
 #else
   ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const {
     // std::cout << "producing Vertices on  CPU" <<    std::endl;
@@ -98,7 +98,7 @@ namespace gpuVertexFinder {
     assert(soa);
 
 #ifdef __CUDACC__
-    auto ws_d = cudautils::make_device_unique<WorkSpace>(stream);
+    auto ws_d = cms::cuda::make_device_unique<WorkSpace>(stream);
 #else
     auto ws_d = std::make_unique<WorkSpace>();
 #endif
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index de3a9a2316238..5b7a1b6eadd0c 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -116,8 +116,8 @@ int main() {
 #ifdef __CUDACC__
   cms::cudatest::requireDevices();
 
-  auto onGPU_d = cudautils::make_device_unique<ZVertices[]>(1, nullptr);
-  auto ws_d = cudautils::make_device_unique<WorkSpace[]>(1, nullptr);
+  auto onGPU_d = cms::cuda::make_device_unique<ZVertices[]>(1, nullptr);
+  auto ws_d = cms::cuda::make_device_unique<WorkSpace[]>(1, nullptr);
 #else
   auto onGPU_d = std::make_unique<ZVertices>();
   auto ws_d = std::make_unique<WorkSpace>();
@@ -174,16 +174,16 @@ int main() {
       cudaDeviceSynchronize();
 
 #ifdef ONE_KERNEL
-      cudautils::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+      cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
 #else
-      cudautils::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+      cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
 #endif
       print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
 
       cudaCheck(cudaGetLastError());
       cudaDeviceSynchronize();
 
-      cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cms::cuda::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
       cudaCheck(cudaGetLastError());
       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
@@ -245,7 +245,7 @@ int main() {
       }
 
 #ifdef __CUDACC__
-      cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cms::cuda::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
       cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
@@ -265,7 +265,7 @@ int main() {
 
 #ifdef __CUDACC__
       // one vertex per block!!!
-      cudautils::launch(splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
+      cms::cuda::launch(splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
       cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
       gridDim.x = 1;
@@ -277,10 +277,10 @@ int main() {
       std::cout << "after split " << nv << std::endl;
 
 #ifdef __CUDACC__
-      cudautils::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
+      cms::cuda::launch(fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
       cudaCheck(cudaGetLastError());
 
-      cudautils::launch(sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
+      cms::cuda::launch(sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
       cudaCheck(cudaGetLastError());
       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
diff --git a/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc b/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc
index 2eefc648a4c6e..72edd10cb031d 100644
--- a/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc
+++ b/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc
@@ -17,11 +17,11 @@
 namespace {
   class BSHost {
   public:
-    BSHost() : bs{cudautils::make_host_noncached_unique<BeamSpotCUDA::Data>(cudaHostAllocWriteCombined)} {}
+    BSHost() : bs{cms::cuda::make_host_noncached_unique<BeamSpotCUDA::Data>(cudaHostAllocWriteCombined)} {}
     BeamSpotCUDA::Data* get() { return bs.get(); }
 
   private:
-    cudautils::host::noncached::unique_ptr<BeamSpotCUDA::Data> bs;
+    cms::cuda::host::noncached::unique_ptr<BeamSpotCUDA::Data> bs;
   };
 }  // namespace