diff --git a/CUDADataFormats/Common/BuildFile.xml b/CUDADataFormats/Common/BuildFile.xml
new file mode 100644
index 0000000000000..1046b76eef0f7
--- /dev/null
+++ b/CUDADataFormats/Common/BuildFile.xml
@@ -0,0 +1,7 @@
+<use name="cuda-api-wrappers"/>
+<use name="FWCore/ServiceRegistry"/>
+<use name="HeterogeneousCore/CUDAServices"/>
+
+<export>
+    <lib name="1"/>
+</export>
diff --git a/CUDADataFormats/Common/interface/CUDAProduct.h b/CUDADataFormats/Common/interface/CUDAProduct.h
new file mode 100644
index 0000000000000..ca07a344ba2d5
--- /dev/null
+++ b/CUDADataFormats/Common/interface/CUDAProduct.h
@@ -0,0 +1,51 @@
+#ifndef CUDADataFormats_Common_CUDAProduct_h
+#define CUDADataFormats_Common_CUDAProduct_h
+
+#include <memory>
+
+#include <cuda/api_wrappers.h>
+
+#include "CUDADataFormats/Common/interface/CUDAProductBase.h"
+
+namespace edm {
+  template <typename T> class Wrapper;
+}
+
+/**
+ * The purpose of this class is to wrap CUDA data to edm::Event in a
+ * way which forces correct use of various utilities.
+ *
+ * The non-default construction has to be done with CUDAScopedContext
+ * (in order to properly register the CUDA event).
+ *
+ * The default constructor is needed only for the ROOT dictionary generation.
+ *
+ * The CUDA event is in practice needed only for stream-stream
+ * synchronization, but someone with long-enough lifetime has to own
+ * it. Here is a somewhat natural place. If overhead is too much, we
+ * can e.g. make CUDAService own them (creating them on demand) and
+ * use them only where synchronization between streams is needed.
+ */
+template <typename T>
+class CUDAProduct: public CUDAProductBase {
+public:
+  CUDAProduct() = default; // Needed only for ROOT dictionary generation
+
+  CUDAProduct(const CUDAProduct&) = delete;
+  CUDAProduct& operator=(const CUDAProduct&) = delete;
+  CUDAProduct(CUDAProduct&&) = default;
+  CUDAProduct& operator=(CUDAProduct&&) = default;
+
+private:
+  friend class CUDAScopedContext;
+  friend class edm::Wrapper<CUDAProduct<T>>;
+
+  explicit CUDAProduct(int device, std::shared_ptr<cuda::stream_t<>> stream, T data):
+    CUDAProductBase(device, std::move(stream)),
+    data_(std::move(data))
+  {}
+
+  T data_; //!
+};
+
+#endif
diff --git a/CUDADataFormats/Common/interface/CUDAProductBase.h b/CUDADataFormats/Common/interface/CUDAProductBase.h
new file mode 100644
index 0000000000000..eb6fdae0e5abf
--- /dev/null
+++ b/CUDADataFormats/Common/interface/CUDAProductBase.h
@@ -0,0 +1,40 @@
+#ifndef CUDADataFormats_Common_CUDAProductBase_h
+#define CUDADataFormats_Common_CUDAProductBase_h
+
+#include <memory>
+
+#include <cuda/api_wrappers.h>
+
+/**
+ * Base class for all instantiations of CUDA<T> to hold the
+ * non-T-dependent members.
+ */
+class CUDAProductBase {
+public:
+  CUDAProductBase() = default; // Needed only for ROOT dictionary generation
+
+  bool isValid() const { return stream_.get() != nullptr; }
+
+  int device() const { return device_; }
+
+  const cuda::stream_t<>& stream() const { return *stream_; }
+  cuda::stream_t<>& stream() { return *stream_; }
+  const std::shared_ptr<cuda::stream_t<>>& streamPtr() const { return stream_; }
+
+  const cuda::event_t& event() const { return *event_; }
+  cuda::event_t& event() { return *event_; }
+
+protected:
+  explicit CUDAProductBase(int device, std::shared_ptr<cuda::stream_t<>> stream);
+
+private:
+  // The cuda::stream_t is really shared among edm::Event products, so
+  // using shared_ptr also here
+  std::shared_ptr<cuda::stream_t<>> stream_; //!
+  // shared_ptr because of caching in CUDAService
+  std::shared_ptr<cuda::event_t> event_; //!
+
+  int device_ = -1; //!
+};
+
+#endif
diff --git a/CUDADataFormats/Common/interface/device_unique_ptr.h b/CUDADataFormats/Common/interface/device_unique_ptr.h
deleted file mode 100644
index 1282c52125fa6..0000000000000
--- a/CUDADataFormats/Common/interface/device_unique_ptr.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef CUDADataFormats_Common_interface_device_unique_ptr_h
-#define CUDADataFormats_Common_interface_device_unique_ptr_h
-
-#include <memory>
-#include <functional>
-
-namespace edm {
-  namespace cuda {
-    namespace device {
-      template <typename T>
-      using unique_ptr = std::unique_ptr<T, std::function<void(void *)>>;
-    }
-  }
-}
-
-#endif
diff --git a/CUDADataFormats/Common/interface/host_unique_ptr.h b/CUDADataFormats/Common/interface/host_unique_ptr.h
deleted file mode 100644
index c945d9b0aa027..0000000000000
--- a/CUDADataFormats/Common/interface/host_unique_ptr.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef CUDADataFormats_Common_interface_host_unique_ptr_h
-#define CUDADataFormats_Common_interface_host_unique_ptr_h
-
-#include <memory>
-#include <functional>
-
-namespace edm {
-  namespace cuda {
-    namespace host {
-      template <typename T>
-      using unique_ptr = std::unique_ptr<T, std::function<void(void *)>>;
-    }
-  }
-}
-
-#endif
diff --git a/CUDADataFormats/Common/src/CUDAProductBase.cc b/CUDADataFormats/Common/src/CUDAProductBase.cc
new file mode 100644
index 0000000000000..c034b4f7295f8
--- /dev/null
+++ b/CUDADataFormats/Common/src/CUDAProductBase.cc
@@ -0,0 +1,19 @@
+#include "CUDADataFormats/Common/interface/CUDAProductBase.h"
+
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+CUDAProductBase::CUDAProductBase(int device, std::shared_ptr<cuda::stream_t<>> stream):
+  stream_(std::move(stream)),
+  device_(device)
+{
+  edm::Service<CUDAService> cs;
+  event_ = cs->getCUDAEvent();
+
+  // Record CUDA event to the CUDA stream. The event will become
+  // "occurred" after all work queued to the stream before this
+  // point has been finished.
+  event_->record(stream_->id());
+}
+
+
diff --git a/CUDADataFormats/Common/test/BuildFile.xml b/CUDADataFormats/Common/test/BuildFile.xml
new file mode 100644
index 0000000000000..5e804fe80a736
--- /dev/null
+++ b/CUDADataFormats/Common/test/BuildFile.xml
@@ -0,0 +1,5 @@
+<bin file="test*.cc" name="testCUDADataFormatsCommon">
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="catch2"/>
+  <use name="cuda"/>
+</bin>
diff --git a/CUDADataFormats/Common/test/test_CUDAProduct.cc b/CUDADataFormats/Common/test/test_CUDAProduct.cc
new file mode 100644
index 0000000000000..bd5ddf7f512fe
--- /dev/null
+++ b/CUDADataFormats/Common/test/test_CUDAProduct.cc
@@ -0,0 +1,63 @@
+#include "catch.hpp"
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+
+#include <cuda_runtime_api.h>
+
+namespace cudatest {
+  class TestCUDAScopedContext {
+  public:
+    static
+    CUDAScopedContext make(int dev) {
+      auto device = cuda::device::get(dev);
+      return CUDAScopedContext(dev, std::make_unique<cuda::stream_t<>>(device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream)));
+    }
+  };
+}
+
+TEST_CASE("Use of CUDAProduct template", "[CUDACore]") {
+  SECTION("Default constructed") {
+    auto foo = CUDAProduct<int>();
+    REQUIRE(!foo.isValid());
+
+    auto bar = std::move(foo);
+  }
+
+  exitSansCUDADevices();
+
+  constexpr int defaultDevice = 0;
+  {
+    auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice);
+    std::unique_ptr<CUDAProduct<int>> dataPtr = ctx.wrap(10);
+    auto& data = *dataPtr;
+
+    SECTION("Construct from CUDAScopedContext") {
+      REQUIRE(data.isValid());
+      REQUIRE(data.device() == defaultDevice);
+      REQUIRE(data.stream().id() == ctx.stream().id());
+      REQUIRE(&data.event() != nullptr);
+    }
+
+    SECTION("Move constructor") {
+      auto data2 = CUDAProduct<int>(std::move(data));
+      REQUIRE(data2.isValid());
+      REQUIRE(!data.isValid());
+    }
+
+    SECTION("Move assignment") {
+      CUDAProduct<int> data2;
+      data2 = std::move(data);
+      REQUIRE(data2.isValid());
+      REQUIRE(!data.isValid());
+    }
+  }
+
+  // Destroy and clean up all resources so that the next test can
+  // assume to start from a clean state.
+  cudaCheck(cudaSetDevice(defaultDevice));
+  cudaCheck(cudaDeviceSynchronize());
+  cudaDeviceReset();
+}
diff --git a/CUDADataFormats/Common/test/test_main.cc b/CUDADataFormats/Common/test/test_main.cc
new file mode 100644
index 0000000000000..0c7c351f437f5
--- /dev/null
+++ b/CUDADataFormats/Common/test/test_main.cc
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
diff --git a/CUDADataFormats/SiPixelCluster/BuildFile.xml b/CUDADataFormats/SiPixelCluster/BuildFile.xml
index 21c527e7b2f0d..d34658faa2573 100644
--- a/CUDADataFormats/SiPixelCluster/BuildFile.xml
+++ b/CUDADataFormats/SiPixelCluster/BuildFile.xml
@@ -1,6 +1,7 @@
 <use name="FWCore/ServiceRegistry"/>
 <use name="HeterogeneousCore/CUDAServices"/>
 <use name="cuda-api-wrappers"/>
+<use name="rootcore"/>
 
 <export>
     <lib name="1"/>
diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
index ca8a75d178b6c..f25a8a25f0808 100644
--- a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
+++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
@@ -1,14 +1,15 @@
 #ifndef CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
 #define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
 
-#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 #include <cuda/api_wrappers.h>
 
 class SiPixelClustersCUDA {
 public:
   SiPixelClustersCUDA() = default;
-  explicit SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream);
+  explicit SiPixelClustersCUDA(size_t maxClusters, cuda::stream_t<>& stream);
   ~SiPixelClustersCUDA() = default;
 
   SiPixelClustersCUDA(const SiPixelClustersCUDA&) = delete;
@@ -16,20 +17,23 @@ class SiPixelClustersCUDA {
   SiPixelClustersCUDA(SiPixelClustersCUDA&&) = default;
   SiPixelClustersCUDA& operator=(SiPixelClustersCUDA&&) = default;
 
+  void setNClusters(uint32_t nClusters) {
+    nClusters_h = nClusters;
+  }
+
+  uint32_t nClusters() const { return nClusters_h; }
+
   uint32_t *moduleStart() { return moduleStart_d.get(); }
-  int32_t  *clus() { return clus_d.get(); }
   uint32_t *clusInModule() { return clusInModule_d.get(); }
   uint32_t *moduleId() { return moduleId_d.get(); }
   uint32_t *clusModuleStart() { return clusModuleStart_d.get(); }
 
   uint32_t const *moduleStart() const { return moduleStart_d.get(); }
-  int32_t  const *clus() const { return clus_d.get(); }
   uint32_t const *clusInModule() const { return clusInModule_d.get(); }
   uint32_t const *moduleId() const { return moduleId_d.get(); }
   uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); }
 
   uint32_t const *c_moduleStart() const { return moduleStart_d.get(); }
-  int32_t  const *c_clus() const { return clus_d.get(); }
   uint32_t const *c_clusInModule() const { return clusInModule_d.get(); }
   uint32_t const *c_moduleId() const { return moduleId_d.get(); }
   uint32_t const *c_clusModuleStart() const { return clusModuleStart_d.get(); }
@@ -40,7 +44,6 @@ class SiPixelClustersCUDA {
 
 #ifdef __CUDACC__
     __device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_+i); }
-    __device__ __forceinline__ int32_t  clus(int i) const { return __ldg(clus_+i); }
     __device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_+i); }
     __device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_+i); }
     __device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_+i); }
@@ -50,7 +53,6 @@ class SiPixelClustersCUDA {
 
   private:
     uint32_t const *moduleStart_;
-    int32_t  const *clus_;
     uint32_t const *clusInModule_;
     uint32_t const *moduleId_;
     uint32_t const *clusModuleStart_;
@@ -59,15 +61,16 @@ class SiPixelClustersCUDA {
   DeviceConstView *view() const { return view_d.get(); }
 
 private:
-  edm::cuda::device::unique_ptr<uint32_t[]> moduleStart_d;   // index of the first pixel of each module
-  edm::cuda::device::unique_ptr<int32_t[]>  clus_d;          // cluster id of each pixel
-  edm::cuda::device::unique_ptr<uint32_t[]> clusInModule_d;  // number of clusters found in each module
-  edm::cuda::device::unique_ptr<uint32_t[]> moduleId_d;      // module id of each module
+  cudautils::device::unique_ptr<uint32_t[]> moduleStart_d;   // index of the first pixel of each module
+  cudautils::device::unique_ptr<uint32_t[]> clusInModule_d;  // number of clusters found in each module
+  cudautils::device::unique_ptr<uint32_t[]> moduleId_d;      // module id of each module
 
   // originally from rechits
-  edm::cuda::device::unique_ptr<uint32_t[]> clusModuleStart_d;
+  cudautils::device::unique_ptr<uint32_t[]> clusModuleStart_d;
+
+  cudautils::device::unique_ptr<DeviceConstView> view_d;    // "me" pointer
 
-  edm::cuda::device::unique_ptr<DeviceConstView> view_d;    // "me" pointer
+  uint32_t nClusters_h;
 };
 
 #endif
diff --git a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
index 7363c2fd364af..d88a1b0a6370b 100644
--- a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
+++ b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
@@ -2,23 +2,22 @@
 
 #include "FWCore/ServiceRegistry/interface/Service.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 
-SiPixelClustersCUDA::SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream) {
+SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxClusters, cuda::stream_t<>& stream) {
   edm::Service<CUDAService> cs;
 
-  moduleStart_d     = cs->make_device_unique<uint32_t[]>(nelements+1, stream);
-  clus_d            = cs->make_device_unique< int32_t[]>(feds, stream);
-  clusInModule_d    = cs->make_device_unique<uint32_t[]>(nelements, stream);
-  moduleId_d        = cs->make_device_unique<uint32_t[]>(nelements, stream);
-  clusModuleStart_d = cs->make_device_unique<uint32_t[]>(nelements+1, stream);
+  moduleStart_d     = cs->make_device_unique<uint32_t[]>(maxClusters+1, stream);
+  clusInModule_d    = cs->make_device_unique<uint32_t[]>(maxClusters, stream);
+  moduleId_d        = cs->make_device_unique<uint32_t[]>(maxClusters, stream);
+  clusModuleStart_d = cs->make_device_unique<uint32_t[]>(maxClusters+1, stream);
 
   auto view = cs->make_host_unique<DeviceConstView>(stream);
   view->moduleStart_ = moduleStart_d.get();
-  view->clus_ = clus_d.get();
   view->clusInModule_ = clusInModule_d.get();
   view->moduleId_ = moduleId_d.get();
   view->clusModuleStart_ = clusModuleStart_d.get();
 
   view_d = cs->make_device_unique<DeviceConstView>(stream);
-  cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id());
+  cudautils::copyAsync(view_d, view, stream);
 }
diff --git a/CUDADataFormats/SiPixelCluster/src/classes.h b/CUDADataFormats/SiPixelCluster/src/classes.h
new file mode 100644
index 0000000000000..08d46244adc7d
--- /dev/null
+++ b/CUDADataFormats/SiPixelCluster/src/classes.h
@@ -0,0 +1,8 @@
+#ifndef CUDADataFormats_SiPixelCluster_classes_h
+#define CUDADataFormats_SiPixelCluster_classes_h
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif
diff --git a/CUDADataFormats/SiPixelCluster/src/classes_def.xml b/CUDADataFormats/SiPixelCluster/src/classes_def.xml
new file mode 100644
index 0000000000000..ba0706ac4b8aa
--- /dev/null
+++ b/CUDADataFormats/SiPixelCluster/src/classes_def.xml
@@ -0,0 +1,4 @@
+<lcgdict>
+  <class name="CUDAProduct<SiPixelClustersCUDA>" persistent="false"/>
+  <class name="edm::Wrapper<CUDAProduct<SiPixelClustersCUDA>>" persistent="false"/>
+</lcgdict>
diff --git a/CUDADataFormats/SiPixelDigi/BuildFile.xml b/CUDADataFormats/SiPixelDigi/BuildFile.xml
index 259aa9f08d054..29ec13098819c 100644
--- a/CUDADataFormats/SiPixelDigi/BuildFile.xml
+++ b/CUDADataFormats/SiPixelDigi/BuildFile.xml
@@ -1,6 +1,8 @@
+<use name="DataFormats/SiPixelRawData"/>
 <use name="FWCore/ServiceRegistry"/>
 <use name="HeterogeneousCore/CUDAServices"/>
 <use name="cuda-api-wrappers"/>
+<use name="rootcore"/>
 
 <export>
     <lib name="1"/>
diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h
new file mode 100644
index 0000000000000..e9c8c0f644722
--- /dev/null
+++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h
@@ -0,0 +1,40 @@
+#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h
+#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h
+
+#include "DataFormats/SiPixelDigi/interface/PixelErrors.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
+
+#include <cuda/api_wrappers.h>
+
+class SiPixelDigiErrorsCUDA {
+public:
+  SiPixelDigiErrorsCUDA() = default;
+  explicit SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cuda::stream_t<>& stream);
+  ~SiPixelDigiErrorsCUDA() = default;
+
+  SiPixelDigiErrorsCUDA(const SiPixelDigiErrorsCUDA&) = delete;
+  SiPixelDigiErrorsCUDA& operator=(const SiPixelDigiErrorsCUDA&) = delete;
+  SiPixelDigiErrorsCUDA(SiPixelDigiErrorsCUDA&&) = default;
+  SiPixelDigiErrorsCUDA& operator=(SiPixelDigiErrorsCUDA&&) = default;
+
+  const PixelFormatterErrors& formatterErrors() const { return formatterErrors_h; }
+
+  GPU::SimpleVector<PixelErrorCompact> *error() { return error_d.get(); }
+  GPU::SimpleVector<PixelErrorCompact> const *error() const { return error_d.get(); }
+  GPU::SimpleVector<PixelErrorCompact> const *c_error() const { return error_d.get(); }
+
+  using HostDataError = std::pair<GPU::SimpleVector<PixelErrorCompact>, cudautils::host::unique_ptr<PixelErrorCompact[]>>;
+  HostDataError dataErrorToHostAsync(cuda::stream_t<>& stream) const;
+
+  void copyErrorToHostAsync(cuda::stream_t<>& stream);
+
+private:
+  cudautils::device::unique_ptr<PixelErrorCompact[]> data_d;
+  cudautils::device::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_d;
+  cudautils::host::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_h;
+  PixelFormatterErrors formatterErrors_h;
+};
+
+#endif
diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
index 66ca680effd19..6a52545483eb8 100644
--- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
+++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
@@ -1,15 +1,15 @@
 #ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
 #define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
 
-#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
-#include "FWCore/Utilities/interface/propagate_const.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 #include <cuda/api_wrappers.h>
 
 class SiPixelDigisCUDA {
 public:
   SiPixelDigisCUDA() = default;
-  explicit SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream);
+  explicit SiPixelDigisCUDA(size_t maxFedWords, cuda::stream_t<>& stream);
   ~SiPixelDigisCUDA() = default;
 
   SiPixelDigisCUDA(const SiPixelDigisCUDA&) = delete;
@@ -17,20 +17,42 @@ class SiPixelDigisCUDA {
   SiPixelDigisCUDA(SiPixelDigisCUDA&&) = default;
   SiPixelDigisCUDA& operator=(SiPixelDigisCUDA&&) = default;
 
+  void setNModulesDigis(uint32_t nModules, uint32_t nDigis) {
+    nModules_h = nModules;
+    nDigis_h = nDigis;
+  }
+
+  uint32_t nModules() const { return nModules_h; }
+  uint32_t nDigis() const { return nDigis_h; }
+
   uint16_t * xx() { return xx_d.get(); }
   uint16_t * yy() { return yy_d.get(); }
   uint16_t * adc() { return adc_d.get(); }
   uint16_t * moduleInd() { return moduleInd_d.get(); }
+  int32_t  * clus() { return clus_d.get(); }
+  uint32_t * pdigi() { return pdigi_d.get(); }
+  uint32_t * rawIdArr() { return rawIdArr_d.get(); }
 
   uint16_t const *xx() const { return xx_d.get(); }
   uint16_t const *yy() const { return yy_d.get(); }
   uint16_t const *adc() const { return adc_d.get(); }
   uint16_t const *moduleInd() const { return moduleInd_d.get(); }
+  int32_t  const *clus() const { return clus_d.get(); } 
+  uint32_t const *pdigi() const { return pdigi_d.get(); }
+  uint32_t const *rawIdArr() const { return rawIdArr_d.get(); }
 
   uint16_t const *c_xx() const { return xx_d.get(); }
   uint16_t const *c_yy() const { return yy_d.get(); }
   uint16_t const *c_adc() const { return adc_d.get(); }
   uint16_t const *c_moduleInd() const { return moduleInd_d.get(); }
+  int32_t  const *c_clus() const { return clus_d.get(); }
+  uint32_t const *c_pdigi() const { return pdigi_d.get(); }
+  uint32_t const *c_rawIdArr() const { return rawIdArr_d.get(); }
+  
+  cudautils::host::unique_ptr<uint16_t[]> adcToHostAsync(cuda::stream_t<>& stream) const;
+  cudautils::host::unique_ptr< int32_t[]> clusToHostAsync(cuda::stream_t<>& stream) const;
+  cudautils::host::unique_ptr<uint32_t[]> pdigiToHostAsync(cuda::stream_t<>& stream) const;
+  cudautils::host::unique_ptr<uint32_t[]> rawIdArrToHostAsync(cuda::stream_t<>& stream) const;
 
   class DeviceConstView {
   public:
@@ -41,6 +63,7 @@ class SiPixelDigisCUDA {
     __device__ __forceinline__ uint16_t yy(int i) const { return __ldg(yy_+i); }
     __device__ __forceinline__ uint16_t adc(int i) const { return __ldg(adc_+i); }
     __device__ __forceinline__ uint16_t moduleInd(int i) const { return __ldg(moduleInd_+i); }
+    __device__ __forceinline__ int32_t  clus(int i) const { return __ldg(clus_+i); }
 #endif
 
     friend class SiPixelDigisCUDA;
@@ -50,16 +73,27 @@ class SiPixelDigisCUDA {
     uint16_t const *yy_;
     uint16_t const *adc_;
     uint16_t const *moduleInd_;
+    int32_t  const *clus_;
   };
 
   const DeviceConstView *view() const { return view_d.get(); }
 
 private:
-  edm::cuda::device::unique_ptr<uint16_t[]> xx_d;        // local coordinates of each pixel
-  edm::cuda::device::unique_ptr<uint16_t[]> yy_d;        //
-  edm::cuda::device::unique_ptr<uint16_t[]> adc_d;       // ADC of each pixel
-  edm::cuda::device::unique_ptr<uint16_t[]> moduleInd_d; // module id of each pixel
-  edm::cuda::device::unique_ptr<DeviceConstView> view_d; // "me" pointer
+  // These are consumed by downstream device code
+  cudautils::device::unique_ptr<uint16_t[]> xx_d;        // local coordinates of each pixel
+  cudautils::device::unique_ptr<uint16_t[]> yy_d;        //
+  cudautils::device::unique_ptr<uint16_t[]> adc_d;       // ADC of each pixel
+  cudautils::device::unique_ptr<uint16_t[]> moduleInd_d; // module id of each pixel
+  cudautils::device::unique_ptr<int32_t[]>  clus_d;      // cluster id of each pixel
+  cudautils::device::unique_ptr<DeviceConstView> view_d; // "me" pointer
+
+  // These are for CPU output; should we (eventually) place them to a
+  // separate product?
+  cudautils::device::unique_ptr<uint32_t[]> pdigi_d;
+  cudautils::device::unique_ptr<uint32_t[]> rawIdArr_d;
+
+  uint32_t nModules_h = 0;
+  uint32_t nDigis_h = 0;
 };
 
 #endif
diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc
new file mode 100644
index 0000000000000..92aab1ec9d578
--- /dev/null
+++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc
@@ -0,0 +1,44 @@
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
+
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"
+
+SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cuda::stream_t<>& stream):
+  formatterErrors_h(std::move(errors))
+{
+  edm::Service<CUDAService> cs;
+
+  error_d = cs->make_device_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
+  data_d = cs->make_device_unique<PixelErrorCompact[]>(maxFedWords, stream);
+
+  cudautils::memsetAsync(data_d, 0x00, maxFedWords, stream);
+
+  error_h = cs->make_host_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
+  GPU::make_SimpleVector(error_h.get(), maxFedWords, data_d.get());
+  assert(error_h->size() == 0);
+  assert(error_h->capacity() == static_cast<int>(maxFedWords));
+
+  cudautils::copyAsync(error_d, error_h, stream);
+}
+
+void SiPixelDigiErrorsCUDA::copyErrorToHostAsync(cuda::stream_t<>& stream) {
+  cudautils::copyAsync(error_h, error_d, stream);
+}
+
+SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync(cuda::stream_t<>& stream) const {
+  edm::Service<CUDAService> cs;
+  // On one hand size() could be sufficient. On the other hand, if
+  // someone copies the SimpleVector<>, (s)he might expect the data
+  // buffer to actually have space for capacity() elements.
+  auto data = cs->make_host_unique<PixelErrorCompact[]>(error_h->capacity(), stream);
+
+  // but transfer only the required amount
+  if(error_h->size() > 0) {
+    cudautils::copyAsync(data, data_d, error_h->size(), stream);
+  }
+  auto err = *error_h;
+  err.set_data(data.get());
+  return HostDataError(std::move(err), std::move(data));
+}
diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
index 7e3d876ac8bdc..ef13ed9612dbf 100644
--- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
+++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
@@ -2,24 +2,55 @@
 
 #include "FWCore/ServiceRegistry/interface/Service.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 
-#include <cuda_runtime.h>
-
-SiPixelDigisCUDA::SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream) {
+SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cuda::stream_t<>& stream) {
   edm::Service<CUDAService> cs;
 
-  xx_d              = cs->make_device_unique<uint16_t[]>(nelements, stream);
-  yy_d              = cs->make_device_unique<uint16_t[]>(nelements, stream);
-  adc_d             = cs->make_device_unique<uint16_t[]>(nelements, stream);
-  moduleInd_d       = cs->make_device_unique<uint16_t[]>(nelements, stream);
+  xx_d              = cs->make_device_unique<uint16_t[]>(maxFedWords, stream);
+  yy_d              = cs->make_device_unique<uint16_t[]>(maxFedWords, stream);
+  adc_d             = cs->make_device_unique<uint16_t[]>(maxFedWords, stream);
+  moduleInd_d       = cs->make_device_unique<uint16_t[]>(maxFedWords, stream);
+  clus_d            = cs->make_device_unique< int32_t[]>(maxFedWords, stream);
+
+  pdigi_d           = cs->make_device_unique<uint32_t[]>(maxFedWords, stream);
+  rawIdArr_d        = cs->make_device_unique<uint32_t[]>(maxFedWords, stream);
 
   auto view = cs->make_host_unique<DeviceConstView>(stream);
   view->xx_ = xx_d.get();
   view->yy_ = yy_d.get();
   view->adc_ = adc_d.get();
   view->moduleInd_ = moduleInd_d.get();
+  view->clus_ = clus_d.get();
 
   view_d = cs->make_device_unique<DeviceConstView>(stream);
-  cudaCheck(cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id()));
+  cudautils::copyAsync(view_d, view, stream);
+}
+
+cudautils::host::unique_ptr<uint16_t[]> SiPixelDigisCUDA::adcToHostAsync(cuda::stream_t<>& stream) const {
+  edm::Service<CUDAService> cs;
+  auto ret = cs->make_host_unique<uint16_t[]>(nDigis(), stream);
+  cudautils::copyAsync(ret, adc_d, nDigis(), stream);
+  return ret;
+}
+
+cudautils::host::unique_ptr<int32_t[]> SiPixelDigisCUDA::clusToHostAsync(cuda::stream_t<>& stream) const {
+  edm::Service<CUDAService> cs;
+  auto ret = cs->make_host_unique<int32_t[]>(nDigis(), stream);
+  cudautils::copyAsync(ret, clus_d, nDigis(), stream);
+  return ret;
+}
+
+cudautils::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::pdigiToHostAsync(cuda::stream_t<>& stream) const {
+  edm::Service<CUDAService> cs;
+  auto ret = cs->make_host_unique<uint32_t[]>(nDigis(), stream);
+  cudautils::copyAsync(ret, pdigi_d, nDigis(), stream);
+  return ret;
+}
+
+cudautils::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::rawIdArrToHostAsync(cuda::stream_t<>& stream) const {
+  edm::Service<CUDAService> cs;
+  auto ret = cs->make_host_unique<uint32_t[]>(nDigis(), stream);
+  cudautils::copyAsync(ret, rawIdArr_d, nDigis(), stream);
+  return ret;
 }
diff --git a/CUDADataFormats/SiPixelDigi/src/classes.h b/CUDADataFormats/SiPixelDigi/src/classes.h
new file mode 100644
index 0000000000000..41b135640b883
--- /dev/null
+++ b/CUDADataFormats/SiPixelDigi/src/classes.h
@@ -0,0 +1,9 @@
+#ifndef CUDADataFormats_SiPixelDigi_classes_h
+#define CUDADataFormats_SiPixelDigi_classes_h
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif
diff --git a/CUDADataFormats/SiPixelDigi/src/classes_def.xml b/CUDADataFormats/SiPixelDigi/src/classes_def.xml
new file mode 100644
index 0000000000000..9d6816ed3b14c
--- /dev/null
+++ b/CUDADataFormats/SiPixelDigi/src/classes_def.xml
@@ -0,0 +1,7 @@
+<lcgdict>
+  <class name="CUDAProduct<SiPixelDigisCUDA>" persistent="false"/>
+  <class name="edm::Wrapper<CUDAProduct<SiPixelDigisCUDA>>" persistent="false"/>
+
+  <class name="CUDAProduct<SiPixelDigiErrorsCUDA>" persistent="false"/>
+  <class name="edm::Wrapper<CUDAProduct<SiPixelDigiErrorsCUDA>>" persistent="false"/>
+</lcgdict>
diff --git a/Configuration/StandardSequences/python/RawToDigi_cff.py b/Configuration/StandardSequences/python/RawToDigi_cff.py
index 96aaeebbfaacd..ed10c78a40c9b 100644
--- a/Configuration/StandardSequences/python/RawToDigi_cff.py
+++ b/Configuration/StandardSequences/python/RawToDigi_cff.py
@@ -3,7 +3,7 @@
 # This object is used to selectively make changes for different running
 # scenarios. In this case it makes changes for Run 2.
 
-from EventFilter.SiPixelRawToDigi.SiPixelRawToDigi_cfi import *
+from EventFilter.SiPixelRawToDigi.siPixelDigis_cff import *
 
 from EventFilter.SiStripRawToDigi.SiStripDigis_cfi import *
 
@@ -49,7 +49,7 @@
 from EventFilter.CTPPSRawToDigi.ctppsRawToDigi_cff import *
 
 RawToDigiTask = cms.Task(L1TRawToDigiTask,
-                         siPixelDigis,
+                         siPixelDigisTask,
                          siStripDigis,
                          ecalDigis,
                          ecalPreshowerDigis,
@@ -64,15 +64,14 @@
                          )
 RawToDigi = cms.Sequence(RawToDigiTask)
 
-RawToDigiTask_noTk = RawToDigiTask.copyAndExclude([siPixelDigis, siStripDigis])
+RawToDigiTask_noTk = RawToDigiTask.copyAndExclude([siPixelDigisTask, siStripDigis])
 RawToDigi_noTk = cms.Sequence(RawToDigiTask_noTk)
 
-RawToDigiTask_pixelOnly = cms.Task(siPixelDigis)
+RawToDigiTask_pixelOnly = cms.Task(siPixelDigisTask)
 RawToDigi_pixelOnly = cms.Sequence(RawToDigiTask_pixelOnly)
 
 scalersRawToDigi.scalersInputTag = 'rawDataCollector'
-from Configuration.ProcessModifiers.gpu_cff import gpu
-(~gpu).toModify(siPixelDigis, InputLabel = 'rawDataCollector')
+siPixelDigis.cpu.InputLabel = 'rawDataCollector'
 #false by default anyways ecalDigis.DoRegional = False
 ecalDigis.InputLabel = 'rawDataCollector'
 ecalPreshowerDigis.sourceTag = 'rawDataCollector'
diff --git a/DataFormats/SiPixelDigi/interface/PixelErrors.h b/DataFormats/SiPixelDigi/interface/PixelErrors.h
new file mode 100644
index 0000000000000..5231b7d1f372a
--- /dev/null
+++ b/DataFormats/SiPixelDigi/interface/PixelErrors.h
@@ -0,0 +1,21 @@
+#ifndef DataFormats_SiPixelDigi_PixelErrors_h
+#define DataFormats_SiPixelDigi_PixelErrors_h
+
+#include "DataFormats/SiPixelRawData/interface/SiPixelRawDataError.h"
+#include "FWCore/Utilities/interface/typedefs.h"
+
+#include <map>
+#include <vector>
+
+// Better ideas for the placement of these?
+
+struct PixelErrorCompact {
+  uint32_t rawId;
+  uint32_t word;
+  unsigned char errorType;
+  unsigned char fedId;
+};
+
+using PixelFormatterErrors = std::map<cms_uint32_t, std::vector<SiPixelRawDataError>>;
+
+#endif
diff --git a/DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h b/DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h
new file mode 100644
index 0000000000000..1a7710b4fb3c6
--- /dev/null
+++ b/DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h
@@ -0,0 +1,28 @@
+#ifndef DataFormats_SiPixelDigi_interface_SiPixelDigiErrorsSoA_h
+#define DataFormats_SiPixelDigi_interface_SiPixelDigiErrorsSoA_h
+
+#include "DataFormats/SiPixelDigi/interface/PixelErrors.h"
+
+#include <cstdint>
+#include <vector>
+
+class SiPixelDigiErrorsSoA {
+public:
+  SiPixelDigiErrorsSoA() = default;
+  explicit SiPixelDigiErrorsSoA(size_t nErrors, const PixelErrorCompact *error, const PixelFormatterErrors *err);
+  ~SiPixelDigiErrorsSoA() = default;
+
+  auto size() const { return error_.size(); }
+
+  const PixelFormatterErrors *formatterErrors() const { return formatterErrors_; }
+
+  const PixelErrorCompact& error(size_t i) const { return error_[i]; }
+  
+  const std::vector<PixelErrorCompact>& errorVector() const { return error_; }
+  
+private:
+  std::vector<PixelErrorCompact> error_;
+  const PixelFormatterErrors *formatterErrors_ = nullptr;
+};
+
+#endif
diff --git a/DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h b/DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h
new file mode 100644
index 0000000000000..df249a3790cd2
--- /dev/null
+++ b/DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h
@@ -0,0 +1,32 @@
+#ifndef DataFormats_SiPixelDigi_interface_SiPixelDigisSoA_h
+#define DataFormats_SiPixelDigi_interface_SiPixelDigisSoA_h
+
+#include <cstdint>
+#include <vector>
+
+class SiPixelDigisSoA {
+public:
+  SiPixelDigisSoA() = default;
+  explicit SiPixelDigisSoA(size_t nDigis, const uint32_t *pdigi, const uint32_t *rawIdArr, const uint16_t *adc, const int32_t *clus);
+  ~SiPixelDigisSoA() = default;
+
+  auto size() const { return pdigi_.size(); }
+
+  uint32_t pdigi(size_t i) const { return pdigi_[i]; }
+  uint32_t rawIdArr(size_t i) const { return rawIdArr_[i]; }
+  uint16_t adc(size_t i) const { return adc_[i]; }
+  int32_t clus(size_t i) const { return clus_[i]; }
+  
+  const std::vector<uint32_t>& pdigiVector() const { return pdigi_; }
+  const std::vector<uint32_t>& rawIdArrVector() const { return rawIdArr_; }
+  const std::vector<uint16_t>& adcVector() const { return adc_; }
+  const std::vector<int32_t>& clusVector() const { return clus_; }
+  
+private:
+  std::vector<uint32_t> pdigi_;
+  std::vector<uint32_t> rawIdArr_;
+  std::vector<uint16_t> adc_;
+  std::vector<int32_t> clus_;
+};
+
+#endif
diff --git a/DataFormats/SiPixelDigi/src/SiPixelDigiErrorsSoA.cc b/DataFormats/SiPixelDigi/src/SiPixelDigiErrorsSoA.cc
new file mode 100644
index 0000000000000..ef2b4581fc46e
--- /dev/null
+++ b/DataFormats/SiPixelDigi/src/SiPixelDigiErrorsSoA.cc
@@ -0,0 +1,10 @@
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h"
+
+#include <cassert>
+
+SiPixelDigiErrorsSoA::SiPixelDigiErrorsSoA(size_t nErrors, const PixelErrorCompact *error, const PixelFormatterErrors *err):
+  error_(error, error+nErrors),
+  formatterErrors_(err)
+{
+  assert(error_.size() == nErrors);
+}
diff --git a/DataFormats/SiPixelDigi/src/SiPixelDigisSoA.cc b/DataFormats/SiPixelDigi/src/SiPixelDigisSoA.cc
new file mode 100644
index 0000000000000..ebc8ba2055f78
--- /dev/null
+++ b/DataFormats/SiPixelDigi/src/SiPixelDigisSoA.cc
@@ -0,0 +1,12 @@
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h"
+
+#include <cassert>
+
+SiPixelDigisSoA::SiPixelDigisSoA(size_t nDigis, const uint32_t *pdigi, const uint32_t *rawIdArr, const uint16_t *adc, const int32_t *clus):
+  pdigi_(pdigi, pdigi+nDigis),
+  rawIdArr_(rawIdArr, rawIdArr+nDigis),
+  adc_(adc, adc+nDigis),
+  clus_(clus, clus+nDigis)
+{
+  assert(pdigi_.size() == nDigis);
+}
diff --git a/DataFormats/SiPixelDigi/src/classes.h b/DataFormats/SiPixelDigi/src/classes.h
index 0c6a09d852959..256ca41ad1867 100644
--- a/DataFormats/SiPixelDigi/src/classes.h
+++ b/DataFormats/SiPixelDigi/src/classes.h
@@ -5,6 +5,8 @@
 #include "DataFormats/SiPixelDigi/interface/PixelDigiCollection.h"
 #include "DataFormats/SiPixelDigi/interface/SiPixelCalibDigi.h"
 #include "DataFormats/SiPixelDigi/interface/SiPixelCalibDigiError.h"
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h"
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 #include "DataFormats/Common/interface/DetSetVector.h"
 #include "DataFormats/Common/interface/DetSetVectorNew.h"
diff --git a/DataFormats/SiPixelDigi/src/classes_def.xml b/DataFormats/SiPixelDigi/src/classes_def.xml
index de7779a5c00ea..8cabbd3f3f06e 100755
--- a/DataFormats/SiPixelDigi/src/classes_def.xml
+++ b/DataFormats/SiPixelDigi/src/classes_def.xml
@@ -49,4 +49,10 @@
 
    <class name="edmNew::DetSetVector<SiPixelCalibDigiError>"/>
    <class name="edm::Wrapper<edmNew::DetSetVector<SiPixelCalibDigiError> >"/>
+
+   <class name="SiPixelDigisSoA" persistent="false"/>
+   <class name="edm::Wrapper<SiPixelDigisSoA>" persistent="false"/>
+
+   <class name="SiPixelDigiErrorsSoA" persistent="false"/>
+   <class name="edm::Wrapper<SiPixelDigiErrorsSoA>" persistent="false"/>
 </lcgdict>
diff --git a/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml b/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml
index f92aa68373927..4d2b5ebf45542 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml
+++ b/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml
@@ -1,4 +1,7 @@
+<use name="CUDADataFormats/SiPixelDigi"/>
 <use name="EventFilter/SiPixelRawToDigi"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="cuda-api-wrappers"/>
 <library file="*.cc" name="EventFilterSiPixelRawToDigiPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsFromSoA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsFromSoA.cc
new file mode 100644
index 0000000000000..9e998b92fc403
--- /dev/null
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsFromSoA.cc
@@ -0,0 +1,183 @@
+#include "CondFormats/DataRecord/interface/SiPixelFedCablingMapRcd.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingMap.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingTree.h"
+#include "DataFormats/Common/interface/DetSetVector.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "DataFormats/DetId/interface/DetIdCollection.h"
+#include "DataFormats/SiPixelDetId/interface/PixelFEDChannel.h"
+#include "DataFormats/SiPixelDigi/interface/PixelDigi.h"
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h"
+#include "EventFilter/SiPixelRawToDigi/interface/PixelDataFormatter.h"
+#include "FWCore/Framework/interface/ESTransientHandle.h"
+#include "FWCore/Framework/interface/ESWatcher.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+
+#include <memory>
+
+class SiPixelDigiErrorsFromSoA: public edm::stream::EDProducer<> {
+public:
+  explicit SiPixelDigiErrorsFromSoA(const edm::ParameterSet& iConfig);
+  ~SiPixelDigiErrorsFromSoA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+  edm::EDGetTokenT<SiPixelDigiErrorsSoA> digiErrorSoAGetToken_;
+
+  edm::EDPutTokenT<edm::DetSetVector<SiPixelRawDataError>> errorPutToken_;
+  edm::EDPutTokenT<DetIdCollection> tkErrorPutToken_;
+  edm::EDPutTokenT<DetIdCollection> userErrorPutToken_;
+  edm::EDPutTokenT<edmNew::DetSetVector<PixelFEDChannel>> disabledChannelPutToken_;
+
+  edm::ESWatcher<SiPixelFedCablingMapRcd> cablingWatcher_;
+  std::unique_ptr<SiPixelFedCablingTree> cabling_;
+  const std::string cablingMapLabel_;
+
+  const std::vector<int> tkerrorlist_;
+  const std::vector<int> usererrorlist_;
+
+  const bool usePhase1_;
+};
+
+SiPixelDigiErrorsFromSoA::SiPixelDigiErrorsFromSoA(const edm::ParameterSet& iConfig):
+  digiErrorSoAGetToken_{consumes<SiPixelDigiErrorsSoA>(iConfig.getParameter<edm::InputTag>("digiErrorSoASrc"))},
+  errorPutToken_{produces<edm::DetSetVector<SiPixelRawDataError>>()},
+  tkErrorPutToken_{produces<DetIdCollection>()},
+  userErrorPutToken_{produces<DetIdCollection>("UserErrorModules")},
+  disabledChannelPutToken_{produces<edmNew::DetSetVector<PixelFEDChannel>>()},
+  cablingMapLabel_(iConfig.getParameter<std::string>("CablingMapLabel")),
+  tkerrorlist_(iConfig.getParameter<std::vector<int>>("ErrorList")),
+  usererrorlist_(iConfig.getParameter<std::vector<int>>("UserErrorList")),
+  usePhase1_(iConfig.getParameter<bool> ("UsePhase1"))
+{}
+
+void SiPixelDigiErrorsFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("digiErrorSoASrc", edm::InputTag("siPixelDigiErrorsSoA"));
+  desc.add<std::string>("CablingMapLabel","")->setComment("CablingMap label");
+  desc.add<bool>("UsePhase1",false)->setComment("##  Use phase1");
+  desc.add<std::vector<int> >("ErrorList", std::vector<int>{29})->setComment("## ErrorList: list of error codes used by tracking to invalidate modules");
+  desc.add<std::vector<int> >("UserErrorList", std::vector<int>{40})->setComment("## UserErrorList: list of error codes used by Pixel experts for investigation");
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void SiPixelDigiErrorsFromSoA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  // pack errors into collection
+
+  // initialize cabling map or update if necessary
+  if (cablingWatcher_.check(iSetup)) {
+    // cabling map, which maps online address (fed->link->ROC->local pixel) to offline (DetId->global pixel)
+    edm::ESTransientHandle<SiPixelFedCablingMap> cablingMap;
+    iSetup.get<SiPixelFedCablingMapRcd>().get(cablingMapLabel_, cablingMap);
+    cabling_ = cablingMap->cablingTree();
+    LogDebug("map version:")<< cabling_->version();
+  }
+
+  const auto& digiErrors = iEvent.get(digiErrorSoAGetToken_);
+
+
+  edm::DetSetVector<SiPixelRawDataError> errorcollection{};
+  DetIdCollection tkerror_detidcollection{};
+  DetIdCollection usererror_detidcollection{};
+  edmNew::DetSetVector<PixelFEDChannel> disabled_channelcollection{};
+
+  PixelDataFormatter formatter(cabling_.get(), usePhase1_); // for phase 1 & 0
+  const PixelDataFormatter::Errors *formatterErrors = digiErrors.formatterErrors();
+  assert(formatterErrors != nullptr);
+  auto errors = *formatterErrors; // make a copy
+  PixelDataFormatter::DetErrors nodeterrors;
+
+  auto size = digiErrors.size();
+  for (auto i = 0U; i < size; i++) {
+    PixelErrorCompact err = digiErrors.error(i);
+    if (err.errorType != 0) {
+      SiPixelRawDataError error(err.word, err.errorType, err.fedId + 1200);
+      errors[err.rawId].push_back(error);
+    }
+  }
+
+  constexpr uint32_t dummydetid = 0xffffffff;
+  typedef PixelDataFormatter::Errors::iterator IE;
+  for (IE is = errors.begin(); is != errors.end(); is++) {
+
+    uint32_t errordetid = is->first;
+    if (errordetid == dummydetid) {// errors given dummy detId must be sorted by Fed
+      nodeterrors.insert( nodeterrors.end(), errors[errordetid].begin(), errors[errordetid].end() );
+    }
+    else {
+      edm::DetSet<SiPixelRawDataError>& errorDetSet = errorcollection.find_or_insert(errordetid);
+      errorDetSet.data.insert(errorDetSet.data.end(), is->second.begin(), is->second.end());
+      // Fill detid of the detectors where there is error AND the error number is listed
+      // in the configurable error list in the job option cfi.
+      // Code needs to be here, because there can be a set of errors for each
+      // entry in the for loop over PixelDataFormatter::Errors
+
+      std::vector<PixelFEDChannel> disabledChannelsDetSet;
+
+      for (auto const& aPixelError : errorDetSet) {
+        // For the time being, we extend the error handling functionality with ErrorType 25
+        // In the future, we should sort out how the usage of tkerrorlist can be generalized
+        if (aPixelError.getType() == 25) {
+          int fedId = aPixelError.getFedId();
+          const sipixelobjects::PixelFEDCabling* fed = cabling_->fed(fedId);
+          if (fed) {
+            cms_uint32_t linkId = formatter.linkId(aPixelError.getWord32());
+            const sipixelobjects::PixelFEDLink* link = fed->link(linkId);
+            if (link) {
+              // The "offline" 0..15 numbering is fixed by definition, also, the FrameConversion depends on it
+              // in contrast, the ROC-in-channel numbering is determined by hardware --> better to use the "offline" scheme
+              PixelFEDChannel ch = {fed->id(), linkId, 25, 0};
+              for (unsigned int iRoc = 1; iRoc <= link->numberOfROCs(); iRoc++) {
+                const sipixelobjects::PixelROC * roc = link->roc(iRoc);
+                if (roc->idInDetUnit() < ch.roc_first) ch.roc_first = roc->idInDetUnit();
+                if (roc->idInDetUnit() > ch.roc_last) ch.roc_last = roc->idInDetUnit();
+              }
+              if (ch.roc_first<ch.roc_last) disabledChannelsDetSet.push_back(ch);
+            }
+          }
+        }
+        else {
+          // fill list of detIds to be turned off by tracking
+          if (!tkerrorlist_.empty()) {
+            auto it_find = std::find(tkerrorlist_.begin(), tkerrorlist_.end(), aPixelError.getType());
+            if (it_find != tkerrorlist_.end()) {
+              tkerror_detidcollection.push_back(errordetid);
+            }
+          }
+        }
+        
+        // fill list of detIds with errors to be studied
+        if (!usererrorlist_.empty()) {
+          auto it_find = std::find(usererrorlist_.begin(), usererrorlist_.end(), aPixelError.getType());
+          if (it_find != usererrorlist_.end()) {
+            usererror_detidcollection.push_back(errordetid);
+          }
+        }
+
+      } // loop on DetSet of errors
+
+      if (!disabledChannelsDetSet.empty()) {
+        disabled_channelcollection.insert(errordetid, disabledChannelsDetSet.data(), disabledChannelsDetSet.size());
+      }
+
+    } // if error assigned to a real DetId
+  } // loop on errors in event for this FED
+
+  edm::DetSet<SiPixelRawDataError>& errorDetSet = errorcollection.find_or_insert(dummydetid);
+  errorDetSet.data = nodeterrors;
+
+  iEvent.emplace(errorPutToken_, std::move(errorcollection));
+  iEvent.emplace(tkErrorPutToken_, std::move(tkerror_detidcollection));
+  iEvent.emplace(userErrorPutToken_, std::move(usererror_detidcollection));
+  iEvent.emplace(disabledChannelPutToken_, std::move(disabled_channelcollection));
+}
+
+DEFINE_FWK_MODULE(SiPixelDigiErrorsFromSoA);
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
new file mode 100644
index 0000000000000..d47542528ed86
--- /dev/null
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
@@ -0,0 +1,75 @@
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+class SiPixelDigiErrorsSoAFromCUDA: public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit SiPixelDigiErrorsSoAFromCUDA(const edm::ParameterSet& iConfig);
+  ~SiPixelDigiErrorsSoAFromCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+  edm::EDGetTokenT<CUDAProduct<SiPixelDigiErrorsCUDA>> digiErrorGetToken_;
+  edm::EDPutTokenT<SiPixelDigiErrorsSoA> digiErrorPutToken_;
+
+  cudautils::host::unique_ptr<PixelErrorCompact[]> data_;
+  GPU::SimpleVector<PixelErrorCompact> error_;
+  const PixelFormatterErrors *formatterErrors_ = nullptr;
+};
+
+SiPixelDigiErrorsSoAFromCUDA::SiPixelDigiErrorsSoAFromCUDA(const edm::ParameterSet& iConfig):
+  digiErrorGetToken_(consumes<CUDAProduct<SiPixelDigiErrorsCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
+  digiErrorPutToken_(produces<SiPixelDigiErrorsSoA>())
+{}
+
+void SiPixelDigiErrorsSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag("siPixelClustersCUDA"));
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void SiPixelDigiErrorsSoAFromCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  // Do the transfer in a CUDA stream parallel to the computation CUDA stream
+  CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
+
+  const auto& gpuDigiErrors = ctx.get(iEvent, digiErrorGetToken_);
+
+  auto tmp = gpuDigiErrors.dataErrorToHostAsync(ctx.stream());
+  error_ = std::move(tmp.first);
+  data_ = std::move(tmp.second);
+  formatterErrors_ = &(gpuDigiErrors.formatterErrors());
+}
+
+void SiPixelDigiErrorsSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  // The following line copies the data from the pinned host memory to
+  // regular host memory. In principle that feels unnecessary (why not
+  // just use the pinned host memory?). There are a few arguments for
+  // doing it though
+  // - Now can release the pinned host memory back to the (caching) allocator
+  //   * if we'd like to keep the pinned memory, we'd need to also
+  //     keep the CUDA stream around as long as that, or allow pinned
+  //     host memory to be allocated without a CUDA stream
+  // - What if a CPU algorithm would produce the same SoA? We can't
+  //   use cudaMallocHost without a GPU...
+  iEvent.emplace(digiErrorPutToken_, error_.size(), error_.data(), formatterErrors_);
+
+  error_ = GPU::make_SimpleVector<PixelErrorCompact>(0, nullptr);
+  data_.reset();
+  formatterErrors_ = nullptr;
+}
+
+// define as framework plugin
+DEFINE_FWK_MODULE(SiPixelDigiErrorsSoAFromCUDA);
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
new file mode 100644
index 0000000000000..068701f0bcf07
--- /dev/null
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
@@ -0,0 +1,81 @@
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+
+class SiPixelDigisSoAFromCUDA: public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit SiPixelDigisSoAFromCUDA(const edm::ParameterSet& iConfig);
+  ~SiPixelDigisSoAFromCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+  edm::EDGetTokenT<CUDAProduct<SiPixelDigisCUDA>> digiGetToken_;
+  edm::EDPutTokenT<SiPixelDigisSoA> digiPutToken_;
+
+  cudautils::host::unique_ptr<uint32_t[]> pdigi_;
+  cudautils::host::unique_ptr<uint32_t[]> rawIdArr_;
+  cudautils::host::unique_ptr<uint16_t[]> adc_;
+  cudautils::host::unique_ptr< int32_t[]> clus_;
+
+  int nDigis_;
+};
+
+SiPixelDigisSoAFromCUDA::SiPixelDigisSoAFromCUDA(const edm::ParameterSet& iConfig):
+  digiGetToken_(consumes<CUDAProduct<SiPixelDigisCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
+  digiPutToken_(produces<SiPixelDigisSoA>())
+{}
+
+void SiPixelDigisSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag("siPixelClustersCUDA"));
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  // Do the transfer in a CUDA stream parallel to the computation CUDA stream
+  CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
+
+  const auto& gpuDigis = ctx.get(iEvent, digiGetToken_);
+
+  nDigis_ = gpuDigis.nDigis();
+  pdigi_ = gpuDigis.pdigiToHostAsync(ctx.stream());
+  rawIdArr_ = gpuDigis.rawIdArrToHostAsync(ctx.stream());
+  adc_ = gpuDigis.adcToHostAsync(ctx.stream());
+  clus_ = gpuDigis.clusToHostAsync(ctx.stream());
+}
+
+void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  // The following line copies the data from the pinned host memory to
+  // regular host memory. In principle that feels unnecessary (why not
+  // just use the pinned host memory?). There are a few arguments for
+  // doing it though
+  // - Now can release the pinned host memory back to the (caching) allocator
+  //   * if we'd like to keep the pinned memory, we'd need to also
+  //     keep the CUDA stream around as long as that, or allow pinned
+  //     host memory to be allocated without a CUDA stream
+  // - What if a CPU algorithm would produce the same SoA? We can't
+  //   use cudaMallocHost without a GPU...
+  iEvent.emplace(digiPutToken_, nDigis_, pdigi_.get(), rawIdArr_.get(), adc_.get(), clus_.get());
+
+  pdigi_.reset();
+  rawIdArr_.reset();
+  adc_.reset();
+  clus_.reset();
+}
+
+// define as framework plugin
+DEFINE_FWK_MODULE(SiPixelDigisSoAFromCUDA);
diff --git a/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py b/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py
index c2479af1f60bd..50c8f0fcabd3c 100644
--- a/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py
+++ b/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py
@@ -1,14 +1,24 @@
 import FWCore.ParameterSet.Config as cms
-import EventFilter.SiPixelRawToDigi.siPixelRawToDigi_cfi
+from EventFilter.SiPixelRawToDigi.siPixelRawToDigi_cfi import siPixelRawToDigi as _siPixelRawToDigi
 
-siPixelDigis = EventFilter.SiPixelRawToDigi.siPixelRawToDigi_cfi.siPixelRawToDigi.clone()
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+siPixelDigis = SwitchProducerCUDA(
+    cpu = _siPixelRawToDigi.clone()
+)
 
 from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel
-phase1Pixel.toModify(siPixelDigis, UsePhase1=True)
-
-import RecoLocalTracker.SiPixelClusterizer.siPixelDigiHeterogeneousConverter_cfi
-_siPixelDigis_gpu = RecoLocalTracker.SiPixelClusterizer.siPixelDigiHeterogeneousConverter_cfi.siPixelDigiHeterogeneousConverter.clone()
-_siPixelDigis_gpu.includeErrors = cms.bool(True)
+phase1Pixel.toModify(siPixelDigis.cpu, UsePhase1=True)
 
 from Configuration.ProcessModifiers.gpu_cff import gpu
-gpu.toReplaceWith(siPixelDigis, _siPixelDigis_gpu)
+gpu.toModify(siPixelDigis,
+    cuda = cms.EDAlias(
+        siPixelDigiErrors = cms.VPSet(
+            cms.PSet(type = cms.string("DetIdedmEDCollection")),
+            cms.PSet(type = cms.string("SiPixelRawDataErroredmDetSetVector")),
+            cms.PSet(type = cms.string("PixelFEDChanneledmNewDetSetVector"))
+        ),
+        siPixelDigisClustersPreSplitting = cms.VPSet(
+            cms.PSet(type = cms.string("PixelDigiedmDetSetVector"))
+        )
+    )
+)
diff --git a/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py b/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py
new file mode 100644
index 0000000000000..31ba8596bddc6
--- /dev/null
+++ b/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py
@@ -0,0 +1,30 @@
+import FWCore.ParameterSet.Config as cms
+
+from EventFilter.SiPixelRawToDigi.SiPixelRawToDigi_cfi import siPixelDigis
+from EventFilter.SiPixelRawToDigi.siPixelDigisSoAFromCUDA_cfi import siPixelDigisSoAFromCUDA as _siPixelDigisSoAFromCUDA
+from EventFilter.SiPixelRawToDigi.siPixelDigiErrorsSoAFromCUDA_cfi import siPixelDigiErrorsSoAFromCUDA as _siPixelDigiErrorsSoAFromCUDA
+from EventFilter.SiPixelRawToDigi.siPixelDigiErrorsFromSoA_cfi import siPixelDigiErrorsFromSoA as _siPixelDigiErrorsFromSoA
+
+siPixelDigisTask = cms.Task(siPixelDigis)
+
+siPixelDigisSoA = _siPixelDigisSoAFromCUDA.clone(
+    src = "siPixelClustersCUDAPreSplitting"
+)
+siPixelDigiErrorsSoA = _siPixelDigiErrorsSoAFromCUDA.clone(
+    src = "siPixelClustersCUDAPreSplitting"
+)
+siPixelDigiErrors = _siPixelDigiErrorsFromSoA.clone()
+
+from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel
+phase1Pixel.toModify(siPixelDigiErrors, UsePhase1=True)
+
+siPixelDigisTaskCUDA = cms.Task(
+    siPixelDigisSoA,
+    siPixelDigiErrorsSoA,
+    siPixelDigiErrors
+)
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+_siPixelDigisTask_gpu = siPixelDigisTask.copy()
+_siPixelDigisTask_gpu.add(siPixelDigisTaskCUDA)
+gpu.toReplaceWith(siPixelDigisTask, _siPixelDigisTask_gpu)
diff --git a/HeterogeneousCore/CUDACore/BuildFile.xml b/HeterogeneousCore/CUDACore/BuildFile.xml
index ba6b35c6d0ce7..dc0066701ece3 100644
--- a/HeterogeneousCore/CUDACore/BuildFile.xml
+++ b/HeterogeneousCore/CUDACore/BuildFile.xml
@@ -1,6 +1,8 @@
+<use name="FWCore/Concurrency"/>
 <use name="FWCore/Framework"/>
 <use name="FWCore/ServiceRegistry"/>
 <use name="FWCore/ParameterSet"/>
+<use name="CUDADataFormats/Common"/>
 <use name="DataFormats/Provenance"/>
 <use name="HeterogeneousCore/CUDAServices"/>
 <use name="cuda"/>
diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md
new file mode 100644
index 0000000000000..a2bdb67cfba11
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/README.md
@@ -0,0 +1,636 @@
+# CUDA algorithms in CMSSW
+
+## Outline
+
+* [Introduction](#introduction)
+  * [Design goals](#design-goals)
+  * [Overall guidelines](#overall-guidelines)
+* [Sub-packages](#sub-packages)
+* [Examples](#examples)
+  * [Isolated producer (no CUDA input nor output)](#isolated-producer-no-cuda-input-nor-output)
+  * [Producer with CUDA input](#producer-with-cuda-output)
+  * [Producer with CUDA output](#producer-with-cuda-input)
+  * [Producer with CUDA input and output (with ExternalWork)](#producer-with-cuda-input-and-output-with-externalwork)
+  * [Producer with CUDA input and output (without ExternalWork)](#producer-with-cuda-input-and-output-without-externalwork)
+  * [Configuration](#configuration)
+    * [GPU-only configuration](#gpu-only-configuration)
+    * [Automatic switching between CPU and GPU modules](#automatic-switching-between-cpu-and-gpu-modules)
+* [More details](#more-details)
+  * [Device choice](#device-choice)
+  * [Data model](#data-model)
+  * [CUDA EDProducer](#cuda-edproducer)
+    * [Class declaration](#class-declaration)
+    * [Memory allocation](#memory-allocation)
+      * [Caching allocator](#caching-allocator)
+      * [CUDA API](#cuda-api)
+    * [Setting the current device](#setting-the-current-device)
+    * [Getting input](#getting-input)
+    * [Calling the CUDA kernels](#calling-the-cuda-kernels)
+    * [Putting output](#putting-output)
+    * [`ExternalWork` extension](#externalwork-extension)
+    * [Transferring GPU data to CPU](#transferring-gpu-data-to-cpu)
+    * [Synchronizing between CUDA streams](#synchronizing-between-cuda-streams)
+
+
+## Introduction
+
+This page documents the CUDA integration within CMSSW
+
+### Design goals
+
+1. Provide a mechanism for a chain of modules to share a resource
+   * Resource can be e.g. CUDA device memory or a CUDA stream
+2. Minimize data movements between the CPU and the device
+3. Support multiple devices
+4. Allow the same job configuration to be used on all hardware combinations
+
+### Overall guidelines
+
+1. Within the `acquire()`/`produce()` functions all CUDA operations should be asynchronous, i.e.
+   * Use `cudaMemcpyAsync()`, `cudaMemsetAsync()`, `cudaMemPrefetchAsync()` etc.
+   * Avoid `cudaMalloc*()`, `cudaHostAlloc()`, `cudaFree*()`, `cudaHostRegister()`, `cudaHostUnregister()` on every event
+     * Occasional calls are permitted through a caching mechanism that amortizes the cost (see also [Caching allocator](#caching-allocator))
+   * Avoid `assert()` in device functions, or use `#include HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h`
+     * With the latter the `assert()` calls in CUDA code are disabled by
+       default, but can be enabled by defining a `GPU_DEBUG` macro
+       (before the aforementioned include)
+2. Synchronization needs should be fulfilled with
+   [`ExternalWork`](https://twiki.cern.ch/twiki/bin/view/CMSPublic/FWMultithreadedFrameworkStreamModuleInterface#edm_ExternalWork)
+   extension to EDProducers
+   * `ExternalWork` can be used to replace one synchronization point
+     (e.g. between device kernels and copying a known amount of data
+     back to CPU).
+   * For further synchronization points (e.g. copying data whose
+     amount is known only at the device side), split the work to
+     multiple `ExternalWork` producers. This approach has the added
+     benefit that e.g. data transfers to CPU become on-demand automatically
+   * A general breakdown of the possible steps:
+     * Convert input legacy CPU data format to CPU SoA
+     * Transfer input CPU SoA to GPU
+     * Launch kernels
+     * Transfer the number of output elements to CPU
+     * Transfer the output data from GPU to CPU SoA
+     * Convert the output SoA to legacy CPU data formats
+3. Within `acquire()`/`produce()`, the current CUDA device is set
+   implicitly and the CUDA stream is provided by the system (with
+   `CUDAScopedContext`)
+   * It is strongly recommended to use the provided CUDA stream for all operations
+     * If that is not feasible for some reason, the provided CUDA
+       stream must synchronize with the work queued on other CUDA
+       streams (with CUDA events and `cudaStreamWaitEvent()`)
+4. Outside of `acquire()`/`produce()`, CUDA API functions may be
+   called only if `CUDAService::enabled()` returns `true`.
+   * With point 3 it follows that in these cases multiple devices have
+     to be dealt with explicitly, as well as CUDA streams
+
+## Sub-packages
+* [`HeterogeneousCore/CUDACore`](#cuda-integration) CUDA-specific core components
+* [`HeterogeneousCore/CUDAServices`](../CUDAServices) Various edm::Services related to CUDA
+* [`HeterogeneousCore/CUDAUtilities`](../CUDAUtilities) Various utilities for CUDA kernel code
+* [`HeterogeneousCore/CUDATest`](../CUDATest) Test modules and configurations
+* [`CUDADataFormats/Common`](../../CUDADataFormats/Common) Utilities for event products with CUDA data
+
+## Examples
+
+### Isolated producer (no CUDA input nor output)
+
+```cpp
+class IsolatedProducerCUDA: public edm::stream::EDProducer<ExternalWork> {
+public:
+  ...
+  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+  ...
+private:
+  ...
+  IsolatedProducerGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<InputData> inputToken_;
+  edm::EDPutTokenT<OutputData> outputToken_;
+};
+...
+void IsolatedProducerCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  // Sets the current device and creates a CUDA stream
+  CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
+
+  auto const& inputData = iEvent.get(inputToken_);
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by CUDAScopedContext::stream()
+  gpuAlgo_.makeAsync(inputData, ctx.stream());
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished
+}
+
+// Called after the asynchronous work has finished
+void IsolatedProducerCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+  // Real life is likely more complex than this simple example. Here
+  // getResult() returns some data in CPU memory that is passed
+  // directly to the OutputData constructor.
+  iEvent.emplace(outputToken_, gpuAlgo_.getResult());
+}
+```
+
+### Producer with CUDA output
+
+```cpp
+class ProducerOutputCUDA: public edm::stream::EDProducer<ExternalWork> {
+public:
+  ...
+  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+  ...
+private:
+  ...
+  ProducerOutputGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<InputData> inputToken_;
+  edm::EDPutTokenT<CUDAProduct<OutputData>> outputToken_;
+  CUDAContextToken ctxTmp_;
+};
+...
+void ProducerOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  // Sets the current device and creates a CUDA stream
+  CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
+
+  auto const& inputData = iEvent.get(inputToken_);
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by CUDAScopedContext::stream()
+  gpuAlgo.makeAsync(inputData, ctx.stream());
+
+  // Passes the current device and CUDA stream to produce()
+  // Feels a bit silly, and will hopefully get improved in the future
+  ctxTmp_ = ctx.toToken();
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished
+}
+
+// Called after the asynchronous work has finished
+void ProducerOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+  // Sets again the current device, uses the CUDA stream created in the acquire()
+  CUDAScopedContext ctx{std::move(ctxTmp_)};
+
+  // Now getResult() returns data in GPU memory that is passed to the
+  // constructor of OutputData. CUDAScopedContext::emplace() wraps the
+  // OutputData to CUDAProduct<OutputData>. CUDAProduct<T> stores also
+  // the current device and the CUDA stream since those will be needed
+  // in the consumer side.
+  ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
+}
+```
+
+### Producer with CUDA input
+
+```cpp
+class ProducerInputCUDA: public edm::stream::EDProducer<ExternalWork> {
+public:
+  ...
+  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+  ...
+private:
+  ...
+  ProducerInputGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<CUDAProduct<InputData>> inputToken_;
+  edm::EDGetTokenT<CUDAProduct<OtherInputData>> otherInputToken_;
+  edm::EDPutTokenT<OutputData> outputToken_;
+};
+...
+void ProducerInputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  CUDAProduct<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
+
+  // Set the current device to the same that was used to produce
+  // InputData, and also use the same CUDA stream
+  CUDAScopedContext ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+
+  // Alternatively a new CUDA stream can be created here. This is for
+  // a case where there are two (or more) consumers of
+  // CUDAProduct<InputData> whose work is independent and thus can be run
+  // in parallel.
+  CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder);
+  
+  // Grab the real input data. Checks that the input data is on the
+  // current device. If the input data was produced in a different CUDA
+  // stream than the CUDAScopedContext holds, create an inter-stream
+  // synchronization point with CUDA event and cudaStreamWaitEvent()
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  // Input data from another producer
+  auto const& otherInputData = ctx.get(iEvent.get(otherInputToken_));
+  // or
+  auto const& otherInputData = ctx.get(iEvent, otherInputToken_);
+
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by CUDAScopedContext::stream()
+  gpuAlgo.makeAsync(inputData, otherInputData, ctx.stream());
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished
+}
+
+// Called after the asynchronous work has finished
+void ProducerInputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
+  // Real life is likely more complex than this simple example. Here
+  // getResult() returns some data in CPU memory that is passed
+  // directly to the OutputData constructor.
+  iEvent.emplace(outputToken_, gpuAlgo_.getResult());
+}
+```
+
+### Producer with CUDA input and output (with ExternalWork)
+
+```cpp
+class ProducerInputOutputCUDA: public edm::stream::EDProducer<ExternalWork> {
+public:
+  ...
+  void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override;
+  ...
+private:
+  ...
+  ProducerInputGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<CUDAProduct<InputData>> inputToken_;
+  edm::EDPutTokenT<CUDAProduct<OutputData>> outputToken_;
+  CUDAContextToken ctxTmp_;
+};
+...
+void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  CUDAProduct<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
+
+  // Set the current device to the same that was used to produce
+  // InputData, and also use the same CUDA stream
+  CUDAScopedContext ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+
+  // Grab the real input data. Checks that the input data is on the
+  // current device. If the input data was produced in a different CUDA
+  // stream than the CUDAScopedContext holds, create an inter-stream
+  // synchronization point with CUDA event and cudaStreamWaitEvent()
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by CUDAScopedContext::stream()
+  gpuAlgo.makeAsync(inputData, ctx.stream());
+
+  // Passes the current device and CUDA stream to produce()
+  // Feels a bit silly, and will hopefully get improved in the future
+  ctxTmp_ = ctx.toToken();
+
+// Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished
+}
+
+// Called after the asynchronous work has finished
+void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
+  // Sets again the current device, uses the CUDA stream created in the acquire()
+  CUDAScopedContext ctx{std::move(ctxTmp_)};
+
+  // Now getResult() returns data in GPU memory that is passed to the
+  // constructor of OutputData. CUDAScopedContext::emplace() wraps the
+  // OutputData to CUDAProduct<OutputData>. CUDAProduct<T> stores also
+  // the current device and the CUDA stream since those will be needed
+  // in the consumer side.
+  ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
+}
+```
+
+### Producer with CUDA input and output (without ExternalWork)
+
+If the producer does not need to transfer anything back to CPU (like
+the number of output elements), the `ExternalWork` extension is not
+needed as there is no need to synchronize.
+
+```cpp
+class ProducerInputOutputCUDA: public edm::global::EDProducer<> {
+public:
+  ...
+  void produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const override;
+  ...
+private:
+  ...
+  ProducerInputGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<CUDAProduct<InputData>> inputToken_;
+  edm::EDPutTokenT<CUDAProduct<OutputData>> outputToken_;
+};
+...
+void ProducerInputOutputCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const {
+  CUDAProduct<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
+
+  // Set the current device to the same that was used to produce
+  // InputData, and also use the same CUDA stream
+  CUDAScopedContext ctx{streamID};
+
+  // Grab the real input data. Checks that the input data is on the
+  // current device. If the input data was produced in a different CUDA
+  // stream than the CUDAScopedContext holds, create an inter-stream
+  // synchronization point with CUDA event and cudaStreamWaitEvent()
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by CUDAScopedContext::stream(). Here makeAsync() also
+  // returns data in GPU memory that is passed to the constructor of
+  // OutputData. CUDAScopedContext::emplace() wraps the OutputData to
+  // CUDAProduct<OutputData>. CUDAProduct<T> stores also the current
+  // device and the CUDA stream since those will be needed in the
+  // consumer side.
+  ctx.emplace(iEvent, outputToken, gpuAlgo.makeAsync(inputData, ctx.stream());
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished
+}
+```
+
+### Configuration
+
+#### GPU-only configuration
+
+For a GPU-only configuration there is nothing special to be done, just
+construct the Paths/Sequences/Tasks from the GPU modules.
+
+#### Automatic switching between CPU and GPU modules
+
+The `SwitchProducer` mechanism can be used to switch automatically
+between CPU and GPU modules based on the availability of GPUs on the
+machine where the configuration is done. Framework decides at the
+beginning of the job which of the modules to run for a given module
+label.
+
+Framework requires that the modules in the switch must produce the
+same types of output products (the closer the actual results are the
+better, but the framework can not enforce that). This means that for a
+chain of GPU modules, it is the module that transforms the SoA data
+format back to the legacy data formats (possibly, but not necessarily,
+transferring the SoA data from GPU to CPU) that should be switched
+between the legacy CPU module. The rest of the GPU modules should be
+placed to a `Task`, in which case framework runs them only if their
+output is needed by another module.
+
+```python
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+process.foo = SwitchProducerCUDA(
+    cpu = cms.EDProducer("FooProducer"), # legacy CPU
+    cuda = cms.EDProducer("FooProducerFromCUDA",
+        src="fooCUDA"
+    )
+)
+process.fooCUDA = cms.EDProducer("FooProducerCUDA")
+
+process.fooTaskCUDA = cms.Task(process.fooCUDA)
+process.fooTask = cms.Task(
+    process.foo,
+    process.fooTaskCUDA
+)
+```
+
+For a more complete example, see [here](../CUDATest/test/testCUDASwitch_cfg.py).
+
+
+
+
+
+## More details
+
+### Device choice
+
+As discussed above, with `SwitchProducer` the choice between CPU and
+GPU modules is done at the beginning of the job.
+
+For multi-GPU setup the device is chosen in the first CUDA module in a
+chain of modules by one of the constructors of `CUDAScopedContext`
+```cpp
+CUDAScopedContext ctx{iEvent.streamID()};
+```
+As the choice is still the static EDM stream to device assignment, the
+EDM stream ID is needed. The logic will likely evolve in the future to
+be more dynamic, and likely the device choice has to be made for the
+full event.
+
+### Data model
+
+The "GPU data product" should be a class/struct containing smart
+pointer(s) to device data (see [Memory allocation](#memory-allocation)).
+When putting the data to event, the data is wrapped to
+`CUDAProduct<T>` template, which holds
+* the GPU data product
+  * must be moveable, but no other restrictions
+* the current device where the data was produced, and the CUDA stream the data was produced with
+* [CUDA event for synchronization between multiple CUDA streams](#synchronizing-between-cuda-streams)
+
+Note that the `CUDAProduct<T>` wrapper can be constructed only with
+`CUDAScopedContext::wrap()`, and the data `T` can be obtained from it
+only with `CUDAScopedContext::get()`, as described further below. When
+putting the data product directly to `edm::Event`, also
+`CUDASCopedContext::emplace()` can be used.
+
+The GPU data products that depend on the CUDA runtime should be placed
+under `CUDADataFormats` package, using the same name for sub-package
+that would be used in `DataFormats`. Everything else, e.g. SoA for
+CPU, should go under `DataFormats` as usual.
+
+
+### CUDA EDProducer
+
+#### Class declaration
+
+The CUDA producers are normal EDProducers. The `ExternalWork`
+extension should be used if a synchronization between the GPU and CPU
+is needed, e.g. when transferring data from GPU to CPU.
+
+#### Memory allocation
+
+##### Caching allocator
+
+The memory allocations should be done dynamically with `CUDAService`
+```cpp
+edm::Service<CUDAService> cs;
+cudautils::device::unique_ptr<float[]> device_buffer = cs->make_device_unique<float[]>(50, cudaStream);
+cudautils::host::unique_ptr<float[]>   host_buffer   = cs->make_host_unique<float[]>(50, cudaStream);
+```
+
+in the `acquire()` and `produce()` functions. The same
+`cuda::stream_t<>` object that is used for transfers and kernels
+should be passed to the allocator.
+
+The allocator is based on `cub::CachingDeviceAllocator`. The memory is
+guaranteed to be reserved
+* for the host: up to the destructor of the `unique_ptr`
+* for the device: until all work queued in the `cudaStream` up to the point when the `unique_ptr` destructor is called has finished
+
+##### CUDA API
+
+The `cudaMalloc()` etc may be used outside of the event loop, but that
+should be limited to only relatively small allocations in order to
+allow as much re-use of device memory as possible.
+
+If really needed, the `cudaMalloc()` etc may be used also within the
+event loop, but then the cost of allocation and implicit
+synchronization should be explicitly amortized e.g. by caching.
+
+#### Setting the current device
+
+A CUDA producer should construct `CUDAScopedContext` in `acquire()`
+(`produce()` if not using `ExternalWork`) either with `edm::StreamID`,
+or with a `CUDAProduct<T>` read as an input.
+
+```cpp
+// From edm::StreamID
+CUDAScopedContext ctx{iEvent.streamID()};
+
+// From CUDAProduct<T>
+CUDAProduct<GPUClusters> cclus = iEvent.get(srcToken_);
+CUDAScopedContext ctx{cclus};
+```
+
+`CUDAScopedContext` works in the RAII way and does the following
+* Sets the current device for the current scope
+  - If constructed from the `edm::StreamID`, chooses the device and creates a new CUDA stream
+  - If constructed from the `CUDAProduct<T>`, uses the same device and CUDA stream as was used to produce the `CUDAProduct<T>`
+* Gives access to the CUDA stream the algorithm should use to queue asynchronous work
+* Calls `edm::WaitingTaskWithArenaHolder::doneWaiting()` when necessary
+* [Synchronizes between CUDA streams if necessary](#synchronizing-between-cuda-streams)
+* Needed to get/put `CUDAProduct<T>` from/to the event
+
+In case of multiple input products, from possibly different CUDA
+streams and/or CUDA devices, this approach gives the developer full
+control in which of them the kernels of the algorithm should be run.
+
+#### Getting input
+
+The real product (`T`) can be obtained from `CUDAProduct<T>` only with
+the help of `CUDAScopedContext`.
+
+```cpp
+// From CUDAProduct<T>
+CUDAProduct<GPUClusters> cclus = iEvent.get(srcToken_);
+GPUClusters const& clus = ctx.get(cclus);
+
+// Directly from Event
+GPUClusters const& clus = ctx.get(iEvent, srcToken_);
+```
+
+This step is needed to
+* check that the data are on the same CUDA device
+  * if not, throw an exception (with unified memory could prefetch instead)
+* if the CUDA streams are different, synchronize between them
+
+#### Calling the CUDA kernels
+
+It is usually best to wrap the CUDA kernel calls to a separate class,
+and then call methods of that class from the EDProducer. The only
+requirement is that the CUDA stream where to queue the operations
+should be the one from the `CUDAScopedContext`
+
+```cpp
+gpuAlgo.makeClustersAsync(..., ctx.stream());
+```
+
+If necessary, different CUDA streams may be used internally, but they
+should to be made to synchronize with the provided CUDA stream with
+CUDA events and `cudaStreamWaitEvent()`.
+
+
+#### Putting output
+
+The GPU data needs to be wrapped to `CUDAProduct<T>` template with
+`CUDAScopedContext::wrap()` or `CUDAScopedContext::emplace()`
+
+```cpp
+GPUClusters clusters = gpuAlgo.makeClustersAsync(..., ctx.stream());
+std::unique_ptr<CUDA<GPUClusters>> ret = ctx.wrap(clusters);
+iEvent.put(std::move(ret));
+
+// or with one line
+iEvent.put(ctx.wrap(gpuAlgo.makeClustersAsync(ctx.stream())));
+
+// or avoid one unique_ptr with emplace
+edm::PutTokenT<CUDA<GPUClusters>> putToken_ = produces<CUDA<GPUClusters>>(); // in constructor
+...
+ctx.emplace(iEvent, putToken_, gpuAlgo.makeClustersAsync(ctx.stream()));
+```
+
+This step is needed to
+* store the current device and CUDA stream into `CUDAProduct<T>`
+* record the CUDA event needed for CUDA stream synchronization
+
+#### `ExternalWork` extension
+
+Everything above works both with and without `ExternalWork`.
+
+Without `ExternalWork` the `EDProducer`s act similar to TBB
+flowgraph's "streaming node". In other words, they just queue more
+asynchronous work to the CUDA stream in their `produce()`.
+
+The `ExternalWork` is needed when one would otherwise call
+`cudeStreamSynchronize()`. For example transferring something to CPU
+needed for downstream DQM, or queueing more asynchronous work. With
+`ExternalWork` an `acquire()` method needs to be implemented that gets
+an `edm::WaitingTaskWithArenaHolder` parameter. The
+`edm::WaitingTaskWithArenaHolder` should then be passed to the
+constructor of `CUDAScopedContext` along
+
+```cpp
+void acquire(..., edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  CUDAProduct<GPUClusters> const& cclus = iEvent.get(token_);
+  CUDAScopedContext ctx{cclus, std::move(waitingTaskHolder)}; // can also copy instead of move if waitingTaskHolder is needed for something else as well
+  ...
+```
+
+When constructed this way, `CUDAScopedContext` registers a callback
+function to the CUDA stream in its destructor to call
+`waitingTaskHolder.doneWaiting()`.
+
+A GPU->GPU producer needs a `CUDAScopedContext` also in its
+`produce()`. Currently the best way is to store the state of
+`CUDAScopedContext` to `CUDAContextToken` member variable:
+
+```cpp
+class FooProducerCUDA ... {
+  ...
+  CUDAContextToken ctxTmp_;
+};
+
+void acquire(...) {
+  ...
+  ctxTmp_ = ctx.toToken();
+}
+
+void produce(...( {
+  ...
+  CUDAScopedContext ctx{std::move(ctxTmp_)};
+}
+```
+
+Ideas for improvements are welcome.
+
+
+#### Transferring GPU data to CPU
+
+The GPU->CPU data transfer needs synchronization to ensure the CPU
+memory to have all data before putting that to the event. This means
+the `ExternalWork` needs to be used along
+* In `acquire()`
+  * (allocate CPU memory buffers)
+  * Queue all GPU->CPU transfers asynchronously
+* In `produce()`
+  * If needed, read additional CPU products (e.g. from `edm::Ref`s)
+  * Reformat data back to legacy data formats
+  * Note: `CUDAScopedContext` is **not** needed in `produce()`
+
+#### Synchronizing between CUDA streams
+
+In case the producer needs input data that were produced in two (or
+more) CUDA streams, these streams have to be synchronized. Here this
+synchronization is achieved with CUDA events.
+
+Each `CUDAProduct<T>` constains also a CUDA event object. The call to
+`CUDAScopedContext::wrap()` will *record* the event in the CUDA stream.
+This means that when all work queued to the CUDA stream up to that
+point has been finished, the CUDA event becomes *occurred*. Then, in
+`CUDAScopedContext::get()`, if the `CUDAProduct<T>` to get from has a
+different CUDA stream than the `CUDAScopedContext`,
+`cudaStreamWaitEvent(stream, event)` is called. This means that all
+subsequent work queued to the CUDA stream will wait for the CUDA event
+to become occurred. Therefore this subsequent work can assume that the
+to-be-getted CUDA product exists.
+
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h b/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h
new file mode 100644
index 0000000000000..1a599132d13f1
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h
@@ -0,0 +1,38 @@
+#ifndef HeterogeneousCore_CUDACore_CUDAContextToken_h
+#define HeterogeneousCore_CUDACore_CUDAContextToken_h
+
+#include <memory>
+
+/**
+ * The purpose of this class is to deliver the device and CUDA stream
+ * information from ExternalWork's acquire() to producer() via a
+ * member/StreamCache variable.
+ */
+class CUDAContextToken {
+public:
+  CUDAContextToken() = default;
+  ~CUDAContextToken() = default;
+
+  CUDAContextToken(const CUDAContextToken&) = delete;
+  CUDAContextToken& operator=(const CUDAContextToken&) = delete;
+  CUDAContextToken(CUDAContextToken&&) = default;
+  CUDAContextToken& operator=(CUDAContextToken&& other) = default;
+
+private:
+  friend class CUDAScopedContext;
+
+  explicit CUDAContextToken(int device, std::shared_ptr<cuda::stream_t<>> stream):
+    stream_(std::move(stream)),
+    device_(device)
+  {}
+
+  int device() { return device_; }
+  std::shared_ptr<cuda::stream_t<>>&& streamPtr() {
+    return std::move(stream_);
+  }
+
+  std::shared_ptr<cuda::stream_t<>> stream_;
+  int device_;
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
new file mode 100644
index 0000000000000..ef87d017373f8
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
@@ -0,0 +1,109 @@
+#ifndef HeterogeneousCore_CUDACore_CUDAScopedContext_h
+#define HeterogeneousCore_CUDACore_CUDAScopedContext_h
+
+#include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Utilities/interface/StreamID.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/EDPutToken.h"
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAContextToken.h"
+
+#include <cuda/api_wrappers.h>
+
+#include <optional>
+
+namespace cudatest {
+  class TestCUDAScopedContext;
+}
+
+/**
+ * The aim of this class is to do necessary per-event "initialization":
+ * - setting the current device
+ * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
+ * - synchronizing between CUDA streams if necessary
+ * and enforce that those get done in a proper way in RAII fashion.
+ */
+class CUDAScopedContext {
+public:
+  explicit CUDAScopedContext(edm::StreamID streamID);
+
+  explicit CUDAScopedContext(CUDAContextToken&& token):
+    currentDevice_(token.device()),
+    setDeviceForThisScope_(currentDevice_),
+    stream_(std::move(token.streamPtr()))
+  {}
+
+  template<typename T>
+  explicit CUDAScopedContext(const CUDAProduct<T>& data):
+    currentDevice_(data.device()),
+    setDeviceForThisScope_(currentDevice_),
+    stream_(data.streamPtr())
+  {}
+
+  explicit CUDAScopedContext(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
+    CUDAScopedContext(streamID)
+  {
+    waitingTaskHolder_ = std::move(waitingTaskHolder);
+  }
+
+  template <typename T>
+  explicit CUDAScopedContext(const CUDAProduct<T>& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
+    CUDAScopedContext(data)
+  {
+    waitingTaskHolder_ = std::move(waitingTaskHolder);
+  }
+
+  ~CUDAScopedContext();
+
+  int device() const { return currentDevice_; }
+
+  cuda::stream_t<>& stream() { return *stream_; }
+  const cuda::stream_t<>& stream() const { return *stream_; }
+  const std::shared_ptr<cuda::stream_t<>>& streamPtr() const { return stream_; }
+
+  CUDAContextToken toToken() {
+    return CUDAContextToken(currentDevice_, stream_);
+  }
+
+  template <typename T>
+  const T& get(const CUDAProduct<T>& data) {
+    synchronizeStreams(data.device(), data.stream(), data.event());
+    return data.data_;
+  }
+
+  template <typename T>
+  const T& get(const edm::Event& iEvent, edm::EDGetTokenT<CUDAProduct<T>> token) {
+    return get(iEvent.get(token));
+  }
+
+  template <typename T>
+  std::unique_ptr<CUDAProduct<T> > wrap(T data) const {
+    // make_unique doesn't work because of private constructor
+    //
+    // CUDAProduct<T> constructor records CUDA event to the CUDA
+    // stream. The event will become "occurred" after all work queued
+    // to the stream before this point has been finished.
+    return std::unique_ptr<CUDAProduct<T> >(new CUDAProduct<T>(device(), streamPtr(), std::move(data)));
+  }
+
+  template <typename T, typename... Args>
+  auto emplace(edm::Event& iEvent, edm::EDPutTokenT<T> token, Args&&... args) const {
+    return iEvent.emplace(token, device(), streamPtr(), std::forward<Args>(args)...);
+  }
+
+private:
+  friend class cudatest::TestCUDAScopedContext;
+
+  // This construcor is only meant for testing
+  explicit CUDAScopedContext(int device, std::unique_ptr<cuda::stream_t<>> stream);
+
+  void synchronizeStreams(int dataDevice, const cuda::stream_t<>& dataStream, const cuda::event_t& dataEvent);
+
+  int currentDevice_;
+  std::optional<edm::WaitingTaskWithArenaHolder> waitingTaskHolder_;
+  cuda::device::current::scoped_override_t<> setDeviceForThisScope_;
+  std::shared_ptr<cuda::stream_t<>> stream_;
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py b/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py
new file mode 100644
index 0000000000000..ded114e2fddfe
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py
@@ -0,0 +1,34 @@
+import FWCore.ParameterSet.Config as cms
+
+_cuda_enabled_cached = None
+
+def _switch_cuda():
+    global _cuda_enabled_cached
+    if _cuda_enabled_cached is None:
+        import os
+        _cuda_enabled_cached = (os.system("cudaIsEnabled") == 0)
+    return (_cuda_enabled_cached, 2)
+
+class SwitchProducerCUDA(cms.SwitchProducer):
+    def __init__(self, **kargs):
+        super(SwitchProducerCUDA,self).__init__(
+            dict(cpu = cms.SwitchProducer.getCpu(),
+                 cuda = _switch_cuda),
+            **kargs
+        )
+cms.specialImportRegistry.registerSpecialImportForType(SwitchProducerCUDA, "from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA")
+
+if __name__ == "__main__":
+    import unittest
+
+    class TestSwitchProducerCUDA(unittest.TestCase):
+        def testPickle(self):
+            import pickle
+            sp = SwitchProducerCUDA(cpu = cms.EDProducer("Foo"), cuda = cms.EDProducer("Bar"))
+            pkl = pickle.dumps(sp)
+            unpkl = pickle.loads(pkl)
+            self.assertEqual(unpkl.cpu.type_(), "Foo")
+            self.assertEqual(unpkl.cuda.type_(), "Bar")
+
+    unittest.main()
+
diff --git a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
new file mode 100644
index 0000000000000..a29fbee36865f
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
@@ -0,0 +1,67 @@
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/Exception.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+#include "chooseCUDADevice.h"
+
+
+CUDAScopedContext::CUDAScopedContext(edm::StreamID streamID):
+  currentDevice_(cudacore::chooseCUDADevice(streamID)),
+  setDeviceForThisScope_(currentDevice_)
+{
+  edm::Service<CUDAService> cs;
+  stream_ = cs->getCUDAStream();
+}
+
+CUDAScopedContext::CUDAScopedContext(int device, std::unique_ptr<cuda::stream_t<>> stream):
+  currentDevice_(device),
+  setDeviceForThisScope_(device),
+  stream_(std::move(stream))
+{}
+
+CUDAScopedContext::~CUDAScopedContext() {
+  if(waitingTaskHolder_.has_value()) {
+    stream_->enqueue.callback([device=currentDevice_,
+                               waitingTaskHolder=*waitingTaskHolder_]
+                              (cuda::stream::id_t streamId, cuda::status_t status) mutable {
+                                if(cuda::is_success(status)) {
+                                  LogTrace("CUDAScopedContext") << " GPU kernel finished (in callback) device " << device << " CUDA stream " << streamId;
+                                  waitingTaskHolder.doneWaiting(nullptr);
+                                }
+                                else {
+                                  // wrap the exception in a try-catch block to let GDB "catch throw" break on it
+                                  try {
+                                    auto error = cudaGetErrorName(status);
+                                    auto message = cudaGetErrorString(status);
+                                    throw cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << device << " error " << error << ": " << message;
+                                  } catch(cms::Exception&) {
+                                    waitingTaskHolder.doneWaiting(std::current_exception());
+                                  }
+                                }
+                              });
+  }
+}
+
+void CUDAScopedContext::synchronizeStreams(int dataDevice, const cuda::stream_t<>& dataStream, const cuda::event_t& dataEvent) {
+  if(dataDevice != currentDevice_) {
+    // Eventually replace with prefetch to current device (assuming unified memory works)
+    // If we won't go to unified memory, need to figure out something else...
+    throw cms::Exception("LogicError") << "Handling data from multiple devices is not yet supported";
+  }
+
+  if(dataStream.id() != stream_->id()) {
+    // Different streams, need to synchronize
+    if(!dataEvent.has_occurred()) {
+      // Event not yet occurred, so need to add synchronization
+      // here. Sychronization is done by making the CUDA stream to
+      // wait for an event, so all subsequent work in the stream
+      // will run only after the event has "occurred" (i.e. data
+      // product became available).
+      auto ret = cudaStreamWaitEvent(stream_->id(), dataEvent.id(), 0);
+      cuda::throw_if_error(ret, "Failed to make a stream to wait for an event");
+    }
+  }
+}
diff --git a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
new file mode 100644
index 0000000000000..a582ed2f72866
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
@@ -0,0 +1,18 @@
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+#include "chooseCUDADevice.h"
+
+namespace cudacore {
+  int chooseCUDADevice(edm::StreamID id) {
+    edm::Service<CUDAService> cudaService;
+
+    // For startes we "statically" assign the device based on
+    // edm::Stream number. This is suboptimal if the number of
+    // edm::Streams is not a multiple of the number of CUDA devices
+    // (and even then there is no load balancing).
+    //
+    // TODO: improve the "assignment" logic
+    return id % cudaService->numberOfDevices();
+  }
+}
diff --git a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.h b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.h
new file mode 100644
index 0000000000000..bb09c302af7f5
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.h
@@ -0,0 +1,10 @@
+#ifndef HeterogeneousCore_CUDACore_chooseCUDADevice_h
+#define HeterogeneousCore_CUDACore_chooseCUDADevice_h
+
+#include "FWCore/Utilities/interface/StreamID.h"
+
+namespace cudacore {
+  int chooseCUDADevice(edm::StreamID id);
+}
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/test/BuildFile.xml b/HeterogeneousCore/CUDACore/test/BuildFile.xml
new file mode 100644
index 0000000000000..cd2c3b094243c
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/BuildFile.xml
@@ -0,0 +1,6 @@
+<bin file="test*.cc test*.cu" name="testHeterogeneousCoreCUDACore">
+  <use name="FWCore/TestProcessor"/>
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="catch2"/>
+  <use name="cuda"/>
+</bin>
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
new file mode 100644
index 0000000000000..eda2b94f5dfb4
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
@@ -0,0 +1,119 @@
+#include "catch.hpp"
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+
+#include "test_CUDAScopedContextKernels.h"
+
+namespace cudatest {
+  class TestCUDAScopedContext {
+  public:
+    static
+    CUDAScopedContext make(int dev) {
+      auto device = cuda::device::get(dev);
+      return CUDAScopedContext(dev, std::make_unique<cuda::stream_t<>>(device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream)));
+    }
+  };
+}
+
+namespace {
+  std::unique_ptr<CUDAProduct<int *> > produce(int device, int *d, int *h) {
+    auto ctx = cudatest::TestCUDAScopedContext::make(device);
+
+    cuda::memory::async::copy(d, h, sizeof(int), ctx.stream().id());
+    testCUDAScopedContextKernels_single(d, ctx.stream());
+    return ctx.wrap(d);
+  }
+}
+
+TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
+  exitSansCUDADevices();
+
+  constexpr int defaultDevice = 0;
+  {
+    auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice);
+
+    SECTION("Construct from device ID") {
+      REQUIRE(cuda::device::current::get().id() == defaultDevice);
+    }
+
+    SECTION("Wrap T to CUDAProduct<T>") {
+      std::unique_ptr<CUDAProduct<int> > dataPtr = ctx.wrap(10);
+      REQUIRE(dataPtr.get() != nullptr);
+      REQUIRE(dataPtr->device() == ctx.device());
+      REQUIRE(dataPtr->stream().id() == ctx.stream().id());
+    }
+
+    SECTION("Construct from from CUDAProduct<T>") {
+      std::unique_ptr<CUDAProduct<int>> dataPtr = ctx.wrap(10);
+      const auto& data = *dataPtr;
+
+      CUDAScopedContext ctx2{data};
+      REQUIRE(cuda::device::current::get().id() == data.device());
+      REQUIRE(ctx2.stream().id() == data.stream().id());
+    }
+
+    SECTION("Storing state as CUDAContextToken") {
+      CUDAContextToken ctxtok;
+      { // acquire
+        std::unique_ptr<CUDAProduct<int>> dataPtr = ctx.wrap(10);
+        const auto& data = *dataPtr;
+        CUDAScopedContext ctx2{data};
+        ctxtok = ctx2.toToken();
+      }
+
+      { // produce
+        CUDAScopedContext ctx2{std::move(ctxtok)};
+        REQUIRE(cuda::device::current::get().id() == ctx.device());
+        REQUIRE(ctx2.stream().id() == ctx.stream().id());
+      }
+    }
+
+    SECTION("Joining multiple CUDA streams") {
+      cuda::device::current::scoped_override_t<> setDeviceForThisScope(defaultDevice);
+      auto current_device = cuda::device::current::get();
+
+      // Mimick a producer on the second CUDA stream
+      int h_a1 = 1;
+      auto d_a1 = cuda::memory::device::make_unique<int>(current_device);
+      auto wprod1 = produce(defaultDevice, d_a1.get(), &h_a1);
+
+      // Mimick a producer on the second CUDA stream
+      int h_a2 = 2;
+      auto d_a2 = cuda::memory::device::make_unique<int>(current_device);
+      auto wprod2 = produce(defaultDevice, d_a2.get(), &h_a2);
+
+      REQUIRE(wprod1->stream().id() != wprod2->stream().id());
+
+      // Mimick a third producer "joining" the two streams
+      CUDAScopedContext ctx2{*wprod1};
+
+      auto prod1 = ctx.get(*wprod1);
+      auto prod2 = ctx.get(*wprod2);
+
+      auto d_a3 = cuda::memory::device::make_unique<int>(current_device);
+      testCUDAScopedContextKernels_join(prod1, prod2, d_a3.get(), ctx.stream());
+      ctx.stream().synchronize();
+      REQUIRE(wprod2->event().has_occurred());
+
+      h_a1 = 0;
+      h_a2 = 0;
+      int h_a3 = 0;
+      cuda::memory::async::copy(&h_a1, d_a1.get(), sizeof(int), ctx.stream().id());
+      cuda::memory::async::copy(&h_a2, d_a2.get(), sizeof(int), ctx.stream().id());
+      cuda::memory::async::copy(&h_a3, d_a3.get(), sizeof(int), ctx.stream().id());
+
+      REQUIRE(h_a1 == 2);
+      REQUIRE(h_a2 == 4);
+      REQUIRE(h_a3 == 6);
+    }
+  }
+
+  // Destroy and clean up all resources so that the next test can
+  // assume to start from a clean state.
+  cudaCheck(cudaSetDevice(defaultDevice));
+  cudaCheck(cudaDeviceSynchronize());
+  cudaDeviceReset();
+}
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu
new file mode 100644
index 0000000000000..18bdf50abeaa5
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu
@@ -0,0 +1,24 @@
+#include "test_CUDAScopedContextKernels.h"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace {
+  __global__
+  void single_mul(int *d) {
+    d[0] = d[0]*2;
+  }
+
+  __global__
+  void join_add(const int *d1, const int *d2, int *d3) {
+    d3[0] = d1[0] + d2[0];
+  }
+}
+
+void testCUDAScopedContextKernels_single(int *d, cuda::stream_t<>& stream) {
+  single_mul<<<1, 1, 0, stream.id()>>>(d);
+}
+
+void testCUDAScopedContextKernels_join(const int *d1, const int *d2, int *d3, cuda::stream_t<>& stream) {
+  join_add<<<1, 1, 0, stream.id()>>>(d1, d2, d3);
+}
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h
new file mode 100644
index 0000000000000..9d3f9ce33bc97
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h
@@ -0,0 +1,9 @@
+#ifndef HeterogeneousCore_CUDACore_test_CUDAScopedContextKernels_h
+#define HeterogeneousCore_CUDACore_test_CUDAScopedContextKernels_h
+
+#include <cuda/api_wrappers.h>
+
+void testCUDAScopedContextKernels_single(int *d, cuda::stream_t<>& stream);
+void testCUDAScopedContextKernels_join(const int *d1, const int *d2, int *d3, cuda::stream_t<>& stream);
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/test/test_main.cc b/HeterogeneousCore/CUDACore/test/test_main.cc
new file mode 100644
index 0000000000000..0c7c351f437f5
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_main.cc
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
diff --git a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
index a116ca8d78d33..041ed25ba134a 100644
--- a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
+++ b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
@@ -1,3 +1,7 @@
 <bin name="cudaComputeCapabilities" file="cudaComputeCapabilities.cpp">
   <use name="cuda"/>
 </bin>
+
+<bin name="cudaIsEnabled" file="cudaIsEnabled.cpp">
+  <use name="cuda"/>
+</bin>
diff --git a/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
new file mode 100644
index 0000000000000..b24f05adb2213
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
@@ -0,0 +1,31 @@
+#include <algorithm>
+#include <array>
+#include <cstdlib>
+#include <iostream>
+
+#include <cuda_runtime.h>
+
+int main() {
+  int devices = 0;
+  auto status = cudaGetDeviceCount(& devices);
+  if (status != cudaSuccess) {
+    return EXIT_FAILURE;
+  }
+
+  int minimumMajor = 6; // min minor is implicitly 0
+
+  // This approach (requiring all devices are supported) is rather
+  // conservative. In principle we could consider just dropping the
+  // unsupported devices. Currently that would be easiest to achieve
+  // in CUDAService though.
+  for (int i = 0; i < devices; ++i) {
+    cudaDeviceProp properties;
+    cudaGetDeviceProperties(&properties, i);
+
+    if(properties.major < minimumMajor) {
+      return EXIT_FAILURE;
+    }
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
index 9555321f5153a..a7c416c17ed63 100644
--- a/HeterogeneousCore/CUDAServices/interface/CUDAService.h
+++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
@@ -8,8 +8,8 @@
 
 #include "FWCore/Utilities/interface/StreamID.h"
 
-#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
-#include "CUDADataFormats/Common/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 namespace edm {
   class ParameterSet;
@@ -19,16 +19,16 @@ namespace edm {
 
 namespace cudaserviceimpl {
   template <typename T>
-  struct make_device_unique_selector { using non_array = edm::cuda::device::unique_ptr<T>; };
+  struct make_device_unique_selector { using non_array = cudautils::device::unique_ptr<T>; };
   template <typename T>
-  struct make_device_unique_selector<T[]> { using unbounded_array = edm::cuda::device::unique_ptr<T[]>; };
+  struct make_device_unique_selector<T[]> { using unbounded_array = cudautils::device::unique_ptr<T[]>; };
   template <typename T, size_t N>
   struct make_device_unique_selector<T[N]> { struct bounded_array {}; };
 
   template <typename T>
-  struct make_host_unique_selector { using non_array = edm::cuda::host::unique_ptr<T>; };
+  struct make_host_unique_selector { using non_array = cudautils::host::unique_ptr<T>; };
   template <typename T>
-  struct make_host_unique_selector<T[]> { using unbounded_array = edm::cuda::host::unique_ptr<T[]>; };
+  struct make_host_unique_selector<T[]> { using unbounded_array = cudautils::host::unique_ptr<T[]>; };
   template <typename T, size_t N>
   struct make_host_unique_selector<T[N]> { struct bounded_array {}; };
 }
@@ -76,9 +76,9 @@ class CUDAService {
     int dev = getCurrentDevice();
     void *mem = allocate_device(dev, sizeof(T), stream);
     return typename cudaserviceimpl::make_device_unique_selector<T>::non_array(reinterpret_cast<T *>(mem),
-                                                                               [this, dev](void *ptr) {
-                                                                                 this->free_device(dev, ptr);
-                                                                               });
+                                                                               cudautils::device::impl::DeviceDeleter([this, dev](void *ptr) {
+                                                                                   this->free_device(dev, ptr);
+                                                                                 }));
   }
 
   template <typename T>
@@ -89,9 +89,9 @@ class CUDAService {
     int dev = getCurrentDevice();
     void *mem = allocate_device(dev, n*sizeof(element_type), stream);
     return typename cudaserviceimpl::make_device_unique_selector<T>::unbounded_array(reinterpret_cast<element_type *>(mem),
-                                                                                     [this, dev](void *ptr) {
-                                                                                       this->free_device(dev, ptr);
-                                                                                     });
+                                                                                     cudautils::device::impl::DeviceDeleter([this, dev](void *ptr) {
+                                                                                         this->free_device(dev, ptr);
+                                                                                       }));
   }
 
   template <typename T, typename ...Args>
@@ -105,9 +105,9 @@ class CUDAService {
     static_assert(std::is_trivially_constructible<T>::value, "Allocating with non-trivial constructor on the pinned host memory is not supported");
     void *mem = allocate_host(sizeof(T), stream);
     return typename cudaserviceimpl::make_host_unique_selector<T>::non_array(reinterpret_cast<T *>(mem),
-                                                                             [this](void *ptr) {
-                                                                               this->free_host(ptr);
-                                                                             });
+                                                                             cudautils::host::impl::HostDeleter([this](void *ptr) {
+                                                                                 this->free_host(ptr);
+                                                                               }));
   }
 
   template <typename T>
@@ -117,9 +117,9 @@ class CUDAService {
     static_assert(std::is_trivially_constructible<element_type>::value, "Allocating with non-trivial constructor on the pinned host memory is not supported");
     void *mem = allocate_host(n*sizeof(element_type), stream);
     return typename cudaserviceimpl::make_host_unique_selector<T>::unbounded_array(reinterpret_cast<element_type *>(mem),
-                                                                                   [this](void *ptr) {
-                                                                                     this->free_host(ptr);
-                                                                                   });
+                                                                                   cudautils::host::impl::HostDeleter([this](void *ptr) {
+                                                                                       this->free_host(ptr);
+                                                                                     }));
   }
 
   template <typename T, typename ...Args>
@@ -132,6 +132,14 @@ class CUDAService {
   // Free pinned host memory (to be called from unique_ptr)
   void free_host(void *ptr);
 
+  // Gets a (cached) CUDA stream for the current device. The stream
+  // will be returned to the cache by the shared_ptr destructor.
+  std::shared_ptr<cuda::stream_t<>> getCUDAStream();
+
+  // Gets a (cached) CUDA event for the current device. The event
+  // will be returned to the cache by the shared_ptr destructor.
+  std::shared_ptr<cuda::event_t> getCUDAEvent();
+
 private:
   // PIMPL to hide details of allocator
   struct Allocator;
@@ -139,6 +147,14 @@ class CUDAService {
   void *allocate_device(int dev, size_t nbytes, cuda::stream_t<>& stream);
   void *allocate_host(size_t nbytes, cuda::stream_t<>& stream);
 
+  // PIMPL to hide details of the CUDA stream cache
+  struct CUDAStreamCache;
+  std::unique_ptr<CUDAStreamCache> cudaStreamCache_;
+
+  // PIMPL to hide details of the CUDA event cache
+  struct CUDAEventCache;
+  std::unique_ptr<CUDAEventCache> cudaEventCache_;
+
   int numberOfDevices_ = 0;
   unsigned int numberOfStreamsTotal_ = 0;
   std::vector<std::pair<int, int>> computeCapabilities_;
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
index babe062f9bab2..e776c349f2e6c 100644
--- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -9,6 +9,7 @@
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
@@ -98,7 +99,7 @@ namespace {
     cudaCheck(cudaGetDevice(&device));
     for(int i=0; i<numberOfDevices; ++i) {
       cudaCheck(cudaSetDevice(i));
-      preallocate<edm::cuda::device::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
+      preallocate<cudautils::device::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
           return cs.make_device_unique<char[]>(size, stream);
         }, bufferSizes);
     }
@@ -106,7 +107,7 @@ namespace {
   }
 
   void hostPreallocate(CUDAService& cs, const std::vector<unsigned int>& bufferSizes) {
-    preallocate<edm::cuda::device::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
+    preallocate<cudautils::host::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
         return cs.make_host_unique<char[]>(size, stream);
       }, bufferSizes);
   }
@@ -336,6 +337,11 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
     log << "cub::CachingDeviceAllocator disabled\n";
   }
 
+  cudaStreamCache_ = std::make_unique<CUDAStreamCache>(numberOfDevices_);
+  cudaEventCache_ = std::make_unique<CUDAEventCache>(numberOfDevices_);
+
+  log << "\n";
+
   log << "CUDAService fully initialized";
   enabled_ = true;
 
@@ -350,6 +356,8 @@ CUDAService::~CUDAService() {
     if(allocator_) {
       allocator_.reset();
     }
+    cudaEventCache_.reset();
+    cudaStreamCache_.reset();
 
     for (int i = 0; i < numberOfDevices_; ++i) {
       cudaCheck(cudaSetDevice(i));
@@ -490,3 +498,36 @@ void CUDAService::free_host(void *ptr) {
     cuda::throw_if_error(cudaFreeHost(ptr));
   }
 }
+
+
+// CUDA stream cache
+struct CUDAService::CUDAStreamCache {
+  explicit CUDAStreamCache(int ndev): cache(ndev) {}
+
+  // Separate caches for each device for fast lookup
+  std::vector<edm::ReusableObjectHolder<cuda::stream_t<>>> cache;
+};
+
+std::shared_ptr<cuda::stream_t<>> CUDAService::getCUDAStream() {
+  return cudaStreamCache_->cache[getCurrentDevice()].makeOrGet([](){
+      auto current_device = cuda::device::current::get();
+      return std::make_unique<cuda::stream_t<>>(current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream));
+    });
+}
+
+// CUDA event cache
+struct CUDAService::CUDAEventCache {
+  explicit CUDAEventCache(int ndev): cache(ndev) {}
+
+  // Separate caches for each device for fast lookup
+  std::vector<edm::ReusableObjectHolder<cuda::event_t>> cache;
+};
+
+std::shared_ptr<cuda::event_t> CUDAService::getCUDAEvent() {
+  return cudaEventCache_->cache[getCurrentDevice()].makeOrGet([](){
+      auto current_device = cuda::device::current::get();
+      // We should not return a recorded, but not-yet-occurred event
+      return std::make_unique<cuda::event_t>(current_device.create_event(cuda::event::sync_by_busy_waiting,   // default; we should try to avoid explicit synchronization, so maybe the value doesn't matter much?
+                                                                         cuda::event::dont_record_timings)); // it should be a bit faster to ignore timings
+    });
+}
diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
index d0a1afcc8203f..4764d74c427c2 100644
--- a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
+++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
@@ -179,8 +179,8 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
     ps.addUntrackedParameter("allocator", alloc);
     auto cs = makeCUDAService(ps, ar);
     cs.setCurrentDevice(0);
-    auto current_device = cuda::device::current::get();
-    auto cudaStream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream);
+    auto cudaStreamPtr = cs.getCUDAStream();
+    auto& cudaStream = *cudaStreamPtr;
     
     SECTION("Destructor") {
       auto ptr = cs.make_device_unique<int>(cudaStream);
@@ -214,8 +214,8 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
     ps.addUntrackedParameter("allocator", alloc);
     auto cs = makeCUDAService(ps, ar);
     cs.setCurrentDevice(0);
-    auto current_device = cuda::device::current::get();
-    auto cudaStream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream);
+    auto cudaStreamPtr = cs.getCUDAStream();
+    auto& cudaStream = *cudaStreamPtr;
     
     SECTION("Destructor") {
       auto ptr = cs.make_host_unique<int>(cudaStream);
diff --git a/HeterogeneousCore/CUDATest/BuildFile.xml b/HeterogeneousCore/CUDATest/BuildFile.xml
new file mode 100644
index 0000000000000..112c200812d98
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/BuildFile.xml
@@ -0,0 +1,3 @@
+<use name="DataFormats/Common"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="rootcore"/>
diff --git a/HeterogeneousCore/CUDATest/interface/CUDAThing.h b/HeterogeneousCore/CUDATest/interface/CUDAThing.h
new file mode 100644
index 0000000000000..ecda1f2aafdf6
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/interface/CUDAThing.h
@@ -0,0 +1,19 @@
+#ifndef HeterogeneousCore_CUDATest_CUDAThing_H
+#define HeterogeneousCore_CUDATest_CUDAThing_H
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+class CUDAThing {
+public:
+  CUDAThing() = default;
+  CUDAThing(cudautils::device::unique_ptr<float[]> ptr):
+    ptr_(std::move(ptr))
+  {}
+
+  const float *get() const { return ptr_.get(); }
+
+private:
+  cudautils::device::unique_ptr<float[]> ptr_;;
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDATest/plugins/BuildFile.xml b/HeterogeneousCore/CUDATest/plugins/BuildFile.xml
new file mode 100644
index 0000000000000..09a8fb844d4c9
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/BuildFile.xml
@@ -0,0 +1,9 @@
+<library file="*.cc *.cu" name="HeterogeneousCoreCUDATestPlugins">
+  <flags EDM_PLUGIN="1"/>
+  <use name="FWCore/Framework"/>
+  <use name="FWCore/PluginManager"/>
+  <use name="FWCore/ParameterSet"/>
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="cuda"/>
+  <use name="cuda-api-wrappers"/>
+</library>
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc
new file mode 100644
index 0000000000000..30131f796a32c
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc
@@ -0,0 +1,65 @@
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include <chrono>
+#include <random>
+#include <thread>
+
+class TestCUDAProducerCPU: public edm::global::EDProducer<> {
+public:
+  explicit TestCUDAProducerCPU(const edm::ParameterSet& iConfig);
+  ~TestCUDAProducerCPU() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const;
+private:
+  std::string label_;
+  edm::EDGetTokenT<int> srcToken_;
+  edm::EDPutTokenT<int> dstToken_;
+};
+
+TestCUDAProducerCPU::TestCUDAProducerCPU(const edm::ParameterSet& iConfig):
+  label_{iConfig.getParameter<std::string>("@module_label")},
+  dstToken_{produces<int>()}
+{
+  auto srcTag = iConfig.getParameter<edm::InputTag>("src");
+  if(!srcTag.label().empty()) {
+    srcToken_ = consumes<int>(srcTag);
+  }
+}
+
+void TestCUDAProducerCPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Optional source of another TestCUDAProducerCPU.");
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a CPU algorithm.");
+}
+
+void TestCUDAProducerCPU::produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  edm::LogVerbatim("TestCUDAProducerCPU") << label_ << " TestCUDAProducerCPU::produce begin event " << iEvent.id().event() << " stream " << id;
+
+  int input = 0;
+  if(!srcToken_.isUninitialized()) {
+    input = iEvent.get(srcToken_);
+  }
+
+  std::random_device r;
+  std::mt19937 gen(r());
+  auto dist = std::uniform_real_distribution<>(0.2, 1.5); 
+  auto dur = dist(gen);
+  edm::LogVerbatim("TestCUDAProducerCPU") << " Task (CPU) for event " << iEvent.id().event() << " in stream " << id << " will take " << dur << " seconds";
+  std::this_thread::sleep_for(std::chrono::seconds(1)*dur);
+
+  const unsigned int output = input + id*100 + iEvent.id().event();
+
+  iEvent.emplace(dstToken_, output);
+
+  edm::LogVerbatim("TestCUDAProducerCPU") << label_ << " TestCUDAProducerCPU::produce end event " << iEvent.id().event() << " stream " << id << " result " << output;
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerCPU);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
new file mode 100644
index 0000000000000..fc74b714f22c5
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
@@ -0,0 +1,54 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
+
+#include "TestCUDAProducerGPUKernel.h"
+
+class TestCUDAProducerGPU: public edm::global::EDProducer<> {
+public:
+  explicit TestCUDAProducerGPU(const edm::ParameterSet& iConfig);
+  ~TestCUDAProducerGPU() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+private:
+  std::string label_;
+  edm::EDGetTokenT<CUDAProduct<CUDAThing>> srcToken_;
+  edm::EDPutTokenT<CUDAProduct<CUDAThing>> dstToken_;
+  TestCUDAProducerGPUKernel gpuAlgo_;
+};
+
+TestCUDAProducerGPU::TestCUDAProducerGPU(const edm::ParameterSet& iConfig):
+  label_(iConfig.getParameter<std::string>("@module_label")),
+  srcToken_(consumes<CUDAProduct<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))),
+  dstToken_(produces<CUDAProduct<CUDAThing>>())
+{}
+
+void TestCUDAProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source of CUDAProduct<CUDAThing>.");
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this is not the first algorithm in the chain of the GPU EDProducers. Produces CUDAProduct<CUDAThing>.");
+}
+
+void TestCUDAProducerGPU::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  edm::LogVerbatim("TestCUDAProducerGPU") << label_ << " TestCUDAProducerGPU::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  const auto& in = iEvent.get(srcToken_);
+  CUDAScopedContext ctx{in};
+  const CUDAThing& input = ctx.get(in);
+
+  ctx.emplace(iEvent, dstToken_, CUDAThing{gpuAlgo_.runAlgo(label_, input.get(), ctx.stream())});
+
+  edm::LogVerbatim("TestCUDAProducerGPU") << label_ << " TestCUDAProducerGPU::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerGPU);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
new file mode 100644
index 0000000000000..b084ad8920c20
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
@@ -0,0 +1,74 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAContextToken.h"
+#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
+
+#include "TestCUDAProducerGPUKernel.h"
+
+class TestCUDAProducerGPUEW: public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit TestCUDAProducerGPUEW(const edm::ParameterSet& iConfig);
+  ~TestCUDAProducerGPUEW() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+private:
+  std::string label_;
+  edm::EDGetTokenT<CUDAProduct<CUDAThing>> srcToken_;
+  edm::EDPutTokenT<CUDAProduct<CUDAThing>> dstToken_;
+  TestCUDAProducerGPUKernel gpuAlgo_;
+  CUDAContextToken ctxTmp_;
+  cudautils::device::unique_ptr<float[]> devicePtr_;
+  float hostData_ = 0.f;
+};
+
+TestCUDAProducerGPUEW::TestCUDAProducerGPUEW(const edm::ParameterSet& iConfig):
+  label_{iConfig.getParameter<std::string>("@module_label")},
+  srcToken_{consumes<CUDAProduct<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))},
+  dstToken_{produces<CUDAProduct<CUDAThing>>()}
+{}
+
+void TestCUDAProducerGPUEW::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag());
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  const auto& in = iEvent.get(srcToken_);
+  CUDAScopedContext ctx{in, std::move(waitingTaskHolder)};
+  const CUDAThing& input = ctx.get(in);
+
+  devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx.stream());
+  // Mimick the need to transfer some of the GPU data back to CPU to
+  // be used for something within this module, or to be put in the
+  // event.
+  cuda::memory::async::copy(&hostData_, devicePtr_.get()+10, sizeof(float), ctx.stream().id());
+
+  edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  ctxTmp_ = ctx.toToken();
+}
+
+void TestCUDAProducerGPUEW::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID() << " 10th element " << hostData_; 
+
+  CUDAScopedContext ctx{std::move(ctxTmp_)};
+
+  ctx.emplace(iEvent, dstToken_, std::move(devicePtr_));
+
+  edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerGPUEW);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
new file mode 100644
index 0000000000000..2c78e2730be8b
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
@@ -0,0 +1,50 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
+
+#include "TestCUDAProducerGPUKernel.h"
+
+class TestCUDAProducerGPUFirst: public edm::global::EDProducer<> {
+public:
+  explicit TestCUDAProducerGPUFirst(const edm::ParameterSet& iConfig);
+  ~TestCUDAProducerGPUFirst() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void produce(edm::StreamID stream, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+private:
+  std::string label_;
+  TestCUDAProducerGPUKernel gpuAlgo_;
+};
+
+TestCUDAProducerGPUFirst::TestCUDAProducerGPUFirst(const edm::ParameterSet& iConfig):
+  label_(iConfig.getParameter<std::string>("@module_label"))
+{
+  produces<CUDAProduct<CUDAThing>>();
+}
+
+void TestCUDAProducerGPUFirst::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this the first algorithm in the chain of the GPU EDProducers. Produces CUDA<ProductCUDAThing>.");
+}
+
+void TestCUDAProducerGPUFirst::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  edm::LogVerbatim("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  CUDAScopedContext ctx{streamID};
+
+  cudautils::device::unique_ptr<float[]> output = gpuAlgo_.runAlgo(label_, ctx.stream());
+  iEvent.put(ctx.wrap(CUDAThing(std::move(output))));
+
+  edm::LogVerbatim("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerGPUFirst);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
new file mode 100644
index 0000000000000..0bffb6656f31c
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
@@ -0,0 +1,112 @@
+#include "TestCUDAProducerGPUKernel.h"
+
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/Utilities/interface/Exception.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+namespace {
+  template<typename T>
+  __global__
+  void vectorAdd(const T *a, const T *b, T *c, int numElements) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if(i < numElements) { c[i] = a[i] + b[i]; }
+  }
+
+  template <typename T>
+  __global__
+  void vectorProd(const T *a, const T *b, T *c, int numElements) {
+    int row = blockIdx.y*blockDim.y + threadIdx.y;
+    int col = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if(row < numElements && col < numElements) {
+      c[row*numElements + col] = a[row]*b[col];
+    }
+  }
+
+  template <typename T>
+  __global__
+  void matrixMul(const T *a, const T *b, T *c, int numElements) {
+    int row = blockIdx.y*blockDim.y + threadIdx.y;
+    int col = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if(row < numElements && col < numElements) {
+      T tmp = 0;
+      for(int i=0; i<numElements; ++i) {
+        tmp += a[row*numElements + i] * b[i*numElements + col];
+      }
+      c[row*numElements + col] = tmp;
+    }
+  }
+
+  template <typename T>
+  __global__
+  void matrixMulVector(const T *a, const T *b, T *c, int numElements) {
+    int row = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if(row < numElements) {
+      T tmp = 0;
+      for(int i=0; i<numElements; ++i) {
+        tmp += a[row*numElements + i] * b[i];
+      }
+      c[row] = tmp;
+    }
+  }
+}
+
+cudautils::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const std::string& label, const float *d_input, cuda::stream_t<>& stream) const {
+  // First make the sanity check
+  if(d_input != nullptr) {
+    auto h_check = std::make_unique<float[]>(NUM_VALUES);
+    cuda::memory::copy(h_check.get(), d_input, NUM_VALUES*sizeof(float));
+    for(int i=0; i<NUM_VALUES; ++i) {
+      if(h_check[i] != i) {
+        throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got " << h_check[i];
+      }
+    }
+  }
+
+  edm::Service<CUDAService> cs;
+
+  auto h_a = cs->make_host_unique<float[]>(NUM_VALUES, stream);
+  auto h_b = cs->make_host_unique<float[]>(NUM_VALUES, stream);
+
+  for (auto i=0; i<NUM_VALUES; i++) {
+    h_a[i] = i;
+    h_b[i] = i*i;
+  }
+
+  auto d_a = cs->make_device_unique<float[]>(NUM_VALUES, stream);
+  auto d_b = cs->make_device_unique<float[]>(NUM_VALUES, stream);
+
+  cuda::memory::async::copy(d_a.get(), h_a.get(), NUM_VALUES*sizeof(float), stream.id());
+  cuda::memory::async::copy(d_b.get(), h_b.get(), NUM_VALUES*sizeof(float), stream.id());
+
+  int threadsPerBlock {32};
+  int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
+
+  auto d_c = cs->make_device_unique<float[]>(NUM_VALUES, stream);
+  auto current_device = cuda::device::current::get();
+  edm::LogVerbatim("TestHeterogeneousEDProducerGPU") << "  " << label << " GPU launching kernels device " << current_device.id() << " CUDA stream " << stream.id();
+  vectorAdd<<<blocksPerGrid, threadsPerBlock, 0, stream.id()>>>(d_a.get(), d_b.get(), d_c.get(), NUM_VALUES);
+
+  auto d_ma = cs->make_device_unique<float[]>(NUM_VALUES*NUM_VALUES, stream);
+  auto d_mb = cs->make_device_unique<float[]>(NUM_VALUES*NUM_VALUES, stream);
+  auto d_mc = cs->make_device_unique<float[]>(NUM_VALUES*NUM_VALUES, stream);
+  dim3 threadsPerBlock3{NUM_VALUES, NUM_VALUES};
+  dim3 blocksPerGrid3{1,1};
+  if(NUM_VALUES*NUM_VALUES > 32) {
+    threadsPerBlock3.x = 32;
+    threadsPerBlock3.y = 32;
+    blocksPerGrid3.x = ceil(double(NUM_VALUES)/double(threadsPerBlock3.x));
+    blocksPerGrid3.y = ceil(double(NUM_VALUES)/double(threadsPerBlock3.y));
+  }
+  vectorProd<<<blocksPerGrid3, threadsPerBlock3, 0, stream.id()>>>(d_a.get(), d_b.get(), d_ma.get(), NUM_VALUES);
+  vectorProd<<<blocksPerGrid3, threadsPerBlock3, 0, stream.id()>>>(d_a.get(), d_c.get(), d_mb.get(), NUM_VALUES);
+  matrixMul<<<blocksPerGrid3, threadsPerBlock3, 0, stream.id()>>>(d_ma.get(), d_mb.get(), d_mc.get(), NUM_VALUES);
+
+  matrixMulVector<<<blocksPerGrid, threadsPerBlock, 0, stream.id()>>>(d_mc.get(), d_b.get(), d_c.get(), NUM_VALUES);
+
+  edm::LogVerbatim("TestHeterogeneousEDProducerGPU") << "  " << label << " GPU kernels launched, returning return pointer device " << current_device.id() << " CUDA stream " << stream.id();
+  return d_a;
+}
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h
new file mode 100644
index 0000000000000..d03c4c1e0dc70
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h
@@ -0,0 +1,31 @@
+#ifndef HeterogeneousCore_CUDACore_TestCUDAProducerGPUKernel_h
+#define HeterogeneousCore_CUDACore_TestCUDAProducerGPUKernel_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+#include <cuda/api_wrappers.h>
+
+/**
+ * This class models the actual CUDA implementation of an algorithm.
+ *
+ * Memory is allocated dynamically with the allocator in CUDAService
+ *
+ * The algorithm is intended to waste time with large matrix
+ * operations so that the asynchronous nature of the CUDA integration
+ * becomes visible with debug prints.
+ */
+class TestCUDAProducerGPUKernel {
+public:
+  static constexpr int NUM_VALUES = 4000;
+
+  TestCUDAProducerGPUKernel() = default;
+  ~TestCUDAProducerGPUKernel() = default;
+
+  // returns (owning) pointer to device memory
+  cudautils::device::unique_ptr<float[]> runAlgo(const std::string& label, cuda::stream_t<>& stream) const {
+    return runAlgo(label, nullptr, stream);
+  }
+  cudautils::device::unique_ptr<float[]> runAlgo(const std::string& label, const float *d_input, cuda::stream_t<>& stream) const;
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
new file mode 100644
index 0000000000000..b5653d0af366d
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
@@ -0,0 +1,75 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
+
+#include "TestCUDAProducerGPUKernel.h"
+
+class TestCUDAProducerGPUtoCPU: public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit TestCUDAProducerGPUtoCPU(const edm::ParameterSet& iConfig);
+  ~TestCUDAProducerGPUtoCPU() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+private:
+  std::string label_;
+  edm::EDGetTokenT<CUDAProduct<CUDAThing>> srcToken_;
+  edm::EDPutTokenT<int> dstToken_;
+  cudautils::host::unique_ptr<float[]> buffer_;
+};
+
+TestCUDAProducerGPUtoCPU::TestCUDAProducerGPUtoCPU(const edm::ParameterSet& iConfig):
+  label_{iConfig.getParameter<std::string>("@module_label")},
+  srcToken_{consumes<CUDAProduct<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))},
+  dstToken_{produces<int>()}
+{}
+
+void TestCUDAProducerGPUtoCPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source for CUDAProduct<CUDAThing>.");
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models the GPU->CPU data transfer and formatting of the data to legacy data format. Produces int, to be compatible with TestCUDAProducerCPU.");
+}
+
+void TestCUDAProducerGPUtoCPU::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  const auto& in = iEvent.get(srcToken_);
+  CUDAScopedContext ctx{in, std::move(waitingTaskHolder)};
+  const CUDAThing& device = ctx.get(in);
+
+  edm::Service<CUDAService> cs;
+  buffer_ = cs->make_host_unique<float[]>(TestCUDAProducerGPUKernel::NUM_VALUES, ctx.stream());
+  // Enqueue async copy, continue in produce once finished
+  cuda::memory::async::copy(buffer_.get(), device.get(), TestCUDAProducerGPUKernel::NUM_VALUES*sizeof(float), ctx.stream().id());
+
+  edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+void TestCUDAProducerGPUtoCPU::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  int counter = 0;
+  for(int i=0; i<TestCUDAProducerGPUKernel::NUM_VALUES; ++i) {
+    counter += buffer_[i];
+  }
+  buffer_.reset(); // not so nice, but no way around?
+
+  iEvent.emplace(dstToken_, counter);
+
+  edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID() << " result " << counter;
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerGPUtoCPU);
diff --git a/HeterogeneousCore/CUDATest/python/prod1CPU_cfi.py b/HeterogeneousCore/CUDATest/python/prod1CPU_cfi.py
new file mode 100644
index 0000000000000..421d01c07ea46
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod1CPU_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU as _testCUDAProducerCPU
+prod1CPU = _testCUDAProducerCPU.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod1CUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod1CUDA_cfi.py
new file mode 100644
index 0000000000000..cf50287bcb15e
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod1CUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst as _testCUDAProducerGPUFirst
+prod1CUDA = _testCUDAProducerGPUFirst.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod1FromCUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod1FromCUDA_cfi.py
new file mode 100644
index 0000000000000..de6ed9b0ff179
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod1FromCUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU as _testCUDAProducerGPUtoCPU
+prod1FromCUDA = _testCUDAProducerGPUtoCPU.clone(src = "prod1CUDA")
diff --git a/HeterogeneousCore/CUDATest/python/prod1Switch_cff.py b/HeterogeneousCore/CUDATest/python/prod1Switch_cff.py
new file mode 100644
index 0000000000000..72221ade422fa
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod1Switch_cff.py
@@ -0,0 +1,17 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.prod1CPU_cfi import prod1CPU as _prod1CPU
+from HeterogeneousCore.CUDATest.prod1CUDA_cfi import prod1CUDA
+from HeterogeneousCore.CUDATest.prod1FromCUDA_cfi import prod1FromCUDA as _prod1FromCUDA
+
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+
+prod1 = SwitchProducerCUDA(
+    cpu = _prod1CPU.clone(),
+    cuda = _prod1FromCUDA.clone()
+)
+
+prod1Task = cms.Task(
+    prod1CUDA,
+    prod1
+)
diff --git a/HeterogeneousCore/CUDATest/python/prod1_cff.py b/HeterogeneousCore/CUDATest/python/prod1_cff.py
new file mode 100644
index 0000000000000..b15847fb3d935
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod1_cff.py
@@ -0,0 +1,15 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.prod1CPU_cfi import prod1CPU as _prod1CPU
+from HeterogeneousCore.CUDATest.prod1CUDA_cfi import prod1CUDA
+from HeterogeneousCore.CUDATest.prod1FromCUDA_cfi import prod1FromCUDA as _prod1FromCUDA
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+
+prod1 = _prod1CPU.clone()
+gpu.toReplaceWith(prod1, _prod1FromCUDA)
+
+prod1Task = cms.Task(
+    prod1CUDA,
+    prod1
+)
diff --git a/HeterogeneousCore/CUDATest/python/prod5CPU_cfi.py b/HeterogeneousCore/CUDATest/python/prod5CPU_cfi.py
new file mode 100644
index 0000000000000..3353c774f8f27
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod5CPU_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU as _testCUDAProducerCPU
+prod5CPU = _testCUDAProducerCPU.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod5CUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod5CUDA_cfi.py
new file mode 100644
index 0000000000000..2eb97358a1b4c
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod5CUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst as _testCUDAProducerGPUFirst
+prod5CUDA = _testCUDAProducerGPUFirst.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod5FromCUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod5FromCUDA_cfi.py
new file mode 100644
index 0000000000000..c25aa2c5a1043
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod5FromCUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU as _testCUDAProducerGPUtoCPU
+prod5FromCUDA = _testCUDAProducerGPUtoCPU.clone(src = "prod5CUDA")
diff --git a/HeterogeneousCore/CUDATest/python/prod5Switch_cff.py b/HeterogeneousCore/CUDATest/python/prod5Switch_cff.py
new file mode 100644
index 0000000000000..3b5e90d497641
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod5Switch_cff.py
@@ -0,0 +1,17 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.prod5CPU_cfi import prod5CPU as _prod5CPU
+from HeterogeneousCore.CUDATest.prod5CUDA_cfi import prod5CUDA
+from HeterogeneousCore.CUDATest.prod5FromCUDA_cfi import prod5FromCUDA as _prod5FromCUDA
+
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+
+prod5 = SwitchProducerCUDA(
+    cpu = _prod5CPU.clone(),
+    cuda = _prod5FromCUDA.clone()
+)
+
+prod5Task = cms.Task(
+    prod5CUDA,
+    prod5
+)
diff --git a/HeterogeneousCore/CUDATest/python/prod5_cff.py b/HeterogeneousCore/CUDATest/python/prod5_cff.py
new file mode 100644
index 0000000000000..2ced616802f9c
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod5_cff.py
@@ -0,0 +1,15 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.prod5CPU_cfi import prod5CPU as _prod5CPU
+from HeterogeneousCore.CUDATest.prod5CUDA_cfi import prod5CUDA
+from HeterogeneousCore.CUDATest.prod5FromCUDA_cfi import prod5FromCUDA as _prod5FromCUDA
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+
+prod5 = _prod5CPU.clone()
+gpu.toReplaceWith(prod5, _prod5FromCUDA)
+
+prod5Task = cms.Task(
+    prod5CUDA,
+    prod5
+)
diff --git a/HeterogeneousCore/CUDATest/python/prod6CPU_cfi.py b/HeterogeneousCore/CUDATest/python/prod6CPU_cfi.py
new file mode 100644
index 0000000000000..1cb1bba3f12b1
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod6CPU_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU as _testCUDAProducerCPU
+prod6CPU = _testCUDAProducerCPU.clone(src = "prod5CPU")
diff --git a/HeterogeneousCore/CUDATest/python/prod6CUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod6CUDA_cfi.py
new file mode 100644
index 0000000000000..0e1ea6ebbca0f
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod6CUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPU_cfi import testCUDAProducerGPU as _testCUDAProducerGPU
+prod6CUDA = _testCUDAProducerGPU.clone(src = "prod5CUDA")
diff --git a/HeterogeneousCore/CUDATest/python/prod6FromCUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod6FromCUDA_cfi.py
new file mode 100644
index 0000000000000..de894a5f32617
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod6FromCUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU as _testCUDAProducerGPUtoCPU
+prod6FromCUDA = _testCUDAProducerGPUtoCPU.clone(src = "prod6CUDA")
diff --git a/HeterogeneousCore/CUDATest/python/prod6Switch_cff.py b/HeterogeneousCore/CUDATest/python/prod6Switch_cff.py
new file mode 100644
index 0000000000000..fe05a11d4ce5a
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod6Switch_cff.py
@@ -0,0 +1,18 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.prod6CPU_cfi import prod6CPU as _prod6CPU
+from HeterogeneousCore.CUDATest.prod6CUDA_cfi import prod6CUDA
+from HeterogeneousCore.CUDATest.prod6FromCUDA_cfi import prod6FromCUDA as _prod6FromCUDA
+
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+
+prod6 = SwitchProducerCUDA(
+    cpu = _prod6CPU.clone(),
+    cuda = _prod6FromCUDA.clone()
+)
+
+prod6Task = cms.Task(
+    prod6CUDA,
+    prod6
+)
+
diff --git a/HeterogeneousCore/CUDATest/python/prod6_cff.py b/HeterogeneousCore/CUDATest/python/prod6_cff.py
new file mode 100644
index 0000000000000..9847cd896ce5b
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod6_cff.py
@@ -0,0 +1,16 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.prod6CPU_cfi import prod6CPU as _prod6CPU
+from HeterogeneousCore.CUDATest.prod6CUDA_cfi import prod6CUDA
+from HeterogeneousCore.CUDATest.prod6FromCUDA_cfi import prod6FromCUDA as _prod6FromCUDA
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+
+prod6 = _prod6CPU.clone()
+gpu.toReplaceWith(prod6, _prod6FromCUDA)
+
+prod6Task = cms.Task(
+    prod6CUDA,
+    prod6
+)
+
diff --git a/HeterogeneousCore/CUDATest/src/classes.h b/HeterogeneousCore/CUDATest/src/classes.h
new file mode 100644
index 0000000000000..33d9bba2bb9b2
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/src/classes.h
@@ -0,0 +1,3 @@
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
diff --git a/HeterogeneousCore/CUDATest/src/classes_def.xml b/HeterogeneousCore/CUDATest/src/classes_def.xml
new file mode 100644
index 0000000000000..bece1ece62a7b
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/src/classes_def.xml
@@ -0,0 +1,4 @@
+<lcgdict>
+    <class name="CUDAProduct<CUDAThing>" persistent="false"/>
+    <class name="edm::Wrapper<CUDAProduct<CUDAThing>>" persistent="false"/>
+</lcgdict>
diff --git a/HeterogeneousCore/CUDATest/test/BuildFile.xml b/HeterogeneousCore/CUDATest/test/BuildFile.xml
new file mode 100644
index 0000000000000..3287d65c14470
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/BuildFile.xml
@@ -0,0 +1,10 @@
+<bin file="test*.cc" name="testHeterogeneousCoreCUDATest">
+  <use name="FWCore/TestProcessor"/>
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="catch2"/>
+</bin>
+
+<bin file="TestCUDATest.cc">
+  <flags TEST_RUNNER_ARGS=" /bin/bash HeterogeneousCore/CUDATest/test runtests.sh"/>
+  <use name="FWCore/Utilities"/>
+</bin>
diff --git a/HeterogeneousCore/CUDATest/test/TestCUDATest.cc b/HeterogeneousCore/CUDATest/test/TestCUDATest.cc
new file mode 100644
index 0000000000000..b2991bd18ae57
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/TestCUDATest.cc
@@ -0,0 +1,3 @@
+#include "FWCore/Utilities/interface/TestHelper.h"
+
+RUNTEST()
diff --git a/HeterogeneousCore/CUDATest/test/runtests.sh b/HeterogeneousCore/CUDATest/test/runtests.sh
new file mode 100755
index 0000000000000..6817aa8d7ffab
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/runtests.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+function die { echo Failure $1: status $2 ; exit $2 ; }
+
+pushd ${LOCAL_TMP_DIR}
+
+  echo "*************************************************"
+  echo "CUDA producer configuration with SwitchProducer"
+  cmsRun ${LOCAL_TEST_DIR}/testCUDASwitch_cfg.py || die "cmsRun testCUDASwitch_cfg.py 1" $?
+
+popd
diff --git a/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py b/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py
new file mode 100644
index 0000000000000..8bac73608065d
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py
@@ -0,0 +1,92 @@
+import FWCore.ParameterSet.Config as cms
+
+silent = True
+#silent = False
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+process = cms.Process("Test")
+process.load("FWCore.MessageService.MessageLogger_cfi")
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+
+process.source = cms.Source("EmptySource")
+
+process.maxEvents = cms.untracked.PSet( input = cms.untracked.int32(3) )
+if not silent:
+    process.maxEvents.input = 10
+    process.MessageLogger.cerr.threshold = cms.untracked.string("INFO")
+    process.MessageLogger.cerr.INFO.limit = process.MessageLogger.cerr.default.limit
+
+
+process.options = cms.untracked.PSet(
+#    numberOfThreads = cms.untracked.uint32(4),
+    numberOfStreams = cms.untracked.uint32(0)
+)
+#process.Tracer = cms.Service("Tracer")
+
+# Flow diagram of the modules
+#
+#     1   5
+#    / \  |
+#   2  4  6
+#   |
+#   3
+
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+
+process.load("HeterogeneousCore.CUDATest.prod1Switch_cff")
+process.load("HeterogeneousCore.CUDATest.prod5Switch_cff")
+process.load("HeterogeneousCore.CUDATest.prod6Switch_cff")
+
+# GPU producers
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst
+from HeterogeneousCore.CUDATest.testCUDAProducerGPU_cfi import testCUDAProducerGPU
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUEW_cfi import testCUDAProducerGPUEW
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU
+
+process.prod2CUDA = testCUDAProducerGPU.clone(src = "prod1CUDA")
+process.prod3CUDA = testCUDAProducerGPU.clone(src = "prod2CUDA")
+process.prod4CUDA = testCUDAProducerGPUEW.clone(src = "prod1CUDA")
+
+# CPU producers, switched with modules to copy data from GPU to CPU
+# (as "on demand" as any other EDProducer, i.e. according to
+# consumes() and prefetching). If a separate conversion step is needed
+# to get the same data formats as the CPU modules, those are then ones
+# that should be replaced-with here.
+from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU
+process.prod2 = SwitchProducerCUDA(
+    cpu = testCUDAProducerCPU.clone(src = "prod1"),
+    cuda = testCUDAProducerGPUtoCPU.clone(src = "prod2CUDA")
+)
+process.prod3 = SwitchProducerCUDA(
+    cpu = testCUDAProducerCPU.clone(src = "prod2"),
+    cuda = testCUDAProducerGPUtoCPU.clone(src = "prod3CUDA")
+)
+process.prod4 = SwitchProducerCUDA(
+    cpu = testCUDAProducerCPU.clone(src = "prod1"),
+    cuda = testCUDAProducerGPUtoCPU.clone(src = "prod4CUDA")
+)
+
+process.out = cms.OutputModule("AsciiOutputModule",
+    outputCommands = cms.untracked.vstring(
+        "keep *_prod3_*_*",
+        "keep *_prod4_*_*",
+        "keep *_prod5_*_*",
+    ),
+    verbosity = cms.untracked.uint32(0),
+)
+
+process.prod2Task = cms.Task(process.prod2, process.prod2CUDA)
+process.prod3Task = cms.Task(process.prod3, process.prod3CUDA)
+process.prod4Task = cms.Task(process.prod4, process.prod4CUDA)
+
+process.t = cms.Task(
+    process.prod1Task,
+    process.prod2Task,
+    process.prod3Task,
+    process.prod4Task,
+    process.prod5Task,
+    process.prod6Task
+)
+process.p = cms.Path()
+process.p.associate(process.t)
+process.ep = cms.EndPath(process.out)
diff --git a/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py b/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py
new file mode 100644
index 0000000000000..5cb6c678402b4
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py
@@ -0,0 +1,79 @@
+import FWCore.ParameterSet.Config as cms
+
+enableGPU = True
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+process = cms.Process("Test", gpu) if enableGPU else cms.Process("Test")
+process.load("FWCore.MessageService.MessageLogger_cfi")
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+
+process.source = cms.Source("EmptySource")
+
+process.maxEvents = cms.untracked.PSet( input = cms.untracked.int32(10) )
+
+process.options = cms.untracked.PSet(
+#    numberOfThreads = cms.untracked.uint32(4),
+    numberOfStreams = cms.untracked.uint32(0)
+)
+#process.Tracer = cms.Service("Tracer")
+
+# Flow diagram of the modules
+#
+#     1   5
+#    / \  |
+#   2  4  6
+#   |
+#   3
+
+process.load("HeterogeneousCore.CUDATest.prod1_cff")
+process.load("HeterogeneousCore.CUDATest.prod5_cff")
+process.load("HeterogeneousCore.CUDATest.prod6_cff")
+
+# CPU producers
+from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU
+process.prod2 = testCUDAProducerCPU.clone(src = "prod1")
+process.prod3 = testCUDAProducerCPU.clone(src = "prod2")
+process.prod4 = testCUDAProducerCPU.clone(src = "prod1")
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst
+from HeterogeneousCore.CUDATest.testCUDAProducerGPU_cfi import testCUDAProducerGPU
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUEW_cfi import testCUDAProducerGPUEW
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU
+
+# GPU producers
+process.prod2CUDA = testCUDAProducerGPU.clone(src = "prod1CUDA")
+process.prod3CUDA = testCUDAProducerGPU.clone(src = "prod2CUDA")
+process.prod4CUDA = testCUDAProducerGPUEW.clone(src = "prod1CUDA")
+
+# Modules to copy data from GPU to CPU (as "on demand" as any other
+# EDProducer, i.e. according to consumes() and prefetching). If a
+# separate conversion step is needed to get the same data formats as
+# the CPU modules, those are then ones that should be replaced-with here.
+gpu.toReplaceWith(process.prod2, testCUDAProducerGPUtoCPU.clone(src = "prod2CUDA"))
+gpu.toReplaceWith(process.prod3, testCUDAProducerGPUtoCPU.clone(src = "prod3CUDA"))
+gpu.toReplaceWith(process.prod4, testCUDAProducerGPUtoCPU.clone(src = "prod4CUDA"))
+
+process.out = cms.OutputModule("AsciiOutputModule",
+    outputCommands = cms.untracked.vstring(
+        "keep *_prod3_*_*",
+        "keep *_prod4_*_*",
+        "keep *_prod5_*_*",
+    ),
+    verbosity = cms.untracked.uint32(0),
+)
+
+process.prod2Task = cms.Task(process.prod2, process.prod2CUDA)
+process.prod3Task = cms.Task(process.prod3, process.prod3CUDA)
+process.prod4Task = cms.Task(process.prod4, process.prod4CUDA)
+
+process.t = cms.Task(
+    process.prod1Task,
+    process.prod2Task,
+    process.prod3Task,
+    process.prod4Task,
+    process.prod5Task,
+    process.prod6Task
+)
+process.p = cms.Path()
+process.p.associate(process.t)
+process.ep = cms.EndPath(process.out)
diff --git a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
new file mode 100644
index 0000000000000..f19bd24813fbf
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
@@ -0,0 +1,91 @@
+#include "catch.hpp"
+#include "FWCore/TestProcessor/interface/TestProcessor.h"
+#include "FWCore/Utilities/interface/Exception.h"
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+
+#include <iostream>
+
+static constexpr auto s_tag = "[TestCUDAProducerGPUFirst]";
+
+TEST_CASE("Standard checks of TestCUDAProducerGPUFirst", s_tag) {
+  const std::string baseConfig{
+R"_(from FWCore.TestProcessor.TestProcess import *
+process = TestProcess()
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+process.toTest = cms.EDProducer("TestCUDAProducerGPUFirst")
+process.moduleToTest(process.toTest)
+)_"
+  };
+
+  edm::test::TestProcessor::Config config{ baseConfig };
+  SECTION("base configuration is OK") {
+    REQUIRE_NOTHROW(edm::test::TestProcessor(config));
+  }
+
+  SECTION("No event data") {
+    edm::test::TestProcessor tester(config);
+
+    REQUIRE_NOTHROW(tester.test());
+  }
+
+  SECTION("beginJob and endJob only") {
+    edm::test::TestProcessor tester(config);
+
+    REQUIRE_NOTHROW(tester.testBeginAndEndJobOnly());
+  }
+
+  SECTION("Run with no LuminosityBlocks") {
+    edm::test::TestProcessor tester(config);
+
+    REQUIRE_NOTHROW(tester.testRunWithNoLuminosityBlocks());
+  }
+
+  SECTION("LuminosityBlock with no Events") {
+    edm::test::TestProcessor tester(config);
+
+    REQUIRE_NOTHROW(tester.testLuminosityBlockWithNoEvents());
+  }
+
+}
+
+TEST_CASE("TestCUDAProducerGPUFirst operation", s_tag) {
+  const std::string baseConfig{
+R"_(from FWCore.TestProcessor.TestProcess import *
+process = TestProcess()
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+process.toTest = cms.EDProducer("TestCUDAProducerGPUFirst")
+process.moduleToTest(process.toTest)
+)_"
+  };
+  edm::test::TestProcessor::Config config{ baseConfig };
+
+  exitSansCUDADevices();
+
+  constexpr int defaultDevice = 0;
+
+  SECTION("Produce") {
+    edm::test::TestProcessor tester{config};
+    auto event = tester.test();
+    auto prod = event.get<CUDAProduct<CUDAThing> >();
+    REQUIRE(prod->device() == defaultDevice);
+    auto ctx = CUDAScopedContext(*prod);
+    const CUDAThing& thing = ctx.get(*prod);
+    const float *data = thing.get();
+    REQUIRE(data != nullptr);
+
+    float firstElements[10];
+    cuda::memory::async::copy(firstElements, data, sizeof(float)*10, prod->stream().id());
+
+    std::cout << "Synchronizing with CUDA stream" << std::endl;
+    auto stream = prod->stream();
+    stream.synchronize();
+    std::cout << "Synchronized" << std::endl;
+    REQUIRE(firstElements[0] == 0.f);
+    REQUIRE(firstElements[1] == 1.f);
+    REQUIRE(firstElements[9] == 9.f);
+  }
+};
diff --git a/HeterogeneousCore/CUDATest/test/test_main.cc b/HeterogeneousCore/CUDATest/test/test_main.cc
new file mode 100644
index 0000000000000..0c7c351f437f5
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/test_main.cc
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
diff --git a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
new file mode 100644
index 0000000000000..fa0db60aea592
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
@@ -0,0 +1,43 @@
+#ifndef HeterogeneousCore_CUDAUtilities_copyAsync_h
+#define HeterogeneousCore_CUDAUtilities_copyAsync_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+#include <cuda/api_wrappers.h>
+
+#include <type_traits>
+
+namespace cudautils {
+  // Single element
+  template <typename T>
+  inline
+  void copyAsync(cudautils::device::unique_ptr<T>& dst, const cudautils::host::unique_ptr<T>& src, cuda::stream_t<>& stream) {
+    // Shouldn't compile for array types because of sizeof(T), but
+    // let's add an assert with a more helpful message
+    static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
+    cuda::memory::async::copy(dst.get(), src.get(), sizeof(T), stream.id());
+  }
+
+  template <typename T>
+  inline
+  void copyAsync(cudautils::host::unique_ptr<T>& dst, const cudautils::device::unique_ptr<T>& src, cuda::stream_t<>& stream) {
+    static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
+    cuda::memory::async::copy(dst.get(), src.get(), sizeof(T), stream.id());
+  }
+
+  // Multiple elements
+  template <typename T>
+  inline
+  void copyAsync(cudautils::device::unique_ptr<T[]>& dst, const cudautils::host::unique_ptr<T[]>& src, size_t nelements, cuda::stream_t<>& stream) {
+    cuda::memory::async::copy(dst.get(), src.get(), nelements*sizeof(T), stream.id());
+  }
+
+  template <typename T>
+  inline
+  void copyAsync(cudautils::host::unique_ptr<T[]>& dst, const cudautils::device::unique_ptr<T[]>& src, size_t nelements, cuda::stream_t<>& stream) {
+    cuda::memory::async::copy(dst.get(), src.get(), nelements*sizeof(T), stream.id());
+  }
+}
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h
new file mode 100644
index 0000000000000..06a0424450983
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h
@@ -0,0 +1,27 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_device_unique_ptr_h
+#define HeterogeneousCore_CUDAUtilities_interface_device_unique_ptr_h
+
+#include <memory>
+#include <functional>
+
+namespace cudautils {
+  namespace device {
+    namespace impl {
+      // Additional layer of types to distinguish from host::unique_ptr
+      class DeviceDeleter {
+      public:
+        DeviceDeleter() = default;
+        explicit DeviceDeleter(std::function<void(void *)> f): f_(f) {}
+
+        void operator()(void *ptr) { f_(ptr); }
+      private:
+        std::function<void(void *)> f_;
+      };
+    }
+
+    template <typename T>
+    using unique_ptr = std::unique_ptr<T, impl::DeviceDeleter>;
+  }
+}
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h
new file mode 100644
index 0000000000000..2a39c475cbb91
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h
@@ -0,0 +1,27 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_host_unique_ptr_h
+#define HeterogeneousCore_CUDAUtilities_interface_host_unique_ptr_h
+
+#include <memory>
+#include <functional>
+
+namespace cudautils {
+  namespace host {
+    namespace impl {
+      // Additional layer of types to distinguish from host::unique_ptr
+      class HostDeleter {
+      public:
+        HostDeleter() = default;
+        explicit HostDeleter(std::function<void(void *)> f): f_(f) {}
+
+        void operator()(void *ptr) { f_(ptr); }
+      private:
+        std::function<void(void *)> f_;
+      };
+    }
+
+    template <typename T>
+    using unique_ptr = std::unique_ptr<T, impl::HostDeleter>;
+  }
+}
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
new file mode 100644
index 0000000000000..d87c50b666b61
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
@@ -0,0 +1,33 @@
+#ifndef HeterogeneousCore_CUDAUtilities_memsetAsync_h
+#define HeterogeneousCore_CUDAUtilities_memsetAsync_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+#include <cuda/api_wrappers.h>
+
+#include <type_traits>
+
+namespace cudautils {
+  template <typename T>
+  inline
+  void memsetAsync(cudautils::device::unique_ptr<T>& ptr, T value, cuda::stream_t<>& stream) {
+    // Shouldn't compile for array types because of sizeof(T), but
+    // let's add an assert with a more helpful message
+    static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
+    cuda::memory::device::async::set(ptr.get(), value, sizeof(T), stream.id());
+  }
+
+  /**
+   * The type of `value` is `int` because of `cudaMemsetAsync()` takes
+   * it as an `int`. Note that `cudaMemsetAsync()` sets the value of
+   * each **byte** to `value`. This may lead to unexpected results if
+   * `sizeof(T) > 1` and `value != 0`.
+   */
+  template <typename T>
+  inline
+  void memsetAsync(cudautils::device::unique_ptr<T[]>& ptr, int value, size_t nelements, cuda::stream_t<>& stream) {
+    cuda::memory::device::async::set(ptr.get(), value, nelements*sizeof(T), stream.id());
+  }
+}
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index d3f8d77a35d32..f6d6a4bb7e594 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -66,3 +66,7 @@
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 </bin>
 
+<bin file="testCatch2Main.cpp,copyAsync_t.cpp,memsetAsync_t.cpp" name="cudaMemUtils_t">
+  <use name="HeterogeneousCore/CUDAServices"/>
+  <use name="catch2"/>
+</bin>
diff --git a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
new file mode 100644
index 0000000000000..adfe833c5dcb4
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
@@ -0,0 +1,132 @@
+#include "catch.hpp"
+
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+
+namespace {
+  CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) {
+    auto desc = edm::ConfigurationDescriptions("Service", "CUDAService");
+    CUDAService::fillDescriptions(desc);
+    desc.validate(ps, "CUDAService");
+    return CUDAService(ps, ar);
+  }
+}
+
+TEST_CASE("copyAsync", "[cudaMemTools]") {
+  exitSansCUDADevices();
+
+  edm::ActivityRegistry ar;
+  edm::ParameterSet ps;
+  auto cs = makeCUDAService(ps, ar);
+
+  auto current_device = cuda::device::current::get();
+  auto stream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream);
+
+  SECTION("Host to device") {
+    SECTION("Single element") {
+      auto host_orig = cs.make_host_unique<int>(stream);
+      *host_orig = 42;
+
+      auto device = cs.make_device_unique<int>(stream);
+      auto host = cs.make_host_unique<int>(stream);
+
+      cudautils::copyAsync(device, host_orig, stream);
+      cuda::memory::async::copy(host.get(), device.get(), sizeof(int), stream.id());
+      stream.synchronize();
+
+      REQUIRE(*host == 42);
+    }
+
+    SECTION("Multiple elements") {
+      constexpr int N = 100;
+
+      auto host_orig = cs.make_host_unique<int[]>(N, stream);
+      for(int i=0; i<N; ++i) {
+        host_orig[i] = i;
+      }
+
+      auto device = cs.make_device_unique<int[]>(N, stream);
+      auto host = cs.make_host_unique<int[]>(N, stream);
+
+      SECTION("Copy all") {
+        cudautils::copyAsync(device, host_orig, N, stream);
+        cuda::memory::async::copy(host.get(), device.get(), N*sizeof(int), stream.id());
+        stream.synchronize();
+        for(int i=0; i<N; ++i) {
+          CHECK(host[i] == i);
+        }
+      }
+
+      for(int i=0; i<N; ++i) {
+        host_orig[i] = 200+i;
+      }
+
+      SECTION("Copy some") {
+        cudautils::copyAsync(device, host_orig, 42, stream);
+        cuda::memory::async::copy(host.get(), device.get(), 42*sizeof(int), stream.id());
+        stream.synchronize();
+        for(int i=0; i<42; ++i) {
+          CHECK(host[i] == 200+i);
+        }
+      }
+    }
+  }
+
+  SECTION("Device to host") {
+    SECTION("Single element") {
+      auto host_orig = cs.make_host_unique<int>(stream);
+      *host_orig = 42;
+
+      auto device = cs.make_device_unique<int>(stream);
+      auto host = cs.make_host_unique<int>(stream);
+
+      cuda::memory::async::copy(device.get(), host_orig.get(), sizeof(int), stream.id());
+      cudautils::copyAsync(host, device, stream);
+      stream.synchronize();
+
+      REQUIRE(*host == 42);
+    }
+
+    SECTION("Multiple elements") {
+      constexpr int N = 100;
+
+      auto host_orig = cs.make_host_unique<int[]>(N, stream);
+      for(int i=0; i<N; ++i) {
+        host_orig[i] = i;
+      }
+
+      auto device = cs.make_device_unique<int[]>(N, stream);
+      auto host = cs.make_host_unique<int[]>(N, stream);
+
+      SECTION("Copy all") {
+        cuda::memory::async::copy(device.get(), host_orig.get(), N*sizeof(int), stream.id());
+        cudautils::copyAsync(host, device, N, stream);
+        stream.synchronize();
+        for(int i=0; i<N; ++i) {
+          CHECK(host[i] == i);
+        }
+      }
+
+      for(int i=0; i<N; ++i) {
+        host_orig[i] = 200+i;
+      }
+
+      SECTION("Copy some") {
+        cuda::memory::async::copy(device.get(), host_orig.get(), 42*sizeof(int), stream.id());
+        cudautils::copyAsync(host, device, 42, stream);
+        stream.synchronize();
+        for(int i=0; i<42; ++i) {
+          CHECK(host[i] == 200+i);
+        }
+      }
+    }
+  }
+
+  //Fake the end-of-job signal.
+  ar.postEndJobSignal_();
+}
+
diff --git a/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp
new file mode 100644
index 0000000000000..dc116007ae925
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp
@@ -0,0 +1,67 @@
+#include "catch.hpp"
+
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+
+namespace {
+  CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) {
+    auto desc = edm::ConfigurationDescriptions("Service", "CUDAService");
+    CUDAService::fillDescriptions(desc);
+    desc.validate(ps, "CUDAService");
+    return CUDAService(ps, ar);
+  }
+}
+
+TEST_CASE("memsetAsync", "[cudaMemTools]") {
+  exitSansCUDADevices();
+
+  edm::ActivityRegistry ar;
+  edm::ParameterSet ps;
+  auto cs = makeCUDAService(ps, ar);
+
+  auto current_device = cuda::device::current::get();
+  auto stream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream);
+
+  SECTION("Single element") {
+    auto host_orig = cs.make_host_unique<int>(stream);
+    *host_orig = 42;
+
+    auto device = cs.make_device_unique<int>(stream);
+    auto host = cs.make_host_unique<int>(stream);
+    cudautils::copyAsync(device, host_orig, stream);
+    cudautils::memsetAsync(device, 0, stream);
+    cudautils::copyAsync(host, device, stream);
+    stream.synchronize();
+
+    REQUIRE(*host == 0);
+  }
+
+  SECTION("Multiple elements") {
+    constexpr int N = 100;
+
+    auto host_orig = cs.make_host_unique<int[]>(N, stream);
+    for(int i=0; i<N; ++i) {
+      host_orig[i] = i;
+    }
+
+    auto device = cs.make_device_unique<int[]>(N, stream);
+    auto host = cs.make_host_unique<int[]>(N, stream);
+    cudautils::copyAsync(device, host_orig, N, stream);
+    cudautils::memsetAsync(device, 0, N, stream);
+    cudautils::copyAsync(host, device, N, stream);
+    stream.synchronize();
+
+    for(int i=0; i < N; ++i) {
+      CHECK(host[i] == 0);
+    }
+  }
+
+  //Fake the end-of-job signal.
+  ar.postEndJobSignal_();
+}
+
diff --git a/HeterogeneousCore/CUDAUtilities/test/testCatch2Main.cpp b/HeterogeneousCore/CUDAUtilities/test/testCatch2Main.cpp
new file mode 100644
index 0000000000000..0c7c351f437f5
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/testCatch2Main.cpp
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
diff --git a/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py b/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py
index b75e75e000d48..a486a83d178f4 100644
--- a/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py
+++ b/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py
@@ -9,11 +9,11 @@
 from RecoLocalTracker.SiStripRecHitConverter.StripCPEfromTrackAngle_cfi import *
 from RecoLocalTracker.SiStripZeroSuppression.SiStripZeroSuppression_cfi import *
 from RecoLocalTracker.SiStripClusterizer.SiStripClusterizer_cfi import *
-from RecoLocalTracker.SiPixelClusterizer.SiPixelClusterizerPreSplitting_cfi import *
+from RecoLocalTracker.SiPixelClusterizer.siPixelClustersPreSplitting_cff import *
 from RecoLocalTracker.SiPixelRecHits.SiPixelRecHits_cfi import *
 from RecoLocalTracker.SubCollectionProducers.clustersummaryproducer_cfi import *
 
-pixeltrackerlocalrecoTask = cms.Task(siPixelClustersPreSplitting,siPixelRecHitsPreSplitting)
+pixeltrackerlocalrecoTask = cms.Task(siPixelClustersPreSplittingTask,siPixelRecHitsPreSplitting)
 striptrackerlocalrecoTask = cms.Task(siStripZeroSuppression,siStripClusters,siStripMatchedRecHits)
 trackerlocalrecoTask = cms.Task(pixeltrackerlocalrecoTask,striptrackerlocalrecoTask,clusterSummaryProducer)
 
diff --git a/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h b/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h
index e5e5f41053e3d..f0d996bd7310b 100644
--- a/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h
+++ b/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h
@@ -1,10 +1,9 @@
 #ifndef RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h
 #define RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h
 
-#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
-#include "CUDADataFormats/Common/interface/host_unique_ptr.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPU.h"
 
 #include <cuda/api_wrappers.h>
@@ -30,7 +29,7 @@ class SiPixelFedCablingMapGPUWrapper {
 
   // returns pointer to GPU memory
   const unsigned char *getModToUnpAllAsync(cuda::stream_t<>& cudaStream) const;
-  edm::cuda::device::unique_ptr<unsigned char[]> getModToUnpRegionalAsync(std::set<unsigned int> const& modules, cuda::stream_t<>& cudaStream) const;
+  cudautils::device::unique_ptr<unsigned char[]> getModToUnpRegionalAsync(std::set<unsigned int> const& modules, cuda::stream_t<>& cudaStream) const;
 
 private:
   const SiPixelFedCablingMap *cablingMap_;
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc
new file mode 100644
index 0000000000000..4c405a8c85afd
--- /dev/null
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc
@@ -0,0 +1,158 @@
+#include "DataFormats/Common/interface/DetSetVector.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "DataFormats/DetId/interface/DetId.h"
+#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
+#include "DataFormats/SiPixelDigi/interface/PixelDigi.h"
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h"
+#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "Geometry/Records/interface/TrackerTopologyRcd.h"
+
+namespace {
+  struct AccretionCluster {
+    typedef unsigned short UShort;
+    static constexpr UShort MAXSIZE = 256;
+    UShort adc[MAXSIZE];
+    UShort x[MAXSIZE];
+    UShort y[MAXSIZE];
+    UShort xmin=16000;
+    UShort ymin=16000;
+    unsigned int isize=0;
+    int charge=0;
+
+    void clear() {
+      isize=0;
+      charge=0;
+      xmin=16000;
+      ymin=16000;
+    }
+
+    bool add(SiPixelCluster::PixelPos const & p, UShort const iadc) {
+      if (isize==MAXSIZE) return false;
+      xmin=std::min(xmin,(unsigned short)(p.row()));
+      ymin=std::min(ymin,(unsigned short)(p.col()));
+      adc[isize]=iadc;
+      x[isize]=p.row();
+      y[isize++]=p.col();
+      charge+=iadc;
+      return true;
+    }
+  };
+
+  constexpr uint32_t dummydetid = 0xffffffff;
+}
+
+class SiPixelDigisClustersFromSoA: public edm::global::EDProducer<> {
+public:
+  explicit SiPixelDigisClustersFromSoA(const edm::ParameterSet& iConfig);
+  ~SiPixelDigisClustersFromSoA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+  edm::EDGetTokenT<SiPixelDigisSoA> digiGetToken_;
+
+  edm::EDPutTokenT<edm::DetSetVector<PixelDigi>> digiPutToken_;
+  edm::EDPutTokenT<SiPixelClusterCollectionNew> clusterPutToken_;
+ 
+};
+
+SiPixelDigisClustersFromSoA::SiPixelDigisClustersFromSoA(const edm::ParameterSet& iConfig):
+  digiGetToken_(consumes<SiPixelDigisSoA>(iConfig.getParameter<edm::InputTag>("src"))),
+  digiPutToken_(produces<edm::DetSetVector<PixelDigi>>()),
+  clusterPutToken_(produces<SiPixelClusterCollectionNew>())
+{}
+
+void SiPixelDigisClustersFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag("siPixelDigisSoA"));
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void SiPixelDigisClustersFromSoA::produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  const auto& digis = iEvent.get(digiGetToken_);
+  
+  edm::ESHandle<TrackerTopology> trackerTopologyHandle;
+  iSetup.get<TrackerTopologyRcd>().get(trackerTopologyHandle);
+  const auto& ttopo = *trackerTopologyHandle;
+
+  auto collection = std::make_unique<edm::DetSetVector<PixelDigi>>();
+  auto outputClusters = std::make_unique<SiPixelClusterCollectionNew>();
+
+  const uint32_t nDigis = digis.size();
+  edm::DetSet<PixelDigi> * detDigis=nullptr;
+  for (uint32_t i = 0; i < nDigis; i++) {
+    if (digis.pdigi(i)==0) continue;
+    detDigis = &collection->find_or_insert(digis.rawIdArr(i));
+    if ( (*detDigis).empty() ) (*detDigis).data.reserve(32); // avoid the first relocations
+    break;
+  }
+
+  int32_t nclus=-1;
+  std::vector<AccretionCluster> aclusters(1024);
+  auto totCluseFilled=0;
+
+  auto fillClusters = [&](uint32_t detId){
+    if (nclus<0) return; // this in reality should never happen
+    edmNew::DetSetVector<SiPixelCluster>::FastFiller spc(*outputClusters, detId);
+    auto layer = (DetId(detId).subdetId()==1) ? ttopo.pxbLayer(detId) : 0;
+    auto clusterThreshold = (layer==1) ? 2000 : 4000;
+    for (int32_t ic=0; ic<nclus+1;++ic) {
+      auto const & acluster = aclusters[ic];
+      if ( acluster.charge < clusterThreshold) continue;
+      SiPixelCluster cluster(acluster.isize,acluster.adc, acluster.x,acluster.y, acluster.xmin,acluster.ymin);
+      ++totCluseFilled;
+      // std::cout << "putting in this cluster " << ic << " " << cluster.charge() << " " << cluster.pixelADC().size() << endl;
+      // sort by row (x)
+      spc.push_back( std::move(cluster) );
+      std::push_heap(spc.begin(),spc.end(),[](SiPixelCluster const & cl1,SiPixelCluster const & cl2) { return cl1.minPixelRow() < cl2.minPixelRow();});
+    }
+    for (int32_t ic=0; ic<nclus+1;++ic) aclusters[ic].clear();
+    nclus = -1;
+    // sort by row (x)
+    std::sort_heap(spc.begin(),spc.end(),[](SiPixelCluster const & cl1,SiPixelCluster const & cl2) { return cl1.minPixelRow() < cl2.minPixelRow();});
+    if ( spc.empty() ) spc.abort();
+  };
+
+  for (uint32_t i = 0; i < nDigis; i++) {
+    if (digis.pdigi(i)==0) continue;
+    if (digis.clus(i)>9000) continue; // not in cluster; TODO add an assert for the size
+    assert(digis.rawIdArr(i) > 109999);
+    if ( (*detDigis).detId() != digis.rawIdArr(i))
+      {
+        fillClusters((*detDigis).detId());
+        assert(nclus==-1);
+        detDigis = &collection->find_or_insert(digis.rawIdArr(i));
+        if ( (*detDigis).empty() )
+          (*detDigis).data.reserve(32); // avoid the first relocations
+        else { std::cout << "Problem det present twice in input! " << (*detDigis).detId() << std::endl; }
+      }
+    (*detDigis).data.emplace_back(digis.pdigi(i));
+    auto const & dig = (*detDigis).data.back();
+    // fill clusters
+    assert(digis.clus(i)>=0);
+    assert(digis.clus(i)<1024);
+    nclus = std::max(digis.clus(i),nclus);
+    auto row = dig.row();
+    auto col = dig.column();
+    SiPixelCluster::PixelPos pix(row,col);
+    aclusters[digis.clus(i)].add(pix, digis.adc(i));
+  }
+
+  // fill final clusters
+  fillClusters((*detDigis).detId());
+  //std::cout << "filled " << totCluseFilled << " clusters" << std::endl;
+
+  iEvent.put(digiPutToken_, std::move(collection));
+  iEvent.put(clusterPutToken_, std::move(outputClusters));
+}
+
+DEFINE_FWK_MODULE(SiPixelDigisClustersFromSoA);
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
new file mode 100644
index 0000000000000..5dc04009f4832
--- /dev/null
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
@@ -0,0 +1,243 @@
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
+#include "CalibTracker/Records/interface/SiPixelGainCalibrationForHLTGPURcd.h"
+#include "CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h"
+#include "CondFormats/DataRecord/interface/SiPixelFedCablingMapRcd.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingMap.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingTree.h"
+#include "DataFormats/FEDRawData/interface/FEDNumbering.h"
+#include "DataFormats/FEDRawData/interface/FEDRawData.h"
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
+#include "EventFilter/SiPixelRawToDigi/interface/PixelDataFormatter.h"
+#include "EventFilter/SiPixelRawToDigi/interface/PixelUnpackingRegions.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/ESTransientHandle.h"
+#include "FWCore/Framework/interface/ESWatcher.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h"
+#include "RecoTracker/Record/interface/CkfComponentsRecord.h"
+
+#include "SiPixelRawToClusterGPUKernel.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+class SiPixelRawToClusterCUDA: public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfig);
+  ~SiPixelRawToClusterCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+  edm::EDGetTokenT<FEDRawDataCollection> rawGetToken_;
+
+  edm::EDPutTokenT<CUDAProduct<SiPixelDigisCUDA>> digiPutToken_;
+  edm::EDPutTokenT<CUDAProduct<SiPixelDigiErrorsCUDA>> digiErrorPutToken_;
+  edm::EDPutTokenT<CUDAProduct<SiPixelClustersCUDA>> clusterPutToken_;
+
+  CUDAContextToken ctxTmp_;
+
+  edm::ESWatcher<SiPixelFedCablingMapRcd> recordWatcher;
+
+  std::string cablingMapLabel_;
+  std::unique_ptr<SiPixelFedCablingTree> cabling_;
+  std::vector<unsigned int> fedIds_;
+  const SiPixelFedCablingMap *cablingMap_ = nullptr;
+  std::unique_ptr<PixelUnpackingRegions> regions_;
+
+  pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_;
+  PixelDataFormatter::Errors errors_;
+
+  const bool includeErrors_;
+  const bool useQuality_;
+  const bool usePilotBlade_;
+  const bool convertADCtoElectrons_;
+};
+
+SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfig):
+  rawGetToken_(consumes<FEDRawDataCollection>(iConfig.getParameter<edm::InputTag>("InputLabel"))),
+  digiPutToken_(produces<CUDAProduct<SiPixelDigisCUDA>>()),
+  clusterPutToken_(produces<CUDAProduct<SiPixelClustersCUDA>>()),
+  cablingMapLabel_(iConfig.getParameter<std::string>("CablingMapLabel")),
+  includeErrors_(iConfig.getParameter<bool>("IncludeErrors")),
+  useQuality_(iConfig.getParameter<bool>("UseQualityInfo")),
+  usePilotBlade_(iConfig.getParameter<bool> ("UsePilotBlade")), // Control the usage of pilot-blade data, FED=40
+  convertADCtoElectrons_(iConfig.getParameter<bool>("ConvertADCtoElectrons"))
+{
+  if(includeErrors_) {
+    digiErrorPutToken_ = produces<CUDAProduct<SiPixelDigiErrorsCUDA>>();
+  }
+
+  // regions
+  if(!iConfig.getParameter<edm::ParameterSet>("Regions").getParameterNames().empty()) {
+    regions_ = std::make_unique<PixelUnpackingRegions>(iConfig, consumesCollector());
+  }
+
+  if(usePilotBlade_) edm::LogInfo("SiPixelRawToCluster")  << " Use pilot blade data (FED 40)";
+}
+
+void SiPixelRawToClusterCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<bool>("IncludeErrors",true);
+  desc.add<bool>("UseQualityInfo",false);
+  desc.add<bool>("UsePilotBlade",false)->setComment("##  Use pilot blades");
+  desc.add<bool>("ConvertADCtoElectrons", false)->setComment("## do the calibration ADC-> Electron and apply the threshold, requried for clustering");
+  desc.add<edm::InputTag>("InputLabel",edm::InputTag("rawDataCollector"));
+  {
+    edm::ParameterSetDescription psd0;
+    psd0.addOptional<std::vector<edm::InputTag>>("inputs");
+    psd0.addOptional<std::vector<double>>("deltaPhi");
+    psd0.addOptional<std::vector<double>>("maxZ");
+    psd0.addOptional<edm::InputTag>("beamSpot");
+    desc.add<edm::ParameterSetDescription>("Regions",psd0)->setComment("## Empty Regions PSet means complete unpacking");
+  }
+  desc.add<std::string>("CablingMapLabel","")->setComment("CablingMap label"); //Tav
+  descriptions.addWithDefaultLabel(desc);
+}
+
+
+void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
+
+  edm::ESHandle<SiPixelFedCablingMapGPUWrapper> hgpuMap;
+  iSetup.get<CkfComponentsRecord>().get(hgpuMap);
+  if(hgpuMap->hasQuality() != useQuality_) {
+    throw cms::Exception("LogicError") << "UseQuality of the module (" << useQuality_ << ") differs the one from SiPixelFedCablingMapGPUWrapper. Please fix your configuration.";
+  }
+  // get the GPU product already here so that the async transfer can begin
+  const auto *gpuMap = hgpuMap->getGPUProductAsync(ctx.stream());
+
+  edm::ESHandle<SiPixelGainCalibrationForHLTGPU> hgains;
+  iSetup.get<SiPixelGainCalibrationForHLTGPURcd>().get(hgains);
+  // get the GPU product already here so that the async transfer can begin
+  const auto *gpuGains = hgains->getGPUProductAsync(ctx.stream());
+
+  cudautils::device::unique_ptr<unsigned char[]> modulesToUnpackRegional;
+  const unsigned char *gpuModulesToUnpack;
+
+  if(regions_) {
+    regions_->run(iEvent, iSetup);
+    LogDebug("SiPixelRawToCluster") << "region2unpack #feds: "<<regions_->nFEDs();
+    LogDebug("SiPixelRawToCluster") << "region2unpack #modules (BPIX,EPIX,total): "<<regions_->nBarrelModules()<<" "<<regions_->nForwardModules()<<" "<<regions_->nModules();
+    modulesToUnpackRegional = hgpuMap->getModToUnpRegionalAsync(*(regions_->modulesToUnpack()), ctx.stream());
+    gpuModulesToUnpack = modulesToUnpackRegional.get();
+  }
+  else {
+    gpuModulesToUnpack = hgpuMap->getModToUnpAllAsync(ctx.stream());
+  }
+
+  // initialize cabling map or update if necessary
+  if (recordWatcher.check(iSetup)) {
+    // cabling map, which maps online address (fed->link->ROC->local pixel) to offline (DetId->global pixel)
+    edm::ESTransientHandle<SiPixelFedCablingMap> cablingMap;
+    iSetup.get<SiPixelFedCablingMapRcd>().get(cablingMapLabel_, cablingMap); //Tav
+    cablingMap_ = cablingMap.product();
+    fedIds_  = cablingMap->fedIds();
+    cabling_ = cablingMap->cablingTree();
+    LogDebug("map version:")<< cabling_->version();
+  }
+
+  const auto& buffers = iEvent.get(rawGetToken_);
+
+  errors_.clear();
+
+    // GPU specific: Data extraction for RawToDigi GPU
+  unsigned int wordCounterGPU = 0;
+  unsigned int fedCounter = 0;
+  bool errorsInEvent = false;
+
+  // In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData()
+  ErrorChecker errorcheck;
+  auto wordFedAppender = pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender(ctx.stream());
+  for(int fedId: fedIds_) {
+    if (!usePilotBlade_ && (fedId==40) ) continue; // skip pilot blade data
+    if (regions_ && !regions_->mayUnpackFED(fedId)) continue;
+
+    // for GPU
+    // first 150 index stores the fedId and next 150 will store the
+    // start index of word in that fed
+    assert(fedId>=1200);
+    fedCounter++;
+
+    // get event data for this fed
+    const FEDRawData& rawData = buffers.FEDData( fedId );
+
+    // GPU specific
+    int nWords = rawData.size()/sizeof(cms_uint64_t);
+    if (nWords == 0) {
+      continue;
+    }
+
+    // check CRC bit
+    const cms_uint64_t* trailer = reinterpret_cast<const cms_uint64_t* >(rawData.data())+(nWords-1);
+    if (not errorcheck.checkCRC(errorsInEvent, fedId, trailer, errors_)) {
+      continue;
+    }
+
+    // check headers
+    const cms_uint64_t* header = reinterpret_cast<const cms_uint64_t* >(rawData.data()); header--;
+    bool moreHeaders = true;
+    while (moreHeaders) {
+      header++;
+      bool headerStatus = errorcheck.checkHeader(errorsInEvent, fedId, header, errors_);
+      moreHeaders = headerStatus;
+    }
+
+    // check trailers
+    bool moreTrailers = true;
+    trailer++;
+    while (moreTrailers) {
+      trailer--;
+      bool trailerStatus = errorcheck.checkTrailer(errorsInEvent, fedId, nWords, trailer, errors_);
+      moreTrailers = trailerStatus;
+    }
+
+    const cms_uint32_t * bw = (const cms_uint32_t *)(header+1);
+    const cms_uint32_t * ew = (const cms_uint32_t *)(trailer);
+
+    assert(0 == (ew-bw)%2);
+    wordFedAppender.initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw));
+    wordCounterGPU+=(ew-bw);
+
+  } // end of for loop
+
+  gpuAlgo_.makeClustersAsync(gpuMap, gpuModulesToUnpack, gpuGains,
+                             wordFedAppender,
+                             std::move(errors_),
+                             wordCounterGPU, fedCounter, convertADCtoElectrons_,
+                             useQuality_, includeErrors_,
+                             edm::MessageDrop::instance()->debugEnabled,
+                             ctx.stream());
+
+  ctxTmp_ = ctx.toToken();
+}
+
+void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  CUDAScopedContext ctx{std::move(ctxTmp_)};
+
+  auto tmp = gpuAlgo_.getResults();
+  ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first));
+  ctx.emplace(iEvent, clusterPutToken_, std::move(tmp.second));
+  if(includeErrors_) {
+    ctx.emplace(iEvent, digiErrorPutToken_, gpuAlgo_.getErrors());
+  }
+}
+
+// define as framework plugin
+DEFINE_FWK_MODULE(SiPixelRawToClusterCUDA);
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index 1388ed4852b25..fead8e59a0db3 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -44,13 +44,8 @@
 
 namespace pixelgpudetails {
 
-  // data structures size
-  constexpr uint32_t vsize = sizeof(GPU::SimpleVector<pixelgpudetails::error_obj>);
-  constexpr uint32_t esize = sizeof(pixelgpudetails::error_obj);
-
   // number of words for all the FEDs
   constexpr uint32_t MAX_FED_WORDS   = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
-  constexpr uint32_t MAX_ERROR_SIZE  = MAX_FED_WORDS * esize;
 
   SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender(cuda::stream_t<>& cudaStream) {
     edm::Service<CUDAService> cs;
@@ -397,7 +392,7 @@ namespace pixelgpudetails {
       const uint32_t wordCounter, const uint32_t *word, const uint8_t *fedIds,
       uint16_t *xx, uint16_t *yy, uint16_t *adc,
       uint32_t *pdigi, uint32_t *rawIdArr, uint16_t *moduleId,
-      GPU::SimpleVector<pixelgpudetails::error_obj> *err,
+      GPU::SimpleVector<PixelErrorCompact> *err,
       bool useQualityInfo, bool includeErrors, bool debug)
   {
     //if (threadIdx.x==0) printf("Event: %u blockIdx.x: %u start: %u end: %u\n", eventno, blockIdx.x, begin, end);
@@ -432,7 +427,7 @@ namespace pixelgpudetails {
         if (includeErrors and skipROC)
         {
           uint32_t rID = getErrRawID(fedId, ww, errorType, cablingMap, debug);
-          err->push_back(pixelgpudetails::error_obj{rID, ww, errorType, fedId});
+          err->push_back(PixelErrorCompact{rID, ww, errorType, fedId});
           continue;
         }
 
@@ -476,7 +471,7 @@ namespace pixelgpudetails {
           if (includeErrors) {
             if (not rocRowColIsValid(row, col)) {
               uint8_t error = conversionError(fedId, 3, debug); //use the device function and fill the arrays
-              err->push_back(pixelgpudetails::error_obj{rawId, ww, error, fedId});
+              err->push_back(PixelErrorCompact{rawId, ww, error, fedId});
               if(debug) printf("BPIX1  Error status: %i\n", error);
               continue;
             }
@@ -491,7 +486,7 @@ namespace pixelgpudetails {
           localPix.col = col;
           if (includeErrors and not dcolIsValid(dcol, pxid)) {
             uint8_t error = conversionError(fedId, 3, debug);
-            err->push_back(pixelgpudetails::error_obj{rawId, ww, error, fedId});
+            err->push_back(PixelErrorCompact{rawId, ww, error, fedId});
             if(debug) printf("Error status: %i %d %d %d %d\n", error, dcol, pxid, fedId, roc);
             continue;
           }
@@ -514,19 +509,22 @@ namespace pixelgpudetails {
       const unsigned char *modToUnp,
       const SiPixelGainForHLTonGPU *gains,
       const WordFedAppender& wordFed,
+      PixelFormatterErrors&& errors,
       const uint32_t wordCounter, const uint32_t fedCounter,
       bool convertADCtoElectrons,
-      bool useQualityInfo, bool includeErrors, bool transferToCPU, bool debug,
+      bool useQualityInfo, bool includeErrors, bool debug,
       cuda::stream_t<>& stream)
   {
     nDigis = wordCounter;
 
-    constexpr uint32_t MAX_FED_WORDS   = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
-    digis_d = SiPixelDigisCUDA(MAX_FED_WORDS, stream);
-    clusters_d = SiPixelClustersCUDA(MAX_FED_WORDS, gpuClustering::MaxNumModules, stream);
+    digis_d = SiPixelDigisCUDA(pixelgpudetails::MAX_FED_WORDS, stream);
+    if(includeErrors) {
+      digiErrors_d = SiPixelDigiErrorsCUDA(pixelgpudetails::MAX_FED_WORDS, std::move(errors), stream);
+    }
+    clusters_d = SiPixelClustersCUDA(gpuClustering::MaxNumModules, stream);
 
     edm::Service<CUDAService> cs;
-    digis_clusters_h.nModules_Clusters = cs->make_host_unique<uint32_t[]>(2, stream);
+    nModules_Clusters_h = cs->make_host_unique<uint32_t[]>(2, stream);
 
     {
       const int threadsPerBlock = 512;
@@ -537,20 +535,8 @@ namespace pixelgpudetails {
       auto word_d = cs->make_device_unique<uint32_t[]>(wordCounter, stream);
       auto fedId_d = cs->make_device_unique<uint8_t[]>(wordCounter, stream);
 
-      auto error_d = cs->make_device_unique<GPU::SimpleVector<pixelgpudetails::error_obj>>(stream);
-      auto data_d = cs->make_device_unique<pixelgpudetails::error_obj[]>(MAX_FED_WORDS, stream);
-      cudaCheck(cudaMemsetAsync(data_d.get(), 0x00, MAX_ERROR_SIZE, stream.id()));
-      auto error_h_tmp = cs->make_host_unique<GPU::SimpleVector<pixelgpudetails::error_obj>>(stream);
-      GPU::make_SimpleVector(error_h_tmp.get(), MAX_FED_WORDS, data_d.get());
-      assert(error_h_tmp->size() == 0);
-      assert(error_h_tmp->capacity() == static_cast<int>(MAX_FED_WORDS));
-
       cudaCheck(cudaMemcpyAsync(word_d.get(),  wordFed.word(), wordCounter*sizeof(uint32_t),    cudaMemcpyDefault, stream.id()));
       cudaCheck(cudaMemcpyAsync(fedId_d.get(), wordFed.fedId(), wordCounter*sizeof(uint8_t) / 2, cudaMemcpyDefault, stream.id()));
-      cudaCheck(cudaMemcpyAsync(error_d.get(), error_h_tmp.get(), vsize, cudaMemcpyDefault, stream.id()));
-
-      auto pdigi_d = cs->make_device_unique<uint32_t[]>(wordCounter, stream);
-      auto rawIdArr_d = cs->make_device_unique<uint32_t[]>(wordCounter, stream);
 
       // Launch rawToDigi kernel
       RawToDigi_kernel<<<blocks, threadsPerBlock, 0, stream.id()>>>(
@@ -560,43 +546,17 @@ namespace pixelgpudetails {
           word_d.get(),
           fedId_d.get(),
           digis_d.xx(), digis_d.yy(), digis_d.adc(),
-          pdigi_d.get(),
-          rawIdArr_d.get(),
+          digis_d.pdigi(),
+          digis_d.rawIdArr(),
           digis_d.moduleInd(),
-          error_d.get(),
+          digiErrors_d.error(), // returns nullptr if default-constructed
           useQualityInfo,
           includeErrors,
           debug);
       cudaCheck(cudaGetLastError());
 
-      // copy data to host variable
-      if(transferToCPU) {
-        digis_clusters_h.pdigi = cs->make_host_unique<uint32_t[]>(MAX_FED_WORDS, stream);
-        digis_clusters_h.rawIdArr = cs->make_host_unique<uint32_t[]>(MAX_FED_WORDS, stream);
-        cudaCheck(cudaMemcpyAsync(digis_clusters_h.pdigi.get(), pdigi_d.get(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-        cudaCheck(cudaMemcpyAsync(digis_clusters_h.rawIdArr.get(), rawIdArr_d.get(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-
-        if (includeErrors) {
-          digis_clusters_h.data = cs->make_host_unique<pixelgpudetails::error_obj[]>(MAX_FED_WORDS, stream);
-          digis_clusters_h.error = cs->make_host_unique<GPU::SimpleVector<pixelgpudetails::error_obj>>(stream);
-          GPU::make_SimpleVector(digis_clusters_h.error.get(), MAX_FED_WORDS, digis_clusters_h.data.get());
-          assert(digis_clusters_h.error->size() == 0);
-          assert(digis_clusters_h.error->capacity() == static_cast<int>(MAX_FED_WORDS));
-
-          cudaCheck(cudaMemcpyAsync(digis_clusters_h.error.get(), error_d.get(), vsize, cudaMemcpyDefault, stream.id()));
-          cudaCheck(cudaMemcpyAsync(digis_clusters_h.data.get(), data_d.get(), MAX_ERROR_SIZE, cudaMemcpyDefault, stream.id()));
-          // If we want to transfer only the minimal amount of data, we
-          // need a synchronization point. A single ExternalWork (of
-          // SiPixelRawToClusterHeterogeneous) does not help because it is
-          // already used to synchronize the data movement. So we'd need
-          // two ExternalWorks (or explicit use of TBB tasks). The
-          // prototype of #100 would allow this easily (as there would be
-          // two ExternalWorks).
-          //
-          //cudaCheck(cudaStreamSynchronize(stream.id()));
-          //int size = digis_clusters_h.error->size();
-          //cudaCheck(cudaMemcpyAsync(digis_clusters_h.data.get(), data_d.get(), size*esize, cudaMemcpyDefault, stream.id()));
-        }
+      if(includeErrors) {
+        digiErrors_d.copyErrorToHostAsync(stream);
       }
     }
     // End  of Raw2Digi and passing data for cluserisation
@@ -614,12 +574,6 @@ namespace pixelgpudetails {
           wordCounter);
       cudaCheck(cudaGetLastError());
 
-      // calibrated adc
-      if(transferToCPU) {
-        digis_clusters_h.adc = cs->make_host_unique<uint16_t[]>(MAX_FED_WORDS, stream);
-        cudaCheck(cudaMemcpyAsync(digis_clusters_h.adc.get(), digis_d.adc(), wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
-      }
-
 #ifdef GPU_DEBUG
        std::cout
          << "CUDA countModules kernel launch with " << blocks
@@ -628,11 +582,11 @@ namespace pixelgpudetails {
 
       cudaCheck(cudaMemsetAsync(clusters_d.moduleStart(), 0x00, sizeof(uint32_t), stream.id()));
 
-      countModules<<<blocks, threadsPerBlock, 0, stream.id()>>>(digis_d.c_moduleInd(), clusters_d.moduleStart(), clusters_d.clus(), wordCounter);
+      countModules<<<blocks, threadsPerBlock, 0, stream.id()>>>(digis_d.c_moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter);
       cudaCheck(cudaGetLastError());
 
       // read the number of modules into a data member, used by getProduct())
-      cudaCheck(cudaMemcpyAsync(&(digis_clusters_h.nModules_Clusters[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+      cudaCheck(cudaMemcpyAsync(&(nModules_Clusters_h[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
 
       threadsPerBlock = 256;
       blocks = MaxNumModules;
@@ -646,7 +600,7 @@ namespace pixelgpudetails {
           digis_d.c_xx(), digis_d.c_yy(),
           clusters_d.c_moduleStart(),
           clusters_d.clusInModule(), clusters_d.moduleId(),
-          clusters_d.clus(),
+          digis_d.clus(),
           wordCounter);
       cudaCheck(cudaGetLastError());
 
@@ -656,12 +610,11 @@ namespace pixelgpudetails {
           digis_d.c_adc(),
           clusters_d.c_moduleStart(),
           clusters_d.clusInModule(), clusters_d.c_moduleId(),
-          clusters_d.clus(),
+          digis_d.clus(),
           wordCounter);
       cudaCheck(cudaGetLastError());
 
 
-
       // count the module start indices already here (instead of
       // rechits) so that the number of clusters/hits can be made
       // available in the rechit producer without additional points of
@@ -681,15 +634,7 @@ namespace pixelgpudetails {
                                               clusters_d.c_clusInModule(), &clusters_d.clusModuleStart()[1], gpuClustering::MaxNumModules,
                                               stream.id()));
       // last element holds the number of all clusters
-      cudaCheck(cudaMemcpyAsync(&(digis_clusters_h.nModules_Clusters[1]), clusters_d.clusModuleStart()+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-
-
-      // clusters
-      if(transferToCPU) {
-        digis_clusters_h.clus = cs->make_host_unique<int32_t[]>(MAX_FED_WORDS, stream);
-        cudaCheck(cudaMemcpyAsync(digis_clusters_h.clus.get(), clusters_d.clus(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-      }
+      cudaCheck(cudaMemcpyAsync(&(nModules_Clusters_h[1]), clusters_d.clusModuleStart()+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
     } // end clusterizer scope
   }
-
 }
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
index 44bed9abc1e68..1ab8bc3fa5998 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -5,10 +5,13 @@
 #include <cuda_runtime.h>
 #include "cuda/api_wrappers.h"
 
-#include "CUDADataFormats/Common/interface/host_unique_ptr.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
+#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "FWCore/Utilities/interface/typedefs.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
-#include "siPixelRawToClusterHeterogeneousProduct.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "DataFormats/SiPixelDigi/interface/PixelErrors.h"
 
 struct SiPixelFedCablingMapGPU;
 class SiPixelGainForHLTonGPU;
@@ -152,34 +155,8 @@ namespace pixelgpudetails {
   }
 
 
-  using error_obj = siPixelRawToClusterHeterogeneousProduct::error_obj;
-
-
   class SiPixelRawToClusterGPUKernel {
   public:
-
-    using GPUProduct = siPixelRawToClusterHeterogeneousProduct::GPUProduct;
-
-    struct CPUData {
-      CPUData() = default;
-      ~CPUData() = default;
-
-      CPUData(const CPUData&) = delete;
-      CPUData& operator=(const CPUData&) = delete;
-      CPUData(CPUData&&) = default;
-      CPUData& operator=(CPUData&&) = default;
-      
-      edm::cuda::host::unique_ptr<uint32_t[]> nModules_Clusters; // These should really be part of the GPU product
-
-      edm::cuda::host::unique_ptr<pixelgpudetails::error_obj[]> data;
-      edm::cuda::host::unique_ptr<GPU::SimpleVector<pixelgpudetails::error_obj>> error;
-
-      edm::cuda::host::unique_ptr<uint32_t[]> pdigi;
-      edm::cuda::host::unique_ptr<uint32_t[]> rawIdArr;
-      edm::cuda::host::unique_ptr<uint16_t[]> adc;
-      edm::cuda::host::unique_ptr<int32_t[]> clus;
-    };
-
     class WordFedAppender {
     public:
       WordFedAppender(cuda::stream_t<>& cudaStream);
@@ -191,8 +168,8 @@ namespace pixelgpudetails {
       const unsigned char *fedId() const { return fedId_.get(); }
 
     private:
-      edm::cuda::host::unique_ptr<unsigned int[]> word_;
-      edm::cuda::host::unique_ptr<unsigned char[]> fedId_;
+      cudautils::host::unique_ptr<unsigned int[]> word_;
+      cudautils::host::unique_ptr<unsigned char[]> fedId_;
     };
 
     SiPixelRawToClusterGPUKernel() = default;
@@ -207,62 +184,38 @@ namespace pixelgpudetails {
     void makeClustersAsync(const SiPixelFedCablingMapGPU *cablingMap, const unsigned char *modToUnp,
                            const SiPixelGainForHLTonGPU *gains,
                            const WordFedAppender& wordFed,
+                           PixelFormatterErrors&& errors,
                            const uint32_t wordCounter, const uint32_t fedCounter, bool convertADCtoElectrons,
-                           bool useQualityInfo, bool includeErrors, bool transferToCPU_, bool debug,
+                           bool useQualityInfo, bool includeErrors, bool debug,
                            cuda::stream_t<>& stream);
 
-    siPixelRawToClusterHeterogeneousProduct::GPUProduct getProduct() {
-      return siPixelRawToClusterHeterogeneousProduct::GPUProduct(
-        std::move(digis_d), std::move(clusters_d),
-        nDigis,
-        digis_clusters_h.nModules_Clusters[0],
-        digis_clusters_h.nModules_Clusters[1]
-      );
+    std::pair<SiPixelDigisCUDA, SiPixelClustersCUDA> getResults() {
+      digis_d.setNModulesDigis(nModules_Clusters_h[0], nDigis);
+      clusters_d.setNClusters(nModules_Clusters_h[1]);
+      // need to explicitly deallocate while the associated CUDA
+      // stream is still alive
+      //
+      // technically the statement above is not true anymore now that
+      // the CUDA streams are cached within the CUDAService, but it is
+      // still better to release as early as possible
+      nModules_Clusters_h.reset();
+      return std::make_pair(std::move(digis_d), std::move(clusters_d));
     }
 
-    CPUData&& getCPUData() {
-      // Set the vector data pointer to point to CPU
-      digis_clusters_h.error->set_data(digis_clusters_h.data.get());
-      return std::move(digis_clusters_h);
+    SiPixelDigiErrorsCUDA&& getErrors() {
+      return std::move(digiErrors_d);
     }
 
   private:
     uint32_t nDigis = 0;
 
-    // CPU data
-    CPUData digis_clusters_h;
-
     // Data to be put in the event
+    cudautils::host::unique_ptr<uint32_t[]> nModules_Clusters_h;
     SiPixelDigisCUDA digis_d;
     SiPixelClustersCUDA clusters_d;
+    SiPixelDigiErrorsCUDA digiErrors_d;
   };
 
-  // configuration and memory buffers alocated on the GPU
-  struct context {
-    uint32_t * word_d;
-    uint8_t *  fedId_d;
-    uint32_t * pdigi_d;
-    uint16_t * xx_d;
-    uint16_t * yy_d;
-    uint16_t * adc_d;
-    uint16_t * moduleInd_d;
-    uint32_t * rawIdArr_d;
-
-    GPU::SimpleVector<error_obj> * error_d;
-    error_obj * data_d;
-
-    // these are for the clusterizer (to be moved)
-    uint32_t * moduleStart_d;
-    int32_t *  clus_d;
-    uint32_t * clusInModule_d;
-    uint32_t * moduleId_d;
-    uint32_t * debug_d;
-  };
-
-  // void initCablingMap();
-  context initDeviceMemory();
-  void freeMemory(context &);
-
   // see RecoLocalTracker/SiPixelClusterizer
   // all are runtime const, should be specified in python _cfg.py
   struct ADCThreshold {
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
deleted file mode 100644
index 905bc297b394d..0000000000000
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
+++ /dev/null
@@ -1,739 +0,0 @@
-// C++ includes
-#include <string>
-#include <chrono>
-#include <iostream>
-#include <fstream>
-
-// CUDA kincludes
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-// CMSSW includes
-#include "CalibTracker/Records/interface/SiPixelGainCalibrationForHLTGPURcd.h"
-#include "CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h"
-#include "CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTService.h"
-#include "CondFormats/DataRecord/interface/SiPixelFedCablingMapRcd.h"
-#include "CondFormats/DataRecord/interface/SiPixelQualityRcd.h"
-#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingMap.h"
-#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingTree.h"
-#include "CondFormats/SiPixelObjects/interface/SiPixelQuality.h"
-#include "DataFormats/Common/interface/DetSetVector.h"
-#include "DataFormats/Common/interface/Handle.h"
-#include "DataFormats/DetId/interface/DetIdCollection.h"
-#include "DataFormats/FEDRawData/interface/FEDNumbering.h"
-#include "DataFormats/FEDRawData/interface/FEDRawData.h"
-#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
-#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
-#include "DataFormats/SiPixelDetId/interface/PixelFEDChannel.h"
-#include "DataFormats/SiPixelDigi/interface/PixelDigi.h"
-#include "DataFormats/SiPixelRawData/interface/SiPixelRawDataError.h"
-#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
-#include "EventFilter/SiPixelRawToDigi/interface/PixelDataFormatter.h"
-#include "EventFilter/SiPixelRawToDigi/interface/PixelUnpackingRegions.h"
-#include "FWCore/Framework/interface/ConsumesCollector.h"
-#include "FWCore/Framework/interface/ESHandle.h"
-#include "FWCore/Framework/interface/ESTransientHandle.h"
-#include "FWCore/Framework/interface/ESWatcher.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/MessageLogger/interface/MessageLogger.h"
-#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/PluginManager/interface/ModuleDef.h"
-#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
-#include "Geometry/TrackerGeometryBuilder/interface/PixelGeomDetUnit.h"
-#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
-#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
-#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h"
-
-#include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h"
-#include "RecoTracker/Record/interface/CkfComponentsRecord.h"
-
-#include "SiPixelRawToClusterGPUKernel.h"
-#include "siPixelRawToClusterHeterogeneousProduct.h"
-#include "PixelThresholdClusterizer.h"
-
-namespace {
-  struct AccretionCluster {
-    typedef unsigned short UShort;
-    static constexpr UShort MAXSIZE = 256;
-    UShort adc[MAXSIZE];
-    UShort x[MAXSIZE];
-    UShort y[MAXSIZE];
-    UShort xmin=16000;
-    UShort ymin=16000;
-    unsigned int isize=0;
-    int charge=0;
-
-    void clear() {
-      isize=0;
-      charge=0;
-      xmin=16000;
-      ymin=16000;
-    }
-
-    bool add(SiPixelCluster::PixelPos const & p, UShort const iadc) {
-      if (isize==MAXSIZE) return false;
-      xmin=std::min(xmin,(unsigned short)(p.row()));
-      ymin=std::min(ymin,(unsigned short)(p.col()));
-      adc[isize]=iadc;
-      x[isize]=p.row();
-      y[isize++]=p.col();
-      charge+=iadc;
-      return true;
-    }
-  };
-
-  constexpr uint32_t dummydetid = 0xffffffff;
-}
-
-class SiPixelRawToClusterHeterogeneous: public HeterogeneousEDProducer<heterogeneous::HeterogeneousDevices <
-                                                                      heterogeneous::GPUCuda,
-                                                                      heterogeneous::CPU
-                                                                      > > {
-public:
-  using CPUProduct = siPixelRawToClusterHeterogeneousProduct::CPUProduct;
-  using GPUProduct = siPixelRawToClusterHeterogeneousProduct::GPUProduct;
-  using Output = siPixelRawToClusterHeterogeneousProduct::HeterogeneousDigiCluster;
-
-  explicit SiPixelRawToClusterHeterogeneous(const edm::ParameterSet& iConfig);
-  ~SiPixelRawToClusterHeterogeneous() override = default;
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
-
-private:
-  // CPU implementation
-  void produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) override;
-
-  // GPU implementation
-  void acquireGPUCuda(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) override;
-  void produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) override;
-  void convertGPUtoCPU(edm::Event& ev, unsigned int nDigis, pixelgpudetails::SiPixelRawToClusterGPUKernel::CPUData) const;
-
-  // Commonalities
-  const FEDRawDataCollection *initialize(const edm::Event& ev, const edm::EventSetup& es);
-
-  std::unique_ptr<SiPixelFedCablingTree> cabling_;
-  const SiPixelQuality *badPixelInfo_ = nullptr;
-  const SiPixelFedCablingMap *cablingMap_ = nullptr;
-std::unique_ptr<PixelUnpackingRegions> regions_;
-  edm::EDGetTokenT<FEDRawDataCollection> tFEDRawDataCollection;
-
-  bool includeErrors;
-  bool useQuality;
-  bool debug;
-  std::vector<int> tkerrorlist;
-  std::vector<int> usererrorlist;
-  std::vector<unsigned int> fedIds;
-
-  edm::ESWatcher<SiPixelFedCablingMapRcd> recordWatcher;
-  edm::ESWatcher<SiPixelQualityRcd> qualityWatcher;
-
-  bool usePilotBlade;
-  bool usePhase1;
-  bool convertADCtoElectrons;
-  std::string cablingMapLabel;
-
-  // clusterizer
-  PixelThresholdClusterizer clusterizer_;
-  const TrackerGeometry *geom_ = nullptr;
-  const TrackerTopology *ttopo_ = nullptr;
-
-  //  gain calib
-  SiPixelGainCalibrationForHLTService  theSiPixelGainCalibration_;
-
-  // GPU algo
-  pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_;
-  PixelDataFormatter::Errors errors_;
-
-  bool enableTransfer_;
-  bool enableConversion_;
-};
-
-SiPixelRawToClusterHeterogeneous::SiPixelRawToClusterHeterogeneous(const edm::ParameterSet& iConfig):
-  HeterogeneousEDProducer(iConfig),
-  clusterizer_(iConfig),
-  theSiPixelGainCalibration_(iConfig) {
-  includeErrors = iConfig.getParameter<bool>("IncludeErrors");
-  useQuality = iConfig.getParameter<bool>("UseQualityInfo");
-  tkerrorlist = iConfig.getParameter<std::vector<int> > ("ErrorList");
-  usererrorlist = iConfig.getParameter<std::vector<int> > ("UserErrorList");
-  tFEDRawDataCollection = consumes <FEDRawDataCollection> (iConfig.getParameter<edm::InputTag>("InputLabel"));
-
-  enableConversion_ = iConfig.getParameter<bool>("gpuEnableConversion");
-  enableTransfer_ = enableConversion_ || iConfig.getParameter<bool>("gpuEnableTransfer");
-
-  clusterizer_.setSiPixelGainCalibrationService(&theSiPixelGainCalibration_);
-
-  // Products in GPU
-  produces<HeterogeneousProduct>();
-  // Products in CPU
-  if(enableConversion_) {
-    produces<edm::DetSetVector<PixelDigi>>();
-    if(includeErrors) {
-      produces<edm::DetSetVector<SiPixelRawDataError>>();
-      produces<DetIdCollection>();
-      produces<DetIdCollection>("UserErrorModules");
-      produces<SiPixelClusterCollectionNew>();
-      produces<edmNew::DetSetVector<PixelFEDChannel>>();
-    }
-  }
-
-  // regions
-  if(!iConfig.getParameter<edm::ParameterSet>("Regions").getParameterNames().empty()) {
-    regions_ = std::make_unique<PixelUnpackingRegions>(iConfig, consumesCollector());
-  }
-
-  // Control the usage of pilot-blade data, FED=40
-  usePilotBlade = iConfig.getParameter<bool> ("UsePilotBlade");
-  if(usePilotBlade) edm::LogInfo("SiPixelRawToCluster")  << " Use pilot blade data (FED 40)";
-
-  // Control the usage of phase1
-  usePhase1 = iConfig.getParameter<bool> ("UsePhase1");
-  if(usePhase1) edm::LogInfo("SiPixelRawToCluster")  << " Using phase1";
-
-  //CablingMap could have a label //Tav
-  cablingMapLabel = iConfig.getParameter<std::string> ("CablingMapLabel");
-
-  convertADCtoElectrons = iConfig.getParameter<bool>("ConvertADCtoElectrons");
-}
-
-void SiPixelRawToClusterHeterogeneous::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-  edm::ParameterSetDescription desc;
-  desc.add<bool>("IncludeErrors",true);
-  desc.add<bool>("UseQualityInfo",false);
-  {
-    std::vector<int> temp1;
-    temp1.reserve(1);
-    temp1.push_back(29);
-    desc.add<std::vector<int> >("ErrorList",temp1)->setComment("## ErrorList: list of error codes used by tracking to invalidate modules");
-  }
-  {
-    std::vector<int> temp1;
-    temp1.reserve(1);
-    temp1.push_back(40);
-    desc.add<std::vector<int> >("UserErrorList",temp1)->setComment("## UserErrorList: list of error codes used by Pixel experts for investigation");
-  }
-  desc.add<edm::InputTag>("InputLabel",edm::InputTag("rawDataCollector"));
-  {
-    edm::ParameterSetDescription psd0;
-    psd0.addOptional<std::vector<edm::InputTag>>("inputs");
-    psd0.addOptional<std::vector<double>>("deltaPhi");
-    psd0.addOptional<std::vector<double>>("maxZ");
-    psd0.addOptional<edm::InputTag>("beamSpot");
-    desc.add<edm::ParameterSetDescription>("Regions",psd0)->setComment("## Empty Regions PSet means complete unpacking");
-  }
-  desc.add<bool>("UsePilotBlade",false)->setComment("##  Use pilot blades");
-  desc.add<bool>("UsePhase1",false)->setComment("##  Use phase1");
-  desc.add<std::string>("CablingMapLabel","")->setComment("CablingMap label"); //Tav
-  desc.addOptional<bool>("CheckPixelOrder");  // never used, kept for back-compatibility
-
-  desc.add<bool>("ConvertADCtoElectrons", false)->setComment("## do the calibration ADC-> Electron and apply the threshold, requried for clustering");
-
-  // clusterizer
-  desc.add<int>("ChannelThreshold", 1000);
-  desc.add<int>("SeedThreshold", 1000);
-  desc.add<int>("ClusterThreshold", 4000);
-  desc.add<int>("ClusterThreshold_L1", 4000);
-  desc.add<int>("VCaltoElectronGain", 65);
-  desc.add<int>("VCaltoElectronGain_L1", 65);
-  desc.add<int>("VCaltoElectronOffset", -414);
-  desc.add<int>("VCaltoElectronOffset_L1", -414);
-  desc.add<bool>("MissCalibrate", true);
-  desc.add<bool>("SplitClusters", false);
-  desc.add<double>("ElectronPerADCGain", 135.);
-  // Phase 2 clusterizer
-  desc.add<bool>("Phase2Calibration", false);
-  desc.add<int>("Phase2ReadoutMode", -1);
-  desc.add<double>("Phase2DigiBaseline", 1200.);
-  desc.add<int>("Phase2KinkADC", 8);
-
-  desc.add<bool>("gpuEnableTransfer", true);
-  desc.add<bool>("gpuEnableConversion", true);
-
-  HeterogeneousEDProducer::fillPSetDescription(desc);
-
-  descriptions.add("siPixelClustersHeterogeneousDefault",desc);
-}
-
-const FEDRawDataCollection *SiPixelRawToClusterHeterogeneous::initialize(const edm::Event& ev, const edm::EventSetup& es) {
-  debug = edm::MessageDrop::instance()->debugEnabled;
-
-  // setup gain calibration service
-  theSiPixelGainCalibration_.setESObjects( es );
-
-  // initialize cabling map or update if necessary
-  if (recordWatcher.check( es )) {
-    // cabling map, which maps online address (fed->link->ROC->local pixel) to offline (DetId->global pixel)
-    edm::ESTransientHandle<SiPixelFedCablingMap> cablingMap;
-    es.get<SiPixelFedCablingMapRcd>().get( cablingMapLabel, cablingMap ); //Tav
-    cablingMap_ = cablingMap.product();
-    fedIds   = cablingMap->fedIds();
-    cabling_ = cablingMap->cablingTree();
-    LogDebug("map version:")<< cabling_->version();
-  }
-  // initialize quality record or update if necessary
-  if (qualityWatcher.check( es )&&useQuality) {
-    // quality info for dead pixel modules or ROCs
-    edm::ESHandle<SiPixelQuality> qualityInfo;
-    es.get<SiPixelQualityRcd>().get( qualityInfo );
-    badPixelInfo_ = qualityInfo.product();
-    if (!badPixelInfo_) {
-      edm::LogError("SiPixelQualityNotPresent")<<" Configured to use SiPixelQuality, but SiPixelQuality not present";
-    }
-  }
-
-  // tracker geometry: to make sure numbering of DetId is consistent...
-  edm::ESHandle<TrackerGeometry> geom;
-  es.get<TrackerDigiGeometryRecord>().get(geom);
-  geom_ = geom.product();
-
-  edm::ESHandle<TrackerTopology> trackerTopologyHandle;
-  es.get<TrackerTopologyRcd>().get(trackerTopologyHandle);
-  ttopo_ = trackerTopologyHandle.product();
-
-  if (regions_) {
-    regions_->run(ev, es);
-    LogDebug("SiPixelRawToCluster") << "region2unpack #feds: "<<regions_->nFEDs();
-    LogDebug("SiPixelRawToCluster") << "region2unpack #modules (BPIX,EPIX,total): "<<regions_->nBarrelModules()<<" "<<regions_->nForwardModules()<<" "<<regions_->nModules();
-  }
-
-  edm::Handle<FEDRawDataCollection> buffers;
-  ev.getByToken(tFEDRawDataCollection, buffers);
-  return buffers.product();
-}
-
-
-// -----------------------------------------------------------------------------
-void SiPixelRawToClusterHeterogeneous::produceCPU(edm::HeterogeneousEvent& ev, const edm::EventSetup& es)
-{
-  const auto buffers = initialize(ev.event(), es);
-
-  // create product (digis & errors)
-  auto collection = std::make_unique<edm::DetSetVector<PixelDigi>>();
-  auto errorcollection = std::make_unique<edm::DetSetVector<SiPixelRawDataError>>();
-  auto tkerror_detidcollection = std::make_unique<DetIdCollection>();
-  auto usererror_detidcollection = std::make_unique<DetIdCollection>();
-  auto disabled_channelcollection = std::make_unique< edmNew::DetSetVector<PixelFEDChannel>>();
-  auto outputClusters = std::make_unique<SiPixelClusterCollectionNew>();
-  // output->collection.reserve(8*1024);
-
-
-  PixelDataFormatter formatter(cabling_.get(), usePhase1); // for phase 1 & 0
-  formatter.setErrorStatus(includeErrors);
-  if (useQuality) formatter.setQualityStatus(useQuality, badPixelInfo_);
-
-  bool errorsInEvent = false;
-  PixelDataFormatter::DetErrors nodeterrors;
-
-  if (regions_) {
-    formatter.setModulesToUnpack(regions_->modulesToUnpack());
-  }
-
-  for (auto aFed = fedIds.begin(); aFed != fedIds.end(); ++aFed) {
-    int fedId = *aFed;
-
-    if(!usePilotBlade && (fedId==40) ) continue; // skip pilot blade data
-
-    if (regions_ && !regions_->mayUnpackFED(fedId)) continue;
-
-    if(debug) LogDebug("SiPixelRawToCluster")<< " PRODUCE DIGI FOR FED: " <<  fedId;
-
-    PixelDataFormatter::Errors errors;
-
-    //get event data for this fed
-    const FEDRawData& fedRawData = buffers->FEDData( fedId );
-
-    //convert data to digi and strip off errors
-    formatter.interpretRawData( errorsInEvent, fedId, fedRawData, *collection, errors);
-
-    //pack errors into collection
-    if(includeErrors) {
-      typedef PixelDataFormatter::Errors::iterator IE;
-      for (IE is = errors.begin(); is != errors.end(); is++) {
-	uint32_t errordetid = is->first;
-	if (errordetid==dummydetid) {           // errors given dummy detId must be sorted by Fed
-	  nodeterrors.insert( nodeterrors.end(), errors[errordetid].begin(), errors[errordetid].end() );
-	} else {
-	  edm::DetSet<SiPixelRawDataError>& errorDetSet = errorcollection->find_or_insert(errordetid);
-	  errorDetSet.data.insert(errorDetSet.data.end(), is->second.begin(), is->second.end());
-	  // Fill detid of the detectors where there is error AND the error number is listed
-	  // in the configurable error list in the job option cfi.
-	  // Code needs to be here, because there can be a set of errors for each
-	  // entry in the for loop over PixelDataFormatter::Errors
-
-	  std::vector<PixelFEDChannel> disabledChannelsDetSet;
-
-	  for (auto const& aPixelError : errorDetSet) {
-	    // For the time being, we extend the error handling functionality with ErrorType 25
-	    // In the future, we should sort out how the usage of tkerrorlist can be generalized
-	    if (aPixelError.getType()==25) {
-	      assert(aPixelError.getFedId()==fedId);
-	      const sipixelobjects::PixelFEDCabling* fed = cabling_->fed(fedId);
-	      if (fed) {
-		cms_uint32_t linkId = formatter.linkId(aPixelError.getWord32());
-		const sipixelobjects::PixelFEDLink* link = fed->link(linkId);
-		if (link) {
-		  // The "offline" 0..15 numbering is fixed by definition, also, the FrameConversion depends on it
-		  // in contrast, the ROC-in-channel numbering is determined by hardware --> better to use the "offline" scheme
-		  PixelFEDChannel ch = {fed->id(), linkId, 25, 0};
-		  for (unsigned int iRoc=1; iRoc<=link->numberOfROCs(); iRoc++) {
-		    const sipixelobjects::PixelROC * roc = link->roc(iRoc);
-		    if (roc->idInDetUnit()<ch.roc_first) ch.roc_first=roc->idInDetUnit();
-		    if (roc->idInDetUnit()>ch.roc_last) ch.roc_last=roc->idInDetUnit();
-		  }
-		  disabledChannelsDetSet.push_back(ch);
-		}
-	      }
-	    } else {
-	      // fill list of detIds to be turned off by tracking
-	      if(!tkerrorlist.empty()) {
-		auto it_find = std::find(tkerrorlist.begin(), tkerrorlist.end(), aPixelError.getType());
-		if(it_find != tkerrorlist.end()){
-		  tkerror_detidcollection->push_back(errordetid);
-		}
-	      }
-	    }
-
-	    // fill list of detIds with errors to be studied
-	    if(!usererrorlist.empty()) {
-	      auto it_find = std::find(usererrorlist.begin(), usererrorlist.end(), aPixelError.getType());
-	      if(it_find != usererrorlist.end()){
-		usererror_detidcollection->push_back(errordetid);
-	      }
-	    }
-
-	  } // loop on DetSet of errors
-
-	  if (!disabledChannelsDetSet.empty()) {
-	    disabled_channelcollection->insert(errordetid, disabledChannelsDetSet.data(), disabledChannelsDetSet.size());
-	  }
-
-	} // if error assigned to a real DetId
-      } // loop on errors in event for this FED
-    } // if errors to be included in the event
-  } // loop on FED data to be unpacked
-
-  if(includeErrors) {
-    edm::DetSet<SiPixelRawDataError>& errorDetSet = errorcollection->find_or_insert(dummydetid);
-    errorDetSet.data = nodeterrors;
-  }
-  if (errorsInEvent) LogDebug("SiPixelRawToCluster") << "Error words were stored in this event";
-
-  // clusterize, originally from SiPixelClusterProducer
-  for(const auto detset: *collection) {
-    const auto detId = DetId(detset.detId());
-
-    std::vector<short> badChannels; // why do we need this?
-
-    // Comment: At the moment the clusterizer depends on geometry
-    // to access information as the pixel topology (number of columns
-    // and rows in a detector module).
-    // In the future the geometry service will be replaced with
-    // a ES service.
-    const GeomDetUnit      * geoUnit = geom_->idToDetUnit( detId );
-    const PixelGeomDetUnit * pixDet  = dynamic_cast<const PixelGeomDetUnit*>(geoUnit);
-    edmNew::DetSetVector<SiPixelCluster>::FastFiller spc(*outputClusters, detset.detId());
-    clusterizer_.clusterizeDetUnit(detset, pixDet, ttopo_, badChannels, spc);
-    if ( spc.empty() ) {
-      spc.abort();
-    }
-  }
-  outputClusters->shrink_to_fit();
-
-  //send digis and errors back to framework
-  ev.put(std::move(collection));
-  if(includeErrors){
-    ev.put(std::move(errorcollection));
-    ev.put(std::move(tkerror_detidcollection));
-    ev.put(std::move(usererror_detidcollection), "UserErrorModules");
-    ev.put(std::move(disabled_channelcollection));
-  }
-  ev.put(std::move(outputClusters));
-}
-
-// -----------------------------------------------------------------------------
-void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& ev, const edm::EventSetup& es, cuda::stream_t<>& cudaStream) {
-  const auto buffers = initialize(ev.event(), es);
-
-  edm::ESHandle<SiPixelFedCablingMapGPUWrapper> hgpuMap;
-  es.get<CkfComponentsRecord>().get(hgpuMap);
-  if(hgpuMap->hasQuality() != useQuality) {
-    throw cms::Exception("LogicError") << "UseQuality of the module (" << useQuality<< ") differs the one from SiPixelFedCablingMapGPUWrapper. Please fix your configuration.";
-  }
-  // get the GPU product already here so that the async transfer can begin
-  const auto *gpuMap = hgpuMap->getGPUProductAsync(cudaStream);
-
-  edm::cuda::device::unique_ptr<unsigned char[]> modulesToUnpackRegional;
-  const unsigned char *gpuModulesToUnpack;
-  if (regions_) {
-    modulesToUnpackRegional = hgpuMap->getModToUnpRegionalAsync(*(regions_->modulesToUnpack()), cudaStream);
-    gpuModulesToUnpack = modulesToUnpackRegional.get();
-  }
-  else {
-    gpuModulesToUnpack = hgpuMap->getModToUnpAllAsync(cudaStream);
-  }
-
-
-  edm::ESHandle<SiPixelGainCalibrationForHLTGPU> hgains;
-  es.get<SiPixelGainCalibrationForHLTGPURcd>().get(hgains);
-
-  errors_.clear();
-
-  // GPU specific: Data extraction for RawToDigi GPU
-  unsigned int wordCounterGPU = 0;
-  unsigned int fedCounter = 0;
-  bool errorsInEvent = false;
-
-  // In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData()
-  ErrorChecker errorcheck;
-  auto wordFedAppender = pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender(cudaStream);
-  for (auto aFed = fedIds.begin(); aFed != fedIds.end(); ++aFed) {
-    int fedId = *aFed;
-
-    if (!usePilotBlade && (fedId==40) ) continue; // skip pilot blade data
-    if (regions_ && !regions_->mayUnpackFED(fedId)) continue;
-
-    // for GPU
-    // first 150 index stores the fedId and next 150 will store the
-    // start index of word in that fed
-    assert(fedId>=1200);
-    fedCounter++;
-
-    // get event data for this fed
-    const FEDRawData& rawData = buffers->FEDData( fedId );
-
-    // GPU specific
-    int nWords = rawData.size()/sizeof(cms_uint64_t);
-    if (nWords == 0) {
-      continue;
-    }
-
-    // check CRC bit
-    const cms_uint64_t* trailer = reinterpret_cast<const cms_uint64_t* >(rawData.data())+(nWords-1);
-    if (not errorcheck.checkCRC(errorsInEvent, fedId, trailer, errors_)) {
-      continue;
-    }
-
-    // check headers
-    const cms_uint64_t* header = reinterpret_cast<const cms_uint64_t* >(rawData.data()); header--;
-    bool moreHeaders = true;
-    while (moreHeaders) {
-      header++;
-      bool headerStatus = errorcheck.checkHeader(errorsInEvent, fedId, header, errors_);
-      moreHeaders = headerStatus;
-    }
-
-    // check trailers
-    bool moreTrailers = true;
-    trailer++;
-    while (moreTrailers) {
-      trailer--;
-      bool trailerStatus = errorcheck.checkTrailer(errorsInEvent, fedId, nWords, trailer, errors_);
-      moreTrailers = trailerStatus;
-    }
-
-    const cms_uint32_t * bw = (const cms_uint32_t *)(header+1);
-    const cms_uint32_t * ew = (const cms_uint32_t *)(trailer);
-
-    assert(0 == (ew-bw)%2);
-    wordFedAppender.initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw));
-    wordCounterGPU+=(ew-bw);
-
-  } // end of for loop
-
-  gpuAlgo_.makeClustersAsync(gpuMap, gpuModulesToUnpack, hgains->getGPUProductAsync(cudaStream),
-                             wordFedAppender,
-                             wordCounterGPU, fedCounter, convertADCtoElectrons,
-                             useQuality, includeErrors, enableTransfer_, debug, cudaStream);
-}
-
-void SiPixelRawToClusterHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& ev, const edm::EventSetup& es, cuda::stream_t<>& cudaStream) {
-  auto output = std::make_unique<GPUProduct>(gpuAlgo_.getProduct());
-
-  if(enableConversion_) {
-    convertGPUtoCPU(ev.event(), output->nDigis, gpuAlgo_.getCPUData());
-  }
-
-  ev.put<Output>(std::move(output), heterogeneous::DisableTransfer{});
-}
-
-void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev,
-                                                       unsigned int nDigis,
-                                                       pixelgpudetails::SiPixelRawToClusterGPUKernel::CPUData digis_clusters_h) const {
-  // TODO: add the transfers here as well?
-
-  auto collection = std::make_unique<edm::DetSetVector<PixelDigi>>();
-  auto errorcollection = std::make_unique<edm::DetSetVector<SiPixelRawDataError>>();
-  auto tkerror_detidcollection = std::make_unique<DetIdCollection>();
-  auto usererror_detidcollection = std::make_unique<DetIdCollection>();
-  auto disabled_channelcollection = std::make_unique< edmNew::DetSetVector<PixelFEDChannel>>();
-  auto outputClusters = std::make_unique<SiPixelClusterCollectionNew>();
-
-  edm::DetSet<PixelDigi> * detDigis=nullptr;
-  for (uint32_t i = 0; i < nDigis; i++) {
-    if (digis_clusters_h.pdigi[i]==0) continue;
-    detDigis = &collection->find_or_insert(digis_clusters_h.rawIdArr[i]);
-    if ( (*detDigis).empty() ) (*detDigis).data.reserve(32); // avoid the first relocations
-    break;
-  }
-
-  int32_t nclus=-1;
-  std::vector<AccretionCluster> aclusters(1024);
-  auto totCluseFilled=0;
-
-  auto fillClusters = [&](uint32_t detId){
-    if (nclus<0) return; // this in reality should never happen
-    edmNew::DetSetVector<SiPixelCluster>::FastFiller spc(*outputClusters, detId);
-    auto layer = (DetId(detId).subdetId()==1) ? ttopo_->pxbLayer(detId) : 0;
-    auto clusterThreshold = (layer==1) ? 2000 : 4000;
-    for (int32_t ic=0; ic<nclus+1;++ic) {
-      auto const & acluster = aclusters[ic];
-      if ( acluster.charge < clusterThreshold) continue;
-      SiPixelCluster cluster(acluster.isize,acluster.adc, acluster.x,acluster.y, acluster.xmin,acluster.ymin);
-      ++totCluseFilled;
-      // std::cout << "putting in this cluster " << ic << " " << cluster.charge() << " " << cluster.pixelADC().size() << endl;
-      // sort by row (x)
-      spc.push_back( std::move(cluster) );
-      std::push_heap(spc.begin(),spc.end(),[](SiPixelCluster const & cl1,SiPixelCluster const & cl2) { return cl1.minPixelRow() < cl2.minPixelRow();});
-    }
-    for (int32_t ic=0; ic<nclus+1;++ic) aclusters[ic].clear();
-    nclus = -1;
-    // sort by row (x)
-    std::sort_heap(spc.begin(),spc.end(),[](SiPixelCluster const & cl1,SiPixelCluster const & cl2) { return cl1.minPixelRow() < cl2.minPixelRow();});
-    if ( spc.empty() ) spc.abort();
-  };
-
-  for (uint32_t i = 0; i < nDigis; i++) {
-    if (digis_clusters_h.pdigi[i]==0) continue;
-    if (digis_clusters_h.clus[i]>9000) continue; // not in cluster
-    assert(digis_clusters_h.rawIdArr[i] > 109999);
-    if ( (*detDigis).detId() != digis_clusters_h.rawIdArr[i])
-      {
-        fillClusters((*detDigis).detId());
-        assert(nclus==-1);
-        detDigis = &collection->find_or_insert(digis_clusters_h.rawIdArr[i]);
-        if ( (*detDigis).empty() )
-          (*detDigis).data.reserve(32); // avoid the first relocations
-        else { std::cout << "Problem det present twice in input! " << (*detDigis).detId() << std::endl; }
-      }
-    (*detDigis).data.emplace_back(digis_clusters_h.pdigi[i]);
-    auto const & dig = (*detDigis).data.back();
-    // fill clusters
-    assert(digis_clusters_h.clus[i]>=0);
-    assert(digis_clusters_h.clus[i]<1024);
-    nclus = std::max(digis_clusters_h.clus[i],nclus);
-    auto row = dig.row();
-    auto col = dig.column();
-    SiPixelCluster::PixelPos pix(row,col);
-    aclusters[digis_clusters_h.clus[i]].add(pix, digis_clusters_h.adc[i]);
-  }
-
-  // fill final clusters
-  fillClusters((*detDigis).detId());
-  //std::cout << "filled " << totCluseFilled << " clusters" << std::endl;
-
-  PixelDataFormatter formatter(cabling_.get(), usePhase1); // for phase 1 & 0
-  auto errors = errors_; // make a copy
-  PixelDataFormatter::DetErrors nodeterrors;
-
-  auto size = digis_clusters_h.error->size();
-  for (auto i = 0; i < size; i++) {
-    pixelgpudetails::error_obj err = (*digis_clusters_h.error)[i];
-    if (err.errorType != 0) {
-      SiPixelRawDataError error(err.word, err.errorType, err.fedId + 1200);
-      errors[err.rawId].push_back(error);
-    }
-  }
-
-  // pack errors into collection
-  if (includeErrors) {
-
-    typedef PixelDataFormatter::Errors::iterator IE;
-    for (IE is = errors.begin(); is != errors.end(); is++) {
-
-      uint32_t errordetid = is->first;
-      if (errordetid == dummydetid) {// errors given dummy detId must be sorted by Fed
-        nodeterrors.insert( nodeterrors.end(), errors[errordetid].begin(), errors[errordetid].end() );
-      }
-      else {
-        edm::DetSet<SiPixelRawDataError>& errorDetSet = errorcollection->find_or_insert(errordetid);
-        errorDetSet.data.insert(errorDetSet.data.end(), is->second.begin(), is->second.end());
-        // Fill detid of the detectors where there is error AND the error number is listed
-        // in the configurable error list in the job option cfi.
-        // Code needs to be here, because there can be a set of errors for each
-        // entry in the for loop over PixelDataFormatter::Errors
-
-        std::vector<PixelFEDChannel> disabledChannelsDetSet;
-
-        for (auto const& aPixelError : errorDetSet) {
-          // For the time being, we extend the error handling functionality with ErrorType 25
-          // In the future, we should sort out how the usage of tkerrorlist can be generalized
-          if (aPixelError.getType() == 25) {
-            int fedId = aPixelError.getFedId();
-            const sipixelobjects::PixelFEDCabling* fed = cabling_->fed(fedId);
-            if (fed) {
-              cms_uint32_t linkId = formatter.linkId(aPixelError.getWord32());
-              const sipixelobjects::PixelFEDLink* link = fed->link(linkId);
-              if (link) {
-                // The "offline" 0..15 numbering is fixed by definition, also, the FrameConversion depends on it
-                // in contrast, the ROC-in-channel numbering is determined by hardware --> better to use the "offline" scheme
-                PixelFEDChannel ch = {fed->id(), linkId, 25, 0};
-                for (unsigned int iRoc = 1; iRoc <= link->numberOfROCs(); iRoc++) {
-                  const sipixelobjects::PixelROC * roc = link->roc(iRoc);
-                  if (roc->idInDetUnit() < ch.roc_first) ch.roc_first = roc->idInDetUnit();
-                  if (roc->idInDetUnit() > ch.roc_last) ch.roc_last = roc->idInDetUnit();
-                }
-                if (ch.roc_first<ch.roc_last) disabledChannelsDetSet.push_back(ch);
-              }
-            }
-          }
-          else {
-            // fill list of detIds to be turned off by tracking
-            if (!tkerrorlist.empty()) {
-              auto it_find = std::find(tkerrorlist.begin(), tkerrorlist.end(), aPixelError.getType());
-              if (it_find != tkerrorlist.end()) {
-                tkerror_detidcollection->push_back(errordetid);
-              }
-            }
-          }
-
-          // fill list of detIds with errors to be studied
-          if (!usererrorlist.empty()) {
-            auto it_find = std::find(usererrorlist.begin(), usererrorlist.end(), aPixelError.getType());
-            if (it_find != usererrorlist.end()) {
-              usererror_detidcollection->push_back(errordetid);
-            }
-          }
-
-        } // loop on DetSet of errors
-
-        if (!disabledChannelsDetSet.empty()) {
-          disabled_channelcollection->insert(errordetid, disabledChannelsDetSet.data(), disabledChannelsDetSet.size());
-        }
-
-      } // if error assigned to a real DetId
-    } // loop on errors in event for this FED
-  } // if errors to be included in the event
-
-  if (includeErrors) {
-    edm::DetSet<SiPixelRawDataError>& errorDetSet = errorcollection->find_or_insert(dummydetid);
-    errorDetSet.data = nodeterrors;
-  }
-
-  ev.put(std::move(collection));
-  if(includeErrors){
-    ev.put(std::move(errorcollection));
-    ev.put(std::move(tkerror_detidcollection));
-    ev.put(std::move(usererror_detidcollection), "UserErrorModules");
-    ev.put(std::move(disabled_channelcollection));
-  }
-  ev.put(std::move(outputClusters));
-}
-
-// define as framework plugin
-DEFINE_FWK_MODULE(SiPixelRawToClusterHeterogeneous);
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h
deleted file mode 100644
index 3b81e4a16f017..0000000000000
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef EventFilter_SiPixelRawToDigi_siPixelRawToClusterHeterogeneousProduct_h
-#define EventFilter_SiPixelRawToDigi_siPixelRawToClusterHeterogeneousProduct_h
-
-#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
-#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
-#include "FWCore/Utilities/interface/typedefs.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
-#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h"
-
-namespace siPixelRawToClusterHeterogeneousProduct {
-  using CPUProduct = int; // dummy...
-
-  struct error_obj {
-    uint32_t rawId;
-    uint32_t word;
-    unsigned char errorType;
-    unsigned char fedId;
-  };
-
-  // FIXME split in two
-  struct GPUProduct {
-    GPUProduct() = default;
-    GPUProduct(const GPUProduct&) = delete;
-    GPUProduct& operator=(const GPUProduct&) = delete;
-    GPUProduct(GPUProduct&&) = default;
-    GPUProduct& operator=(GPUProduct&&) = default;
-
-    GPUProduct(SiPixelDigisCUDA&& digis,
-               SiPixelClustersCUDA&& clusters,
-               uint32_t ndig, uint32_t nmod, uint32_t nclus):
-      digis_d(std::move(digis)), clusters_d(std::move(clusters)),
-      nDigis(ndig), nModules(nmod), nClusters(nclus)
-    {}
-
-    SiPixelDigisCUDA digis_d;
-    SiPixelClustersCUDA clusters_d;
-
-    uint32_t nDigis;
-    uint32_t nModules;
-    uint32_t nClusters;
-  };
-
-  using HeterogeneousDigiCluster = HeterogeneousProductImpl<heterogeneous::CPUProduct<CPUProduct>,
-                                                            heterogeneous::GPUCudaProduct<GPUProduct> >;
-}
-
-#endif
diff --git a/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py b/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py
index bb0bb85697a99..b9c6862b015bf 100644
--- a/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py
+++ b/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py
@@ -2,10 +2,16 @@
 
 from CondTools.SiPixel.SiPixelGainCalibrationService_cfi import *
 from RecoLocalTracker.SiPixelClusterizer.SiPixelClusterizer_cfi import siPixelClusters as _siPixelClusters
-siPixelClustersPreSplitting = _siPixelClusters.clone()
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+siPixelClustersPreSplitting = SwitchProducerCUDA(
+    cpu = _siPixelClusters.clone()
+)
 
 from Configuration.ProcessModifiers.gpu_cff import gpu
-from RecoLocalTracker.SiPixelClusterizer.siPixelClustersHeterogeneous_cfi import siPixelClustersHeterogeneous as _siPixelClustersHeterogeneous
-from RecoLocalTracker.SiPixelClusterizer.siPixelFedCablingMapGPUWrapper_cfi import *
-from CalibTracker.SiPixelESProducers.siPixelGainCalibrationForHLTGPU_cfi import *
-gpu.toReplaceWith(siPixelClustersPreSplitting, _siPixelClustersHeterogeneous.clone())
+gpu.toModify(siPixelClustersPreSplitting,
+    cuda = cms.EDAlias(
+        siPixelDigisClustersPreSplitting = cms.VPSet(
+            cms.PSet(type = cms.string("SiPixelClusteredmNewDetSetVector"))
+        )
+    )
+)
diff --git a/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py b/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py
new file mode 100644
index 0000000000000..c80f3b16b3a43
--- /dev/null
+++ b/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py
@@ -0,0 +1,21 @@
+import FWCore.ParameterSet.Config as cms
+
+from RecoLocalTracker.SiPixelClusterizer.SiPixelClusterizerPreSplitting_cfi import siPixelClustersPreSplitting
+from RecoLocalTracker.SiPixelClusterizer.siPixelRawToClusterCUDA_cfi import siPixelRawToClusterCUDA as _siPixelRawToClusterCUDA
+from RecoLocalTracker.SiPixelClusterizer.siPixelDigisClustersFromSoA_cfi import siPixelDigisClustersFromSoA as _siPixelDigisClustersFromSoA
+from RecoLocalTracker.SiPixelClusterizer.siPixelFedCablingMapGPUWrapper_cfi import *
+from CalibTracker.SiPixelESProducers.siPixelGainCalibrationForHLTGPU_cfi import *
+
+siPixelClustersPreSplittingTask = cms.Task(siPixelClustersPreSplitting)
+
+siPixelClustersCUDAPreSplitting = _siPixelRawToClusterCUDA.clone()
+siPixelDigisClustersPreSplitting = _siPixelDigisClustersFromSoA.clone()
+siPixelClustersPreSplittingTaskCUDA = cms.Task(
+    siPixelClustersCUDAPreSplitting,
+    siPixelDigisClustersPreSplitting,
+)
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+_siPixelClustersPreSplittingTask_gpu = siPixelClustersPreSplittingTask.copy()
+_siPixelClustersPreSplittingTask_gpu.add(siPixelClustersPreSplittingTaskCUDA)
+gpu.toReplaceWith(siPixelClustersPreSplittingTask, _siPixelClustersPreSplittingTask_gpu)
diff --git a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
index b652100f69e9f..e8726100abe0e 100644
--- a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
@@ -135,7 +135,7 @@ const unsigned char *SiPixelFedCablingMapGPUWrapper::getModToUnpAllAsync(cuda::s
   return data.modToUnpDefault;
 }
 
-edm::cuda::device::unique_ptr<unsigned char[]> SiPixelFedCablingMapGPUWrapper::getModToUnpRegionalAsync(std::set<unsigned int> const& modules, cuda::stream_t<>& cudaStream) const {
+cudautils::device::unique_ptr<unsigned char[]> SiPixelFedCablingMapGPUWrapper::getModToUnpRegionalAsync(std::set<unsigned int> const& modules, cuda::stream_t<>& cudaStream) const {
   edm::Service<CUDAService> cs;
   auto modToUnpDevice = cs->make_device_unique<unsigned char[]>(pixelgpudetails::MAX_SIZE, cudaStream);
   auto modToUnpHost = cs->make_host_unique<unsigned char[]>(pixelgpudetails::MAX_SIZE, cudaStream);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index 947cd20d97919..80be13dedd26b 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -129,18 +129,19 @@ namespace pixelgpudetails {
 #endif
   }
 
-  void PixelRecHitGPUKernel::makeHitsAsync(const siPixelRawToClusterHeterogeneousProduct::GPUProduct& input,
+  void PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d,
+                                           SiPixelClustersCUDA const& clusters_d,
                                            float const * bs,
                                            pixelCPEforGPU::ParamsOnGPU const * cpeParams,
                                            bool transferToCPU,
                                            cuda::stream_t<>& stream) {
     cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id()));
-    gpu_.hitsModuleStart_d = input.clusters_d.clusModuleStart();
+    gpu_.hitsModuleStart_d = clusters_d.clusModuleStart();
     gpu_.cpeParams = cpeParams; // copy it for use in clients
     cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id()));
 
     int threadsPerBlock = 256;
-    int blocks = input.nModules; // active modules (with digis)
+    int blocks = digis_d.nModules(); // active modules (with digis)
 
 #ifdef GPU_DEBUG
     std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl;
@@ -148,12 +149,12 @@ namespace pixelgpudetails {
     gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream.id()>>>(
       cpeParams,
       gpu_.bs_d,
-      input.digis_d.moduleInd(),
-      input.digis_d.xx(), input.digis_d.yy(), input.digis_d.adc(),
-      input.clusters_d.moduleStart(),
-      input.clusters_d.clusInModule(), input.clusters_d.moduleId(),
-      input.clusters_d.clus(),
-      input.nDigis,
+      digis_d.moduleInd(),
+      digis_d.xx(), digis_d.yy(), digis_d.adc(),
+      clusters_d.moduleStart(),
+      clusters_d.clusInModule(), clusters_d.moduleId(),
+      digis_d.clus(),
+      digis_d.nDigis(),
       gpu_.hitsModuleStart_d,
       gpu_.charge_d,
       gpu_.detInd_d,
@@ -170,7 +171,7 @@ namespace pixelgpudetails {
     cudaCheck(cudaGetLastError());
 
     // needed only if hits on CPU are required...
-    nhits_ = input.nClusters;
+    nhits_ = clusters_d.nClusters();
     if(transferToCPU) {
       cudaCheck(cudaMemcpyAsync(h_hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
 #ifdef GPU_DEBUG
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
index dcc80308c4463..49164d24ab335 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
@@ -1,7 +1,8 @@
 #ifndef RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h
 #define RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h
 
-#include "RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h"
+#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h"
 
 #include <cuda/api_wrappers.h>
@@ -31,7 +32,8 @@ namespace pixelgpudetails {
     PixelRecHitGPUKernel& operator=(const PixelRecHitGPUKernel&) = delete;
     PixelRecHitGPUKernel& operator=(PixelRecHitGPUKernel&&) = delete;
 
-    void makeHitsAsync(const siPixelRawToClusterHeterogeneousProduct::GPUProduct& input,
+    void makeHitsAsync(SiPixelDigisCUDA const& digis_d,
+                       SiPixelClustersCUDA const& clusters_d,
                        float const * bs,
                        pixelCPEforGPU::ParamsOnGPU const * cpeParams,
                        bool transferToCPU,
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
index 68f53a47157d4..d8e07667f976b 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
@@ -1,3 +1,6 @@
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "DataFormats/Common/interface/DetSetVectorNew.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
@@ -13,25 +16,25 @@
 #include "FWCore/Utilities/interface/InputTag.h"
 #include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
 #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h"
 #include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h"
 #include "RecoLocalTracker/Records/interface/TkPixelCPERecord.h"
 
-#include "RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h" // TODO: we need a proper place for this header...
-
 #include "PixelRecHits.h"  // TODO : spit product from kernel
 
+#include <cuda_runtime.h>
+
 class SiPixelRecHitHeterogeneous: public HeterogeneousEDProducer<heterogeneous::HeterogeneousDevices <
                                                                    heterogeneous::GPUCuda,
                                                                    heterogeneous::CPU
                                                                    > > {
 
 public:
-  using Input = siPixelRawToClusterHeterogeneousProduct::HeterogeneousDigiCluster;
-
   using CPUProduct = siPixelRecHitsHeterogeneousProduct::CPUProduct;
   using GPUProduct = siPixelRecHitsHeterogeneousProduct::GPUProduct;
   using Output = siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit;
@@ -62,7 +65,9 @@ class SiPixelRecHitHeterogeneous: public HeterogeneousEDProducer<heterogeneous::
 
 
   edm::EDGetTokenT<reco::BeamSpot> 	 tBeamSpot;
-  edm::EDGetTokenT<HeterogeneousProduct> token_;
+  // The mess with inputs will be cleaned up when migrating to the new framework
+  edm::EDGetTokenT<CUDAProduct<SiPixelClustersCUDA>> token_;
+  edm::EDGetTokenT<CUDAProduct<SiPixelDigisCUDA>> tokenDigi_;
   edm::EDGetTokenT<SiPixelClusterCollectionNew> clusterToken_;
   std::string cpeName_;
 
@@ -78,8 +83,8 @@ class SiPixelRecHitHeterogeneous: public HeterogeneousEDProducer<heterogeneous::
 SiPixelRecHitHeterogeneous::SiPixelRecHitHeterogeneous(const edm::ParameterSet& iConfig):
   HeterogeneousEDProducer(iConfig),
   tBeamSpot(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
-  token_(consumesHeterogeneous(iConfig.getParameter<edm::InputTag>("heterogeneousSrc"))),
-  clusterToken_(consumes<SiPixelClusterCollectionNew>(iConfig.getParameter<edm::InputTag>("src"))),
+  token_(consumes<CUDAProduct<SiPixelClustersCUDA>>(iConfig.getParameter<edm::InputTag>("heterogeneousSrc"))),
+  tokenDigi_(consumes<CUDAProduct<SiPixelDigisCUDA>>(iConfig.getParameter<edm::InputTag>("heterogeneousSrc"))),
   cpeName_(iConfig.getParameter<std::string>("CPE"))
 {
   enableConversion_ = iConfig.getParameter<bool>("gpuEnableConversion");
@@ -87,6 +92,7 @@ SiPixelRecHitHeterogeneous::SiPixelRecHitHeterogeneous(const edm::ParameterSet&
 
   produces<HeterogeneousProduct>();
   if(enableConversion_) {
+    clusterToken_ = consumes<SiPixelClusterCollectionNew>(iConfig.getParameter<edm::InputTag>("src"));
     produces<SiPixelRecHitCollectionNew>();
   }
 }
@@ -95,7 +101,7 @@ void SiPixelRecHitHeterogeneous::fillDescriptions(edm::ConfigurationDescriptions
   edm::ParameterSetDescription desc;
 
   desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpot"));
-  desc.add<edm::InputTag>("heterogeneousSrc", edm::InputTag("siPixelClustersPreSplitting"));
+  desc.add<edm::InputTag>("heterogeneousSrc", edm::InputTag("siPixelClustersCUDAPreSplitting"));
   desc.add<edm::InputTag>("src", edm::InputTag("siPixelClustersPreSplitting"));
   desc.add<std::string>("CPE", "PixelCPEFast");
 
@@ -118,16 +124,7 @@ void SiPixelRecHitHeterogeneous::initialize(const edm::EventSetup& es) {
 }
 
 void SiPixelRecHitHeterogeneous::produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) {
-  initialize(iSetup);
-
-  edm::Handle<SiPixelClusterCollectionNew> hclusters;
-  iEvent.getByToken(clusterToken_, hclusters);
-
-  auto output = std::make_unique<SiPixelRecHitCollectionNew>();
-  run(hclusters, *output);
-
-  output->shrink_to_fit();
-  iEvent.put(std::move(output));
+  throw cms::Exception("NotImplemented") << "CPU version is no longer implemented";
 }
 
 void SiPixelRecHitHeterogeneous::run(const edm::Handle<SiPixelClusterCollectionNew>& inputhandle, SiPixelRecHitCollectionNew &output) const {
@@ -174,8 +171,28 @@ void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& i
     throw cms::Exception("Configuration") << "too bad, not a fast cpe gpu processing not possible....";
   }
 
-  edm::Handle<siPixelRawToClusterHeterogeneousProduct::GPUProduct> hinput;
-  iEvent.getByToken<Input>(token_, hinput);
+  edm::Handle<CUDAProduct<SiPixelClustersCUDA>> hclusters;
+  iEvent.getByToken(token_, hclusters);
+  // temporary check (until the migration)
+  edm::Service<CUDAService> cs;
+  assert(hclusters->device() == cs->getCurrentDevice());
+  CUDAScopedContext ctx{*hclusters};
+  auto const& clusters = ctx.get(*hclusters);
+
+  edm::Handle<CUDAProduct<SiPixelDigisCUDA>> hdigis;
+  iEvent.getByToken(tokenDigi_, hdigis);
+  auto const& digis = ctx.get(*hdigis);
+
+  // We're processing in a stream given by base class, so need to
+  // synchronize explicitly (implementation is from
+  // CUDAScopedContext). In practice these should not be needed
+  // (because of synchronizations upstream), but let's play generic.
+  if(not hclusters->event().has_occurred()) {
+    cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hclusters->event().id(), 0));
+  }
+  if(not hdigis->event().has_occurred()) {
+    cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hclusters->event().id(), 0));
+  }
 
   edm::Handle<reco::BeamSpot> bsHandle;
   iEvent.getByToken( tBeamSpot, bsHandle);
@@ -185,8 +202,7 @@ void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& i
     bs[0]=bsh.x0(); bs[1]=bsh.y0(); bs[2]=bsh.z0();
   }
 
-
-  gpuAlgo_->makeHitsAsync(*hinput, bs, fcpe->getGPUProductAsync(cudaStream), enableTransfer_, cudaStream);
+  gpuAlgo_->makeHitsAsync(digis, clusters, bs, fcpe->getGPUProductAsync(cudaStream), enableTransfer_, cudaStream);
 
 }
 
diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
index 15224adb78cc3..58935e9a6991c 100644
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
@@ -21,7 +21,6 @@ def customizePixelTracksForProfilingDisableConversion(process):
     process = customizePixelTracksForProfiling(process)
 
     # Disable conversions to legacy
-    process.siPixelClustersPreSplitting.gpuEnableConversion = False
     process.siPixelRecHitsPreSplitting.gpuEnableConversion = False
     process.pixelTracksHitQuadruplets.gpuEnableConversion = False
     process.pixelTracks.gpuEnableConversion = False
@@ -33,7 +32,6 @@ def customizePixelTracksForProfilingDisableTransfer(process):
     process = customizePixelTracksForProfilingDisableConversion(process)
 
     # Disable "unnecessary" transfers to CPU
-    process.siPixelClustersPreSplitting.gpuEnableTransfer = False
     process.siPixelRecHitsPreSplitting.gpuEnableTransfer = False
     process.pixelTracksHitQuadruplets.gpuEnableTransfer = False
     process.pixelVertices.gpuEnableTransfer = False
diff --git a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
index 9f5d10ad020e9..767d140a5d5ed 100644
--- a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
@@ -18,6 +18,7 @@
 </bin>
 <bin file="PixelTriplets_InvPrbl_prec.cpp">
   <use   name="RecoPixelVertexing/PixelTriplets"/>
+  <flags NO_TESTRUN="1"/>
 </bin>
 
 <bin file="fastDPHI_t.cpp"/>
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
index b402daef07a05..dfa08c1fa2043 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
@@ -3,8 +3,6 @@
 #include <mutex>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
-#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
-#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
@@ -14,7 +12,7 @@
 using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU;
 
 __global__
-void simLink(const SiPixelDigisCUDA::DeviceConstView *dd, uint32_t ndigis, const SiPixelClustersCUDA::DeviceConstView *cc, clusterSLOnGPU::HitsOnGPU const * hhp, ClusterSLGPU const * slp, uint32_t n)
+void simLink(const SiPixelDigisCUDA::DeviceConstView *dd, uint32_t ndigis, clusterSLOnGPU::HitsOnGPU const * hhp, ClusterSLGPU const * slp, uint32_t n)
 {
   assert(slp == slp->me_d);
 
@@ -35,7 +33,7 @@ void simLink(const SiPixelDigisCUDA::DeviceConstView *dd, uint32_t ndigis, const
 
   auto ch = pixelgpudetails::pixelToChannel(dd->xx(i), dd->yy(i));
   auto first = hh.hitsModuleStart_d[id];
-  auto cl = first + cc->clus(i);
+  auto cl = first + dd->clus(i);
   assert(cl < 2000 * blockDim.x);
 
   const std::array<uint32_t, 4> me{{id, ch, 0, 0}};
@@ -162,7 +160,7 @@ namespace clusterSLOnGPU {
     cudaCheck(cudaMemsetAsync(slgpu.n2_d, 0, (ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream));
   }
 
-  void Kernel::algo(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream) {
+  void Kernel::algo(SiPixelDigisCUDA const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream) {
     zero(stream.id());
 
     ClusterSLGPU const & sl = slgpu;
@@ -177,7 +175,7 @@ namespace clusterSLOnGPU {
     blocks = (ndigis + threadsPerBlock - 1) / threadsPerBlock;
 
     assert(sl.me_d);
-    simLink<<<blocks, threadsPerBlock, 0, stream.id()>>>(dd.digis_d.view(), ndigis, dd.clusters_d.view(), hh.gpu_d, sl.me_d, n);
+    simLink<<<blocks, threadsPerBlock, 0, stream.id()>>>(dd.view(), ndigis, hh.gpu_d, sl.me_d, n);
     cudaCheck(cudaGetLastError());
 
     if (doDump) {
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
index 00b0e34b301c8..23976cb418e16 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
@@ -4,8 +4,8 @@
 #include <cuda_runtime.h>
 #include <cuda/api_wrappers.h>
 
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h"
 #include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 
 #include "trackerHitAssociationHeterogeneousProduct.h"
@@ -15,7 +15,6 @@ namespace clusterSLOnGPU {
   using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU;
   using GPUProduct   = trackerHitAssociationHeterogeneousProduct::GPUProduct;
 
-  using DigisOnGPU   = siPixelRawToClusterHeterogeneousProduct::GPUProduct;
   using HitsOnGPU    = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
   using HitsOnCPU    = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
 
@@ -23,7 +22,7 @@ namespace clusterSLOnGPU {
   public:
     Kernel(cuda::stream_t<>& stream, bool dump);
     ~Kernel() {deAlloc();}
-    void algo(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream);
+    void algo(SiPixelDigisCUDA const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream);
     GPUProduct getProduct() { return GPUProduct{slgpu.me_d};}
     
   private:
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
index a5e0f403adcad..e9e271e1e58cc 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
@@ -4,6 +4,8 @@
 
 #include <cuda_runtime.h>
 
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "DataFormats/Common/interface/DetSetVector.h"
 #include "DataFormats/Common/interface/DetSetVectorNew.h"
 #include "DataFormats/Common/interface/Handle.h"
@@ -22,15 +24,16 @@
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/InputTag.h"
 #include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
 #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
 #include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
-#include "RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h"
 #include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 #include "SimDataFormats/Track/interface/SimTrackContainer.h"
 #include "SimDataFormats/TrackerDigiSimLink/interface/PixelDigiSimLink.h"
@@ -52,7 +55,6 @@ class ClusterTPAssociationHeterogeneous : public HeterogeneousEDProducer<heterog
   using CPUProduct = trackerHitAssociationHeterogeneousProduct::CPUProduct;
   using Output = trackerHitAssociationHeterogeneousProduct::ClusterTPAHeterogeneousProduct;
 
-  using PixelDigiClustersH = siPixelRawToClusterHeterogeneousProduct::HeterogeneousDigiCluster;
   using PixelRecHitsH = siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit;
 
   explicit ClusterTPAssociationHeterogeneous(const edm::ParameterSet&);
@@ -89,7 +91,7 @@ class ClusterTPAssociationHeterogeneous : public HeterogeneousEDProducer<heterog
   edm::EDGetTokenT<edmNew::DetSetVector<Phase2TrackerCluster1D>> phase2OTClustersToken_;
   edm::EDGetTokenT<TrackingParticleCollection> trackingParticleToken_;
 
-  edm::EDGetTokenT<HeterogeneousProduct> tGpuDigis;
+  edm::EDGetTokenT<CUDAProduct<SiPixelDigisCUDA>> tGpuDigis;
   edm::EDGetTokenT<HeterogeneousProduct> tGpuHits;
 
   std::unique_ptr<clusterSLOnGPU::Kernel> gpuAlgo;
@@ -111,7 +113,7 @@ ClusterTPAssociationHeterogeneous::ClusterTPAssociationHeterogeneous(const edm::
     stripClustersToken_(consumes<edmNew::DetSetVector<SiStripCluster>>(cfg.getParameter<edm::InputTag>("stripClusterSrc"))),
     phase2OTClustersToken_(consumes<edmNew::DetSetVector<Phase2TrackerCluster1D>>(cfg.getParameter<edm::InputTag>("phase2OTClusterSrc"))),
     trackingParticleToken_(consumes<TrackingParticleCollection>(cfg.getParameter<edm::InputTag>("trackingParticleSrc"))),
-    tGpuDigis(consumesHeterogeneous(cfg.getParameter<edm::InputTag>("heterogeneousPixelDigiClusterSrc"))),
+    tGpuDigis(consumes<CUDAProduct<SiPixelDigisCUDA>>(cfg.getParameter<edm::InputTag>("heterogeneousPixelDigiClusterSrc"))),
     tGpuHits(consumesHeterogeneous(cfg.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc"))),
     doDump(cfg.getParameter<bool>("dumpCSV"))
 {
@@ -128,7 +130,7 @@ void ClusterTPAssociationHeterogeneous::fillDescriptions(edm::ConfigurationDescr
   desc.add<edm::InputTag>("stripClusterSrc", edm::InputTag("siStripClusters"));
   desc.add<edm::InputTag>("phase2OTClusterSrc", edm::InputTag("siPhase2Clusters"));
   desc.add<edm::InputTag>("trackingParticleSrc", edm::InputTag("mix", "MergedTrackTruth"));
-  desc.add<edm::InputTag>("heterogeneousPixelDigiClusterSrc", edm::InputTag("siPixelClustersPreSplitting"));
+  desc.add<edm::InputTag>("heterogeneousPixelDigiClusterSrc", edm::InputTag("siPixelClustersCUDAPreSplitting"));
   desc.add<edm::InputTag>("heterogeneousPixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplitting"));
 
   desc.add<bool>("dumpCSV", false);
@@ -184,13 +186,27 @@ void ClusterTPAssociationHeterogeneous::acquireGPUCuda(const edm::HeterogeneousE
 
     //  gpu stuff ------------------------
 
-    edm::Handle<siPixelRawToClusterHeterogeneousProduct::GPUProduct> gd;
+    edm::Handle<CUDAProduct<SiPixelDigisCUDA>> gd;
+    iEvent.getByToken(tGpuDigis, gd);
+    // temporary check (until the migration)
+    edm::Service<CUDAService> cs;
+    assert(gd->device() == cs->getCurrentDevice());
+
+    CUDAScopedContext ctx{*gd};
+    auto const &gDigis = ctx.get(*gd);
+
+    // We're processing in a stream given by base class, so need to
+    // synchronize explicitly (implementation is from
+    // CUDAScopedContext). In practice these should not be needed
+    // (because of synchronizations upstream), but let's play generic.
+    if(not gd->event().has_occurred()) {
+      cudaCheck(cudaStreamWaitEvent(cudaStream.id(), gd->event().id(), 0));
+    }
+
     edm::Handle<siPixelRecHitsHeterogeneousProduct::GPUProduct> gh;
-    iEvent.getByToken<siPixelRawToClusterHeterogeneousProduct::HeterogeneousDigiCluster>(tGpuDigis, gd);
     iEvent.getByToken<siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit>(tGpuHits, gh);
-    auto const & gDigis = *gd;
     auto const & gHits = *gh;
-    auto ndigis = gDigis.nDigis;
+    auto ndigis = gDigis.nDigis();
     auto nhits = gHits.nHits;
 
     digi2tp.clear();