diff --git a/CUDADataFormats/Common/BuildFile.xml b/CUDADataFormats/Common/BuildFile.xml
index b990c1295e31a..1046b76eef0f7 100644
--- a/CUDADataFormats/Common/BuildFile.xml
+++ b/CUDADataFormats/Common/BuildFile.xml
@@ -1,6 +1,7 @@
-<iftool name="cuda">
-  <use name="HeterogeneousCore/CUDAUtilities"/>
-  <export>
+<use name="cuda-api-wrappers"/>
+<use name="FWCore/ServiceRegistry"/>
+<use name="HeterogeneousCore/CUDAServices"/>
+
+<export>
     <lib name="1"/>
-  </export>
-</iftool>
+</export>
diff --git a/CUDADataFormats/SiPixelCluster/BuildFile.xml b/CUDADataFormats/SiPixelCluster/BuildFile.xml
new file mode 100644
index 0000000000000..d34658faa2573
--- /dev/null
+++ b/CUDADataFormats/SiPixelCluster/BuildFile.xml
@@ -0,0 +1,9 @@
+<use name="FWCore/ServiceRegistry"/>
+<use name="HeterogeneousCore/CUDAServices"/>
+<use name="cuda-api-wrappers"/>
+<use name="rootcore"/>
+
+<export>
+    <lib name="1"/>
+</export>
+
diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
new file mode 100644
index 0000000000000..f25a8a25f0808
--- /dev/null
+++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
@@ -0,0 +1,76 @@
+#ifndef CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
+#define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+#include <cuda/api_wrappers.h>
+
+class SiPixelClustersCUDA {
+public:
+  SiPixelClustersCUDA() = default;
+  explicit SiPixelClustersCUDA(size_t maxClusters, cuda::stream_t<>& stream);
+  ~SiPixelClustersCUDA() = default;
+
+  SiPixelClustersCUDA(const SiPixelClustersCUDA&) = delete;
+  SiPixelClustersCUDA& operator=(const SiPixelClustersCUDA&) = delete;
+  SiPixelClustersCUDA(SiPixelClustersCUDA&&) = default;
+  SiPixelClustersCUDA& operator=(SiPixelClustersCUDA&&) = default;
+
+  void setNClusters(uint32_t nClusters) {
+    nClusters_h = nClusters;
+  }
+
+  uint32_t nClusters() const { return nClusters_h; }
+
+  uint32_t *moduleStart() { return moduleStart_d.get(); }
+  uint32_t *clusInModule() { return clusInModule_d.get(); }
+  uint32_t *moduleId() { return moduleId_d.get(); }
+  uint32_t *clusModuleStart() { return clusModuleStart_d.get(); }
+
+  uint32_t const *moduleStart() const { return moduleStart_d.get(); }
+  uint32_t const *clusInModule() const { return clusInModule_d.get(); }
+  uint32_t const *moduleId() const { return moduleId_d.get(); }
+  uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); }
+
+  uint32_t const *c_moduleStart() const { return moduleStart_d.get(); }
+  uint32_t const *c_clusInModule() const { return clusInModule_d.get(); }
+  uint32_t const *c_moduleId() const { return moduleId_d.get(); }
+  uint32_t const *c_clusModuleStart() const { return clusModuleStart_d.get(); }
+
+  class DeviceConstView {
+  public:
+    DeviceConstView() = default;
+
+#ifdef __CUDACC__
+    __device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_+i); }
+    __device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_+i); }
+    __device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_+i); }
+    __device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_+i); }
+#endif
+
+    friend SiPixelClustersCUDA;
+
+  private:
+    uint32_t const *moduleStart_;
+    uint32_t const *clusInModule_;
+    uint32_t const *moduleId_;
+    uint32_t const *clusModuleStart_;
+  };
+
+  DeviceConstView *view() const { return view_d.get(); }
+
+private:
+  cudautils::device::unique_ptr<uint32_t[]> moduleStart_d;   // index of the first pixel of each module
+  cudautils::device::unique_ptr<uint32_t[]> clusInModule_d;  // number of clusters found in each module
+  cudautils::device::unique_ptr<uint32_t[]> moduleId_d;      // module id of each module
+
+  // originally from rechits
+  cudautils::device::unique_ptr<uint32_t[]> clusModuleStart_d;
+
+  cudautils::device::unique_ptr<DeviceConstView> view_d;    // "me" pointer
+
+  uint32_t nClusters_h;
+};
+
+#endif
diff --git a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
new file mode 100644
index 0000000000000..d88a1b0a6370b
--- /dev/null
+++ b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
@@ -0,0 +1,23 @@
+#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
+
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+
+SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxClusters, cuda::stream_t<>& stream) {
+  edm::Service<CUDAService> cs;
+
+  moduleStart_d     = cs->make_device_unique<uint32_t[]>(maxClusters+1, stream);
+  clusInModule_d    = cs->make_device_unique<uint32_t[]>(maxClusters, stream);
+  moduleId_d        = cs->make_device_unique<uint32_t[]>(maxClusters, stream);
+  clusModuleStart_d = cs->make_device_unique<uint32_t[]>(maxClusters+1, stream);
+
+  auto view = cs->make_host_unique<DeviceConstView>(stream);
+  view->moduleStart_ = moduleStart_d.get();
+  view->clusInModule_ = clusInModule_d.get();
+  view->moduleId_ = moduleId_d.get();
+  view->clusModuleStart_ = clusModuleStart_d.get();
+
+  view_d = cs->make_device_unique<DeviceConstView>(stream);
+  cudautils::copyAsync(view_d, view, stream);
+}
diff --git a/CUDADataFormats/SiPixelCluster/src/classes.h b/CUDADataFormats/SiPixelCluster/src/classes.h
new file mode 100644
index 0000000000000..08d46244adc7d
--- /dev/null
+++ b/CUDADataFormats/SiPixelCluster/src/classes.h
@@ -0,0 +1,8 @@
+#ifndef CUDADataFormats_SiPixelCluster_classes_h
+#define CUDADataFormats_SiPixelCluster_classes_h
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif
diff --git a/CUDADataFormats/SiPixelCluster/src/classes_def.xml b/CUDADataFormats/SiPixelCluster/src/classes_def.xml
new file mode 100644
index 0000000000000..ba0706ac4b8aa
--- /dev/null
+++ b/CUDADataFormats/SiPixelCluster/src/classes_def.xml
@@ -0,0 +1,4 @@
+<lcgdict>
+  <class name="CUDAProduct<SiPixelClustersCUDA>" persistent="false"/>
+  <class name="edm::Wrapper<CUDAProduct<SiPixelClustersCUDA>>" persistent="false"/>
+</lcgdict>
diff --git a/CUDADataFormats/SiPixelDigi/BuildFile.xml b/CUDADataFormats/SiPixelDigi/BuildFile.xml
new file mode 100644
index 0000000000000..29ec13098819c
--- /dev/null
+++ b/CUDADataFormats/SiPixelDigi/BuildFile.xml
@@ -0,0 +1,9 @@
+<use name="DataFormats/SiPixelRawData"/>
+<use name="FWCore/ServiceRegistry"/>
+<use name="HeterogeneousCore/CUDAServices"/>
+<use name="cuda-api-wrappers"/>
+<use name="rootcore"/>
+
+<export>
+    <lib name="1"/>
+</export>
diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h
new file mode 100644
index 0000000000000..e9c8c0f644722
--- /dev/null
+++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h
@@ -0,0 +1,40 @@
+#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h
+#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h
+
+#include "DataFormats/SiPixelDigi/interface/PixelErrors.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
+
+#include <cuda/api_wrappers.h>
+
+class SiPixelDigiErrorsCUDA {
+public:
+  SiPixelDigiErrorsCUDA() = default;
+  explicit SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cuda::stream_t<>& stream);
+  ~SiPixelDigiErrorsCUDA() = default;
+
+  SiPixelDigiErrorsCUDA(const SiPixelDigiErrorsCUDA&) = delete;
+  SiPixelDigiErrorsCUDA& operator=(const SiPixelDigiErrorsCUDA&) = delete;
+  SiPixelDigiErrorsCUDA(SiPixelDigiErrorsCUDA&&) = default;
+  SiPixelDigiErrorsCUDA& operator=(SiPixelDigiErrorsCUDA&&) = default;
+
+  const PixelFormatterErrors& formatterErrors() const { return formatterErrors_h; }
+
+  GPU::SimpleVector<PixelErrorCompact> *error() { return error_d.get(); }
+  GPU::SimpleVector<PixelErrorCompact> const *error() const { return error_d.get(); }
+  GPU::SimpleVector<PixelErrorCompact> const *c_error() const { return error_d.get(); }
+
+  using HostDataError = std::pair<GPU::SimpleVector<PixelErrorCompact>, cudautils::host::unique_ptr<PixelErrorCompact[]>>;
+  HostDataError dataErrorToHostAsync(cuda::stream_t<>& stream) const;
+
+  void copyErrorToHostAsync(cuda::stream_t<>& stream);
+
+private:
+  cudautils::device::unique_ptr<PixelErrorCompact[]> data_d;
+  cudautils::device::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_d;
+  cudautils::host::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_h;
+  PixelFormatterErrors formatterErrors_h;
+};
+
+#endif
diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
new file mode 100644
index 0000000000000..6a52545483eb8
--- /dev/null
+++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
@@ -0,0 +1,99 @@
+#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
+#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+#include <cuda/api_wrappers.h>
+
+class SiPixelDigisCUDA {
+public:
+  SiPixelDigisCUDA() = default;
+  explicit SiPixelDigisCUDA(size_t maxFedWords, cuda::stream_t<>& stream);
+  ~SiPixelDigisCUDA() = default;
+
+  SiPixelDigisCUDA(const SiPixelDigisCUDA&) = delete;
+  SiPixelDigisCUDA& operator=(const SiPixelDigisCUDA&) = delete;
+  SiPixelDigisCUDA(SiPixelDigisCUDA&&) = default;
+  SiPixelDigisCUDA& operator=(SiPixelDigisCUDA&&) = default;
+
+  void setNModulesDigis(uint32_t nModules, uint32_t nDigis) {
+    nModules_h = nModules;
+    nDigis_h = nDigis;
+  }
+
+  uint32_t nModules() const { return nModules_h; }
+  uint32_t nDigis() const { return nDigis_h; }
+
+  uint16_t * xx() { return xx_d.get(); }
+  uint16_t * yy() { return yy_d.get(); }
+  uint16_t * adc() { return adc_d.get(); }
+  uint16_t * moduleInd() { return moduleInd_d.get(); }
+  int32_t  * clus() { return clus_d.get(); }
+  uint32_t * pdigi() { return pdigi_d.get(); }
+  uint32_t * rawIdArr() { return rawIdArr_d.get(); }
+
+  uint16_t const *xx() const { return xx_d.get(); }
+  uint16_t const *yy() const { return yy_d.get(); }
+  uint16_t const *adc() const { return adc_d.get(); }
+  uint16_t const *moduleInd() const { return moduleInd_d.get(); }
+  int32_t  const *clus() const { return clus_d.get(); } 
+  uint32_t const *pdigi() const { return pdigi_d.get(); }
+  uint32_t const *rawIdArr() const { return rawIdArr_d.get(); }
+
+  uint16_t const *c_xx() const { return xx_d.get(); }
+  uint16_t const *c_yy() const { return yy_d.get(); }
+  uint16_t const *c_adc() const { return adc_d.get(); }
+  uint16_t const *c_moduleInd() const { return moduleInd_d.get(); }
+  int32_t  const *c_clus() const { return clus_d.get(); }
+  uint32_t const *c_pdigi() const { return pdigi_d.get(); }
+  uint32_t const *c_rawIdArr() const { return rawIdArr_d.get(); }
+  
+  cudautils::host::unique_ptr<uint16_t[]> adcToHostAsync(cuda::stream_t<>& stream) const;
+  cudautils::host::unique_ptr< int32_t[]> clusToHostAsync(cuda::stream_t<>& stream) const;
+  cudautils::host::unique_ptr<uint32_t[]> pdigiToHostAsync(cuda::stream_t<>& stream) const;
+  cudautils::host::unique_ptr<uint32_t[]> rawIdArrToHostAsync(cuda::stream_t<>& stream) const;
+
+  class DeviceConstView {
+  public:
+    DeviceConstView() = default;
+
+#ifdef __CUDACC__
+    __device__ __forceinline__ uint16_t xx(int i) const { return __ldg(xx_+i); }
+    __device__ __forceinline__ uint16_t yy(int i) const { return __ldg(yy_+i); }
+    __device__ __forceinline__ uint16_t adc(int i) const { return __ldg(adc_+i); }
+    __device__ __forceinline__ uint16_t moduleInd(int i) const { return __ldg(moduleInd_+i); }
+    __device__ __forceinline__ int32_t  clus(int i) const { return __ldg(clus_+i); }
+#endif
+
+    friend class SiPixelDigisCUDA;
+
+  private:
+    uint16_t const *xx_;
+    uint16_t const *yy_;
+    uint16_t const *adc_;
+    uint16_t const *moduleInd_;
+    int32_t  const *clus_;
+  };
+
+  const DeviceConstView *view() const { return view_d.get(); }
+
+private:
+  // These are consumed by downstream device code
+  cudautils::device::unique_ptr<uint16_t[]> xx_d;        // local coordinates of each pixel
+  cudautils::device::unique_ptr<uint16_t[]> yy_d;        //
+  cudautils::device::unique_ptr<uint16_t[]> adc_d;       // ADC of each pixel
+  cudautils::device::unique_ptr<uint16_t[]> moduleInd_d; // module id of each pixel
+  cudautils::device::unique_ptr<int32_t[]>  clus_d;      // cluster id of each pixel
+  cudautils::device::unique_ptr<DeviceConstView> view_d; // "me" pointer
+
+  // These are for CPU output; should we (eventually) place them to a
+  // separate product?
+  cudautils::device::unique_ptr<uint32_t[]> pdigi_d;
+  cudautils::device::unique_ptr<uint32_t[]> rawIdArr_d;
+
+  uint32_t nModules_h = 0;
+  uint32_t nDigis_h = 0;
+};
+
+#endif
diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc
new file mode 100644
index 0000000000000..92aab1ec9d578
--- /dev/null
+++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc
@@ -0,0 +1,44 @@
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
+
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"
+
+SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cuda::stream_t<>& stream):
+  formatterErrors_h(std::move(errors))
+{
+  edm::Service<CUDAService> cs;
+
+  error_d = cs->make_device_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
+  data_d = cs->make_device_unique<PixelErrorCompact[]>(maxFedWords, stream);
+
+  cudautils::memsetAsync(data_d, 0x00, maxFedWords, stream);
+
+  error_h = cs->make_host_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
+  GPU::make_SimpleVector(error_h.get(), maxFedWords, data_d.get());
+  assert(error_h->size() == 0);
+  assert(error_h->capacity() == static_cast<int>(maxFedWords));
+
+  cudautils::copyAsync(error_d, error_h, stream);
+}
+
+void SiPixelDigiErrorsCUDA::copyErrorToHostAsync(cuda::stream_t<>& stream) {
+  cudautils::copyAsync(error_h, error_d, stream);
+}
+
+SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync(cuda::stream_t<>& stream) const {
+  edm::Service<CUDAService> cs;
+  // On one hand size() could be sufficient. On the other hand, if
+  // someone copies the SimpleVector<>, (s)he might expect the data
+  // buffer to actually have space for capacity() elements.
+  auto data = cs->make_host_unique<PixelErrorCompact[]>(error_h->capacity(), stream);
+
+  // but transfer only the required amount
+  if(error_h->size() > 0) {
+    cudautils::copyAsync(data, data_d, error_h->size(), stream);
+  }
+  auto err = *error_h;
+  err.set_data(data.get());
+  return HostDataError(std::move(err), std::move(data));
+}
diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
new file mode 100644
index 0000000000000..ef13ed9612dbf
--- /dev/null
+++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
@@ -0,0 +1,56 @@
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+
+SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cuda::stream_t<>& stream) {
+  edm::Service<CUDAService> cs;
+
+  xx_d              = cs->make_device_unique<uint16_t[]>(maxFedWords, stream);
+  yy_d              = cs->make_device_unique<uint16_t[]>(maxFedWords, stream);
+  adc_d             = cs->make_device_unique<uint16_t[]>(maxFedWords, stream);
+  moduleInd_d       = cs->make_device_unique<uint16_t[]>(maxFedWords, stream);
+  clus_d            = cs->make_device_unique< int32_t[]>(maxFedWords, stream);
+
+  pdigi_d           = cs->make_device_unique<uint32_t[]>(maxFedWords, stream);
+  rawIdArr_d        = cs->make_device_unique<uint32_t[]>(maxFedWords, stream);
+
+  auto view = cs->make_host_unique<DeviceConstView>(stream);
+  view->xx_ = xx_d.get();
+  view->yy_ = yy_d.get();
+  view->adc_ = adc_d.get();
+  view->moduleInd_ = moduleInd_d.get();
+  view->clus_ = clus_d.get();
+
+  view_d = cs->make_device_unique<DeviceConstView>(stream);
+  cudautils::copyAsync(view_d, view, stream);
+}
+
+cudautils::host::unique_ptr<uint16_t[]> SiPixelDigisCUDA::adcToHostAsync(cuda::stream_t<>& stream) const {
+  edm::Service<CUDAService> cs;
+  auto ret = cs->make_host_unique<uint16_t[]>(nDigis(), stream);
+  cudautils::copyAsync(ret, adc_d, nDigis(), stream);
+  return ret;
+}
+
+cudautils::host::unique_ptr<int32_t[]> SiPixelDigisCUDA::clusToHostAsync(cuda::stream_t<>& stream) const {
+  edm::Service<CUDAService> cs;
+  auto ret = cs->make_host_unique<int32_t[]>(nDigis(), stream);
+  cudautils::copyAsync(ret, clus_d, nDigis(), stream);
+  return ret;
+}
+
+cudautils::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::pdigiToHostAsync(cuda::stream_t<>& stream) const {
+  edm::Service<CUDAService> cs;
+  auto ret = cs->make_host_unique<uint32_t[]>(nDigis(), stream);
+  cudautils::copyAsync(ret, pdigi_d, nDigis(), stream);
+  return ret;
+}
+
+cudautils::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::rawIdArrToHostAsync(cuda::stream_t<>& stream) const {
+  edm::Service<CUDAService> cs;
+  auto ret = cs->make_host_unique<uint32_t[]>(nDigis(), stream);
+  cudautils::copyAsync(ret, rawIdArr_d, nDigis(), stream);
+  return ret;
+}
diff --git a/CUDADataFormats/SiPixelDigi/src/classes.h b/CUDADataFormats/SiPixelDigi/src/classes.h
new file mode 100644
index 0000000000000..41b135640b883
--- /dev/null
+++ b/CUDADataFormats/SiPixelDigi/src/classes.h
@@ -0,0 +1,9 @@
+#ifndef CUDADataFormats_SiPixelDigi_classes_h
+#define CUDADataFormats_SiPixelDigi_classes_h
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif
diff --git a/CUDADataFormats/SiPixelDigi/src/classes_def.xml b/CUDADataFormats/SiPixelDigi/src/classes_def.xml
new file mode 100644
index 0000000000000..9d6816ed3b14c
--- /dev/null
+++ b/CUDADataFormats/SiPixelDigi/src/classes_def.xml
@@ -0,0 +1,7 @@
+<lcgdict>
+  <class name="CUDAProduct<SiPixelDigisCUDA>" persistent="false"/>
+  <class name="edm::Wrapper<CUDAProduct<SiPixelDigisCUDA>>" persistent="false"/>
+
+  <class name="CUDAProduct<SiPixelDigiErrorsCUDA>" persistent="false"/>
+  <class name="edm::Wrapper<CUDAProduct<SiPixelDigiErrorsCUDA>>" persistent="false"/>
+</lcgdict>
diff --git a/CalibTracker/Records/interface/SiPixelGainCalibrationForHLTGPURcd.h b/CalibTracker/Records/interface/SiPixelGainCalibrationForHLTGPURcd.h
new file mode 100644
index 0000000000000..afb682e5d451f
--- /dev/null
+++ b/CalibTracker/Records/interface/SiPixelGainCalibrationForHLTGPURcd.h
@@ -0,0 +1,14 @@
+#ifndef CalibTracker_Records_SiPixelGainCalibrationForHLTGPURcd_h
+#define CalibTracker_Records_SiPixelGainCalibrationForHLTGPURcd_h
+
+#include "FWCore/Framework/interface/EventSetupRecordImplementation.h"
+#include "FWCore/Framework/interface/DependentRecordImplementation.h"
+
+#include "CondFormats/DataRecord/interface/SiPixelGainCalibrationForHLTRcd.h"
+#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
+
+#include "boost/mpl/vector.hpp"
+
+class SiPixelGainCalibrationForHLTGPURcd : public edm::eventsetup::DependentRecordImplementation<SiPixelGainCalibrationForHLTGPURcd, boost::mpl::vector<SiPixelGainCalibrationForHLTRcd, TrackerDigiGeometryRecord> > {};
+
+#endif
diff --git a/CalibTracker/Records/src/SiPixelGainCalibrationForHLTGPURcd.cc b/CalibTracker/Records/src/SiPixelGainCalibrationForHLTGPURcd.cc
new file mode 100644
index 0000000000000..e6020eca80b1f
--- /dev/null
+++ b/CalibTracker/Records/src/SiPixelGainCalibrationForHLTGPURcd.cc
@@ -0,0 +1,5 @@
+#include "CalibTracker/Records/interface/SiPixelGainCalibrationForHLTGPURcd.h"
+#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h"
+#include "FWCore/Utilities/interface/typelookup.h"
+
+EVENTSETUP_RECORD_REG(SiPixelGainCalibrationForHLTGPURcd);
diff --git a/CalibTracker/SiPixelESProducers/BuildFile.xml b/CalibTracker/SiPixelESProducers/BuildFile.xml
index 6efeef5ca0d1c..69d258da21ed1 100644
--- a/CalibTracker/SiPixelESProducers/BuildFile.xml
+++ b/CalibTracker/SiPixelESProducers/BuildFile.xml
@@ -1,10 +1,15 @@
-<use name="FWCore/Framework"/>
-<use name="FWCore/ParameterSet"/>
-<use name="FWCore/MessageLogger"/>
-<use name="CondFormats/DataRecord"/>
-<use name="CondFormats/SiPixelObjects"/>
-<use name="DataFormats/Common"/>
-<use name="DataFormats/SiPixelDigi"/>
+<use   name="FWCore/Framework"/>
+<use   name="FWCore/ParameterSet"/>
+<use   name="FWCore/MessageLogger"/>
+<use   name="CondFormats/DataRecord"/>
+<use   name="CondFormats/SiPixelObjects"/>
+<use   name="DataFormats/Common"/>
+<use   name="DataFormats/SiPixelDigi"/>
+<use   name="CalibTracker/Records"/>
+<use   name="MagneticField/VolumeBasedEngine"/>
+<use   name="HeterogeneousCore/CUDACore"/>
+<use   name="boost"/>
+<use   name="cuda-api-wrappers"/>
 <export>
-  <lib name="1"/>
+  <lib   name="1"/>
 </export>
diff --git a/CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h b/CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h
new file mode 100644
index 0000000000000..96989c8a2c3b2
--- /dev/null
+++ b/CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h
@@ -0,0 +1,32 @@
+#ifndef CalibTracker_SiPixelESProducers_SiPixelGainCalibrationForHLTGPU_H
+#define CalibTracker_SiPixelESProducers_SiPixelGainCalibrationForHLTGPU_H
+
+#include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelGainCalibrationForHLT.h"
+
+#include <cuda/api_wrappers.h>
+
+class SiPixelGainCalibrationForHLT;
+class SiPixelGainForHLTonGPU;
+struct SiPixelGainForHLTonGPU_DecodingStructure;
+class TrackerGeometry;
+
+class SiPixelGainCalibrationForHLTGPU {
+public:
+  explicit SiPixelGainCalibrationForHLTGPU(const SiPixelGainCalibrationForHLT& gains, const TrackerGeometry& geom);
+  ~SiPixelGainCalibrationForHLTGPU();
+
+  const SiPixelGainForHLTonGPU *getGPUProductAsync(cuda::stream_t<>& cudaStream) const;
+
+private:
+  const SiPixelGainCalibrationForHLT *gains_ = nullptr;
+  SiPixelGainForHLTonGPU *gainForHLTonHost_ = nullptr;
+  struct GPUData {
+    ~GPUData();
+    SiPixelGainForHLTonGPU *gainForHLTonGPU = nullptr;
+    SiPixelGainForHLTonGPU_DecodingStructure *gainDataOnGPU = nullptr;
+  };
+  CUDAESProduct<GPUData> gpuData_;
+};
+
+#endif
diff --git a/CalibTracker/SiPixelESProducers/plugins/BuildFile.xml b/CalibTracker/SiPixelESProducers/plugins/BuildFile.xml
index 5380c9d7d346b..b33657e273036 100644
--- a/CalibTracker/SiPixelESProducers/plugins/BuildFile.xml
+++ b/CalibTracker/SiPixelESProducers/plugins/BuildFile.xml
@@ -1,13 +1,13 @@
-<use name="FWCore/Framework"/>
-<use name="FWCore/ParameterSet"/>
-<use name="CondFormats/DataRecord"/>
-<use name="CondFormats/SiPixelObjects"/>
-<use name="CondFormats/SiStripObjects"/>
-<use name="Geometry/Records"/>
-<use name="Geometry/TrackerGeometryBuilder"/>
-<use name="CalibTracker/Records"/>
-<use name="CalibTracker/SiPixelESProducers"/>
-<use name="MagneticField/Engine"/>
-<library file="*.cc" name="CalibTrackerSiPixelESProducersPlugins">
-  <flags EDM_PLUGIN="1"/>
+<use   name="FWCore/Framework"/>
+<use   name="FWCore/ParameterSet"/>
+<use   name="CondFormats/DataRecord"/>
+<use   name="CondFormats/SiPixelObjects"/>
+<use   name="CondFormats/SiStripObjects"/>
+<use   name="Geometry/Records"/>
+<use   name="Geometry/TrackerGeometryBuilder"/>
+<use   name="CalibTracker/SiPixelESProducers"/>
+<use   name="HeterogeneousCore/CUDACore"/>
+<use   name="cuda-api-wrappers"/>
+<library   file="*.cc" name="CalibTrackerSiPixelESProducersPlugins">
+  <flags   EDM_PLUGIN="1"/>
 </library>
diff --git a/CalibTracker/SiPixelESProducers/plugins/SiPixelGainCalibrationForHLTGPUESProducer.cc b/CalibTracker/SiPixelESProducers/plugins/SiPixelGainCalibrationForHLTGPUESProducer.cc
new file mode 100644
index 0000000000000..186bb2d72c3f3
--- /dev/null
+++ b/CalibTracker/SiPixelESProducers/plugins/SiPixelGainCalibrationForHLTGPUESProducer.cc
@@ -0,0 +1,47 @@
+#include "CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h"
+#include "CalibTracker/Records/interface/SiPixelGainCalibrationForHLTGPURcd.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelGainCalibrationForHLT.h"
+#include "CondFormats/DataRecord/interface/SiPixelGainCalibrationForHLTRcd.h"
+#include "FWCore/Framework/interface/ESProducer.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/ModuleFactory.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
+
+#include <memory>
+
+class SiPixelGainCalibrationForHLTGPUESProducer: public edm::ESProducer {
+public:
+  explicit SiPixelGainCalibrationForHLTGPUESProducer(const edm::ParameterSet& iConfig);
+  std::unique_ptr<SiPixelGainCalibrationForHLTGPU> produce(const SiPixelGainCalibrationForHLTGPURcd& iRecord);
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+private:
+};
+
+SiPixelGainCalibrationForHLTGPUESProducer::SiPixelGainCalibrationForHLTGPUESProducer(const edm::ParameterSet& iConfig) {
+  setWhatProduced(this);
+}
+
+void SiPixelGainCalibrationForHLTGPUESProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  descriptions.add("siPixelGainCalibrationForHLTGPU", desc);
+}
+
+std::unique_ptr<SiPixelGainCalibrationForHLTGPU> SiPixelGainCalibrationForHLTGPUESProducer::produce(const SiPixelGainCalibrationForHLTGPURcd& iRecord) {
+  edm::ESHandle<SiPixelGainCalibrationForHLT> gains;
+  iRecord.getRecord<SiPixelGainCalibrationForHLTRcd>().get(gains);
+
+  edm::ESHandle<TrackerGeometry> geom;
+  iRecord.getRecord<TrackerDigiGeometryRecord>().get(geom);
+
+  return std::make_unique<SiPixelGainCalibrationForHLTGPU>(*gains, *geom);
+}
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h"
+
+DEFINE_FWK_EVENTSETUP_MODULE(SiPixelGainCalibrationForHLTGPUESProducer);
diff --git a/CalibTracker/SiPixelESProducers/src/ES_SiPixelGainCalibrationForHLTGPU.cc b/CalibTracker/SiPixelESProducers/src/ES_SiPixelGainCalibrationForHLTGPU.cc
new file mode 100644
index 0000000000000..80932fb468f71
--- /dev/null
+++ b/CalibTracker/SiPixelESProducers/src/ES_SiPixelGainCalibrationForHLTGPU.cc
@@ -0,0 +1,4 @@
+#include "CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h"
+#include "FWCore/Utilities/interface/typelookup.h"
+
+TYPELOOKUP_DATA_REG(SiPixelGainCalibrationForHLTGPU);
diff --git a/CalibTracker/SiPixelESProducers/src/SiPixelGainCalibrationForHLTGPU.cc b/CalibTracker/SiPixelESProducers/src/SiPixelGainCalibrationForHLTGPU.cc
new file mode 100644
index 0000000000000..3aef3f44c8f67
--- /dev/null
+++ b/CalibTracker/SiPixelESProducers/src/SiPixelGainCalibrationForHLTGPU.cc
@@ -0,0 +1,98 @@
+#include "CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelGainCalibrationForHLT.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelGainForHLTonGPU.h"
+#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+#include "Geometry/CommonDetUnit/interface/GeomDetType.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include <cuda.h>
+
+SiPixelGainCalibrationForHLTGPU::SiPixelGainCalibrationForHLTGPU(const SiPixelGainCalibrationForHLT& gains, const TrackerGeometry& geom):
+  gains_(&gains)
+{
+  // bizzarre logic (looking for fist strip-det) don't ask
+  auto const & dus = geom.detUnits();
+  unsigned m_detectors = dus.size();
+  for(unsigned int i=1;i<7;++i) {
+    if(geom.offsetDU(GeomDetEnumerators::tkDetEnum[i]) != dus.size() &&
+        dus[geom.offsetDU(GeomDetEnumerators::tkDetEnum[i])]->type().isTrackerStrip()) {
+      if(geom.offsetDU(GeomDetEnumerators::tkDetEnum[i]) < m_detectors) m_detectors = geom.offsetDU(GeomDetEnumerators::tkDetEnum[i]);
+    }
+  }
+
+  /*
+  std::cout << "caching calibs for " << m_detectors << " pixel detectors of size " << gains.data().size() << std::endl;
+  std::cout << "sizes " << sizeof(char) << ' ' << sizeof(uint8_t) << ' ' << sizeof(SiPixelGainForHLTonGPU::DecodingStructure) << std::endl;
+  */
+
+  cudaCheck(cudaMallocHost((void**) & gainForHLTonHost_, sizeof(SiPixelGainForHLTonGPU)));
+  //gainForHLTonHost_->v_pedestals = gainDataOnGPU_; // how to do this?
+
+  // do not read back from the (possibly write-combined) memory buffer
+  auto minPed  = gains.getPedLow();
+  auto maxPed  = gains.getPedHigh();
+  auto minGain = gains.getGainLow();
+  auto maxGain = gains.getGainHigh();
+  auto nBinsToUseForEncoding = 253;
+
+  // we will simplify later (not everything is needed....)
+  gainForHLTonHost_->minPed_ = minPed;
+  gainForHLTonHost_->maxPed_ = maxPed;
+  gainForHLTonHost_->minGain_= minGain;
+  gainForHLTonHost_->maxGain_= maxGain;
+
+  gainForHLTonHost_->numberOfRowsAveragedOver_ = 80;
+  gainForHLTonHost_->nBinsToUseForEncoding_    = nBinsToUseForEncoding;
+  gainForHLTonHost_->deadFlag_                 = 255;
+  gainForHLTonHost_->noisyFlag_                = 254;
+
+  gainForHLTonHost_->pedPrecision  = static_cast<float>(maxPed - minPed) / nBinsToUseForEncoding;
+  gainForHLTonHost_->gainPrecision = static_cast<float>(maxGain - minGain) / nBinsToUseForEncoding;
+
+  /*
+  std::cout << "precisions g " << gainForHLTonHost_->pedPrecision << ' ' << gainForHLTonHost_->gainPrecision << std::endl;
+  */
+
+  // fill the index map
+  auto const & ind = gains.getIndexes();
+  /*
+  std::cout << ind.size() << " " << m_detectors << std::endl;
+  */
+
+  for (auto i=0U; i<m_detectors; ++i) {
+    auto p = std::lower_bound(ind.begin(),ind.end(),dus[i]->geographicalId().rawId(),SiPixelGainCalibrationForHLT::StrictWeakOrdering());
+    assert (p!=ind.end() && p->detid==dus[i]->geographicalId());
+    assert(p->iend<=gains.data().size());
+    assert(p->iend>=p->ibegin);
+    assert(0==p->ibegin%2);
+    assert(0==p->iend%2);
+    assert(p->ibegin!=p->iend);
+    assert(p->ncols>0);
+    gainForHLTonHost_->rangeAndCols[i] = std::make_pair(SiPixelGainForHLTonGPU::Range(p->ibegin,p->iend), p->ncols);
+    // if (ind[i].detid!=dus[i]->geographicalId()) std::cout << ind[i].detid<<"!="<<dus[i]->geographicalId() << std::endl;
+    // gainForHLTonHost_->rangeAndCols[i] = std::make_pair(SiPixelGainForHLTonGPU::Range(ind[i].ibegin,ind[i].iend), ind[i].ncols);
+  }
+
+}
+
+SiPixelGainCalibrationForHLTGPU::~SiPixelGainCalibrationForHLTGPU() {
+  cudaCheck(cudaFreeHost(gainForHLTonHost_));
+}
+
+SiPixelGainCalibrationForHLTGPU::GPUData::~GPUData() {
+  cudaCheck(cudaFree(gainForHLTonGPU));
+  cudaCheck(cudaFree(gainDataOnGPU));
+}
+
+const SiPixelGainForHLTonGPU *SiPixelGainCalibrationForHLTGPU::getGPUProductAsync(cuda::stream_t<>& cudaStream) const {
+  const auto& data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData& data, cuda::stream_t<>& stream) {
+      cudaCheck(cudaMalloc((void**) & data.gainForHLTonGPU, sizeof(SiPixelGainForHLTonGPU)));
+      cudaCheck(cudaMalloc((void**) & data.gainDataOnGPU, this->gains_->data().size())); // TODO: this could be changed to cuda::memory::device::unique_ptr<>
+      // gains.data().data() is used also for non-GPU code, we cannot allocate it on aligned and write-combined memory
+      cudaCheck(cudaMemcpyAsync(data.gainDataOnGPU, this->gains_->data().data(), this->gains_->data().size(), cudaMemcpyDefault, stream.id()));
+
+      cudaCheck(cudaMemcpyAsync(data.gainForHLTonGPU, this->gainForHLTonHost_, sizeof(SiPixelGainForHLTonGPU), cudaMemcpyDefault, stream.id()));
+      cudaCheck(cudaMemcpyAsync(&(data.gainForHLTonGPU->v_pedestals), &(data.gainDataOnGPU), sizeof(SiPixelGainForHLTonGPU_DecodingStructure*), cudaMemcpyDefault, stream.id()));
+    });
+  return data.gainForHLTonGPU;
+}
diff --git a/Calibration/LumiAlCaRecoProducers/python/ALCARECOAlCaPCCRandom_cff.py b/Calibration/LumiAlCaRecoProducers/python/ALCARECOAlCaPCCRandom_cff.py
index 8185c8cfbb089..9161b2152ade7 100644
--- a/Calibration/LumiAlCaRecoProducers/python/ALCARECOAlCaPCCRandom_cff.py
+++ b/Calibration/LumiAlCaRecoProducers/python/ALCARECOAlCaPCCRandom_cff.py
@@ -19,8 +19,8 @@
 
 from Calibration.LumiAlCaRecoProducers.alcaPCCProducer_cfi import alcaPCCProducer
 alcaPCCProducerRandom = alcaPCCProducer.clone()
-alcaPCCProducerRandom.pixelClusterLabel = cms.InputTag("siPixelClustersForLumiR")
-alcaPCCProducerRandom.trigstring        = cms.untracked.string("alcaPCCRandom")
+alcaPCCProducerRandom.AlcaPCCProducerParameters.pixelClusterLabel = cms.InputTag("siPixelClustersForLumiR")
+alcaPCCProducerRandom.AlcaPCCProducerParameters.trigstring        = cms.untracked.string("alcaPCCRandom")
 
 # Sequence #
 seqALCARECOAlCaPCCRandom = cms.Sequence(ALCARECORandomHLT + siPixelDigisForLumiR + siPixelClustersForLumiR + alcaPCCProducerRandom)
diff --git a/Calibration/LumiAlCaRecoProducers/python/ALCARECOAlCaPCCZeroBias_cff.py b/Calibration/LumiAlCaRecoProducers/python/ALCARECOAlCaPCCZeroBias_cff.py
index 0ef9e074cc817..6a66256ae72ca 100644
--- a/Calibration/LumiAlCaRecoProducers/python/ALCARECOAlCaPCCZeroBias_cff.py
+++ b/Calibration/LumiAlCaRecoProducers/python/ALCARECOAlCaPCCZeroBias_cff.py
@@ -19,8 +19,8 @@
 
 from Calibration.LumiAlCaRecoProducers.alcaPCCProducer_cfi import alcaPCCProducer
 alcaPCCProducerZeroBias = alcaPCCProducer.clone()
-alcaPCCProducerZeroBias.pixelClusterLabel = cms.InputTag("siPixelClustersForLumiZB")
-alcaPCCProducerZeroBias.trigstring        = cms.untracked.string("alcaPCCZeroBias")
+alcaPCCProducerZeroBias.AlcaPCCProducerParameters.pixelClusterLabel = cms.InputTag("siPixelClustersForLumiZB")
+alcaPCCProducerZeroBias.AlcaPCCProducerParameters.trigstring        = cms.untracked.string("alcaPCCZeroBias")
 
 # Sequence #
 seqALCARECOAlCaPCCZeroBias = cms.Sequence(ALCARECOZeroBiasHLT + siPixelDigisForLumiZB + siPixelClustersForLumiZB + alcaPCCProducerZeroBias)
diff --git a/CondFormats/SiPixelObjects/interface/SiPixelGainForHLTonGPU.h b/CondFormats/SiPixelObjects/interface/SiPixelGainForHLTonGPU.h
new file mode 100644
index 0000000000000..931ee7e65f295
--- /dev/null
+++ b/CondFormats/SiPixelObjects/interface/SiPixelGainForHLTonGPU.h
@@ -0,0 +1,73 @@
+#ifndef CondFormats_SiPixelObjects_SiPixelGainForHLTonGPU_h
+#define CondFormats_SiPixelObjects_SiPixelGainForHLTonGPU_h
+
+#include <cstdint>
+#include <cstdio>
+#include <tuple>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+struct SiPixelGainForHLTonGPU_DecodingStructure{
+  uint8_t gain;
+  uint8_t ped;
+};
+
+
+// copy of SiPixelGainCalibrationForHLT
+class SiPixelGainForHLTonGPU {
+
+ public:
+
+  using DecodingStructure = SiPixelGainForHLTonGPU_DecodingStructure;
+  
+  using Range = std::pair<uint32_t,uint32_t>;
+ 
+
+  inline __host__ __device__
+  std::pair<float,float> getPedAndGain(uint32_t moduleInd, int col, int row, bool& isDeadColumn, bool& isNoisyColumn ) const {
+
+
+    auto range = rangeAndCols[moduleInd].first;
+    auto nCols = rangeAndCols[moduleInd].second;
+
+    // determine what averaged data block we are in (there should be 1 or 2 of these depending on if plaquette is 1 by X or 2 by X
+    unsigned int lengthOfColumnData  = (range.second-range.first)/nCols;
+    unsigned int lengthOfAveragedDataInEachColumn = 2;  // we always only have two values per column averaged block 
+    unsigned int numberOfDataBlocksToSkip = row / numberOfRowsAveragedOver_;
+
+
+    auto offset = range.first + col*lengthOfColumnData + lengthOfAveragedDataInEachColumn*numberOfDataBlocksToSkip;
+
+    assert(offset<range.second);
+    assert(offset<3088384);
+    assert(0==offset%2);
+
+    DecodingStructure const * __restrict__ lp = v_pedestals;
+    auto s = lp[offset/2];
+
+    isDeadColumn = (s.ped & 0xFF) == deadFlag_;
+    isNoisyColumn = (s.ped & 0xFF) == noisyFlag_;
+
+    return std::make_pair(decodePed(s.ped & 0xFF),decodeGain(s.gain & 0xFF));
+
+  }
+
+
+
+  constexpr float decodeGain(unsigned int gain) const {return gain*gainPrecision + minGain_;}
+  constexpr float decodePed (unsigned int ped) const { return ped*pedPrecision + minPed_;}
+
+  DecodingStructure * v_pedestals;
+  std::pair<Range, int> rangeAndCols[2000];
+
+  float  minPed_, maxPed_, minGain_, maxGain_;
+
+  float pedPrecision, gainPrecision;
+
+  unsigned int numberOfRowsAveragedOver_; // this is 80!!!!
+  unsigned int nBinsToUseForEncoding_;
+  unsigned int deadFlag_;
+  unsigned int noisyFlag_;
+};
+
+#endif // CondFormats_SiPixelObjects_SiPixelGainForHLTonGPU_h
diff --git a/Configuration/StandardSequences/python/RawToDigi_cff.py b/Configuration/StandardSequences/python/RawToDigi_cff.py
index d0af52de00a47..ed10c78a40c9b 100644
--- a/Configuration/StandardSequences/python/RawToDigi_cff.py
+++ b/Configuration/StandardSequences/python/RawToDigi_cff.py
@@ -3,13 +3,14 @@
 # This object is used to selectively make changes for different running
 # scenarios. In this case it makes changes for Run 2.
 
-from EventFilter.SiPixelRawToDigi.SiPixelRawToDigi_cfi import *
+from EventFilter.SiPixelRawToDigi.siPixelDigis_cff import *
 
 from EventFilter.SiStripRawToDigi.SiStripDigis_cfi import *
 
 from SimCalorimetry.EcalTrigPrimProducers.ecalTriggerPrimitiveDigis_cff import *
 
-from EventFilter.EcalRawToDigi.ecalDigis_cff import *
+import EventFilter.EcalRawToDigi.EcalUnpackerData_cfi
+ecalDigis = EventFilter.EcalRawToDigi.EcalUnpackerData_cfi.ecalEBunpacker.clone()
 
 import EventFilter.ESRawToDigi.esRawToDigi_cfi
 ecalPreshowerDigis = EventFilter.ESRawToDigi.esRawToDigi_cfi.esRawToDigi.clone()
@@ -23,8 +24,11 @@
 import EventFilter.DTRawToDigi.dtunpacker_cfi
 muonDTDigis = EventFilter.DTRawToDigi.dtunpacker_cfi.muonDTDigis.clone()
 
-import EventFilter.RPCRawToDigi.RPCRawToDigi_cfi 
-muonRPCDigis = EventFilter.RPCRawToDigi.RPCRawToDigi_cfi.muonRPCDigis.clone()
+import EventFilter.RPCRawToDigi.rpcUnpacker_cfi
+muonRPCDigis = EventFilter.RPCRawToDigi.rpcUnpacker_cfi.rpcunpacker.clone()
+
+import EventFilter.RPCRawToDigi.rpcDigiMerger_cfi
+muonRPCNewDigis = EventFilter.RPCRawToDigi.rpcDigiMerger_cfi.rpcDigiMerger.clone()
 
 import EventFilter.GEMRawToDigi.muonGEMDigis_cfi
 muonGEMDigis = EventFilter.GEMRawToDigi.muonGEMDigis_cfi.muonGEMDigis.clone()
@@ -45,9 +49,9 @@
 from EventFilter.CTPPSRawToDigi.ctppsRawToDigi_cff import *
 
 RawToDigiTask = cms.Task(L1TRawToDigiTask,
-                         siPixelDigis,
+                         siPixelDigisTask,
                          siStripDigis,
-                         ecalDigisTask,
+                         ecalDigis,
                          ecalPreshowerDigis,
                          hcalDigis,
                          muonCSCDigis,
@@ -60,20 +64,15 @@
                          )
 RawToDigi = cms.Sequence(RawToDigiTask)
 
-RawToDigiTask_noTk = RawToDigiTask.copyAndExclude([siPixelDigis, siStripDigis])
+RawToDigiTask_noTk = RawToDigiTask.copyAndExclude([siPixelDigisTask, siStripDigis])
 RawToDigi_noTk = cms.Sequence(RawToDigiTask_noTk)
 
-RawToDigiTask_pixelOnly = cms.Task(siPixelDigis)
+RawToDigiTask_pixelOnly = cms.Task(siPixelDigisTask)
 RawToDigi_pixelOnly = cms.Sequence(RawToDigiTask_pixelOnly)
 
-RawToDigiTask_ecalOnly = cms.Task(ecalDigisTask, ecalPreshowerDigis, scalersRawToDigi)
-RawToDigi_ecalOnly = cms.Sequence(RawToDigiTask_ecalOnly)
-
-RawToDigiTask_hcalOnly = cms.Task(hcalDigis)
-RawToDigi_hcalOnly = cms.Sequence(RawToDigiTask_hcalOnly)
-
 scalersRawToDigi.scalersInputTag = 'rawDataCollector'
-siPixelDigis.InputLabel = 'rawDataCollector'
+siPixelDigis.cpu.InputLabel = 'rawDataCollector'
+#false by default anyways ecalDigis.DoRegional = False
 ecalDigis.InputLabel = 'rawDataCollector'
 ecalPreshowerDigis.sourceTag = 'rawDataCollector'
 hcalDigis.InputLabel = 'rawDataCollector'
@@ -117,6 +116,19 @@
 from Configuration.Eras.Modifier_phase2_hgcal_cff import phase2_hgcal
 phase2_hgcal.toReplaceWith(RawToDigiTask,_hgcal_RawToDigiTask)
 
+# RPC New Readout Validation
+from Configuration.Eras.Modifier_stage2L1Trigger_2017_cff import stage2L1Trigger_2017
+_rpc_NewReadoutVal_RawToDigiTask = RawToDigiTask.copy()
+_rpc_NewReadoutVal_RawToDigiTask_noTk = RawToDigiTask_noTk.copy()
+_rpc_NewReadoutVal_RawToDigiTask.add(muonRPCNewDigis)
+_rpc_NewReadoutVal_RawToDigiTask_noTk.add(muonRPCNewDigis)
+stage2L1Trigger_2017.toReplaceWith(RawToDigiTask, _rpc_NewReadoutVal_RawToDigiTask)
+stage2L1Trigger_2017.toReplaceWith(RawToDigiTask_noTk, _rpc_NewReadoutVal_RawToDigiTask)
+
+from Configuration.Eras.Modifier_fastSim_cff import fastSim
+fastSim.toReplaceWith(RawToDigiTask, RawToDigiTask.copyAndExclude([muonRPCNewDigis]))
+fastSim.toReplaceWith(RawToDigiTask_noTk, RawToDigiTask_noTk.copyAndExclude([muonRPCNewDigis]))
+
 _hfnose_RawToDigiTask = RawToDigiTask.copy()
 _hfnose_RawToDigiTask.add(hfnoseDigis)
 
diff --git a/DQM/Integration/python/clients/beam_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/beam_dqm_sourceclient-live_cfg.py
index 4f6ad7bd592ba..5bc3b2bf8af63 100644
--- a/DQM/Integration/python/clients/beam_dqm_sourceclient-live_cfg.py
+++ b/DQM/Integration/python/clients/beam_dqm_sourceclient-live_cfg.py
@@ -1,15 +1,9 @@
 from __future__ import print_function
 import FWCore.ParameterSet.Config as cms
+from Configuration.StandardSequences.Eras import eras
 
-# Define here the BeamSpotOnline record name,
-# it will be used both in BeamMonitor setup and in payload creation/upload
-BSOnlineRecordName = 'BeamSpotOnlineLegacyObjectsRcd'
-
-#from Configuration.Eras.Era_Run2_2018_cff import Run2_2018
-#process = cms.Process("BeamMonitor", Run2_2018) FIXME
-import sys
-from Configuration.Eras.Era_Run2_2018_pp_on_AA_cff import Run2_2018_pp_on_AA
-process = cms.Process("BeamMonitor", Run2_2018_pp_on_AA)
+#process = cms.Process("BeamMonitor", eras.Run2_2018) FIXME
+process = cms.Process("BeamMonitor", eras.Run2_2018_pp_on_AA)
 
 #
 process.MessageLogger = cms.Service("MessageLogger",
@@ -22,29 +16,13 @@
 
 # switch
 live = True # FIXME
-unitTest = False
-
-if 'unitTest=True' in sys.argv:
-    live=False
-    unitTest=True
-
-# Switch to veto the upload of the BeamSpot conditions to the DB
-# when False it performs the upload
-noDB = True
-if 'noDB=False' in sys.argv:
-    noDB=False
 
 #---------------
 # Input sources
-if unitTest:
-    process.load("DQM.Integration.config.unittestinputsource_cfi")
-    from DQM.Integration.config.unittestinputsource_cfi import options
-elif live:
+if (live):
     process.load("DQM.Integration.config.inputsource_cfi")
-    from DQM.Integration.config.inputsource_cfi import options
 else:
     process.load("DQM.Integration.config.fileinputsource_cfi")
-    from DQM.Integration.config.fileinputsource_cfi import options
 
 #--------------------------
 # HLT Filter
@@ -57,9 +35,6 @@
 process.load("DQM.Integration.config.environment_cfi")
 process.dqmEnv.subSystemFolder = 'BeamMonitor'
 process.dqmSaver.tag           = 'BeamMonitor'
-process.dqmSaver.runNumber     = options.runNumber
-process.dqmSaverPB.tag         = 'BeamMonitor'
-process.dqmSaverPB.runNumber   = options.runNumber
 
 process.dqmEnvPixelLess = process.dqmEnv.clone()
 process.dqmEnvPixelLess.subSystemFolder = 'BeamMonitor_PixelLess'
@@ -73,7 +48,7 @@
     from Configuration.AlCa.GlobalTag import GlobalTag as gtCustomise
     process.GlobalTag = gtCustomise(process.GlobalTag, 'auto:run2_data', '')
     # you may need to set manually the GT in the line below
-    #process.GlobalTag.globaltag = '100X_upgrade2018_realistic_v10'
+    process.GlobalTag.globaltag = '100X_upgrade2018_realistic_v10'
 
 #----------------------------
 # BeamMonitor
@@ -231,7 +206,7 @@
 
 #
 process.dqmcommon = cms.Sequence(process.dqmEnv
-                               * process.dqmSaver*process.dqmSaverPB)
+                               * process.dqmSaver)
 
 #
 process.monitor = cms.Sequence(process.dqmBeamMonitor
@@ -249,8 +224,7 @@
 process.dqmBeamSpotProblemMonitor.pixelTracks  = 'pixelTracks'
 
 #
-from DQMServices.Core.DQMQualityTester import DQMQualityTester
-process.qTester = DQMQualityTester(
+process.qTester = cms.EDAnalyzer("QualityTester",
     qtList = cms.untracked.FileInPath('DQM/BeamMonitor/test/BeamSpotAvailableTest.xml'),
     prescaleFactor = cms.untracked.int32(1),                               
     qtestOnEndLumi = cms.untracked.bool(True),
@@ -297,7 +271,6 @@
 process.load("RecoVertex.BeamSpotProducer.BeamSpot_cfi")
 
 process.dqmBeamMonitor.OnlineMode = True
-process.dqmBeamMonitor.recordName = BSOnlineRecordName
 
 process.dqmBeamMonitor.resetEveryNLumi   = 5 # was 10 for HI
 process.dqmBeamMonitor.resetPVEveryNLumi = 5 # was 10 for HI
@@ -350,40 +323,6 @@
 
 process.dqmBeamMonitor.hltResults = cms.InputTag("TriggerResults","","HLT")
 
-#---------
-# Upload BeamSpotOnlineObject (LegacyRcd) to CondDB
-process.OnlineDBOutputService = cms.Service("OnlineDBOutputService",
-
-    DBParameters = cms.PSet(
-                            messageLevel = cms.untracked.int32(0),
-                            authenticationPath = cms.untracked.string('.')
-                           ),
-
-    # Upload to CondDB
-    connect = cms.string('oracle://cms_orcoff_prep/CMS_CONDITIONS'),
-    preLoadConnectionString = cms.untracked.string('frontier://FrontierPrep/CMS_CONDITIONS'),
-
-    runNumber = cms.untracked.uint64(options.runNumber),
-    lastLumiFile = cms.untracked.string(''),
-    writeTransactionDelay = cms.untracked.uint32(options.transDelay),
-    latency = cms.untracked.uint32(2),
-    autoCommit = cms.untracked.bool(True),
-    saveLogsOnDB = cms.untracked.bool(True),
-    jobName = cms.untracked.string("BeamSpotOnlineLegacyTest"), # name of the DB log record
-    toPut = cms.VPSet(cms.PSet(
-        record = cms.string(BSOnlineRecordName),
-        tag = cms.string('BSOnlineLegacy_tag'),
-        timetype = cms.untracked.string('Lumi'),
-        onlyAppendUpdatePolicy = cms.untracked.bool(True)
-    ))
-)
-
-# If not live or noDB: produce a (local) SQLITE file
-if not live or noDB:
-    process.OnlineDBOutputService.connect = cms.string('sqlite_file:BeamSpotOnlineLegacy.db')
-    process.OnlineDBOutputService.preLoadConnectionString = cms.untracked.string('sqlite_file:BeamSpotOnlineLegacy.db')
-    process.OnlineDBOutputService.saveLogsOnDB = cms.untracked.bool(False)
-
 #---------
 # Final path
 if (not process.runType.getRunType() == process.runType.hi_run):
diff --git a/DQM/Integration/python/clients/beampixel_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/beampixel_dqm_sourceclient-live_cfg.py
index 75f0545a5c5ba..7b4115f74ad84 100644
--- a/DQM/Integration/python/clients/beampixel_dqm_sourceclient-live_cfg.py
+++ b/DQM/Integration/python/clients/beampixel_dqm_sourceclient-live_cfg.py
@@ -1,27 +1,17 @@
 from __future__ import print_function
 import FWCore.ParameterSet.Config as cms
+from Configuration.StandardSequences.Eras import eras
 
-import sys
-from Configuration.Eras.Era_Run2_2018_cff import Run2_2018
-process = cms.Process("BeamPixel", Run2_2018)
-
-unitTest = False
-if 'unitTest=True' in sys.argv:
-    unitTest = True
+process = cms.Process("BeamPixel", eras.Run2_2018)
 
 
 #----------------------------
 # Common for PP and HI running
 #----------------------------
-if unitTest == True:
-    process.load("DQM.Integration.config.unittestinputsource_cfi")
-    from DQM.Integration.config.unittestinputsource_cfi import options
-else:
-    process.load("DQM.Integration.config.inputsource_cfi")
-    from DQM.Integration.config.inputsource_cfi import options
 # Use this to run locally (for testing purposes)
 #process.load("DQM.Integration.config.fileinputsource_cfi")
-#from DQM.Integration.config.fileinputsource_cfi import options
+# Otherwise use this
+process.load("DQM.Integration.config.inputsource_cfi")
 
 
 #----------------------------
@@ -37,9 +27,7 @@
 process.load("DQM.Integration.config.environment_cfi")
 process.dqmEnv.subSystemFolder = "BeamPixel"
 process.dqmSaver.tag = "BeamPixel"
-process.dqmSaver.runNumber = options.runNumber
-process.dqmSaverPB.tag = 'BeamPixel'
-process.dqmSaverPB.runNumber = options.runNumber
+
 
 #----------------------------
 # Conditions
@@ -63,7 +51,7 @@
 #----------------------------
 # Define Sequences
 #----------------------------
-process.dqmModules  = cms.Sequence(process.dqmEnv + process.dqmSaver + process.dqmSaverPB)
+process.dqmModules  = cms.Sequence(process.dqmEnv + process.dqmSaver)
 process.physTrigger = cms.Sequence(process.hltTriggerTypeFilter)
 
 
diff --git a/DQM/Integration/python/clients/csc_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/csc_dqm_sourceclient-live_cfg.py
index 77b1e6b88b699..e14e1dde331c7 100644
--- a/DQM/Integration/python/clients/csc_dqm_sourceclient-live_cfg.py
+++ b/DQM/Integration/python/clients/csc_dqm_sourceclient-live_cfg.py
@@ -1,6 +1,5 @@
 from __future__ import print_function
 import FWCore.ParameterSet.Config as cms
-import sys
 
 process = cms.Process("CSCDQMLIVE")
 
@@ -33,22 +32,11 @@
 #----------------------------
 # Event Source
 #-----------------------------
-
-unitTest=False
-if 'unitTest=True' in sys.argv:
-  unitTest=True
-
-if unitTest:
-  process.load("DQM.Integration.config.unittestinputsource_cfi")
-  from DQM.Integration.config.unittestinputsource_cfi import options
-else:
-  # for live online DQM in P5
-  process.load("DQM.Integration.config.inputsource_cfi")
-  from DQM.Integration.config.inputsource_cfi import options
+# for live online DQM in P5
+process.load("DQM.Integration.config.inputsource_cfi")
 
 # for testing in lxplus
 #process.load("DQM.Integration.config.fileinputsource_cfi")
-#from DQM.Integration.config.fileinputsource_cfi import options
 
 #----------------------------
 # DQM Environment
@@ -61,10 +49,8 @@
 process.load("DQM.Integration.config.environment_cfi")
 process.dqmEnv.subSystemFolder    = "CSC"
 process.dqmSaver.tag = "CSC"
-process.dqmSaver.runNumber = options.runNumber
-process.dqmSaverPB.tag = "CSC"
-process.dqmSaverPB.runNumber = options.runNumber
 
+process.DQMStore.referenceFileName = '/dqmdata/dqm/reference/csc_reference.root'
 
 #process.DQM.collectorHost = 'pccmsdqm02.cern.ch'
 #process.DQM.collectorHost = 'localhost'
@@ -172,8 +158,8 @@
 # Sequences
 #--------------------------
 
-#process.p = cms.Path(process.dqmCSCClient+process.dqmEnv+process.dqmSaver+process.dqmSaverPB)
-process.p = cms.Path(process.dqmCSCClient * process.muonCSCDigis * process.csc2DRecHits * process.cscSegments * process.cscMonitor + process.dqmEnv + process.dqmSaver + process.dqmSaverPB)
+#process.p = cms.Path(process.dqmCSCClient+process.dqmEnv+process.dqmSaver)
+process.p = cms.Path(process.dqmCSCClient * process.muonCSCDigis * process.csc2DRecHits * process.cscSegments * process.cscMonitor + process.dqmEnv + process.dqmSaver)
 
 
 process.castorDigis.InputLabel = cms.InputTag("rawDataCollector")
diff --git a/DQM/Integration/python/clients/fed_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/fed_dqm_sourceclient-live_cfg.py
index 6feff0263b749..db5d48aa3f106 100644
--- a/DQM/Integration/python/clients/fed_dqm_sourceclient-live_cfg.py
+++ b/DQM/Integration/python/clients/fed_dqm_sourceclient-live_cfg.py
@@ -1,13 +1,8 @@
 import FWCore.ParameterSet.Config as cms
-import sys
 
 # Process initialization
 process = cms.Process('FED')
 
-unitTest = False
-if 'unitTest=True' in sys.argv:
-    unitTest=True
-
 # Logging:
 process.MessageLogger = cms.Service(
     'MessageLogger',
@@ -23,18 +18,10 @@
 # Global tag:
 process.load('DQM.Integration.config.FrontierCondition_GT_cfi')
 # Input:
-if unitTest:
-    process.load("DQM.Integration.config.unittestinputsource_cfi")
-    from DQM.Integration.config.unittestinputsource_cfi import options
-else:
-    process.load('DQM.Integration.config.inputsource_cfi')
-    from DQM.Integration.config.inputsource_cfi import options
+process.load('DQM.Integration.config.inputsource_cfi')
 # Output:
 process.dqmEnv.subSystemFolder = 'FED'
 process.dqmSaver.tag = 'FED'
-process.dqmSaver.runNumber = options.runNumber
-process.dqmSaverPB.tag = 'FED'
-process.dqmSaverPB.runNumber = options.runNumber
 
 # Subsystem sequences
 
@@ -75,10 +62,11 @@
 process.load('EventFilter.HcalRawToDigi.HcalRawToDigi_cfi')
 # DT sequence:
 process.load('DQM.DTMonitorModule.dtDataIntegrityTask_EvF_cff')
-process.dtDataIntegrityTask.processingMode = 'SM'
+process.DTDataIntegrityTask.processingMode = 'SM'
 path = 'DT/%s/' % folder_name
-process.dtDataIntegrityTask.fedIntegrityFolder = path
-process.dtDataIntegrityTask.dtFEDlabel     = 'dtunpacker'
+process.DTDataIntegrityTask.fedIntegrityFolder = path
+process.DTDataIntegrityTask.checkUros = True
+process.DTDataIntegrityTask.dtFEDlabel     = 'dtunpacker'
 # RPC sequence:
 process.load('EventFilter.RPCRawToDigi.rpcUnpacker_cfi')
 process.load('DQM.RPCMonitorClient.RPCFEDIntegrity_cfi')
@@ -136,7 +124,7 @@
 			                      + process.hcalDigis
                                   + process.cscDQMEvF
  			                      + process.dtunpacker
-                                  + process.dtDataIntegrityTask
+                                  + process.DTDataIntegrityTask
 			                      + process.rpcunpacker
                                   + process.rpcFEDIntegrity
 
@@ -147,7 +135,6 @@
 process.DQMmodulesPath = cms.Path(
                                     process.dqmEnv
                                   + process.dqmSaver
-                                  + process.dqmSaverPB
                                  )
 
 process.schedule = cms.Schedule(
diff --git a/DQM/Integration/python/clients/l1t_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/l1t_dqm_sourceclient-live_cfg.py
index 301a79f2b2865..b50793e7a5e78 100644
--- a/DQM/Integration/python/clients/l1t_dqm_sourceclient-live_cfg.py
+++ b/DQM/Integration/python/clients/l1t_dqm_sourceclient-live_cfg.py
@@ -16,11 +16,9 @@
 #
 # for live online DQM in P5
 process.load("DQM.Integration.config.inputsource_cfi")
-from DQM.Integration.config.inputsource_cfi import options
 #
 # for testing in lxplus
 #process.load("DQM.Integration.config.fileinputsource_cfi")
-#from DQM.Integration.config.fileinputsource_cfi import options
 
 #----------------------------
 # DQM Environment
@@ -28,12 +26,10 @@
 process.load("DQM.Integration.config.environment_cfi")
 process.dqmEnv.subSystemFolder = 'L1T'
 process.dqmSaver.tag = 'L1T'
-process.dqmSaver.runNumber = options.runNumber
-process.dqmSaverPB.tag = 'L1T'
-process.dqmSaverPB.runNumber = options.runNumber
 
 #
 # references needed
+process.DQMStore.referenceFileName = "/dqmdata/dqm/reference/l1t_reference.root"
 
 # Condition for P5 cluster
 process.load("DQM.Integration.config.FrontierCondition_GT_cfi")
@@ -101,8 +97,7 @@
 #
 process.dqmEndPath = cms.EndPath(
                                  process.dqmEnv *
-                                 process.dqmSaver *
-                                 process.dqmSaverPB
+                                 process.dqmSaver
                                  )
 
 #
diff --git a/DQM/Integration/python/clients/l1temulator_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/l1temulator_dqm_sourceclient-live_cfg.py
index 9701c71d14a3c..5ddb7506f1e6e 100644
--- a/DQM/Integration/python/clients/l1temulator_dqm_sourceclient-live_cfg.py
+++ b/DQM/Integration/python/clients/l1temulator_dqm_sourceclient-live_cfg.py
@@ -16,11 +16,9 @@
 #
 # for live online DQM in P5
 process.load("DQM.Integration.config.inputsource_cfi")
-from DQM.Integration.config.inputsource_cfi import options
 #
 # for testing in lxplus
 #process.load("DQM.Integration.config.fileinputsource_cfi")
-#from DQM.Integration.config.fileinputsource_cfi import options
 
 #----------------------------
 # DQM Environment
@@ -32,11 +30,9 @@
 # for local test
 process.dqmEnv.subSystemFolder = 'L1TEMU'
 process.dqmSaver.tag = 'L1TEMU'
-process.dqmSaver.runNumber = options.runNumber
-process.dqmSaverPB.tag = 'L1TEMU'
-process.dqmSaverPB.runNumber = options.runNumber
 #
 # no references needed
+# replace DQMStore.referenceFileName = "L1TEMU_reference.root"
 
 #
 # Condition for P5 cluster
@@ -93,7 +89,7 @@
 process.l1EmulatorMonitorClientPath = cms.Path(process.l1EmulatorMonitorClient)
 
 #
-process.l1EmulatorMonitorEndPath = cms.EndPath(process.dqmEnv*process.dqmSaver*process.dqmSaverPB)
+process.l1EmulatorMonitorEndPath = cms.EndPath(process.dqmEnv*process.dqmSaver)
 
 #
 process.valCscTriggerPrimitiveDigis.gangedME1a = cms.untracked.bool(False)
diff --git a/DQM/Integration/python/clients/l1tstage1_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/l1tstage1_dqm_sourceclient-live_cfg.py
index b8ae0bcf233b4..20371cf54497f 100644
--- a/DQM/Integration/python/clients/l1tstage1_dqm_sourceclient-live_cfg.py
+++ b/DQM/Integration/python/clients/l1tstage1_dqm_sourceclient-live_cfg.py
@@ -16,11 +16,9 @@
 #
 # for live online DQM in P5
 process.load("DQM.Integration.config.inputsource_cfi")
-from DQM.Integration.config.inputsource_cfi import options
 #
 # for testing in lxplus
 #process.load("DQM.Integration.config.fileinputsource_cfi")
-#from DQM.Integration.config.fileinputsource_cfi import options
 
 #----------------------------
 # DQM Environment
@@ -29,12 +27,10 @@
 process.load("DQM.Integration.config.environment_cfi")
 process.dqmEnv.subSystemFolder = 'L1TStage1'
 process.dqmSaver.tag = 'L1TStage1'
-process.dqmSaver.runNumber = options.runNumber
-process.dqmSaverPB.tag = 'L1TStage1'
-process.dqmSaverPB.runNumber = options.runNumber
 
 #
 # references needed
+process.DQMStore.referenceFileName = "/dqmdata/dqm/reference/l1t_reference.root"
 
 # Condition for P5 cluster
 process.load("DQM.Integration.config.FrontierCondition_GT_cfi")
@@ -106,8 +102,7 @@
 #
 process.dqmEndPath = cms.EndPath(
                                  process.dqmEnv *
-                                 process.dqmSaver *
-                                 process.dqmSaverPB
+                                 process.dqmSaver
                                  )
 
 #
diff --git a/DQM/Integration/python/clients/l1tstage1emulator_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/l1tstage1emulator_dqm_sourceclient-live_cfg.py
index 6200618c0fe44..c7b24f6137478 100644
--- a/DQM/Integration/python/clients/l1tstage1emulator_dqm_sourceclient-live_cfg.py
+++ b/DQM/Integration/python/clients/l1tstage1emulator_dqm_sourceclient-live_cfg.py
@@ -16,11 +16,9 @@
 #
 # for live online DQM in P5
 process.load("DQM.Integration.config.inputsource_cfi")
-from DQM.Integration.config.inputsource_cfi import options
 #
 # for testing in lxplus
 #process.load("DQM.Integration.config.fileinputsource_cfi")
-#from DQM.Integration.config.fileinputsource_cfi import options
 
 #----------------------------
 # DQM Environment
@@ -30,12 +28,10 @@
 # for local test
 process.dqmEnv.subSystemFolder = 'L1TEMUStage1'
 process.dqmSaver.tag = 'L1TEMUStage1'
-process.dqmSaver.runNumber = options.runNumber
-process.dqmSaverPB.tag = 'L1TEMUStage1'
-process.dqmSaverPB.runNumber = options.runNumber
 
 #
 # no references needed
+# replace DQMStore.referenceFileName = "L1TEMU_reference.root"
 
 #
 # Condition for P5 cluster
@@ -93,7 +89,7 @@
 process.l1EmulatorMonitorClientPath = cms.Path(process.l1EmulatorMonitorClient)
 
 #
-process.l1EmulatorMonitorEndPath = cms.EndPath(process.dqmEnv*process.dqmSaver*process.dqmSaverPB)
+process.l1EmulatorMonitorEndPath = cms.EndPath(process.dqmEnv*process.dqmSaver)
 
 #
 
diff --git a/DQM/Integration/python/clients/l1tstage2_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/l1tstage2_dqm_sourceclient-live_cfg.py
index c5788228cac1f..a6362a54ffc17 100644
--- a/DQM/Integration/python/clients/l1tstage2_dqm_sourceclient-live_cfg.py
+++ b/DQM/Integration/python/clients/l1tstage2_dqm_sourceclient-live_cfg.py
@@ -1,27 +1,16 @@
 import FWCore.ParameterSet.Config as cms
 
-import sys
-from Configuration.Eras.Era_Run3_cff import Run3
-process = cms.Process("L1TStage2DQM", Run3)
-
-unitTest = False
-if 'unitTest=True' in sys.argv:
-    unitTest=True
+from Configuration.StandardSequences.Eras import eras
+process = cms.Process("L1TStage2DQM", eras.Run2_2018)
 
 #--------------------------------------------------
 # Event Source and Condition
 
-if unitTest:
-    process.load("DQM.Integration.config.unittestinputsource_cfi")
-    from DQM.Integration.config.unittestinputsource_cfi import options
-else:
-    # Live Online DQM in P5
-    process.load("DQM.Integration.config.inputsource_cfi")
-    from DQM.Integration.config.inputsource_cfi import options
+# Live Online DQM in P5
+process.load("DQM.Integration.config.inputsource_cfi")
 
 # # Testing in lxplus
 # process.load("DQM.Integration.config.fileinputsource_cfi")
-# from DQM.Integration.config.fileinputsource_cfi import options
 # process.load("FWCore.MessageLogger.MessageLogger_cfi")
 # process.MessageLogger.cerr.FwkReport.reportEvery = 1
 
@@ -42,11 +31,9 @@
 
 process.dqmEnv.subSystemFolder = "L1T"
 process.dqmSaver.tag = "L1T"
-process.dqmSaver.runNumber = options.runNumber
-process.dqmSaverPB.tag = "L1T"
-process.dqmSaverPB.runNumber = options.runNumber
+process.DQMStore.referenceFileName = "/dqmdata/dqm/reference/l1t_reference.root"
 
-process.dqmEndPath = cms.EndPath(process.dqmEnv * process.dqmSaver * process.dqmSaverPB)
+process.dqmEndPath = cms.EndPath(process.dqmEnv * process.dqmSaver)
 
 #--------------------------------------------------
 # Standard Unpacking Path
@@ -115,6 +102,7 @@
 
 # Cosmic run
 if (process.runType.getRunType() == process.runType.cosmic_run):
+    process.DQMStore.referenceFileName = "/dqmdata/dqm/reference/l1t_reference_cosmic.root"
     # Remove Quality Tests for L1T Muon Subsystems since they are not optimized yet for cosmics
     process.l1tStage2MonitorClient.remove(process.l1TStage2uGMTQualityTests)
     process.l1tStage2MonitorClient.remove(process.l1TStage2EMTFQualityTests)
@@ -125,6 +113,7 @@
 
 # Heavy-Ion run
 if (process.runType.getRunType() == process.runType.hi_run):
+    process.DQMStore.referenceFileName = "/dqmdata/dqm/reference/l1t_reference_hi.root"
     process.onlineMetaDataDigis.onlineMetaDataInputLabel = cms.InputTag("rawDataRepacker")
     process.onlineMetaDataRawToDigi.onlineMetaDataInputLabel = cms.InputTag("rawDataRepacker")
     process.castorDigis.InputLabel = cms.InputTag("rawDataRepacker")
diff --git a/DQM/Integration/python/clients/l1tstage2emulator_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/l1tstage2emulator_dqm_sourceclient-live_cfg.py
index 91f1f564366d0..614dbb3cfd59d 100644
--- a/DQM/Integration/python/clients/l1tstage2emulator_dqm_sourceclient-live_cfg.py
+++ b/DQM/Integration/python/clients/l1tstage2emulator_dqm_sourceclient-live_cfg.py
@@ -1,27 +1,16 @@
 import FWCore.ParameterSet.Config as cms
 
-import sys
-from Configuration.Eras.Era_Run3_cff import Run3
-process = cms.Process("L1TStage2EmulatorDQM", Run3)
-
-unitTest = False
-if 'unitTest=True' in sys.argv:
-    unitTest=True
+from Configuration.StandardSequences.Eras import eras
+process = cms.Process("L1TStage2EmulatorDQM", eras.Run2_2018)
 
 #--------------------------------------------------
 # Event Source and Condition
 
-if unitTest:
-    process.load("DQM.Integration.config.unittestinputsource_cfi")
-    from DQM.Integration.config.unittestinputsource_cfi import options
-else:
-    # Live Online DQM in P5
-    process.load("DQM.Integration.config.inputsource_cfi")
-    from DQM.Integration.config.inputsource_cfi import options
+# Live Online DQM in P5
+process.load("DQM.Integration.config.inputsource_cfi")
 
 # Testing in lxplus
 #process.load("DQM.Integration.config.fileinputsource_cfi")
-#from DQM.Integration.config.fileinputsource_cfi import options
 
 # Required to load Global Tag
 process.load("DQM.Integration.config.FrontierCondition_GT_cfi")
@@ -38,14 +27,11 @@
 
 process.dqmEnv.subSystemFolder = "L1TEMU"
 process.dqmSaver.tag = "L1TEMU"
-process.dqmSaver.runNumber = options.runNumber
-process.dqmSaverPB.tag = "L1TEMU"
-process.dqmSaverPB.runNumber = options.runNumber
+process.DQMStore.referenceFileName = "/dqmdata/dqm/reference/l1temu_reference.root"
 
 process.dqmEndPath = cms.EndPath(
     process.dqmEnv *
-    process.dqmSaver *
-    process.dqmSaverPB
+    process.dqmSaver
 )
 
 #--------------------------------------------------
diff --git a/DQM/Integration/python/clients/lumi_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/lumi_dqm_sourceclient-live_cfg.py
index 47acfcdc471f2..c38b7dd5fe9c0 100644
--- a/DQM/Integration/python/clients/lumi_dqm_sourceclient-live_cfg.py
+++ b/DQM/Integration/python/clients/lumi_dqm_sourceclient-live_cfg.py
@@ -8,7 +8,6 @@
 # Event Source
 #----------------------------
 process.load("DQM.Integration.config.inputsource_cfi")
-from DQM.Integration.config.inputsource_cfi import options
 #process.DQMEventStreamHttpReader.consumerName = 'DQM Luminosity Consumer'
 #process.DQMEventStreamHttpReader.SelectHLTOutput = cms.untracked.string('hltOutputALCALUMIPIXELS')
 
@@ -18,9 +17,6 @@
 process.load("DQM.Integration.config.environment_cfi")
 process.dqmEnv.subSystemFolder    = "Info/Lumi"
 process.dqmSaver.tag = "Lumi"
-process.dqmSaver.runNumber = options.runNumber
-process.dqmSaverPB.tag = "Lumi"
-process.dqmSaverPB.runNumber = options.runNumber
 
 #---------------------------------------------
 # Global Tag
@@ -67,8 +63,7 @@
 process.dqmmodules = cms.Sequence(process.dqmEnv
                                   + process.expressLumiProducer
                                   + process.dqmLumiMonitor    
-                                  + process.dqmSaver
-                                  + process.dqmSaverPB)
+                                  + process.dqmSaver)
 #----------------------------
 # Proton-Proton Running Stuff
 #----------------------------
diff --git a/DQM/Integration/python/clients/physics_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/physics_dqm_sourceclient-live_cfg.py
index 76b6ae553949c..d7a35b425baab 100644
--- a/DQM/Integration/python/clients/physics_dqm_sourceclient-live_cfg.py
+++ b/DQM/Integration/python/clients/physics_dqm_sourceclient-live_cfg.py
@@ -11,11 +11,9 @@
 
 # for live online DQM in P5
 process.load("DQM.Integration.config.inputsource_cfi")
-from DQM.Integration.config.inputsource_cfi import options
 
 # for testing in lxplus
 #process.load("DQM.Integration.config.fileinputsource_cfi")
-#from DQM.Integration.config.fileinputsource_cfi import options
 
 #----------------------------
 # DQM Environment
@@ -24,9 +22,6 @@
 process.load("DQM.Integration.config.environment_cfi")
 process.dqmEnv.subSystemFolder = 'Physics'
 process.dqmSaver.tag = 'Physics'
-process.dqmSaver.runNumber = options.runNumber
-process.dqmSaverPB.tag = 'Physics'
-process.dqmSaverPB.runNumber = options.runNumber
 
 # 0=random, 1=physics, 2=calibration, 3=technical
 process.hltTriggerTypeFilter = cms.EDFilter("HLTTriggerTypeFilter",
@@ -52,8 +47,7 @@
 #    process.dump *
     process.qcdLowPtDQM *
     process.dqmEnv *
-    process.dqmSaver *
-    process.dqmSaverPB
+    process.dqmSaver
 )
 
 process.siPixelDigis.InputLabel = cms.InputTag("rawDataCollector")
diff --git a/DQM/Integration/python/clients/pixel_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/pixel_dqm_sourceclient-live_cfg.py
index 5c45b5f46ca9c..ebaa305eb8aae 100644
--- a/DQM/Integration/python/clients/pixel_dqm_sourceclient-live_cfg.py
+++ b/DQM/Integration/python/clients/pixel_dqm_sourceclient-live_cfg.py
@@ -1,17 +1,11 @@
 from __future__ import print_function
 import FWCore.ParameterSet.Config as cms
 
-import sys
-from Configuration.Eras.Era_Run3_cff import Run3
-process = cms.Process("PIXELDQMLIVE", Run3)
+from Configuration.StandardSequences.Eras import eras
 
-live=True
-unitTest = False
-
-if 'unitTest=True' in sys.argv:
-    live=False
-    unitTest=True
+process = cms.Process("PIXELDQMLIVE", eras.Run2_2018_pp_on_AA)
 
+live=True
 #set to false for lxplus offline testing
 #live=False
 offlineTesting=not live
@@ -32,18 +26,12 @@
 #-----------------------------
 # for live online DQM in P5
 
-if (unitTest):
-    process.load("DQM.Integration.config.unittestinputsource_cfi")
-    from DQM.Integration.config.unittestinputsource_cfi import options
-
-elif (live):
+if (live):
     process.load("DQM.Integration.config.inputsource_cfi")
-    from DQM.Integration.config.inputsource_cfi import options
 
 # for testing in lxplus
 elif(offlineTesting):
     process.load("DQM.Integration.config.fileinputsource_cfi")
-    from DQM.Integration.config.fileinputsource_cfi import options
 
 #-----------------------------
 # DQM Environment
@@ -59,16 +47,20 @@
 
 process.dqmEnv.subSystemFolder = TAG
 process.dqmSaver.tag = TAG
-process.dqmSaver.runNumber = options.runNumber
-process.dqmSaverPB.tag = TAG
-process.dqmSaverPB.runNumber = options.runNumber
 
 
+process.DQMStore.referenceFileName = '/dqmdata/dqm/reference/pixel_reference_pp.root'
+#if (process.runType.getRunType() == process.runType.hi_run):
+#    process.DQMStore.referenceFileName = '/dqmdata/dqm/reference/pixel_reference_hi.root'
+
+if (process.runType.getRunType() == process.runType.cosmic_run):
+    process.DQMStore.referenceFileName = '/dqmdata/dqm/reference/pixel_reference_cosmic.root'
+
 #-----------------------------
 # Magnetic Field
 #-----------------------------
 
-process.load('Configuration.StandardSequences.MagneticField_cff')
+process.load('Configuration.StandardSequences.MagneticField_AutoFromDBCurrent_cff')
 
 #-------------------------------------------------
 # GEOMETRY
@@ -117,6 +109,7 @@
     process.siPixelDigis.InputLabel   = cms.InputTag("rawDataCollector")
     process.siStripDigis.InputLabel   = cms.InputTag("rawDataCollector")
 
+
 ## Collision Reconstruction
 process.load("Configuration.StandardSequences.RawToDigi_Data_cff")
 
@@ -165,7 +158,7 @@
 # Scheduling
 #--------------------------
 
-process.DQMmodules = cms.Sequence(process.dqmEnv* process.dqmSaver*process.dqmSaverPB)
+process.DQMmodules = cms.Sequence(process.dqmEnv* process.dqmSaver)
 
 process.RecoForDQM_LocalReco = cms.Sequence(process.siPixelDigis*process.siStripDigis*process.gtDigis*process.trackerlocalreco)
 
@@ -185,7 +178,6 @@
                          ##### TRIGGER SELECTION #####
                          process.hltHighLevel*
                          process.scalersRawToDigi*
-                         process.tcdsDigis*
                          process.APVPhases*
                          process.consecutiveHEs*
                          process.hltTriggerTypeFilter*
@@ -231,7 +223,6 @@
     process.p = cms.Path(
       process.hltHighLevel #trigger selection
      *process.scalersRawToDigi
-     *process.tcdsDigis
      *process.APVPhases
      *process.consecutiveHEs
      *process.Reco
diff --git a/DQM/Integration/python/clients/pixellumi_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/pixellumi_dqm_sourceclient-live_cfg.py
index bd15690ede85a..cdc9302a927d0 100644
--- a/DQM/Integration/python/clients/pixellumi_dqm_sourceclient-live_cfg.py
+++ b/DQM/Integration/python/clients/pixellumi_dqm_sourceclient-live_cfg.py
@@ -1,13 +1,8 @@
 from __future__ import print_function
 import FWCore.ParameterSet.Config as cms
-import sys
 
 process = cms.Process("PixelLumiDQM")
 
-unitTest=False
-if 'unitTest=True' in sys.argv:
-    unitTest=True
-
 process.MessageLogger = cms.Service("MessageLogger",
     debugModules = cms.untracked.vstring('siPixelDigis', 
 					 'sipixelEDAClient'),
@@ -18,18 +13,11 @@
 #----------------------------
 # Event Source
 #-----------------------------
-
-if unitTest:
-    process.load("DQM.Integration.config.unittestinputsource_cfi")
-    from DQM.Integration.config.unittestinputsource_cfi import options
-else:
-    # for live online DQM in P5
-    process.load("DQM.Integration.config.inputsource_cfi")
-    from DQM.Integration.config.inputsource_cfi import options
+# for live online DQM in P5
+process.load("DQM.Integration.config.inputsource_cfi")
 
 # for testing in lxplus
 #process.load("DQM.Integration.config.fileinputsource_cfi")
-#from DQM.Integration.config.fileinputsource_cfi import options
 
 ##
 #----------------------------
@@ -43,15 +31,13 @@
 process.load("DQM.Integration.config.environment_cfi")
 process.dqmEnv.subSystemFolder = "PixelLumi"
 process.dqmSaver.tag = "PixelLumi"
-process.dqmSaver.runNumber = options.runNumber
-process.dqmSaverPB.tag = "PixelLumi"
-process.dqmSaverPB.runNumber = options.runNumber
 
-if not unitTest:
-    process.source.SelectEvents = cms.untracked.vstring("HLT_ZeroBias*","HLT_L1AlwaysTrue*", "HLT_PAZeroBias*", "HLT_PAL1AlwaysTrue*")
+process.source.SelectEvents = cms.untracked.vstring("HLT_ZeroBias*","HLT_L1AlwaysTrue*", "HLT_PAZeroBias*", "HLT_PAL1AlwaysTrue*")
+#process.DQMStore.referenceFileName = '/dqmdata/dqm/reference/pixel_reference_pp.root'
 #if (process.runType.getRunType() == process.runType.hi_run):
+#    process.DQMStore.referenceFileName = '/dqmdata/dqm/reference/pixel_reference_hi.root'
 
-if (process.runType.getRunType() == process.runType.cosmic_run and not unitTest):
+if (process.runType.getRunType() == process.runType.cosmic_run):
     process.source.SelectEvents = cms.untracked.vstring('HLT*SingleMu*')
 
 #----------------------------
@@ -100,8 +86,7 @@
     process.load('Configuration.StandardSequences.RawToDigi_Repacked_cff')
     process.siPixelDigis.InputLabel   = cms.InputTag("rawDataRepacker")
 
-    if not unitTest:
-        process.source.SelectEvents = cms.untracked.vstring('HLT_HIL1MinimumBiasHF2AND*')
+    process.source.SelectEvents = cms.untracked.vstring('HLT_HIL1MinimumBiasHF2AND*')
 
 
 #    process.DQMEventStreamHttpReader.SelectEvents = cms.untracked.PSet(
@@ -134,8 +119,7 @@
 process.Reco = cms.Sequence(process.siPixelDigis*process.siPixelClusters)
 process.DQMmodules = cms.Sequence(process.dqmEnv*
   process.pixel_lumi_dqm*
-  process.dqmSaver*
-  process.dqmSaverPB)
+  process.dqmSaver)
 
 process.p = cms.Path(process.Reco*process.DQMmodules)
 
diff --git a/DQM/Integration/python/clients/scal_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/scal_dqm_sourceclient-live_cfg.py
index 893443f74f1a2..95a91979e9375 100644
--- a/DQM/Integration/python/clients/scal_dqm_sourceclient-live_cfg.py
+++ b/DQM/Integration/python/clients/scal_dqm_sourceclient-live_cfg.py
@@ -1,27 +1,15 @@
 import FWCore.ParameterSet.Config as cms
-import sys
 
 process = cms.Process("DQM")
 
-unitTest = False
-if 'unitTest=True' in sys.argv:
-    unitTest=True
-
 #----------------------------
 #### Event Source
 #----------------------------
-
-if unitTest:
-    process.load("DQM.Integration.config.unittestinputsource_cfi")
-    from DQM.Integration.config.unittestinputsource_cfi import options
-else:
-    # for live online DQM in P5
-    process.load("DQM.Integration.config.inputsource_cfi")
-    from DQM.Integration.config.inputsource_cfi import options
+# for live online DQM in P5
+process.load("DQM.Integration.config.inputsource_cfi")
 
 # for testing in lxplus
 #process.load("DQM.Integration.config.fileinputsource_cfi")
-#from DQM.Integration.config.fileinputsource_cfi import options
 
 #----------------------------
 #### DQM Environment
@@ -29,9 +17,6 @@
 process.load("DQM.Integration.config.environment_cfi")
 process.dqmEnv.subSystemFolder = 'Scal'
 process.dqmSaver.tag = 'Scal'
-process.dqmSaver.runNumber = options.runNumber
-process.dqmSaverPB.tag = 'Scal'
-process.dqmSaverPB.runNumber = options.runNumber
 #-----------------------------
 process.load("DQMServices.Components.DQMScalInfo_cfi")
 
@@ -53,7 +38,7 @@
 import EventFilter.L1GlobalTriggerRawToDigi.l1GtEvmUnpack_cfi
 gtEvmDigis = EventFilter.L1GlobalTriggerRawToDigi.l1GtEvmUnpack_cfi.l1GtEvmUnpack.clone()
 
-if (process.runType.getRunType() == process.runType.pp_run and not unitTest):
+if (process.runType.getRunType() == process.runType.pp_run):
     process.source.SelectEvents = cms.untracked.vstring('HLT_ZeroBias*')
 
 process.physicsBitSelector = cms.EDFilter("PhysDecl",
@@ -74,7 +59,7 @@
 process.dump = cms.EDAnalyzer('EventContentAnalyzer')
 
 # DQM Modules
-process.dqmmodules = cms.Sequence(process.dqmEnv + process.dqmSaver + process.dqmSaverPB)
+process.dqmmodules = cms.Sequence(process.dqmEnv + process.dqmSaver)
 process.evfDQMmodulesPath = cms.Path(
                               process.l1GtUnpack*
 			      process.gtDigis*
diff --git a/DQM/Integration/python/clients/sistrip_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/sistrip_dqm_sourceclient-live_cfg.py
index 37e219bd456c2..a9f8ef0e87d00 100644
--- a/DQM/Integration/python/clients/sistrip_dqm_sourceclient-live_cfg.py
+++ b/DQM/Integration/python/clients/sistrip_dqm_sourceclient-live_cfg.py
@@ -1,9 +1,9 @@
 from __future__ import print_function
 import FWCore.ParameterSet.Config as cms
 
-import sys
-from Configuration.Eras.Era_Run3_cff import Run3
-process = cms.Process("SiStrpDQMLive", Run3)
+from Configuration.StandardSequences.Eras import eras
+
+process = cms.Process("SiStrpDQMLive", eras.Run2_2018_pp_on_AA)
 
 process.MessageLogger = cms.Service("MessageLogger",
     debugModules = cms.untracked.vstring('siStripDigis',
@@ -15,12 +15,6 @@
 )
 
 live=True
-unitTest=False
-
-if 'unitTest=True' in sys.argv:
-    live=False
-    unitTest=True
-
 # uncomment for running on lxplus
 #live=False
 offlineTesting=not live
@@ -30,16 +24,11 @@
 # Event Source
 #-----------------------------
 # for live online DQM in P5
-if (unitTest):
-    process.load("DQM.Integration.config.unittestinputsource_cfi")
-    from DQM.Integration.config.unittestinputsource_cfi import options
-elif (live):
+if (live):
     process.load("DQM.Integration.config.inputsource_cfi")
-    from DQM.Integration.config.inputsource_cfi import options
 # for testing in lxplus
 elif(offlineTesting):
     process.load("DQM.Integration.config.fileinputsource_cfi")
-    from DQM.Integration.config.fileinputsource_cfi import options
 
 #----------------------------
 # DQM Live Environment
@@ -55,9 +44,6 @@
 process.dqmEnv.subSystemFolder    = "SiStrip"
 process.dqmSaver.tag = "SiStrip"
 process.dqmSaver.backupLumiCount = 30
-process.dqmSaver.runNumber = options.runNumber
-process.dqmSaverPB.tag = "SiStrip"
-process.dqmSaverPB.runNumber = options.runNumber
 
 from DQMServices.Core.DQMEDAnalyzer import DQMEDAnalyzer
 process.dqmEnvTr = DQMEDAnalyzer('DQMEventInfo',
@@ -70,7 +56,7 @@
 #-----------------------------
 # Magnetic Field
 #-----------------------------
-process.load('Configuration.StandardSequences.MagneticField_cff')
+process.load('Configuration.StandardSequences.MagneticField_AutoFromDBCurrent_cff')
 
 #-------------------------------------------------
 # GEOMETRY
@@ -144,8 +130,7 @@
 #--------------------------
 # Quality Test
 #--------------------------
-from DQMServices.Core.DQMQualityTester import DQMQualityTester
-process.stripQTester = DQMQualityTester(
+process.stripQTester = cms.EDAnalyzer("QualityTester",
     qtList = cms.untracked.FileInPath('DQM/SiStripMonitorClient/data/sistrip_qualitytest_config.xml'),
     prescaleFactor = cms.untracked.int32(3),
     getQualityTestsFromFile = cms.untracked.bool(True),
@@ -153,7 +138,7 @@
     qtestOnEndRun = cms.untracked.bool(True)
 )
 
-process.trackingQTester = DQMQualityTester(
+process.trackingQTester = cms.EDAnalyzer("QualityTester",
     qtList = cms.untracked.FileInPath('DQM/TrackingMonitorClient/data/tracking_qualitytest_config.xml'),
     prescaleFactor = cms.untracked.int32(3),
     getQualityTestsFromFile = cms.untracked.bool(True),
@@ -214,7 +199,7 @@
 # Scheduling
 #--------------------------
 process.SiStripSources_LocalReco = cms.Sequence(process.siStripFEDMonitor*process.SiStripMonitorDigi*process.SiStripMonitorClusterReal)
-process.DQMCommon                = cms.Sequence(process.stripQTester*process.trackingQTester*process.dqmEnv*process.dqmEnvTr*process.dqmSaver*process.dqmSaverPB)
+process.DQMCommon                = cms.Sequence(process.stripQTester*process.trackingQTester*process.dqmEnv*process.dqmEnvTr*process.dqmSaver)
 if (process.runType.getRunType() == process.runType.hi_run):
     process.RecoForDQM_LocalReco     = cms.Sequence(process.siPixelDigis*process.siStripDigis*process.trackerlocalreco)
 else :
@@ -234,6 +219,7 @@
     # event selection for cosmic data
     if ((process.runType.getRunType() == process.runType.cosmic_run) and live): process.source.SelectEvents = cms.untracked.vstring('HLT*SingleMu*','HLT_L1*')
     # Reference run for cosmic
+    process.DQMStore.referenceFileName = '/dqmdata/dqm/reference/sistrip_reference_cosmic.root'
     # Source config for cosmic data
     process.SiStripSources_TrkReco_cosmic = cms.Sequence(process.SiStripMonitorTrack_ckf*process.TrackMon_ckf)
     # Client config for cosmic data
@@ -272,8 +258,6 @@
     process.trackingQTester.qtestOnEndRun           = cms.untracked.bool(True)
 
     process.p = cms.Path(process.scalersRawToDigi*
-                         process.tcdsDigis*
-                         process.onlineMetaDataDigis*
                          process.APVPhases*
                          process.consecutiveHEs*
                          process.hltTriggerTypeFilter*
@@ -304,6 +288,7 @@
             'HLT_PAAK*'
             )
 
+    process.DQMStore.referenceFileName = '/dqmdata/dqm/reference/sistrip_reference_pp.root'
     # Source and Client config for pp collisions
 
     process.SiStripMonitorDigi.UseDCSFiltering = cms.bool(False)
@@ -377,8 +362,6 @@
 
     process.p = cms.Path(
         process.scalersRawToDigi*
-        process.tcdsDigis*
-        process.onlineMetaDataDigis*
         process.APVPhases*
         process.consecutiveHEs*
         process.hltTriggerTypeFilter*
@@ -414,6 +397,7 @@
 
  #        process.DQMEventStreamerReader.SelectEvents = cms.untracked.PSet(SelectEvents = cms.vstring('HLT_600Tower*','HLT_L1*','HLT_Jet*','HLT_HT*','HLT_MinBias_*','HLT_Physics*', 'HLT_ZeroBias*'))
 #
+    process.DQMStore.referenceFileName = '/dqmdata/dqm/reference/sistrip_reference_pp.root'
 
     process.SiStripMonitorDigi.UseDCSFiltering = cms.bool(False)
     process.SiStripMonitorClusterReal.UseDCSFiltering = cms.bool(False)
@@ -473,8 +457,6 @@
     process.RecoForDQM_TrkReco       = cms.Sequence(process.offlineBeamSpot*process.MeasurementTrackerEvent*process.siPixelClusterShapeCache*process.recopixelvertexing*process.iterTracking_FirstStep)
 
     process.p = cms.Path(process.scalersRawToDigi*
-                         process.tcdsDigis*
-                         process.onlineMetaDataDigis*
                          process.APVPhases*
                          process.consecutiveHEs*
                          process.hltTriggerTypeFilter*
@@ -535,6 +517,7 @@
             'HLT_HIPhysics*'
             )
 
+    process.DQMStore.referenceFileName = '/dqmdata/dqm/reference/sistrip_reference_pp.root'
 
 
     process.SiStripMonitorDigi.UseDCSFiltering = cms.bool(False)
@@ -629,8 +612,6 @@
 
     process.p = cms.Path(
         process.scalersRawToDigi*
-        process.tcdsDigis*
-        process.onlineMetaDataDigis*
         process.APVPhases*
         process.consecutiveHEs*
         process.hltTriggerTypeFilter*
diff --git a/DataFormats/SiPixelCluster/interface/SiPixelCluster.h b/DataFormats/SiPixelCluster/interface/SiPixelCluster.h
index ab4ae1add2132..22f9cb1020814 100644
--- a/DataFormats/SiPixelCluster/interface/SiPixelCluster.h
+++ b/DataFormats/SiPixelCluster/interface/SiPixelCluster.h
@@ -8,7 +8,7 @@
 //!  Class to contain and store all the topological information of pixel clusters:
 //!  charge, global size, size and the barycenter in x and y
 //!  local directions. It builds a vector of SiPixel (which is
-//!  an inner class) and a container of channels.
+//!  an inner class) and a container of channels. 
 //!
 //!  March 2007: Edge methods moved to RectangularPixelTopology class (V.Chiochia)
 //!  Feb 2008: Modify the Pixel class from float to shorts
@@ -26,153 +26,160 @@ class PixelDigi;
 
 class SiPixelCluster {
 public:
+  
   class Pixel {
   public:
-    constexpr Pixel() : x(0), y(0), adc(0) {}  // for root
-    constexpr Pixel(int pix_x, int pix_y, int pix_adc) : x(pix_x), y(pix_y), adc(pix_adc) {}
-    uint16_t x;
+    constexpr Pixel() : x(0), y(0), adc(0){} // for root
+    constexpr Pixel(int pix_x, int pix_y, int pix_adc) :
+      x(pix_x), y(pix_y), adc(pix_adc) {}
+    uint16_t  x;
     uint16_t y;
     uint16_t adc;
   };
-
+  
   //--- Integer shift in x and y directions.
   class Shift {
   public:
-    constexpr Shift(int dx, int dy) : dx_(dx), dy_(dy) {}
+    constexpr Shift( int dx, int dy) : dx_(dx), dy_(dy) {}
     constexpr Shift() : dx_(0), dy_(0) {}
-    constexpr int dx() const { return dx_; }
-    constexpr int dy() const { return dy_; }
-
+    constexpr int dx() const { return dx_;}
+    constexpr int dy() const { return dy_;}
   private:
     int dx_;
     int dy_;
   };
-
+  
   //--- Position of a SiPixel
   class PixelPos {
   public:
     constexpr PixelPos() : row_(0), col_(0) {}
-    constexpr PixelPos(int row, int col) : row_(row), col_(col) {}
-    constexpr int row() const { return row_; }
-    constexpr int col() const { return col_; }
-    constexpr PixelPos operator+(const Shift& shift) const { return PixelPos(row() + shift.dx(), col() + shift.dy()); }
-
+    constexpr PixelPos(int row, int col) : row_(row) , col_(col) {}
+    constexpr int row() const { return row_;}
+    constexpr int col() const { return col_;}
+    constexpr PixelPos operator+( const Shift& shift) const {
+      return PixelPos( row() + shift.dx(), col() + shift.dy());
+    }
   private:
     int row_;
     int col_;
   };
-
-  typedef std::vector<PixelDigi>::const_iterator PixelDigiIter;
-  typedef std::pair<PixelDigiIter, PixelDigiIter> PixelDigiRange;
-
-  static constexpr unsigned int MAXSPAN = 255;
-  static constexpr unsigned int MAXPOS = 2047;
-
+  
+  typedef std::vector<PixelDigi>::const_iterator   PixelDigiIter;
+  typedef std::pair<PixelDigiIter,PixelDigiIter>   PixelDigiRange;
+  
+  
+  static constexpr unsigned int MAXSPAN=255;
+  static constexpr unsigned int MAXPOS=2047;
+  
   /** Construct from a range of digis that form a cluster and from 
    *  a DetID. The range is assumed to be non-empty.
    */
-
+  
   SiPixelCluster() {}
-
-  SiPixelCluster(unsigned int isize,
-                 uint16_t const* adcs,
-                 uint16_t const* xpos,
-                 uint16_t const* ypos,
-                 uint16_t const xmin,
-                 uint16_t const ymin)
-      : thePixelOffset(2 * isize), thePixelADC(adcs, adcs + isize) {
+  
+  SiPixelCluster(unsigned int isize, uint16_t const * adcs,
+		 uint16_t const * xpos,  uint16_t const * ypos, 
+		 uint16_t const  xmin,  uint16_t const  ymin) :   
+    thePixelOffset(2*isize), thePixelADC(adcs,adcs+isize)  {
     uint16_t maxCol = 0;
     uint16_t maxRow = 0;
-    for (unsigned int i = 0; i != isize; ++i) {
-      uint16_t xoffset = xpos[i] - xmin;
-      uint16_t yoffset = ypos[i] - ymin;
-      thePixelOffset[i * 2] = std::min(uint16_t(MAXSPAN), xoffset);
-      thePixelOffset[i * 2 + 1] = std::min(uint16_t(MAXSPAN), yoffset);
-      if (xoffset > maxRow)
-        maxRow = xoffset;
-      if (yoffset > maxCol)
-        maxCol = yoffset;
+    for (unsigned int i=0; i!=isize; ++i) {
+      uint16_t xoffset = xpos[i]-xmin;
+      uint16_t yoffset = ypos[i]-ymin;
+      thePixelOffset[i*2] = std::min(uint16_t(MAXSPAN),xoffset);
+      thePixelOffset[i*2+1] = std::min(uint16_t(MAXSPAN),yoffset);
+      if (xoffset > maxRow) maxRow = xoffset; 
+      if (yoffset > maxCol) maxCol = yoffset; 
     }
-    packRow(xmin, maxRow);
-    packCol(ymin, maxCol);
+    packRow(xmin,maxRow);
+    packCol(ymin,maxCol);
   }
-
+  
+  
   // obsolete (only for regression tests)
-  SiPixelCluster(const PixelPos& pix, int adc);
-  void add(const PixelPos& pix, int adc);
-
-  // Analog linear average position (barycenter)
+  SiPixelCluster( const PixelPos& pix, int adc);
+  void add( const PixelPos& pix, int adc);
+  
+  // Analog linear average position (barycenter) 
   float x() const {
     float qm = 0.0;
     int isize = thePixelADC.size();
-    for (int i = 0; i < isize; ++i)
-      qm += float(thePixelADC[i]) * (thePixelOffset[i * 2] + minPixelRow() + 0.5f);
-    return qm / charge();
+    for (int i=0; i<isize; ++i)
+      qm += float(thePixelADC[i]) * (thePixelOffset[i*2] + minPixelRow() + 0.5f);
+    return qm/charge();
   }
-
+  
   float y() const {
     float qm = 0.0;
     int isize = thePixelADC.size();
-    for (int i = 0; i < isize; ++i)
-      qm += float(thePixelADC[i]) * (thePixelOffset[i * 2 + 1] + minPixelCol() + 0.5f);
-    return qm / charge();
+    for (int i=0; i<isize; ++i)
+      qm += float(thePixelADC[i]) * (thePixelOffset[i*2+1]  + minPixelCol() + 0.5f);
+    return qm/charge();
   }
-
+  
   // Return number of pixels.
-  int size() const { return thePixelADC.size(); }
-
+  int size() const { return thePixelADC.size();}
+  
   // Return cluster dimension in the x direction.
-  int sizeX() const { return rowSpan() + 1; }
-
+  int sizeX() const { return rowSpan() +1;}
+  
   // Return cluster dimension in the y direction.
-  int sizeY() const { return colSpan() + 1; }
-
+  int sizeY() const { return colSpan() +1;}
+  
+  
   inline int charge() const {
     int qm = 0;
     int isize = thePixelADC.size();
-    for (int i = 0; i < isize; ++i)
+    for (int i=0; i<isize; ++i) 
       qm += thePixelADC[i];
     return qm;
-  }  // Return total cluster charge.
-
-  inline int minPixelRow() const { return theMinPixelRow; }             // The min x index.
-  inline int maxPixelRow() const { return minPixelRow() + rowSpan(); }  // The max x index.
-  inline int minPixelCol() const { return theMinPixelCol; }             // The min y index.
-  inline int maxPixelCol() const { return minPixelCol() + colSpan(); }  // The max y index.
-
-  const std::vector<uint8_t>& pixelOffset() const { return thePixelOffset; }
-  const std::vector<uint16_t>& pixelADC() const { return thePixelADC; }
-
+  } // Return total cluster charge.
+  
+  inline int minPixelRow() const { return theMinPixelRow;} // The min x index.
+  inline int maxPixelRow() const { return minPixelRow() + rowSpan();} // The max x index.
+  inline int minPixelCol() const { return theMinPixelCol;} // The min y index.
+  inline int maxPixelCol() const { return minPixelCol() + colSpan();} // The max y index.
+  
+  
+  const std::vector<uint8_t> & pixelOffset() const { return thePixelOffset;}
+  const std::vector<uint16_t> & pixelADC() const { return thePixelADC;}
+  
   // obsolete, use single pixel access below
   const std::vector<Pixel> pixels() const {
     std::vector<Pixel> oldPixVector;
     int isize = thePixelADC.size();
-    oldPixVector.reserve(isize);
-    for (int i = 0; i < isize; ++i) {
+    oldPixVector.reserve(isize); 
+    for(int i=0; i<isize; ++i) {
       oldPixVector.push_back(pixel(i));
     }
     return oldPixVector;
   }
-
+  
   // infinite faster than above...
   Pixel pixel(int i) const {
-    return Pixel(minPixelRow() + thePixelOffset[i * 2], minPixelCol() + thePixelOffset[i * 2 + 1], thePixelADC[i]);
+    return Pixel(minPixelRow() + thePixelOffset[i*2],
+		 minPixelCol() + thePixelOffset[i*2+1],
+		 thePixelADC[i]
+		 );
   }
-
+  
 private:
-  static int overflow_(uint16_t span) { return span == uint16_t(MAXSPAN); }
+  
+  static int overflow_(uint16_t span) { return span==uint16_t(MAXSPAN);}
 
 public:
-  int colSpan() const { return thePixelColSpan; }
-
+  
+  int colSpan() const {return thePixelColSpan; }
+  
   int rowSpan() const { return thePixelRowSpan; }
-
+  
+  
   bool overflowCol() const { return overflow_(thePixelColSpan); }
-
+  
   bool overflowRow() const { return overflow_(thePixelRowSpan); }
-
-  bool overflow() const { return overflowCol() || overflowRow(); }
-
+  
+  bool overflow() const { return  overflowCol() || overflowRow(); }
+  
   void packCol(uint16_t ymin, uint16_t yspan) {
     theMinPixelCol = ymin;
     thePixelColSpan = std::min(yspan, uint16_t(MAXSPAN));
@@ -181,40 +188,46 @@ class SiPixelCluster {
     theMinPixelRow = xmin;
     thePixelRowSpan = std::min(xspan, uint16_t(MAXSPAN));
   }
-
-  // ggiurgiu@fnal.gov, 01/05/12
+  
+  // ggiurgiu@fnal.gov, 01/05/12 
   // Getters and setters for the newly added data members (err_x and err_y). See below.
-  void setSplitClusterErrorX(float errx) { err_x = errx; }
-  void setSplitClusterErrorY(float erry) { err_y = erry; }
-  float getSplitClusterErrorX() const { return err_x; }
-  float getSplitClusterErrorY() const { return err_y; }
-
+   void setSplitClusterErrorX( float errx ) { err_x = errx; }
+   void setSplitClusterErrorY( float erry ) { err_y = erry; }
+   float getSplitClusterErrorX() const { return err_x; }
+   float getSplitClusterErrorY() const { return err_y; }
+  
+  
 private:
-  std::vector<uint8_t> thePixelOffset;
+  
+  std::vector<uint8_t>  thePixelOffset;
   std::vector<uint16_t> thePixelADC;
-
-  uint16_t theMinPixelRow = MAXPOS;  // Minimum pixel index in the x direction (low edge).
-  uint16_t theMinPixelCol = MAXPOS;  // Minimum pixel index in the y direction (left edge).
-  uint8_t thePixelRowSpan = 0;       // Span pixel index in the x direction (low edge).
-  uint8_t thePixelColSpan = 0;       // Span pixel index in the y direction (left edge).
-
-  float err_x = -99999.9f;
-  float err_y = -99999.9f;
+  
+  
+  uint16_t theMinPixelRow=MAXPOS; // Minimum pixel index in the x direction (low edge).
+  uint16_t theMinPixelCol=MAXPOS; // Minimum pixel index in the y direction (left edge).
+  uint8_t thePixelRowSpan=0; // Span pixel index in the x direction (low edge).
+  uint8_t thePixelColSpan=0; // Span pixel index in the y direction (left edge).
+  
+   float err_x=-99999.9f;
+   float err_y=-99999.9f;
+  
 };
 
+
 // Comparison operators  (needed by DetSetVector)
-inline bool operator<(const SiPixelCluster& one, const SiPixelCluster& other) {
-  if (one.minPixelRow() < other.minPixelRow()) {
+inline bool operator<( const SiPixelCluster& one, const SiPixelCluster& other) {
+  if ( one.minPixelRow() < other.minPixelRow() ) {
     return true;
-  } else if (one.minPixelRow() > other.minPixelRow()) {
+  } else if ( one.minPixelRow() > other.minPixelRow() ) {
     return false;
-  } else if (one.minPixelCol() < other.minPixelCol()) {
+  } else if ( one.minPixelCol() < other.minPixelCol() ) {
     return true;
   } else {
     return false;
   }
 }
 
+
 #include "DataFormats/Common/interface/DetSetVector.h"
 #include "DataFormats/Common/interface/DetSetVectorNew.h"
 #include "DataFormats/Common/interface/Ref.h"
@@ -227,4 +240,4 @@ typedef edm::RefProd<SiPixelClusterCollection> SiPixelClusterRefProd;
 
 typedef edmNew::DetSetVector<SiPixelCluster> SiPixelClusterCollectionNew;
 typedef edm::Ref<SiPixelClusterCollectionNew, SiPixelCluster> SiPixelClusterRefNew;
-#endif
+#endif 
diff --git a/DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h b/DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h
new file mode 100644
index 0000000000000..df249a3790cd2
--- /dev/null
+++ b/DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h
@@ -0,0 +1,32 @@
+#ifndef DataFormats_SiPixelDigi_interface_SiPixelDigisSoA_h
+#define DataFormats_SiPixelDigi_interface_SiPixelDigisSoA_h
+
+#include <cstdint>
+#include <vector>
+
+class SiPixelDigisSoA {
+public:
+  SiPixelDigisSoA() = default;
+  explicit SiPixelDigisSoA(size_t nDigis, const uint32_t *pdigi, const uint32_t *rawIdArr, const uint16_t *adc, const int32_t *clus);
+  ~SiPixelDigisSoA() = default;
+
+  auto size() const { return pdigi_.size(); }
+
+  uint32_t pdigi(size_t i) const { return pdigi_[i]; }
+  uint32_t rawIdArr(size_t i) const { return rawIdArr_[i]; }
+  uint16_t adc(size_t i) const { return adc_[i]; }
+  int32_t clus(size_t i) const { return clus_[i]; }
+  
+  const std::vector<uint32_t>& pdigiVector() const { return pdigi_; }
+  const std::vector<uint32_t>& rawIdArrVector() const { return rawIdArr_; }
+  const std::vector<uint16_t>& adcVector() const { return adc_; }
+  const std::vector<int32_t>& clusVector() const { return clus_; }
+  
+private:
+  std::vector<uint32_t> pdigi_;
+  std::vector<uint32_t> rawIdArr_;
+  std::vector<uint16_t> adc_;
+  std::vector<int32_t> clus_;
+};
+
+#endif
diff --git a/DataFormats/SiPixelDigi/src/SiPixelDigisSoA.cc b/DataFormats/SiPixelDigi/src/SiPixelDigisSoA.cc
new file mode 100644
index 0000000000000..ebc8ba2055f78
--- /dev/null
+++ b/DataFormats/SiPixelDigi/src/SiPixelDigisSoA.cc
@@ -0,0 +1,12 @@
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h"
+
+#include <cassert>
+
+SiPixelDigisSoA::SiPixelDigisSoA(size_t nDigis, const uint32_t *pdigi, const uint32_t *rawIdArr, const uint16_t *adc, const int32_t *clus):
+  pdigi_(pdigi, pdigi+nDigis),
+  rawIdArr_(rawIdArr, rawIdArr+nDigis),
+  adc_(adc, adc+nDigis),
+  clus_(clus, clus+nDigis)
+{
+  assert(pdigi_.size() == nDigis);
+}
diff --git a/DataFormats/SiPixelDigi/src/classes.h b/DataFormats/SiPixelDigi/src/classes.h
index 2f36b72ca7df8..256ca41ad1867 100644
--- a/DataFormats/SiPixelDigi/src/classes.h
+++ b/DataFormats/SiPixelDigi/src/classes.h
@@ -5,9 +5,13 @@
 #include "DataFormats/SiPixelDigi/interface/PixelDigiCollection.h"
 #include "DataFormats/SiPixelDigi/interface/SiPixelCalibDigi.h"
 #include "DataFormats/SiPixelDigi/interface/SiPixelCalibDigiError.h"
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h"
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 #include "DataFormats/Common/interface/DetSetVector.h"
 #include "DataFormats/Common/interface/DetSetVectorNew.h"
+#include "boost/cstdint.hpp"
 #include <vector>
 
-#endif  // SIPIXELDIGI_CLASSES_H
+
+#endif // SIPIXELDIGI_CLASSES_H
diff --git a/DataFormats/SiPixelDigi/src/classes_def.xml b/DataFormats/SiPixelDigi/src/classes_def.xml
index de7779a5c00ea..8cabbd3f3f06e 100755
--- a/DataFormats/SiPixelDigi/src/classes_def.xml
+++ b/DataFormats/SiPixelDigi/src/classes_def.xml
@@ -49,4 +49,10 @@
 
    <class name="edmNew::DetSetVector<SiPixelCalibDigiError>"/>
    <class name="edm::Wrapper<edmNew::DetSetVector<SiPixelCalibDigiError> >"/>
+
+   <class name="SiPixelDigisSoA" persistent="false"/>
+   <class name="edm::Wrapper<SiPixelDigisSoA>" persistent="false"/>
+
+   <class name="SiPixelDigiErrorsSoA" persistent="false"/>
+   <class name="edm::Wrapper<SiPixelDigiErrorsSoA>" persistent="false"/>
 </lcgdict>
diff --git a/DataFormats/SiPixelRawData/src/classes.h b/DataFormats/SiPixelRawData/src/classes.h
index 73768cc373013..ab6b5d5f11363 100644
--- a/DataFormats/SiPixelRawData/src/classes.h
+++ b/DataFormats/SiPixelRawData/src/classes.h
@@ -6,4 +6,6 @@
 #include "DataFormats/Common/interface/DetSetVector.h"
 #include <vector>
 
-#endif  // SIPIXELRAWDATA_CLASSES_H
+
+#endif // SIPIXELRAWDATA_CLASSES_H
+
diff --git a/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml b/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml
index f92aa68373927..4d2b5ebf45542 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml
+++ b/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml
@@ -1,4 +1,7 @@
+<use name="CUDADataFormats/SiPixelDigi"/>
 <use name="EventFilter/SiPixelRawToDigi"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="cuda-api-wrappers"/>
 <library file="*.cc" name="EventFilterSiPixelRawToDigiPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsFromSoA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsFromSoA.cc
new file mode 100644
index 0000000000000..9e998b92fc403
--- /dev/null
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsFromSoA.cc
@@ -0,0 +1,183 @@
+#include "CondFormats/DataRecord/interface/SiPixelFedCablingMapRcd.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingMap.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingTree.h"
+#include "DataFormats/Common/interface/DetSetVector.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "DataFormats/DetId/interface/DetIdCollection.h"
+#include "DataFormats/SiPixelDetId/interface/PixelFEDChannel.h"
+#include "DataFormats/SiPixelDigi/interface/PixelDigi.h"
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h"
+#include "EventFilter/SiPixelRawToDigi/interface/PixelDataFormatter.h"
+#include "FWCore/Framework/interface/ESTransientHandle.h"
+#include "FWCore/Framework/interface/ESWatcher.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+
+#include <memory>
+
+class SiPixelDigiErrorsFromSoA: public edm::stream::EDProducer<> {
+public:
+  explicit SiPixelDigiErrorsFromSoA(const edm::ParameterSet& iConfig);
+  ~SiPixelDigiErrorsFromSoA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+  edm::EDGetTokenT<SiPixelDigiErrorsSoA> digiErrorSoAGetToken_;
+
+  edm::EDPutTokenT<edm::DetSetVector<SiPixelRawDataError>> errorPutToken_;
+  edm::EDPutTokenT<DetIdCollection> tkErrorPutToken_;
+  edm::EDPutTokenT<DetIdCollection> userErrorPutToken_;
+  edm::EDPutTokenT<edmNew::DetSetVector<PixelFEDChannel>> disabledChannelPutToken_;
+
+  edm::ESWatcher<SiPixelFedCablingMapRcd> cablingWatcher_;
+  std::unique_ptr<SiPixelFedCablingTree> cabling_;
+  const std::string cablingMapLabel_;
+
+  const std::vector<int> tkerrorlist_;
+  const std::vector<int> usererrorlist_;
+
+  const bool usePhase1_;
+};
+
+SiPixelDigiErrorsFromSoA::SiPixelDigiErrorsFromSoA(const edm::ParameterSet& iConfig):
+  digiErrorSoAGetToken_{consumes<SiPixelDigiErrorsSoA>(iConfig.getParameter<edm::InputTag>("digiErrorSoASrc"))},
+  errorPutToken_{produces<edm::DetSetVector<SiPixelRawDataError>>()},
+  tkErrorPutToken_{produces<DetIdCollection>()},
+  userErrorPutToken_{produces<DetIdCollection>("UserErrorModules")},
+  disabledChannelPutToken_{produces<edmNew::DetSetVector<PixelFEDChannel>>()},
+  cablingMapLabel_(iConfig.getParameter<std::string>("CablingMapLabel")),
+  tkerrorlist_(iConfig.getParameter<std::vector<int>>("ErrorList")),
+  usererrorlist_(iConfig.getParameter<std::vector<int>>("UserErrorList")),
+  usePhase1_(iConfig.getParameter<bool> ("UsePhase1"))
+{}
+
+void SiPixelDigiErrorsFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("digiErrorSoASrc", edm::InputTag("siPixelDigiErrorsSoA"));
+  desc.add<std::string>("CablingMapLabel","")->setComment("CablingMap label");
+  desc.add<bool>("UsePhase1",false)->setComment("##  Use phase1");
+  desc.add<std::vector<int> >("ErrorList", std::vector<int>{29})->setComment("## ErrorList: list of error codes used by tracking to invalidate modules");
+  desc.add<std::vector<int> >("UserErrorList", std::vector<int>{40})->setComment("## UserErrorList: list of error codes used by Pixel experts for investigation");
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void SiPixelDigiErrorsFromSoA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  // pack errors into collection
+
+  // initialize cabling map or update if necessary
+  if (cablingWatcher_.check(iSetup)) {
+    // cabling map, which maps online address (fed->link->ROC->local pixel) to offline (DetId->global pixel)
+    edm::ESTransientHandle<SiPixelFedCablingMap> cablingMap;
+    iSetup.get<SiPixelFedCablingMapRcd>().get(cablingMapLabel_, cablingMap);
+    cabling_ = cablingMap->cablingTree();
+    LogDebug("map version:")<< cabling_->version();
+  }
+
+  const auto& digiErrors = iEvent.get(digiErrorSoAGetToken_);
+
+
+  edm::DetSetVector<SiPixelRawDataError> errorcollection{};
+  DetIdCollection tkerror_detidcollection{};
+  DetIdCollection usererror_detidcollection{};
+  edmNew::DetSetVector<PixelFEDChannel> disabled_channelcollection{};
+
+  PixelDataFormatter formatter(cabling_.get(), usePhase1_); // for phase 1 & 0
+  const PixelDataFormatter::Errors *formatterErrors = digiErrors.formatterErrors();
+  assert(formatterErrors != nullptr);
+  auto errors = *formatterErrors; // make a copy
+  PixelDataFormatter::DetErrors nodeterrors;
+
+  auto size = digiErrors.size();
+  for (auto i = 0U; i < size; i++) {
+    PixelErrorCompact err = digiErrors.error(i);
+    if (err.errorType != 0) {
+      SiPixelRawDataError error(err.word, err.errorType, err.fedId + 1200);
+      errors[err.rawId].push_back(error);
+    }
+  }
+
+  constexpr uint32_t dummydetid = 0xffffffff;
+  typedef PixelDataFormatter::Errors::iterator IE;
+  for (IE is = errors.begin(); is != errors.end(); is++) {
+
+    uint32_t errordetid = is->first;
+    if (errordetid == dummydetid) {// errors given dummy detId must be sorted by Fed
+      nodeterrors.insert( nodeterrors.end(), errors[errordetid].begin(), errors[errordetid].end() );
+    }
+    else {
+      edm::DetSet<SiPixelRawDataError>& errorDetSet = errorcollection.find_or_insert(errordetid);
+      errorDetSet.data.insert(errorDetSet.data.end(), is->second.begin(), is->second.end());
+      // Fill detid of the detectors where there is error AND the error number is listed
+      // in the configurable error list in the job option cfi.
+      // Code needs to be here, because there can be a set of errors for each
+      // entry in the for loop over PixelDataFormatter::Errors
+
+      std::vector<PixelFEDChannel> disabledChannelsDetSet;
+
+      for (auto const& aPixelError : errorDetSet) {
+        // For the time being, we extend the error handling functionality with ErrorType 25
+        // In the future, we should sort out how the usage of tkerrorlist can be generalized
+        if (aPixelError.getType() == 25) {
+          int fedId = aPixelError.getFedId();
+          const sipixelobjects::PixelFEDCabling* fed = cabling_->fed(fedId);
+          if (fed) {
+            cms_uint32_t linkId = formatter.linkId(aPixelError.getWord32());
+            const sipixelobjects::PixelFEDLink* link = fed->link(linkId);
+            if (link) {
+              // The "offline" 0..15 numbering is fixed by definition, also, the FrameConversion depends on it
+              // in contrast, the ROC-in-channel numbering is determined by hardware --> better to use the "offline" scheme
+              PixelFEDChannel ch = {fed->id(), linkId, 25, 0};
+              for (unsigned int iRoc = 1; iRoc <= link->numberOfROCs(); iRoc++) {
+                const sipixelobjects::PixelROC * roc = link->roc(iRoc);
+                if (roc->idInDetUnit() < ch.roc_first) ch.roc_first = roc->idInDetUnit();
+                if (roc->idInDetUnit() > ch.roc_last) ch.roc_last = roc->idInDetUnit();
+              }
+              if (ch.roc_first<ch.roc_last) disabledChannelsDetSet.push_back(ch);
+            }
+          }
+        }
+        else {
+          // fill list of detIds to be turned off by tracking
+          if (!tkerrorlist_.empty()) {
+            auto it_find = std::find(tkerrorlist_.begin(), tkerrorlist_.end(), aPixelError.getType());
+            if (it_find != tkerrorlist_.end()) {
+              tkerror_detidcollection.push_back(errordetid);
+            }
+          }
+        }
+        
+        // fill list of detIds with errors to be studied
+        if (!usererrorlist_.empty()) {
+          auto it_find = std::find(usererrorlist_.begin(), usererrorlist_.end(), aPixelError.getType());
+          if (it_find != usererrorlist_.end()) {
+            usererror_detidcollection.push_back(errordetid);
+          }
+        }
+
+      } // loop on DetSet of errors
+
+      if (!disabledChannelsDetSet.empty()) {
+        disabled_channelcollection.insert(errordetid, disabledChannelsDetSet.data(), disabledChannelsDetSet.size());
+      }
+
+    } // if error assigned to a real DetId
+  } // loop on errors in event for this FED
+
+  edm::DetSet<SiPixelRawDataError>& errorDetSet = errorcollection.find_or_insert(dummydetid);
+  errorDetSet.data = nodeterrors;
+
+  iEvent.emplace(errorPutToken_, std::move(errorcollection));
+  iEvent.emplace(tkErrorPutToken_, std::move(tkerror_detidcollection));
+  iEvent.emplace(userErrorPutToken_, std::move(usererror_detidcollection));
+  iEvent.emplace(disabledChannelPutToken_, std::move(disabled_channelcollection));
+}
+
+DEFINE_FWK_MODULE(SiPixelDigiErrorsFromSoA);
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
new file mode 100644
index 0000000000000..d47542528ed86
--- /dev/null
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
@@ -0,0 +1,75 @@
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+class SiPixelDigiErrorsSoAFromCUDA: public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit SiPixelDigiErrorsSoAFromCUDA(const edm::ParameterSet& iConfig);
+  ~SiPixelDigiErrorsSoAFromCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+  edm::EDGetTokenT<CUDAProduct<SiPixelDigiErrorsCUDA>> digiErrorGetToken_;
+  edm::EDPutTokenT<SiPixelDigiErrorsSoA> digiErrorPutToken_;
+
+  cudautils::host::unique_ptr<PixelErrorCompact[]> data_;
+  GPU::SimpleVector<PixelErrorCompact> error_;
+  const PixelFormatterErrors *formatterErrors_ = nullptr;
+};
+
+SiPixelDigiErrorsSoAFromCUDA::SiPixelDigiErrorsSoAFromCUDA(const edm::ParameterSet& iConfig):
+  digiErrorGetToken_(consumes<CUDAProduct<SiPixelDigiErrorsCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
+  digiErrorPutToken_(produces<SiPixelDigiErrorsSoA>())
+{}
+
+void SiPixelDigiErrorsSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag("siPixelClustersCUDA"));
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void SiPixelDigiErrorsSoAFromCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  // Do the transfer in a CUDA stream parallel to the computation CUDA stream
+  CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
+
+  const auto& gpuDigiErrors = ctx.get(iEvent, digiErrorGetToken_);
+
+  auto tmp = gpuDigiErrors.dataErrorToHostAsync(ctx.stream());
+  error_ = std::move(tmp.first);
+  data_ = std::move(tmp.second);
+  formatterErrors_ = &(gpuDigiErrors.formatterErrors());
+}
+
+void SiPixelDigiErrorsSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  // The following line copies the data from the pinned host memory to
+  // regular host memory. In principle that feels unnecessary (why not
+  // just use the pinned host memory?). There are a few arguments for
+  // doing it though
+  // - Now can release the pinned host memory back to the (caching) allocator
+  //   * if we'd like to keep the pinned memory, we'd need to also
+  //     keep the CUDA stream around as long as that, or allow pinned
+  //     host memory to be allocated without a CUDA stream
+  // - What if a CPU algorithm would produce the same SoA? We can't
+  //   use cudaMallocHost without a GPU...
+  iEvent.emplace(digiErrorPutToken_, error_.size(), error_.data(), formatterErrors_);
+
+  error_ = GPU::make_SimpleVector<PixelErrorCompact>(0, nullptr);
+  data_.reset();
+  formatterErrors_ = nullptr;
+}
+
+// define as framework plugin
+DEFINE_FWK_MODULE(SiPixelDigiErrorsSoAFromCUDA);
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
new file mode 100644
index 0000000000000..068701f0bcf07
--- /dev/null
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
@@ -0,0 +1,81 @@
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+
+class SiPixelDigisSoAFromCUDA: public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit SiPixelDigisSoAFromCUDA(const edm::ParameterSet& iConfig);
+  ~SiPixelDigisSoAFromCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+  edm::EDGetTokenT<CUDAProduct<SiPixelDigisCUDA>> digiGetToken_;
+  edm::EDPutTokenT<SiPixelDigisSoA> digiPutToken_;
+
+  cudautils::host::unique_ptr<uint32_t[]> pdigi_;
+  cudautils::host::unique_ptr<uint32_t[]> rawIdArr_;
+  cudautils::host::unique_ptr<uint16_t[]> adc_;
+  cudautils::host::unique_ptr< int32_t[]> clus_;
+
+  int nDigis_;
+};
+
+SiPixelDigisSoAFromCUDA::SiPixelDigisSoAFromCUDA(const edm::ParameterSet& iConfig):
+  digiGetToken_(consumes<CUDAProduct<SiPixelDigisCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
+  digiPutToken_(produces<SiPixelDigisSoA>())
+{}
+
+void SiPixelDigisSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag("siPixelClustersCUDA"));
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  // Do the transfer in a CUDA stream parallel to the computation CUDA stream
+  CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
+
+  const auto& gpuDigis = ctx.get(iEvent, digiGetToken_);
+
+  nDigis_ = gpuDigis.nDigis();
+  pdigi_ = gpuDigis.pdigiToHostAsync(ctx.stream());
+  rawIdArr_ = gpuDigis.rawIdArrToHostAsync(ctx.stream());
+  adc_ = gpuDigis.adcToHostAsync(ctx.stream());
+  clus_ = gpuDigis.clusToHostAsync(ctx.stream());
+}
+
+void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  // The following line copies the data from the pinned host memory to
+  // regular host memory. In principle that feels unnecessary (why not
+  // just use the pinned host memory?). There are a few arguments for
+  // doing it though
+  // - Now can release the pinned host memory back to the (caching) allocator
+  //   * if we'd like to keep the pinned memory, we'd need to also
+  //     keep the CUDA stream around as long as that, or allow pinned
+  //     host memory to be allocated without a CUDA stream
+  // - What if a CPU algorithm would produce the same SoA? We can't
+  //   use cudaMallocHost without a GPU...
+  iEvent.emplace(digiPutToken_, nDigis_, pdigi_.get(), rawIdArr_.get(), adc_.get(), clus_.get());
+
+  pdigi_.reset();
+  rawIdArr_.reset();
+  adc_.reset();
+  clus_.reset();
+}
+
+// define as framework plugin
+DEFINE_FWK_MODULE(SiPixelDigisSoAFromCUDA);
diff --git a/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py b/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py
index 12ff657cefd8e..50c8f0fcabd3c 100644
--- a/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py
+++ b/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py
@@ -1,7 +1,24 @@
 import FWCore.ParameterSet.Config as cms
-import EventFilter.SiPixelRawToDigi.siPixelRawToDigi_cfi
+from EventFilter.SiPixelRawToDigi.siPixelRawToDigi_cfi import siPixelRawToDigi as _siPixelRawToDigi
 
-siPixelDigis = EventFilter.SiPixelRawToDigi.siPixelRawToDigi_cfi.siPixelRawToDigi.clone()
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+siPixelDigis = SwitchProducerCUDA(
+    cpu = _siPixelRawToDigi.clone()
+)
 
 from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel
-phase1Pixel.toModify(siPixelDigis, UsePhase1=True)
+phase1Pixel.toModify(siPixelDigis.cpu, UsePhase1=True)
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+gpu.toModify(siPixelDigis,
+    cuda = cms.EDAlias(
+        siPixelDigiErrors = cms.VPSet(
+            cms.PSet(type = cms.string("DetIdedmEDCollection")),
+            cms.PSet(type = cms.string("SiPixelRawDataErroredmDetSetVector")),
+            cms.PSet(type = cms.string("PixelFEDChanneledmNewDetSetVector"))
+        ),
+        siPixelDigisClustersPreSplitting = cms.VPSet(
+            cms.PSet(type = cms.string("PixelDigiedmDetSetVector"))
+        )
+    )
+)
diff --git a/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py b/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py
new file mode 100644
index 0000000000000..31ba8596bddc6
--- /dev/null
+++ b/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py
@@ -0,0 +1,30 @@
+import FWCore.ParameterSet.Config as cms
+
+from EventFilter.SiPixelRawToDigi.SiPixelRawToDigi_cfi import siPixelDigis
+from EventFilter.SiPixelRawToDigi.siPixelDigisSoAFromCUDA_cfi import siPixelDigisSoAFromCUDA as _siPixelDigisSoAFromCUDA
+from EventFilter.SiPixelRawToDigi.siPixelDigiErrorsSoAFromCUDA_cfi import siPixelDigiErrorsSoAFromCUDA as _siPixelDigiErrorsSoAFromCUDA
+from EventFilter.SiPixelRawToDigi.siPixelDigiErrorsFromSoA_cfi import siPixelDigiErrorsFromSoA as _siPixelDigiErrorsFromSoA
+
+siPixelDigisTask = cms.Task(siPixelDigis)
+
+siPixelDigisSoA = _siPixelDigisSoAFromCUDA.clone(
+    src = "siPixelClustersCUDAPreSplitting"
+)
+siPixelDigiErrorsSoA = _siPixelDigiErrorsSoAFromCUDA.clone(
+    src = "siPixelClustersCUDAPreSplitting"
+)
+siPixelDigiErrors = _siPixelDigiErrorsFromSoA.clone()
+
+from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel
+phase1Pixel.toModify(siPixelDigiErrors, UsePhase1=True)
+
+siPixelDigisTaskCUDA = cms.Task(
+    siPixelDigisSoA,
+    siPixelDigiErrorsSoA,
+    siPixelDigiErrors
+)
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+_siPixelDigisTask_gpu = siPixelDigisTask.copy()
+_siPixelDigisTask_gpu.add(siPixelDigisTaskCUDA)
+gpu.toReplaceWith(siPixelDigisTask, _siPixelDigisTask_gpu)
diff --git a/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h b/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h
index cefdbe4b3296a..05e6b01e96c24 100644
--- a/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h
+++ b/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h
@@ -2,68 +2,150 @@
 #define Geometry_TrackerGeometryBuilder_phase1PixelTopology_h
 
 #include <cstdint>
+#include <array>
 
 namespace phase1PixelTopology {
 
-  constexpr uint16_t numRowsInRoc = 80;
-  constexpr uint16_t numColsInRoc = 52;
-  constexpr uint16_t lastRowInRoc = numRowsInRoc - 1;
-  constexpr uint16_t lastColInRoc = numColsInRoc - 1;
+  constexpr uint16_t numRowsInRoc     = 80;
+  constexpr uint16_t numColsInRoc     = 52;
+  constexpr uint16_t lastRowInRoc     = numRowsInRoc - 1;
+  constexpr uint16_t lastColInRoc     = numColsInRoc - 1;
 
-  constexpr uint16_t numRowsInModule = 2 * numRowsInRoc;
-  constexpr uint16_t numColsInModule = 8 * numColsInRoc;
-  constexpr uint16_t lastRowInModule = numRowsInModule - 1;
-  constexpr uint16_t lastColInModule = numColsInModule - 1;
+  constexpr uint16_t numRowsInModule  = 2 * numRowsInRoc;
+  constexpr uint16_t numColsInModule  = 8 * numColsInRoc;
+  constexpr uint16_t lastRowInModule  = numRowsInModule - 1;
+  constexpr uint16_t lastColInModule  = numColsInModule - 1;
 
   constexpr int16_t xOffset = -81;
-  constexpr int16_t yOffset = -54 * 4;
+  constexpr int16_t yOffset = -54*4;
 
-  constexpr uint32_t numPixsInModule = uint32_t(numRowsInModule) * uint32_t(numColsInModule);
+  constexpr uint32_t numPixsInModule = uint32_t(numRowsInModule)* uint32_t(numColsInModule);
+
+  constexpr uint32_t numberOfModules = 1856;
+  constexpr uint32_t numberOfLayers = 10;
+  constexpr uint32_t layerStart[numberOfLayers + 1] = {
+       0,    96, 320,  672,         // barrel
+    1184, 1296, 1408,               // positive endcap
+    1520, 1632, 1744,               // negative endcap
+    numberOfModules
+  };
+  constexpr char const * layerName[numberOfLayers] = {
+    "BL1", "BL2", "BL3", "BL4",     // barrel
+    "E+1", "E+2", "E+3",            // positive endcap
+    "E-1", "E-2", "E-3"             // negative endcap
+  };
+
+
+  template<class Function, std::size_t... Indices>
+  constexpr auto map_to_array_helper(Function f, std::index_sequence<Indices...>)
+  -> std::array<typename std::result_of<Function(std::size_t)>::type, sizeof...(Indices)>
+  {
+    return {{ f(Indices)... }};
+  }
+
+  template<int N, class Function>
+  constexpr auto map_to_array(Function f)
+  -> std::array<typename std::result_of<Function(std::size_t)>::type, N>
+  {
+    return map_to_array_helper(f, std::make_index_sequence<N>{});
+  }
+
+
+  constexpr uint32_t findMaxModuleStride() {
+    bool go = true;
+    int n=2;
+    while (go) {
+      for  (uint8_t i=1; i<11; ++i) {
+        if (layerStart[i]%n !=0) {go=false; break;}
+      }
+      if(!go) break;
+      n*=2;
+    }
+    return n/2;
+  }
+
+  constexpr uint32_t maxModuleStride = findMaxModuleStride();
+
+  constexpr uint8_t findLayer(uint32_t detId) {
+    for  (uint8_t i=0; i<11; ++i) if (detId<layerStart[i+1]) return i;
+    return 11;
+  }
+
+  constexpr uint8_t findLayerFromCompact(uint32_t detId) {
+    detId*=maxModuleStride;
+    for  (uint8_t i=0; i<11; ++i) if (detId<layerStart[i+1]) return i;
+    return 11;
+  }
+
+  constexpr uint32_t layerIndexSize = numberOfModules / maxModuleStride;
+  constexpr std::array<uint8_t, layerIndexSize> layer = map_to_array<layerIndexSize>(findLayerFromCompact);
+
+  constexpr bool validateLayerIndex() {
+    bool res=true;
+    for (auto i=0U; i<numberOfModules; ++i)  {
+      auto j = i/maxModuleStride;
+      res &=(layer[j]<10);
+      res &=(i>=layerStart[layer[j]]);
+      res &=(i<layerStart[layer[j]+1]);
+    }
+    return res;
+  }
+
+  static_assert(validateLayerIndex(), "layer from detIndex algo is buggy");
 
   // this is for the ROC n<512 (upgrade 1024)
-  constexpr inline uint16_t divu52(uint16_t n) {
-    n = n >> 2;
-    uint16_t q = (n >> 1) + (n >> 4);
-    q = q + (q >> 4) + (q >> 5);
-    q = q >> 3;
-    uint16_t r = n - q * 13;
+  constexpr inline
+  uint16_t divu52(uint16_t n) {
+    n = n>>2;
+    uint16_t q = (n>>1) + (n>>4);
+    q = q + (q>>4) + (q>>5); q = q >> 3;
+    uint16_t r = n - q*13;
     return q + ((r + 3) >> 4);
   }
 
-  constexpr inline bool isEdgeX(uint16_t px) { return (px == 0) | (px == lastRowInModule); }
-  constexpr inline bool isEdgeY(uint16_t py) { return (py == 0) | (py == lastColInModule); }
+  constexpr inline
+  bool isEdgeX(uint16_t px) { return (px==0) | (px==lastRowInModule); }
+
+  constexpr inline
+  bool isEdgeY(uint16_t py) { return (py==0) | (py==lastColInModule); }
 
-  constexpr inline uint16_t toRocX(uint16_t px) { return (px < numRowsInRoc) ? px : px - numRowsInRoc; }
-  constexpr inline uint16_t toRocY(uint16_t py) {
+  constexpr inline
+  uint16_t toRocX(uint16_t px) { return (px<numRowsInRoc) ? px : px-numRowsInRoc; }
+
+  constexpr inline
+  uint16_t toRocY(uint16_t py) {
     auto roc = divu52(py);
-    return py - 52 * roc;
+    return py - 52*roc;
   }
 
-  constexpr inline bool isBigPixX(uint16_t px) { return (px == 79) | (px == 80); }
+  constexpr inline
+  bool isBigPixX(uint16_t px) {
+    return (px==79) | (px==80);
+  }
 
-  constexpr inline bool isBigPixY(uint16_t py) {
-    auto ly = toRocY(py);
-    return (ly == 0) | (ly == lastColInRoc);
+  constexpr inline
+  bool isBigPixY(uint16_t py) {
+    auto ly=toRocY(py);
+    return (ly==0) | (ly==lastColInRoc);
   }
 
-  constexpr inline uint16_t localX(uint16_t px) {
+  constexpr inline
+  uint16_t localX(uint16_t px) {
     auto shift = 0;
-    if (px > lastRowInRoc)
-      shift += 1;
-    if (px > numRowsInRoc)
-      shift += 1;
-    return px + shift;
+    if (px>lastRowInRoc) shift+=1;
+    if (px>numRowsInRoc) shift+=1;
+    return px+shift;
   }
 
-  constexpr inline uint16_t localY(uint16_t py) {
+  constexpr inline
+  uint16_t localY(uint16_t py) {
     auto roc = divu52(py);
-    auto shift = 2 * roc;
-    auto yInRoc = py - 52 * roc;
-    if (yInRoc > 0)
-      shift += 1;
-    return py + shift;
+    auto shift = 2*roc;
+    auto yInRoc = py - 52*roc;
+    if (yInRoc>0) shift+=1;
+    return py+shift;
   }
 
-}  // namespace phase1PixelTopology
+}
 
-#endif  // Geometry_TrackerGeometryBuilder_phase1PixelTopology_h
+#endif // Geometry_TrackerGeometryBuilder_phase1PixelTopology_h
diff --git a/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cpp b/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cpp
index 9a00efbff9a9a..5c37dad30d73e 100644
--- a/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cpp
+++ b/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cpp
@@ -8,138 +8,149 @@ namespace {
 
   // original code from CMSSW_4_4
 
-  std::tuple<int, bool> localXori(int mpx) {
-    const float m_pitchx = 1.f;
-    int binoffx = int(mpx);         // truncate to int
-    float local_pitchx = m_pitchx;  // defaultpitch
-
-    if (binoffx > 80) {  // ROC 1 - handles x on edge cluster
-      binoffx = binoffx + 2;
-    } else if (binoffx == 80) {  // ROC 1
-      binoffx = binoffx + 1;
+  std::tuple<int,bool> localXori(int mpx) {
+    const float m_pitchx=1.f;
+    int binoffx = int(mpx);             // truncate to int
+    float local_pitchx = m_pitchx;      // defaultpitch
+
+    if (binoffx>80) {                   // ROC 1 - handles x on edge cluster
+      binoffx=binoffx+2;
+    } else if (binoffx==80) {           // ROC 1
+      binoffx=binoffx+1;
       local_pitchx = 2 * m_pitchx;
 
-    } else if (binoffx == 79) {  // ROC 0
-      binoffx = binoffx + 0;
+    } else if (binoffx==79) {           // ROC 0
+      binoffx=binoffx+0;
       local_pitchx = 2 * m_pitchx;
-    } else if (binoffx >= 0) {  // ROC 0
-      binoffx = binoffx + 0;
+    } else if (binoffx>=0) {            // ROC 0
+      binoffx=binoffx+0;
 
-    } else {  // too small
-      assert("binoffx too small" == 0);
+    } else {                            // too small
+      assert("binoffx too small"==0);
     }
 
-    return std::make_tuple(binoffx, local_pitchx > m_pitchx);
+    return std::make_tuple(binoffx,local_pitchx>m_pitchx);
   }
 
-  std::tuple<int, bool> localYori(int mpy) {
-    const float m_pitchy = 1.f;
-    int binoffy = int(mpy);         // truncate to int
-    float local_pitchy = m_pitchy;  // defaultpitch
+  std::tuple<int,bool> localYori(int mpy) {
+    const float m_pitchy=1.f;
+    int binoffy = int(mpy);             // truncate to int
+    float local_pitchy = m_pitchy;      // defaultpitch
 
-    if (binoffy > 416) {  // ROC 8, not real ROC
-      binoffy = binoffy + 17;
-    } else if (binoffy == 416) {  // ROC 8
-      binoffy = binoffy + 16;
+    if (binoffy>416) {                  // ROC 8, not real ROC
+      binoffy=binoffy+17;
+    } else if (binoffy==416) {          // ROC 8
+      binoffy=binoffy+16;
       local_pitchy = 2 * m_pitchy;
 
-    } else if (binoffy == 415) {  // ROC 7, last big pixel
-      binoffy = binoffy + 15;
+    } else if (binoffy==415) {          // ROC 7, last big pixel
+      binoffy=binoffy+15;
       local_pitchy = 2 * m_pitchy;
-    } else if (binoffy > 364) {  // ROC 7
-      binoffy = binoffy + 15;
-    } else if (binoffy == 364) {  // ROC 7
-      binoffy = binoffy + 14;
+    } else if (binoffy>364) {           // ROC 7
+      binoffy=binoffy+15;
+    } else if (binoffy==364) {          // ROC 7
+      binoffy=binoffy+14;
       local_pitchy = 2 * m_pitchy;
 
-    } else if (binoffy == 363) {  // ROC 6
-      binoffy = binoffy + 13;
+    } else if (binoffy==363) {          // ROC 6
+      binoffy=binoffy+13;
       local_pitchy = 2 * m_pitchy;
-    } else if (binoffy > 312) {  // ROC 6
-      binoffy = binoffy + 13;
-    } else if (binoffy == 312) {  // ROC 6
-      binoffy = binoffy + 12;
+    } else if (binoffy>312) {           // ROC 6
+      binoffy=binoffy+13;
+    } else if (binoffy==312) {          // ROC 6
+      binoffy=binoffy+12;
       local_pitchy = 2 * m_pitchy;
 
-    } else if (binoffy == 311) {  // ROC 5
-      binoffy = binoffy + 11;
+    } else if (binoffy==311) {          // ROC 5
+      binoffy=binoffy+11;
       local_pitchy = 2 * m_pitchy;
-    } else if (binoffy > 260) {  // ROC 5
-      binoffy = binoffy + 11;
-    } else if (binoffy == 260) {  // ROC 5
-      binoffy = binoffy + 10;
+    } else if (binoffy>260) {           // ROC 5
+      binoffy=binoffy+11;
+    } else if (binoffy==260) {          // ROC 5
+      binoffy=binoffy+10;
       local_pitchy = 2 * m_pitchy;
 
-    } else if (binoffy == 259) {  // ROC 4
-      binoffy = binoffy + 9;
+    } else if (binoffy==259) {          // ROC 4
+      binoffy=binoffy+9;
       local_pitchy = 2 * m_pitchy;
-    } else if (binoffy > 208) {  // ROC 4
-      binoffy = binoffy + 9;
-    } else if (binoffy == 208) {  // ROC 4
-      binoffy = binoffy + 8;
+    } else if (binoffy>208) {           // ROC 4
+      binoffy=binoffy+9;
+    } else if (binoffy==208) {          // ROC 4
+      binoffy=binoffy+8;
       local_pitchy = 2 * m_pitchy;
 
-    } else if (binoffy == 207) {  // ROC 3
-      binoffy = binoffy + 7;
+    } else if (binoffy==207)  {         // ROC 3
+      binoffy=binoffy+7;
       local_pitchy = 2 * m_pitchy;
-    } else if (binoffy > 156) {  // ROC 3
-      binoffy = binoffy + 7;
-    } else if (binoffy == 156) {  // ROC 3
-      binoffy = binoffy + 6;
+    } else if (binoffy>156) {           // ROC 3
+      binoffy=binoffy+7;
+    } else if (binoffy==156) {          // ROC 3
+      binoffy=binoffy+6;
       local_pitchy = 2 * m_pitchy;
 
-    } else if (binoffy == 155) {  // ROC 2
-      binoffy = binoffy + 5;
+    } else if (binoffy==155) {          // ROC 2
+      binoffy=binoffy+5;
       local_pitchy = 2 * m_pitchy;
-    } else if (binoffy > 104) {  // ROC 2
-      binoffy = binoffy + 5;
-    } else if (binoffy == 104) {  // ROC 2
-      binoffy = binoffy + 4;
+    } else if (binoffy>104) {           // ROC 2
+      binoffy=binoffy+5;
+    } else if (binoffy==104) {          // ROC 2
+      binoffy=binoffy+4;
       local_pitchy = 2 * m_pitchy;
 
-    } else if (binoffy == 103) {  // ROC 1
-      binoffy = binoffy + 3;
+    } else if (binoffy==103) {          // ROC 1
+      binoffy=binoffy+3;
       local_pitchy = 2 * m_pitchy;
-    } else if (binoffy > 52) {  // ROC 1
-      binoffy = binoffy + 3;
-    } else if (binoffy == 52) {  // ROC 1
-      binoffy = binoffy + 2;
+    } else if (binoffy>52) {            // ROC 1
+      binoffy=binoffy+3;
+    } else if (binoffy==52) {           // ROC 1
+      binoffy=binoffy+2;
       local_pitchy = 2 * m_pitchy;
 
-    } else if (binoffy == 51) {  // ROC 0
-      binoffy = binoffy + 1;
+    } else if (binoffy==51) {           // ROC 0
+      binoffy=binoffy+1;
       local_pitchy = 2 * m_pitchy;
-    } else if (binoffy > 0) {  // ROC 0
-      binoffy = binoffy + 1;
-    } else if (binoffy == 0) {  // ROC 0
-      binoffy = binoffy + 0;
+    } else if (binoffy>0) {             // ROC 0
+      binoffy=binoffy+1;
+    } else if (binoffy==0) {            // ROC 0
+      binoffy=binoffy+0;
       local_pitchy = 2 * m_pitchy;
     } else {
-      assert("binoffy too small" == 0);
+      assert("binoffy too small"==0);
     }
 
-    return std::make_tuple(binoffy, local_pitchy > m_pitchy);
+    return std::make_tuple(binoffy,local_pitchy>m_pitchy);
   }
 
-}  // namespace
+}
 
 int main() {
-  for (uint16_t ix = 0; ix < 80 * 2; ++ix) {
+
+  for (uint16_t ix=0; ix<80*2; ++ix) {
     auto ori = localXori(ix);
     auto xl = phase1PixelTopology::localX(ix);
     auto bp = phase1PixelTopology::isBigPixX(ix);
-    if (std::get<0>(ori) != xl)
-      std::cout << "Error " << std::get<0>(ori) << "!=" << xl << std::endl;
-    assert(std::get<1>(ori) == bp);
+    if (std::get<0>(ori)!=xl) std::cout << "Error " << std::get<0>(ori) << "!=" << xl << std::endl;
+    assert(std::get<1>(ori)==bp);
   }
 
-  for (uint16_t iy = 0; iy < 52 * 8; ++iy) {
+  for (uint16_t iy=0; iy<52*8; ++iy) {
     auto ori = localYori(iy);
     auto yl = phase1PixelTopology::localY(iy);
     auto bp = phase1PixelTopology::isBigPixY(iy);
-    if (std::get<0>(ori) != yl)
-      std::cout << "Error " << std::get<0>(ori) << "!=" << yl << std::endl;
-    assert(std::get<1>(ori) == bp);
+    if (std::get<0>(ori)!=yl) std::cout << "Error " << std::get<0>(ori) << "!=" << yl << std::endl;
+    assert(std::get<1>(ori)==bp);
+  }
+
+  for (auto i = 0U; i < phase1PixelTopology::numberOfLayers; ++i) {
+    std::cout << "layer " << i << ", \"" << phase1PixelTopology::layerName[i] << "\", [" << phase1PixelTopology::layerStart[i] << ", " << phase1PixelTopology::layerStart[i+1] << ")" << std::endl;
+  }
+
+  for (auto i = 0U; i < phase1PixelTopology::numberOfModules; ++i) {
+    int layer = phase1PixelTopology::layer[i / phase1PixelTopology::maxModuleStride];
+    //std::cout << "module " << i << ": " << "layer " << layer << ", \"" << phase1PixelTopology::layerName[layer] << "\", [" << phase1PixelTopology::layerStart[layer] << ", " << phase1PixelTopology::layerStart[layer+1] << ")" << std::endl;
+    assert(layer < 10);
+    assert(i >= phase1PixelTopology::layerStart[layer]);
+    assert(i < phase1PixelTopology::layerStart[layer+1]);
   }
 
   return 0;
diff --git a/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py b/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py
index 3cae176059b3b..a486a83d178f4 100644
--- a/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py
+++ b/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py
@@ -9,11 +9,11 @@
 from RecoLocalTracker.SiStripRecHitConverter.StripCPEfromTrackAngle_cfi import *
 from RecoLocalTracker.SiStripZeroSuppression.SiStripZeroSuppression_cfi import *
 from RecoLocalTracker.SiStripClusterizer.SiStripClusterizer_cfi import *
-from RecoLocalTracker.SiPixelClusterizer.SiPixelClusterizerPreSplitting_cfi import *
+from RecoLocalTracker.SiPixelClusterizer.siPixelClustersPreSplitting_cff import *
 from RecoLocalTracker.SiPixelRecHits.SiPixelRecHits_cfi import *
 from RecoLocalTracker.SubCollectionProducers.clustersummaryproducer_cfi import *
 
-pixeltrackerlocalrecoTask = cms.Task(siPixelClustersPreSplitting,siPixelRecHitsPreSplitting)
+pixeltrackerlocalrecoTask = cms.Task(siPixelClustersPreSplittingTask,siPixelRecHitsPreSplitting)
 striptrackerlocalrecoTask = cms.Task(siStripZeroSuppression,siStripClusters,siStripMatchedRecHits)
 trackerlocalrecoTask = cms.Task(pixeltrackerlocalrecoTask,striptrackerlocalrecoTask,clusterSummaryProducer)
 
@@ -21,9 +21,12 @@
 striptrackerlocalreco = cms.Sequence(striptrackerlocalrecoTask)
 trackerlocalreco = cms.Sequence(trackerlocalrecoTask)
 
+from Configuration.ProcessModifiers.gpu_cff import gpu
+from RecoLocalTracker.SiPixelRecHits.siPixelRecHitHeterogeneous_cfi import siPixelRecHitHeterogeneous as _siPixelRecHitHeterogeneous
+gpu.toReplaceWith(siPixelRecHitsPreSplitting, _siPixelRecHitHeterogeneous)
+
 from RecoLocalTracker.SiPhase2Clusterizer.phase2TrackerClusterizer_cfi import *
 from RecoLocalTracker.Phase2TrackerRecHits.Phase2StripCPEGeometricESProducer_cfi import *
-from RecoLocalTracker.SiPhase2VectorHitBuilder.siPhase2RecHitMatcher_cfi import *
 
 _pixeltrackerlocalrecoTask_phase2 = pixeltrackerlocalrecoTask.copy()
 _pixeltrackerlocalrecoTask_phase2.add(siPhase2Clusters)
diff --git a/RecoLocalTracker/SiPixelClusterizer/BuildFile.xml b/RecoLocalTracker/SiPixelClusterizer/BuildFile.xml
new file mode 100644
index 0000000000000..74e76ab6ff3e2
--- /dev/null
+++ b/RecoLocalTracker/SiPixelClusterizer/BuildFile.xml
@@ -0,0 +1,9 @@
+<use   name="FWCore/Utilities"/>
+<use   name="CalibTracker/SiPixelESProducers"/>
+<use   name="HeterogeneousCore/CUDACore"/>
+<use   name="cuda"/>
+<use   name="cuda-api-wrappers"/>
+
+<export>
+    <lib name="1"/>
+</export>
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml b/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml
index c7b16a6ef4ee2..40a489f763397 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/BuildFile.xml
@@ -1,8 +1,20 @@
-<use name="DataFormats/Common"/>
-<use name="FWCore/ParameterSet"/>
-<use name="DataFormats/SiPixelCluster"/>
-<use name="boost_serialization"/>
-<use name="CalibTracker/SiPixelESProducers"/>
-<library file="*.cc" name="RecoLocalTrackerSiPixelClusterizerPlugins">
-  <flags EDM_PLUGIN="1"/>
+<use   name="DataFormats/Common"/>
+<use   name="FWCore/ParameterSet"/>
+<use   name="DataFormats/SiPixelDetId"/>
+<use   name="DataFormats/SiPixelCluster"/>
+<use   name="boost_serialization"/>
+<use   name="RecoLocalTracker/SiPixelClusterizer"/>
+<use   name="RecoTracker/Record"/>
+<use   name="CalibTracker/SiPixelESProducers"/>
+<use   name="EventFilter/SiPixelRawToDigi"/>
+<use   name="CUDADataFormats/SiPixelDigi"/>
+<use   name="CUDADataFormats/SiPixelCluster"/>
+<use   name="HeterogeneousCore/Producer"/>
+<use   name="HeterogeneousCore/Product"/>
+<use   name="HeterogeneousCore/CUDACore"/>
+<use   name="cuda"/>
+<use   name="cuda-api-wrappers"/>
+<use   name="cub"/>
+<library   file="*.cc *.cu" name="RecoLocalTrackerSiPixelClusterizerPlugins">
+  <flags   EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelClusterProducer.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelClusterProducer.cc
index 15a6536ca644b..95d1e7475e33e 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelClusterProducer.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelClusterProducer.cc
@@ -17,7 +17,7 @@
 
 // Geometry
 #include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
-#include "Geometry/CommonDetUnit/interface/PixelGeomDetUnit.h"
+#include "Geometry/TrackerGeometryBuilder/interface/PixelGeomDetUnit.h"
 
 // Data Formats
 #include "DataFormats/Common/interface/DetSetVector.h"
@@ -43,32 +43,36 @@
 // MessageLogger
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 
-//---------------------------------------------------------------------------
-//!  Constructor: set the ParameterSet and defer all thinking to setupClusterizer().
-//---------------------------------------------------------------------------
-SiPixelClusterProducer::SiPixelClusterProducer(edm::ParameterSet const& conf)
-    : tPutPixelClusters(produces<SiPixelClusterCollectionNew>()),
-      clusterMode_(conf.getParameter<std::string>("ClusterMode")),
-      maxTotalClusters_(conf.getParameter<int32_t>("maxNumberOfClusters")) {
-  if (clusterMode_ == "PixelThresholdReclusterizer")
-    tPixelClusters = consumes<SiPixelClusterCollectionNew>(conf.getParameter<edm::InputTag>("src"));
-  else
-    tPixelDigi = consumes<edm::DetSetVector<PixelDigi>>(conf.getParameter<edm::InputTag>("src"));
-
-  const auto& payloadType = conf.getParameter<std::string>("payloadType");
-  if (payloadType == "HLT")
-    theSiPixelGainCalibration_ = std::make_unique<SiPixelGainCalibrationForHLTService>(conf);
-  else if (payloadType == "Offline")
-    theSiPixelGainCalibration_ = std::make_unique<SiPixelGainCalibrationOfflineService>(conf);
-  else if (payloadType == "Full")
-    theSiPixelGainCalibration_ = std::make_unique<SiPixelGainCalibrationService>(conf);
-
-  //--- Make the algorithm(s) according to what the user specified
-  //--- in the ParameterSet.
-  setupClusterizer(conf);
-}
 
-// Destructor
+  //---------------------------------------------------------------------------
+  //!  Constructor: set the ParameterSet and defer all thinking to setupClusterizer().
+  //---------------------------------------------------------------------------
+  SiPixelClusterProducer::SiPixelClusterProducer(edm::ParameterSet const& conf) 
+    : 
+    tPutPixelClusters(produces<SiPixelClusterCollectionNew>()),
+    clusterMode_( conf.getParameter<std::string>("ClusterMode") ),
+    maxTotalClusters_( conf.getParameter<int32_t>( "maxNumberOfClusters" ) )
+  {
+    if ( clusterMode_ == "PixelThresholdReclusterizer" )
+      tPixelClusters = consumes<SiPixelClusterCollectionNew>( conf.getParameter<edm::InputTag>("src") );
+    else
+      tPixelDigi = consumes<edm::DetSetVector<PixelDigi>>( conf.getParameter<edm::InputTag>("src") );
+
+    const auto& payloadType = conf.getParameter<std::string>( "payloadType" );
+    if (payloadType == "HLT")
+        theSiPixelGainCalibration_ = std::make_unique<SiPixelGainCalibrationForHLTService>(conf);
+    else if (payloadType == "Offline")
+        theSiPixelGainCalibration_ = std::make_unique<SiPixelGainCalibrationOfflineService>(conf);
+    else if (payloadType == "Full")
+        theSiPixelGainCalibration_ = std::make_unique<SiPixelGainCalibrationService>(conf);
+
+    //--- Make the algorithm(s) according to what the user specified
+    //--- in the ParameterSet.
+    setupClusterizer(conf);
+
+  }
+
+  // Destructor
 SiPixelClusterProducer::~SiPixelClusterProducer() = default;
 
 void SiPixelClusterProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
@@ -77,129 +81,138 @@ void SiPixelClusterProducer::fillDescriptions(edm::ConfigurationDescriptions& de
   desc.add<edm::InputTag>("src", edm::InputTag("siPixelDigis"));
   desc.add<std::string>("ClusterMode", "PixelThresholdClusterizer");
   desc.add<int>("maxNumberOfClusters", -1)->setComment("-1 means no limit");
-  desc.add<std::string>("payloadType", "Offline")
-      ->setComment("Options: HLT - column granularity, Offline - gain:col/ped:pix");
+  desc.add<std::string>("payloadType", "Offline")->setComment("Options: HLT - column granularity, Offline - gain:col/ped:pix");
 
   PixelThresholdClusterizer::fillPSetDescription(desc);
-  SiPixelGainCalibrationServiceBase::fillPSetDescription(desc);  // no-op, but in principle the structures are there...
+  SiPixelGainCalibrationServiceBase::fillPSetDescription(desc); // no-op, but in principle the structures are there...
 
   descriptions.add("SiPixelClusterizerDefault", desc);
 }
 
-//---------------------------------------------------------------------------
-//! The "Event" entrypoint: gets called by framework for every event
-//---------------------------------------------------------------------------
-void SiPixelClusterProducer::produce(edm::Event& e, const edm::EventSetup& es) {
-  //Setup gain calibration service
-  theSiPixelGainCalibration_->setESObjects(es);
-
-  // Step A.1: get input data
-  edm::Handle<SiPixelClusterCollectionNew> inputClusters;
-  edm::Handle<edm::DetSetVector<PixelDigi>> inputDigi;
-  if (clusterMode_ == "PixelThresholdReclusterizer")
-    e.getByToken(tPixelClusters, inputClusters);
-  else
-    e.getByToken(tPixelDigi, inputDigi);
-
-  // Step A.2: get event setup
-  edm::ESHandle<TrackerGeometry> geom;
-  es.get<TrackerDigiGeometryRecord>().get(geom);
-
-  edm::ESHandle<TrackerTopology> trackerTopologyHandle;
-  es.get<TrackerTopologyRcd>().get(trackerTopologyHandle);
-  tTopo_ = trackerTopologyHandle.product();
-
-  // Step B: create the final output collection
-  auto output = std::make_unique<SiPixelClusterCollectionNew>();
-  //FIXME: put a reserve() here
-
-  // Step C: Iterate over DetIds and invoke the pixel clusterizer algorithm
-  // on each DetUnit
-  if (clusterMode_ == "PixelThresholdReclusterizer")
-    run(*inputClusters, geom, *output);
-  else
-    run(*inputDigi, geom, *output);
-
-  // Step D: write output to file
-  output->shrink_to_fit();
-  e.put(tPutPixelClusters, std::move(output));
-}
+  
+  //---------------------------------------------------------------------------
+  //! The "Event" entrypoint: gets called by framework for every event
+  //---------------------------------------------------------------------------
+  void SiPixelClusterProducer::produce(edm::Event& e, const edm::EventSetup& es)
+  {
+
+    //Setup gain calibration service
+    theSiPixelGainCalibration_->setESObjects( es );
+
+    // Step A.1: get input data
+    edm::Handle< SiPixelClusterCollectionNew >   inputClusters;
+    edm::Handle< edm::DetSetVector<PixelDigi> >  inputDigi;
+    if ( clusterMode_ == "PixelThresholdReclusterizer" )
+      e.getByToken(tPixelClusters, inputClusters);
+    else
+      e.getByToken(tPixelDigi, inputDigi);
+
+    // Step A.2: get event setup
+    edm::ESHandle<TrackerGeometry> geom;
+    es.get<TrackerDigiGeometryRecord>().get( geom );
+
+    edm::ESHandle<TrackerTopology> trackerTopologyHandle;
+    es.get<TrackerTopologyRcd>().get(trackerTopologyHandle);
+    tTopo_ = trackerTopologyHandle.product();
+
+    // Step B: create the final output collection
+    auto output = std::make_unique< SiPixelClusterCollectionNew>();
+    //FIXME: put a reserve() here
+
+    // Step C: Iterate over DetIds and invoke the pixel clusterizer algorithm
+    // on each DetUnit
+    if ( clusterMode_ == "PixelThresholdReclusterizer" )
+      run(*inputClusters, geom, *output );
+    else
+      run(*inputDigi, geom, *output );
+
+    // Step D: write output to file
+    output->shrink_to_fit();
+    e.put(tPutPixelClusters, std::move(output));
 
-//---------------------------------------------------------------------------
-//!  Set up the specific algorithm we are going to use.
-//!  TO DO: in the future, we should allow for a different algorithm for
-//!  each detector subset (e.g. barrel vs forward, per layer, etc).
-//---------------------------------------------------------------------------
-void SiPixelClusterProducer::setupClusterizer(const edm::ParameterSet& conf) {
-  if (clusterMode_ == "PixelThresholdReclusterizer" || clusterMode_ == "PixelThresholdClusterizer") {
-    clusterizer_ = std::make_unique<PixelThresholdClusterizer>(conf);
-    clusterizer_->setSiPixelGainCalibrationService(theSiPixelGainCalibration_.get());
-  } else {
-    throw cms::Exception("Configuration") << "[SiPixelClusterProducer]:"
-                                          << " choice " << clusterMode_ << " is invalid.\n"
-                                          << "Possible choices:\n"
-                                          << "    PixelThresholdClusterizer";
   }
-}
 
-//---------------------------------------------------------------------------
-//!  Iterate over DetUnits, and invoke the PixelClusterizer on each.
-//---------------------------------------------------------------------------
-template <typename T>
-void SiPixelClusterProducer::run(const T& input,
-                                 const edm::ESHandle<TrackerGeometry>& geom,
-                                 edmNew::DetSetVector<SiPixelCluster>& output) {
-  int numberOfDetUnits = 0;
-  int numberOfClusters = 0;
-
-  // Iterate on detector units
-  typename T::const_iterator DSViter = input.begin();
-  for (; DSViter != input.end(); DSViter++) {
-    ++numberOfDetUnits;
-
-    //  LogDebug takes very long time, get rid off.
-    //LogDebug("SiStripClusterizer") << "[SiPixelClusterProducer::run] DetID" << DSViter->id;
-
-    std::vector<short> badChannels;
-    DetId detIdObject(DSViter->detId());
-
-    // Comment: At the moment the clusterizer depends on geometry
-    // to access information as the pixel topology (number of columns
-    // and rows in a detector module).
-    // In the future the geometry service will be replaced with
-    // a ES service.
-    const GeomDetUnit* geoUnit = geom->idToDetUnit(detIdObject);
-    const PixelGeomDetUnit* pixDet = dynamic_cast<const PixelGeomDetUnit*>(geoUnit);
-    if (!pixDet) {
-      // Fatal error!  TO DO: throw an exception!
-      assert(0);
+  //---------------------------------------------------------------------------
+  //!  Set up the specific algorithm we are going to use.  
+  //!  TO DO: in the future, we should allow for a different algorithm for 
+  //!  each detector subset (e.g. barrel vs forward, per layer, etc).
+  //---------------------------------------------------------------------------
+  void SiPixelClusterProducer::setupClusterizer(const edm::ParameterSet& conf)  {
+
+    if ( clusterMode_ == "PixelThresholdReclusterizer" || clusterMode_ == "PixelThresholdClusterizer" ) {
+      clusterizer_ = std::make_unique<PixelThresholdClusterizer>(conf);
+      clusterizer_->setSiPixelGainCalibrationService(theSiPixelGainCalibration_.get());
+    } 
+    else {
+      throw cms::Exception("Configuration") << "[SiPixelClusterProducer]:"
+		<<" choice " << clusterMode_ << " is invalid.\n"
+		<< "Possible choices:\n" 
+		<< "    PixelThresholdClusterizer";
     }
-    {
-      // Produce clusters for this DetUnit and store them in
+  }
+
+
+  //---------------------------------------------------------------------------
+  //!  Iterate over DetUnits, and invoke the PixelClusterizer on each.
+  //---------------------------------------------------------------------------
+  template<typename T>
+  void SiPixelClusterProducer::run(const T                              & input, 
+                                   const edm::ESHandle<TrackerGeometry> & geom,
+                                   edmNew::DetSetVector<SiPixelCluster> & output) {
+    int numberOfDetUnits = 0;
+    int numberOfClusters = 0;
+ 
+    // Iterate on detector units
+    typename T::const_iterator DSViter = input.begin();
+    for( ; DSViter != input.end(); DSViter++) {
+      ++numberOfDetUnits;
+
+      //  LogDebug takes very long time, get rid off.
+      //LogDebug("SiStripClusterizer") << "[SiPixelClusterProducer::run] DetID" << DSViter->id;
+
+      std::vector<short> badChannels; 
+      DetId detIdObject(DSViter->detId());
+      
+      // Comment: At the moment the clusterizer depends on geometry
+      // to access information as the pixel topology (number of columns
+      // and rows in a detector module). 
+      // In the future the geometry service will be replaced with
+      // a ES service.
+      const GeomDetUnit      * geoUnit = geom->idToDetUnit( detIdObject );
+      const PixelGeomDetUnit * pixDet  = dynamic_cast<const PixelGeomDetUnit*>(geoUnit);
+      if (! pixDet) {
+	// Fatal error!  TO DO: throw an exception!
+	assert(0);
+      }
+      {
+      // Produce clusters for this DetUnit and store them in 
       // a DetSet
       edmNew::DetSetVector<SiPixelCluster>::FastFiller spc(output, DSViter->detId());
       clusterizer_->clusterizeDetUnit(*DSViter, pixDet, tTopo_, badChannels, spc);
-      if (spc.empty()) {
+      if ( spc.empty() ) {
         spc.abort();
       } else {
-        numberOfClusters += spc.size();
+	numberOfClusters += spc.size();
       }
-    }  // spc is not deleted and detsetvector updated
-    if ((maxTotalClusters_ >= 0) && (numberOfClusters > maxTotalClusters_)) {
-      edm::LogError("TooManyClusters")
-          << "Limit on the number of clusters exceeded. An empty cluster collection will be produced instead.\n";
-      edmNew::DetSetVector<SiPixelCluster> empty;
-      empty.swap(output);
-      break;
-    }
-  }  // end of DetUnit loop
+      } // spc is not deleted and detsetvector updated
+      if ((maxTotalClusters_ >= 0) && (numberOfClusters > maxTotalClusters_)) {
+        edm::LogError("TooManyClusters") <<  "Limit on the number of clusters exceeded. An empty cluster collection will be produced instead.\n";
+        edmNew::DetSetVector<SiPixelCluster> empty;
+        empty.swap(output);
+        break;
+      }
+    } // end of DetUnit loop
+    
+    //LogDebug ("SiPixelClusterProducer") << " Executing " 
+    //      << clusterMode_ << " resulted in " << numberOfClusters
+    //				    << " SiPixelClusters in " << numberOfDetUnits << " DetUnits."; 
+  }
+
+
 
-  //LogDebug ("SiPixelClusterProducer") << " Executing "
-  //      << clusterMode_ << " resulted in " << numberOfClusters
-  //      << " SiPixelClusters in " << numberOfDetUnits << " DetUnits.";
-}
 
 #include "FWCore/PluginManager/interface/ModuleDef.h"
 #include "FWCore/Framework/interface/MakerMacros.h"
 
 DEFINE_FWK_MODULE(SiPixelClusterProducer);
+
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc
new file mode 100644
index 0000000000000..4c405a8c85afd
--- /dev/null
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc
@@ -0,0 +1,158 @@
+#include "DataFormats/Common/interface/DetSetVector.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "DataFormats/DetId/interface/DetId.h"
+#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
+#include "DataFormats/SiPixelDigi/interface/PixelDigi.h"
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h"
+#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "Geometry/Records/interface/TrackerTopologyRcd.h"
+
+namespace {
+  struct AccretionCluster {
+    typedef unsigned short UShort;
+    static constexpr UShort MAXSIZE = 256;
+    UShort adc[MAXSIZE];
+    UShort x[MAXSIZE];
+    UShort y[MAXSIZE];
+    UShort xmin=16000;
+    UShort ymin=16000;
+    unsigned int isize=0;
+    int charge=0;
+
+    void clear() {
+      isize=0;
+      charge=0;
+      xmin=16000;
+      ymin=16000;
+    }
+
+    bool add(SiPixelCluster::PixelPos const & p, UShort const iadc) {
+      if (isize==MAXSIZE) return false;
+      xmin=std::min(xmin,(unsigned short)(p.row()));
+      ymin=std::min(ymin,(unsigned short)(p.col()));
+      adc[isize]=iadc;
+      x[isize]=p.row();
+      y[isize++]=p.col();
+      charge+=iadc;
+      return true;
+    }
+  };
+
+  constexpr uint32_t dummydetid = 0xffffffff;
+}
+
+class SiPixelDigisClustersFromSoA: public edm::global::EDProducer<> {
+public:
+  explicit SiPixelDigisClustersFromSoA(const edm::ParameterSet& iConfig);
+  ~SiPixelDigisClustersFromSoA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+  edm::EDGetTokenT<SiPixelDigisSoA> digiGetToken_;
+
+  edm::EDPutTokenT<edm::DetSetVector<PixelDigi>> digiPutToken_;
+  edm::EDPutTokenT<SiPixelClusterCollectionNew> clusterPutToken_;
+ 
+};
+
+SiPixelDigisClustersFromSoA::SiPixelDigisClustersFromSoA(const edm::ParameterSet& iConfig):
+  digiGetToken_(consumes<SiPixelDigisSoA>(iConfig.getParameter<edm::InputTag>("src"))),
+  digiPutToken_(produces<edm::DetSetVector<PixelDigi>>()),
+  clusterPutToken_(produces<SiPixelClusterCollectionNew>())
+{}
+
+void SiPixelDigisClustersFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag("siPixelDigisSoA"));
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void SiPixelDigisClustersFromSoA::produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  const auto& digis = iEvent.get(digiGetToken_);
+  
+  edm::ESHandle<TrackerTopology> trackerTopologyHandle;
+  iSetup.get<TrackerTopologyRcd>().get(trackerTopologyHandle);
+  const auto& ttopo = *trackerTopologyHandle;
+
+  auto collection = std::make_unique<edm::DetSetVector<PixelDigi>>();
+  auto outputClusters = std::make_unique<SiPixelClusterCollectionNew>();
+
+  const uint32_t nDigis = digis.size();
+  edm::DetSet<PixelDigi> * detDigis=nullptr;
+  for (uint32_t i = 0; i < nDigis; i++) {
+    if (digis.pdigi(i)==0) continue;
+    detDigis = &collection->find_or_insert(digis.rawIdArr(i));
+    if ( (*detDigis).empty() ) (*detDigis).data.reserve(32); // avoid the first relocations
+    break;
+  }
+
+  int32_t nclus=-1;
+  std::vector<AccretionCluster> aclusters(1024);
+  auto totCluseFilled=0;
+
+  auto fillClusters = [&](uint32_t detId){
+    if (nclus<0) return; // this in reality should never happen
+    edmNew::DetSetVector<SiPixelCluster>::FastFiller spc(*outputClusters, detId);
+    auto layer = (DetId(detId).subdetId()==1) ? ttopo.pxbLayer(detId) : 0;
+    auto clusterThreshold = (layer==1) ? 2000 : 4000;
+    for (int32_t ic=0; ic<nclus+1;++ic) {
+      auto const & acluster = aclusters[ic];
+      if ( acluster.charge < clusterThreshold) continue;
+      SiPixelCluster cluster(acluster.isize,acluster.adc, acluster.x,acluster.y, acluster.xmin,acluster.ymin);
+      ++totCluseFilled;
+      // std::cout << "putting in this cluster " << ic << " " << cluster.charge() << " " << cluster.pixelADC().size() << endl;
+      // sort by row (x)
+      spc.push_back( std::move(cluster) );
+      std::push_heap(spc.begin(),spc.end(),[](SiPixelCluster const & cl1,SiPixelCluster const & cl2) { return cl1.minPixelRow() < cl2.minPixelRow();});
+    }
+    for (int32_t ic=0; ic<nclus+1;++ic) aclusters[ic].clear();
+    nclus = -1;
+    // sort by row (x)
+    std::sort_heap(spc.begin(),spc.end(),[](SiPixelCluster const & cl1,SiPixelCluster const & cl2) { return cl1.minPixelRow() < cl2.minPixelRow();});
+    if ( spc.empty() ) spc.abort();
+  };
+
+  for (uint32_t i = 0; i < nDigis; i++) {
+    if (digis.pdigi(i)==0) continue;
+    if (digis.clus(i)>9000) continue; // not in cluster; TODO add an assert for the size
+    assert(digis.rawIdArr(i) > 109999);
+    if ( (*detDigis).detId() != digis.rawIdArr(i))
+      {
+        fillClusters((*detDigis).detId());
+        assert(nclus==-1);
+        detDigis = &collection->find_or_insert(digis.rawIdArr(i));
+        if ( (*detDigis).empty() )
+          (*detDigis).data.reserve(32); // avoid the first relocations
+        else { std::cout << "Problem det present twice in input! " << (*detDigis).detId() << std::endl; }
+      }
+    (*detDigis).data.emplace_back(digis.pdigi(i));
+    auto const & dig = (*detDigis).data.back();
+    // fill clusters
+    assert(digis.clus(i)>=0);
+    assert(digis.clus(i)<1024);
+    nclus = std::max(digis.clus(i),nclus);
+    auto row = dig.row();
+    auto col = dig.column();
+    SiPixelCluster::PixelPos pix(row,col);
+    aclusters[digis.clus(i)].add(pix, digis.adc(i));
+  }
+
+  // fill final clusters
+  fillClusters((*detDigis).detId());
+  //std::cout << "filled " << totCluseFilled << " clusters" << std::endl;
+
+  iEvent.put(digiPutToken_, std::move(collection));
+  iEvent.put(clusterPutToken_, std::move(outputClusters));
+}
+
+DEFINE_FWK_MODULE(SiPixelDigisClustersFromSoA);
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
new file mode 100644
index 0000000000000..5dc04009f4832
--- /dev/null
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
@@ -0,0 +1,243 @@
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
+#include "CalibTracker/Records/interface/SiPixelGainCalibrationForHLTGPURcd.h"
+#include "CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h"
+#include "CondFormats/DataRecord/interface/SiPixelFedCablingMapRcd.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingMap.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingTree.h"
+#include "DataFormats/FEDRawData/interface/FEDNumbering.h"
+#include "DataFormats/FEDRawData/interface/FEDRawData.h"
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
+#include "EventFilter/SiPixelRawToDigi/interface/PixelDataFormatter.h"
+#include "EventFilter/SiPixelRawToDigi/interface/PixelUnpackingRegions.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/ESTransientHandle.h"
+#include "FWCore/Framework/interface/ESWatcher.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h"
+#include "RecoTracker/Record/interface/CkfComponentsRecord.h"
+
+#include "SiPixelRawToClusterGPUKernel.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+class SiPixelRawToClusterCUDA: public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfig);
+  ~SiPixelRawToClusterCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+  edm::EDGetTokenT<FEDRawDataCollection> rawGetToken_;
+
+  edm::EDPutTokenT<CUDAProduct<SiPixelDigisCUDA>> digiPutToken_;
+  edm::EDPutTokenT<CUDAProduct<SiPixelDigiErrorsCUDA>> digiErrorPutToken_;
+  edm::EDPutTokenT<CUDAProduct<SiPixelClustersCUDA>> clusterPutToken_;
+
+  CUDAContextToken ctxTmp_;
+
+  edm::ESWatcher<SiPixelFedCablingMapRcd> recordWatcher;
+
+  std::string cablingMapLabel_;
+  std::unique_ptr<SiPixelFedCablingTree> cabling_;
+  std::vector<unsigned int> fedIds_;
+  const SiPixelFedCablingMap *cablingMap_ = nullptr;
+  std::unique_ptr<PixelUnpackingRegions> regions_;
+
+  pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_;
+  PixelDataFormatter::Errors errors_;
+
+  const bool includeErrors_;
+  const bool useQuality_;
+  const bool usePilotBlade_;
+  const bool convertADCtoElectrons_;
+};
+
+SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfig):
+  rawGetToken_(consumes<FEDRawDataCollection>(iConfig.getParameter<edm::InputTag>("InputLabel"))),
+  digiPutToken_(produces<CUDAProduct<SiPixelDigisCUDA>>()),
+  clusterPutToken_(produces<CUDAProduct<SiPixelClustersCUDA>>()),
+  cablingMapLabel_(iConfig.getParameter<std::string>("CablingMapLabel")),
+  includeErrors_(iConfig.getParameter<bool>("IncludeErrors")),
+  useQuality_(iConfig.getParameter<bool>("UseQualityInfo")),
+  usePilotBlade_(iConfig.getParameter<bool> ("UsePilotBlade")), // Control the usage of pilot-blade data, FED=40
+  convertADCtoElectrons_(iConfig.getParameter<bool>("ConvertADCtoElectrons"))
+{
+  if(includeErrors_) {
+    digiErrorPutToken_ = produces<CUDAProduct<SiPixelDigiErrorsCUDA>>();
+  }
+
+  // regions
+  if(!iConfig.getParameter<edm::ParameterSet>("Regions").getParameterNames().empty()) {
+    regions_ = std::make_unique<PixelUnpackingRegions>(iConfig, consumesCollector());
+  }
+
+  if(usePilotBlade_) edm::LogInfo("SiPixelRawToCluster")  << " Use pilot blade data (FED 40)";
+}
+
+void SiPixelRawToClusterCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<bool>("IncludeErrors",true);
+  desc.add<bool>("UseQualityInfo",false);
+  desc.add<bool>("UsePilotBlade",false)->setComment("##  Use pilot blades");
+  desc.add<bool>("ConvertADCtoElectrons", false)->setComment("## do the calibration ADC-> Electron and apply the threshold, requried for clustering");
+  desc.add<edm::InputTag>("InputLabel",edm::InputTag("rawDataCollector"));
+  {
+    edm::ParameterSetDescription psd0;
+    psd0.addOptional<std::vector<edm::InputTag>>("inputs");
+    psd0.addOptional<std::vector<double>>("deltaPhi");
+    psd0.addOptional<std::vector<double>>("maxZ");
+    psd0.addOptional<edm::InputTag>("beamSpot");
+    desc.add<edm::ParameterSetDescription>("Regions",psd0)->setComment("## Empty Regions PSet means complete unpacking");
+  }
+  desc.add<std::string>("CablingMapLabel","")->setComment("CablingMap label"); //Tav
+  descriptions.addWithDefaultLabel(desc);
+}
+
+
+void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
+
+  edm::ESHandle<SiPixelFedCablingMapGPUWrapper> hgpuMap;
+  iSetup.get<CkfComponentsRecord>().get(hgpuMap);
+  if(hgpuMap->hasQuality() != useQuality_) {
+    throw cms::Exception("LogicError") << "UseQuality of the module (" << useQuality_ << ") differs the one from SiPixelFedCablingMapGPUWrapper. Please fix your configuration.";
+  }
+  // get the GPU product already here so that the async transfer can begin
+  const auto *gpuMap = hgpuMap->getGPUProductAsync(ctx.stream());
+
+  edm::ESHandle<SiPixelGainCalibrationForHLTGPU> hgains;
+  iSetup.get<SiPixelGainCalibrationForHLTGPURcd>().get(hgains);
+  // get the GPU product already here so that the async transfer can begin
+  const auto *gpuGains = hgains->getGPUProductAsync(ctx.stream());
+
+  cudautils::device::unique_ptr<unsigned char[]> modulesToUnpackRegional;
+  const unsigned char *gpuModulesToUnpack;
+
+  if(regions_) {
+    regions_->run(iEvent, iSetup);
+    LogDebug("SiPixelRawToCluster") << "region2unpack #feds: "<<regions_->nFEDs();
+    LogDebug("SiPixelRawToCluster") << "region2unpack #modules (BPIX,EPIX,total): "<<regions_->nBarrelModules()<<" "<<regions_->nForwardModules()<<" "<<regions_->nModules();
+    modulesToUnpackRegional = hgpuMap->getModToUnpRegionalAsync(*(regions_->modulesToUnpack()), ctx.stream());
+    gpuModulesToUnpack = modulesToUnpackRegional.get();
+  }
+  else {
+    gpuModulesToUnpack = hgpuMap->getModToUnpAllAsync(ctx.stream());
+  }
+
+  // initialize cabling map or update if necessary
+  if (recordWatcher.check(iSetup)) {
+    // cabling map, which maps online address (fed->link->ROC->local pixel) to offline (DetId->global pixel)
+    edm::ESTransientHandle<SiPixelFedCablingMap> cablingMap;
+    iSetup.get<SiPixelFedCablingMapRcd>().get(cablingMapLabel_, cablingMap); //Tav
+    cablingMap_ = cablingMap.product();
+    fedIds_  = cablingMap->fedIds();
+    cabling_ = cablingMap->cablingTree();
+    LogDebug("map version:")<< cabling_->version();
+  }
+
+  const auto& buffers = iEvent.get(rawGetToken_);
+
+  errors_.clear();
+
+    // GPU specific: Data extraction for RawToDigi GPU
+  unsigned int wordCounterGPU = 0;
+  unsigned int fedCounter = 0;
+  bool errorsInEvent = false;
+
+  // In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData()
+  ErrorChecker errorcheck;
+  auto wordFedAppender = pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender(ctx.stream());
+  for(int fedId: fedIds_) {
+    if (!usePilotBlade_ && (fedId==40) ) continue; // skip pilot blade data
+    if (regions_ && !regions_->mayUnpackFED(fedId)) continue;
+
+    // for GPU
+    // first 150 index stores the fedId and next 150 will store the
+    // start index of word in that fed
+    assert(fedId>=1200);
+    fedCounter++;
+
+    // get event data for this fed
+    const FEDRawData& rawData = buffers.FEDData( fedId );
+
+    // GPU specific
+    int nWords = rawData.size()/sizeof(cms_uint64_t);
+    if (nWords == 0) {
+      continue;
+    }
+
+    // check CRC bit
+    const cms_uint64_t* trailer = reinterpret_cast<const cms_uint64_t* >(rawData.data())+(nWords-1);
+    if (not errorcheck.checkCRC(errorsInEvent, fedId, trailer, errors_)) {
+      continue;
+    }
+
+    // check headers
+    const cms_uint64_t* header = reinterpret_cast<const cms_uint64_t* >(rawData.data()); header--;
+    bool moreHeaders = true;
+    while (moreHeaders) {
+      header++;
+      bool headerStatus = errorcheck.checkHeader(errorsInEvent, fedId, header, errors_);
+      moreHeaders = headerStatus;
+    }
+
+    // check trailers
+    bool moreTrailers = true;
+    trailer++;
+    while (moreTrailers) {
+      trailer--;
+      bool trailerStatus = errorcheck.checkTrailer(errorsInEvent, fedId, nWords, trailer, errors_);
+      moreTrailers = trailerStatus;
+    }
+
+    const cms_uint32_t * bw = (const cms_uint32_t *)(header+1);
+    const cms_uint32_t * ew = (const cms_uint32_t *)(trailer);
+
+    assert(0 == (ew-bw)%2);
+    wordFedAppender.initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw));
+    wordCounterGPU+=(ew-bw);
+
+  } // end of for loop
+
+  gpuAlgo_.makeClustersAsync(gpuMap, gpuModulesToUnpack, gpuGains,
+                             wordFedAppender,
+                             std::move(errors_),
+                             wordCounterGPU, fedCounter, convertADCtoElectrons_,
+                             useQuality_, includeErrors_,
+                             edm::MessageDrop::instance()->debugEnabled,
+                             ctx.stream());
+
+  ctxTmp_ = ctx.toToken();
+}
+
+void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  CUDAScopedContext ctx{std::move(ctxTmp_)};
+
+  auto tmp = gpuAlgo_.getResults();
+  ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first));
+  ctx.emplace(iEvent, clusterPutToken_, std::move(tmp.second));
+  if(includeErrors_) {
+    ctx.emplace(iEvent, digiErrorPutToken_, gpuAlgo_.getErrors());
+  }
+}
+
+// define as framework plugin
+DEFINE_FWK_MODULE(SiPixelRawToClusterCUDA);
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
new file mode 100644
index 0000000000000..fead8e59a0db3
--- /dev/null
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -0,0 +1,640 @@
+/* Sushil Dubey, Shashi Dugad, TIFR, July 2017
+ *
+ * File Name: RawToClusterGPU.cu
+ * Description: It converts Raw data into Digi Format on GPU
+ * then it converts adc -> electron and
+ * applies the adc threshold to needed for clustering
+ * Finaly the Output of RawToDigi data is given to pixelClusterizer
+ *
+**/
+
+// C++ includes
+#include <cassert>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+
+// CUDA includes
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+#include <thrust/execution_policy.h>
+
+// cub includes
+#include <cub/cub.cuh>
+
+// CMSSW includes
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h"
+#include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPU.h"
+
+// local includes
+#include "SiPixelRawToClusterGPUKernel.h"
+
+namespace pixelgpudetails {
+
+  // number of words for all the FEDs
+  constexpr uint32_t MAX_FED_WORDS   = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
+
+  SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender(cuda::stream_t<>& cudaStream) {
+    edm::Service<CUDAService> cs;
+    word_ = cs->make_host_unique<unsigned int[]>(MAX_FED_WORDS, cudaStream);
+    fedId_ = cs->make_host_unique<unsigned char[]>(MAX_FED_WORDS, cudaStream);
+  }
+
+  void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {
+    std::memcpy(word_.get()+wordCounterGPU, src, sizeof(cms_uint32_t)*length);
+    std::memset(fedId_.get()+wordCounterGPU/2, fedId - 1200, length/2);
+  }
+
+  ////////////////////
+
+  __device__ uint32_t getLink(uint32_t ww)  {
+    return ((ww >> pixelgpudetails::LINK_shift) & pixelgpudetails::LINK_mask);
+  }
+
+
+  __device__ uint32_t getRoc(uint32_t ww) {
+    return ((ww >> pixelgpudetails::ROC_shift ) & pixelgpudetails::ROC_mask);
+  }
+
+
+  __device__ uint32_t getADC(uint32_t ww) {
+    return ((ww >> pixelgpudetails::ADC_shift) & pixelgpudetails::ADC_mask);
+  }
+
+
+  __device__ bool isBarrel(uint32_t rawId) {
+    return (1==((rawId>>25)&0x7));
+  }
+
+  __device__ pixelgpudetails::DetIdGPU getRawId(const SiPixelFedCablingMapGPU * cablingMap, uint8_t fed, uint32_t link, uint32_t roc) {
+    uint32_t index = fed * MAX_LINK * MAX_ROC + (link-1) * MAX_ROC + roc;
+    pixelgpudetails::DetIdGPU detId = { cablingMap->RawId[index], cablingMap->rocInDet[index], cablingMap->moduleId[index] };
+    return detId;
+  }
+
+  //reference http://cmsdoxygen.web.cern.ch/cmsdoxygen/CMSSW_9_2_0/doc/html/dd/d31/FrameConversion_8cc_source.html
+  //http://cmslxr.fnal.gov/source/CondFormats/SiPixelObjects/src/PixelROC.cc?v=CMSSW_9_2_0#0071
+  // Convert local pixel to pixelgpudetails::global pixel
+  __device__ pixelgpudetails::Pixel frameConversion(bool bpix, int side, uint32_t layer, uint32_t rocIdInDetUnit, pixelgpudetails::Pixel local) {
+
+    int slopeRow  = 0, slopeCol = 0;
+    int rowOffset = 0, colOffset = 0;
+
+    if (bpix) {
+
+      if (side == -1 && layer != 1) { // -Z side: 4 non-flipped modules oriented like 'dddd', except Layer 1
+        if (rocIdInDetUnit < 8) {
+          slopeRow = 1;
+          slopeCol = -1;
+          rowOffset = 0;
+          colOffset = (8-rocIdInDetUnit)*pixelgpudetails::numColsInRoc-1;
+        }
+        else {
+          slopeRow  = -1;
+          slopeCol  = 1;
+          rowOffset = 2*pixelgpudetails::numRowsInRoc-1;
+          colOffset = (rocIdInDetUnit-8)*pixelgpudetails::numColsInRoc;
+        } // if roc
+      }
+      else { // +Z side: 4 non-flipped modules oriented like 'pppp', but all 8 in layer1
+        if (rocIdInDetUnit < 8) {
+          slopeRow  = -1;
+          slopeCol  =  1;
+          rowOffset = 2*pixelgpudetails::numRowsInRoc-1;
+          colOffset = rocIdInDetUnit * pixelgpudetails::numColsInRoc;
+        }
+        else {
+          slopeRow  = 1;
+          slopeCol  = -1;
+          rowOffset = 0;
+          colOffset = (16-rocIdInDetUnit)*pixelgpudetails::numColsInRoc-1;
+        }
+      }
+
+    }
+    else { // fpix
+      if (side==-1) { // pannel 1
+        if (rocIdInDetUnit < 8) {
+          slopeRow = 1;
+          slopeCol = -1;
+          rowOffset = 0;
+          colOffset = (8-rocIdInDetUnit)*pixelgpudetails::numColsInRoc-1;
+        }
+        else {
+          slopeRow = -1;
+          slopeCol = 1;
+          rowOffset = 2*pixelgpudetails::numRowsInRoc-1;
+          colOffset = (rocIdInDetUnit-8)*pixelgpudetails::numColsInRoc;
+        }
+      }
+      else { // pannel 2
+        if (rocIdInDetUnit < 8) {
+          slopeRow = 1;
+          slopeCol = -1;
+          rowOffset = 0;
+          colOffset = (8-rocIdInDetUnit)*pixelgpudetails::numColsInRoc-1;
+        }
+        else {
+          slopeRow = -1;
+          slopeCol = 1;
+          rowOffset = 2*pixelgpudetails::numRowsInRoc-1;
+          colOffset = (rocIdInDetUnit-8)*pixelgpudetails::numColsInRoc;
+        }
+
+      } // side
+
+    }
+
+    uint32_t gRow = rowOffset+slopeRow*local.row;
+    uint32_t gCol = colOffset+slopeCol*local.col;
+    //printf("Inside frameConversion row: %u, column: %u\n", gRow, gCol);
+    pixelgpudetails::Pixel global = {gRow, gCol};
+    return global;
+  }
+
+
+  __device__ uint8_t conversionError(uint8_t fedId, uint8_t status, bool debug = false)
+  {
+    uint8_t errorType = 0;
+
+    // debug = true;
+
+    switch (status) {
+      case(1) : {
+        if (debug) printf("Error in Fed: %i, invalid channel Id (errorType = 35\n)", fedId );
+        errorType = 35;
+        break;
+      }
+      case(2) : {
+        if (debug) printf("Error in Fed: %i, invalid ROC Id (errorType = 36)\n", fedId);
+        errorType = 36;
+        break;
+      }
+      case(3) : {
+        if (debug) printf("Error in Fed: %i, invalid dcol/pixel value (errorType = 37)\n", fedId);
+        errorType = 37;
+        break;
+      }
+      case(4) : {
+        if (debug) printf("Error in Fed: %i, dcol/pixel read out of order (errorType = 38)\n", fedId);
+        errorType = 38;
+        break;
+      }
+      default:
+        if (debug) printf("Cabling check returned unexpected result, status = %i\n", status);
+    };
+
+    return errorType;
+  }
+
+  __device__ bool rocRowColIsValid(uint32_t rocRow, uint32_t rocCol)
+  {
+    uint32_t numRowsInRoc = 80;
+    uint32_t numColsInRoc = 52;
+
+    /// row and collumn in ROC representation
+    return ((rocRow < numRowsInRoc) & (rocCol < numColsInRoc));
+  }
+
+  __device__ bool dcolIsValid(uint32_t dcol, uint32_t pxid)
+  {
+    return ((dcol < 26) &  (2 <= pxid) & (pxid < 162));
+  }
+
+  __device__ uint8_t checkROC(uint32_t errorWord, uint8_t fedId, uint32_t link, const SiPixelFedCablingMapGPU *cablingMap, bool debug = false)
+  {
+    uint8_t errorType = (errorWord >> pixelgpudetails::ROC_shift) & pixelgpudetails::ERROR_mask;
+    if (errorType < 25) return 0;
+    bool errorFound = false;
+
+    switch (errorType) {
+      case(25) : {
+        errorFound = true;
+        uint32_t index = fedId * MAX_LINK * MAX_ROC + (link-1) * MAX_ROC + 1;
+        if (index > 1 && index <= cablingMap->size) {
+          if (!(link == cablingMap->link[index] && 1 == cablingMap->roc[index])) errorFound = false;
+        }
+        if (debug and errorFound) printf("Invalid ROC = 25 found (errorType = 25)\n");
+        break;
+      }
+      case(26) : {
+        if (debug) printf("Gap word found (errorType = 26)\n");
+        errorFound = true;
+        break;
+      }
+      case(27) : {
+        if (debug) printf("Dummy word found (errorType = 27)\n");
+        errorFound = true;
+        break;
+      }
+      case(28) : {
+        if (debug) printf("Error fifo nearly full (errorType = 28)\n");
+        errorFound = true;
+        break;
+      }
+      case(29) : {
+        if (debug) printf("Timeout on a channel (errorType = 29)\n");
+        if ((errorWord >> pixelgpudetails::OMIT_ERR_shift) & pixelgpudetails::OMIT_ERR_mask) {
+          if (debug) printf("...first errorType=29 error, this gets masked out\n");
+        }
+        errorFound = true;
+        break;
+      }
+      case(30) : {
+        if (debug) printf("TBM error trailer (errorType = 30)\n");
+        int StateMatch_bits = 4;
+        int StateMatch_shift = 8;
+        uint32_t StateMatch_mask = ~(~uint32_t(0) << StateMatch_bits);
+        int StateMatch = (errorWord >> StateMatch_shift) & StateMatch_mask;
+        if ( StateMatch != 1 && StateMatch != 8 ) {
+          if (debug) printf("FED error 30 with unexpected State Bits (errorType = 30)\n");
+        }
+        if (StateMatch == 1) errorType = 40; // 1=Overflow -> 40, 8=number of ROCs -> 30
+        errorFound = true;
+        break;
+      }
+      case(31) : {
+        if (debug) printf("Event number error (errorType = 31)\n");
+        errorFound = true;
+        break;
+      }
+      default:
+        errorFound = false;
+    };
+
+    return errorFound ? errorType : 0;
+  }
+
+  __device__ uint32_t getErrRawID(uint8_t fedId, uint32_t errWord, uint32_t errorType, const SiPixelFedCablingMapGPU *cablingMap, bool debug = false)
+  {
+    uint32_t rID = 0xffffffff;
+
+    switch (errorType) {
+      case 25 : case 30 : case 31 : case 36 : case 40 : {
+        //set dummy values for cabling just to get detId from link
+        //cabling.dcol = 0;
+        //cabling.pxid = 2;
+        uint32_t roc  = 1;
+        uint32_t link = (errWord >> pixelgpudetails::LINK_shift) & pixelgpudetails::LINK_mask;
+        uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).RawId;
+        if (rID_temp != 9999) rID = rID_temp;
+        break;
+      }
+      case 29 : {
+        int chanNmbr = 0;
+        const int DB0_shift = 0;
+        const int DB1_shift = DB0_shift + 1;
+        const int DB2_shift = DB1_shift + 1;
+        const int DB3_shift = DB2_shift + 1;
+        const int DB4_shift = DB3_shift + 1;
+        const uint32_t DataBit_mask = ~(~uint32_t(0) << 1);
+
+        int CH1 = (errWord >> DB0_shift) & DataBit_mask;
+        int CH2 = (errWord >> DB1_shift) & DataBit_mask;
+        int CH3 = (errWord >> DB2_shift) & DataBit_mask;
+        int CH4 = (errWord >> DB3_shift) & DataBit_mask;
+        int CH5 = (errWord >> DB4_shift) & DataBit_mask;
+        int BLOCK_bits      = 3;
+        int BLOCK_shift     = 8;
+        uint32_t BLOCK_mask = ~(~uint32_t(0) << BLOCK_bits);
+        int BLOCK = (errWord >> BLOCK_shift) & BLOCK_mask;
+        int localCH = 1*CH1+2*CH2+3*CH3+4*CH4+5*CH5;
+        if (BLOCK%2==0) chanNmbr=(BLOCK/2)*9+localCH;
+        else chanNmbr = ((BLOCK-1)/2)*9+4+localCH;
+        if ((chanNmbr < 1)||(chanNmbr > 36)) break;  // signifies unexpected result
+
+        // set dummy values for cabling just to get detId from link if in Barrel
+        //cabling.dcol = 0;
+        //cabling.pxid = 2;
+        uint32_t roc  = 1;
+        uint32_t link = chanNmbr;
+        uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).RawId;
+        if(rID_temp != 9999) rID = rID_temp;
+        break;
+      }
+      case 37 : case 38: {
+        //cabling.dcol = 0;
+        //cabling.pxid = 2;
+        uint32_t roc  = (errWord >> pixelgpudetails::ROC_shift) & pixelgpudetails::ROC_mask;
+        uint32_t link = (errWord >> pixelgpudetails::LINK_shift) & pixelgpudetails::LINK_mask;
+        uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).RawId;
+        if(rID_temp != 9999) rID = rID_temp;
+        break;
+      }
+      default:
+        break;
+    };
+
+    return rID;
+  }
+
+  /*----------
+   * Name: applyADCthreshold_kernel()
+   * Desc: converts adc count to electrons and then applies the
+   * threshold on each channel.
+   * make pixel to 0 if it is below the threshold
+   * Input: xx_d[], yy_d[], layer_d[], wordCounter, adc[], ADCThreshold
+   *-----------
+   * Output: xx_adc[], yy_adc[] with pixel threshold applied
+   */
+  // kernel to apply adc threshold on the channels
+
+
+  // Felice: gains and pedestals are not the same for each pixel. This code should be rewritten to take
+  // in account local gains/pedestals
+  // __global__ void applyADCthreshold_kernel(const uint32_t *xx_d, const uint32_t *yy_d, const uint32_t *layer_d, uint32_t *adc, const uint32_t wordCounter,
+  //  const ADCThreshold adcThreshold, uint32_t *xx_adc, uint32_t *yy_adc ) {
+  //   int tid = threadIdx.x;
+  //   int gIndex = blockDim.x*blockIdx.x+tid;
+  //   if (gIndex<wordCounter) {
+  //     uint32_t adcOld = adc[gIndex];
+  //     const float gain = adcThreshold.theElectronPerADCGain_; // default: 1 adc = 135 electrons
+  //     const float pedestal = 0; //
+  //     int adcNew = int(adcOld*gain+pedestal);
+  //     // rare chance of entering into the if ()
+  //     if (layer_d[gIndex]>=adcThreshold.theFirstStack_) {
+  //       if (adcThreshold.theStackADC_==1 && adcOld==1) {
+  //         adcNew = int(255*135); // Arbitrarily use overflow value.
+  //       }
+  //       if (adcThreshold.theStackADC_ >1 && adcThreshold.theStackADC_!=255 && adcOld>=1){
+  //         adcNew = int((adcOld-1) * gain * 255/float(adcThreshold.theStackADC_-1));
+  //       }
+  //     }
+  //
+  //     if (adcNew >adcThreshold.thePixelThreshold ) {
+  //       xx_adc[gIndex]=xx_d[gIndex];
+  //       yy_adc[gIndex]=yy_d[gIndex];
+  //     }
+  //     else {
+  //       xx_adc[gIndex]=0; // 0: dead pixel
+  //       yy_adc[gIndex]=0;
+  //     }
+  //     adc[gIndex] = adcNew;
+  //   }
+  // }
+
+
+  // Kernel to perform Raw to Digi conversion
+  __global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *cablingMap, const unsigned char *modToUnp,
+      const uint32_t wordCounter, const uint32_t *word, const uint8_t *fedIds,
+      uint16_t *xx, uint16_t *yy, uint16_t *adc,
+      uint32_t *pdigi, uint32_t *rawIdArr, uint16_t *moduleId,
+      GPU::SimpleVector<PixelErrorCompact> *err,
+      bool useQualityInfo, bool includeErrors, bool debug)
+  {
+    //if (threadIdx.x==0) printf("Event: %u blockIdx.x: %u start: %u end: %u\n", eventno, blockIdx.x, begin, end);
+
+    auto gIndex  = threadIdx.x + blockIdx.x * blockDim.x;
+    xx[gIndex]   = 0;
+    yy[gIndex]   = 0;
+    adc[gIndex]  = 0;
+    bool skipROC = false;
+
+    do {  // too many coninue below.... (to be fixed)
+      if (gIndex < wordCounter) {
+        uint8_t fedId = fedIds[gIndex/2]; // +1200;
+
+        // initialize (too many coninue below)
+        pdigi[gIndex]  = 0;
+        rawIdArr[gIndex] = 0;
+        moduleId[gIndex] = 9999;
+
+        uint32_t ww = word[gIndex]; // Array containing 32 bit raw data
+        if (ww == 0) {
+          // 0 is an indicator of a noise/dead channel, skip these pixels during clusterization
+          continue;
+        }
+
+        uint32_t link  = getLink(ww);            // Extract link
+        uint32_t roc   = getRoc(ww);             // Extract Roc in link
+        pixelgpudetails::DetIdGPU detId = getRawId(cablingMap, fedId, link, roc);
+
+        uint8_t errorType = checkROC(ww, fedId, link, cablingMap, debug);
+        skipROC = (roc < pixelgpudetails::maxROCIndex) ? false : (errorType != 0);
+        if (includeErrors and skipROC)
+        {
+          uint32_t rID = getErrRawID(fedId, ww, errorType, cablingMap, debug);
+          err->push_back(PixelErrorCompact{rID, ww, errorType, fedId});
+          continue;
+        }
+
+        uint32_t rawId  = detId.RawId;
+        uint32_t rocIdInDetUnit = detId.rocInDet;
+        bool barrel = isBarrel(rawId);
+
+        uint32_t index = fedId * MAX_LINK * MAX_ROC + (link-1) * MAX_ROC + roc;
+        if (useQualityInfo) {
+          skipROC = cablingMap->badRocs[index];
+          if (skipROC) continue;
+        }
+        skipROC = modToUnp[index];
+        if (skipROC) continue;
+
+        uint32_t layer = 0;//, ladder =0;
+        int side = 0, panel = 0, module = 0;//disk = 0, blade = 0
+
+        if (barrel)
+        {
+          layer  = (rawId >> pixelgpudetails::layerStartBit) & pixelgpudetails::layerMask;
+          module = (rawId >> pixelgpudetails::moduleStartBit) & pixelgpudetails::moduleMask;
+          side   = (module < 5)? -1 : 1;
+        }
+        else {
+          // endcap ids
+          layer = 0;
+          panel = (rawId >> pixelgpudetails::panelStartBit) & pixelgpudetails::panelMask;
+          //disk  = (rawId >> diskStartBit_) & diskMask_;
+          side  = (panel == 1)? -1 : 1;
+          //blade = (rawId >> bladeStartBit_) & bladeMask_;
+        }
+
+        // ***special case of layer to 1 be handled here
+        pixelgpudetails::Pixel localPix;
+        if (layer == 1) {
+          uint32_t col = (ww >> pixelgpudetails::COL_shift) & pixelgpudetails::COL_mask;
+          uint32_t row = (ww >> pixelgpudetails::ROW_shift) & pixelgpudetails::ROW_mask;
+          localPix.row = row;
+          localPix.col = col;
+          if (includeErrors) {
+            if (not rocRowColIsValid(row, col)) {
+              uint8_t error = conversionError(fedId, 3, debug); //use the device function and fill the arrays
+              err->push_back(PixelErrorCompact{rawId, ww, error, fedId});
+              if(debug) printf("BPIX1  Error status: %i\n", error);
+              continue;
+            }
+          }
+        } else {
+          // ***conversion rules for dcol and pxid
+          uint32_t dcol = (ww >> pixelgpudetails::DCOL_shift) & pixelgpudetails::DCOL_mask;
+          uint32_t pxid = (ww >> pixelgpudetails::PXID_shift) & pixelgpudetails::PXID_mask;
+          uint32_t row  = pixelgpudetails::numRowsInRoc - pxid/2;
+          uint32_t col  = dcol*2 + pxid%2;
+          localPix.row = row;
+          localPix.col = col;
+          if (includeErrors and not dcolIsValid(dcol, pxid)) {
+            uint8_t error = conversionError(fedId, 3, debug);
+            err->push_back(PixelErrorCompact{rawId, ww, error, fedId});
+            if(debug) printf("Error status: %i %d %d %d %d\n", error, dcol, pxid, fedId, roc);
+            continue;
+          }
+        }
+
+        pixelgpudetails::Pixel globalPix = frameConversion(barrel, side, layer, rocIdInDetUnit, localPix);
+        xx[gIndex]    = globalPix.row;  // origin shifting by 1 0-159
+        yy[gIndex]    = globalPix.col;  // origin shifting by 1 0-415
+        adc[gIndex]   = getADC(ww);
+        pdigi[gIndex] = pixelgpudetails::pack(globalPix.row, globalPix.col, adc[gIndex]);
+        moduleId[gIndex] = detId.moduleId;
+        rawIdArr[gIndex] = rawId;
+      } // end of if (gIndex < end)
+    } while (false); // end fake loop
+  } // end of Raw to Digi kernel
+
+  // Interface to outside
+  void SiPixelRawToClusterGPUKernel::makeClustersAsync(
+      const SiPixelFedCablingMapGPU *cablingMap,
+      const unsigned char *modToUnp,
+      const SiPixelGainForHLTonGPU *gains,
+      const WordFedAppender& wordFed,
+      PixelFormatterErrors&& errors,
+      const uint32_t wordCounter, const uint32_t fedCounter,
+      bool convertADCtoElectrons,
+      bool useQualityInfo, bool includeErrors, bool debug,
+      cuda::stream_t<>& stream)
+  {
+    nDigis = wordCounter;
+
+    digis_d = SiPixelDigisCUDA(pixelgpudetails::MAX_FED_WORDS, stream);
+    if(includeErrors) {
+      digiErrors_d = SiPixelDigiErrorsCUDA(pixelgpudetails::MAX_FED_WORDS, std::move(errors), stream);
+    }
+    clusters_d = SiPixelClustersCUDA(gpuClustering::MaxNumModules, stream);
+
+    edm::Service<CUDAService> cs;
+    nModules_Clusters_h = cs->make_host_unique<uint32_t[]>(2, stream);
+
+    {
+      const int threadsPerBlock = 512;
+      const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all
+
+      assert(0 == wordCounter%2);
+      // wordCounter is the total no of words in each event to be trasfered on device
+      auto word_d = cs->make_device_unique<uint32_t[]>(wordCounter, stream);
+      auto fedId_d = cs->make_device_unique<uint8_t[]>(wordCounter, stream);
+
+      cudaCheck(cudaMemcpyAsync(word_d.get(),  wordFed.word(), wordCounter*sizeof(uint32_t),    cudaMemcpyDefault, stream.id()));
+      cudaCheck(cudaMemcpyAsync(fedId_d.get(), wordFed.fedId(), wordCounter*sizeof(uint8_t) / 2, cudaMemcpyDefault, stream.id()));
+
+      // Launch rawToDigi kernel
+      RawToDigi_kernel<<<blocks, threadsPerBlock, 0, stream.id()>>>(
+          cablingMap,
+          modToUnp,
+          wordCounter,
+          word_d.get(),
+          fedId_d.get(),
+          digis_d.xx(), digis_d.yy(), digis_d.adc(),
+          digis_d.pdigi(),
+          digis_d.rawIdArr(),
+          digis_d.moduleInd(),
+          digiErrors_d.error(), // returns nullptr if default-constructed
+          useQualityInfo,
+          includeErrors,
+          debug);
+      cudaCheck(cudaGetLastError());
+
+      if(includeErrors) {
+        digiErrors_d.copyErrorToHostAsync(stream);
+      }
+    }
+    // End  of Raw2Digi and passing data for cluserisation
+
+    {
+      // clusterizer ...
+      using namespace gpuClustering;
+      int threadsPerBlock = 256;
+      int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock;
+
+      gpuCalibPixel::calibDigis<<<blocks, threadsPerBlock, 0, stream.id()>>>(
+          digis_d.moduleInd(),
+          digis_d.c_xx(), digis_d.c_yy(), digis_d.adc(),
+          gains,
+          wordCounter);
+      cudaCheck(cudaGetLastError());
+
+#ifdef GPU_DEBUG
+       std::cout
+         << "CUDA countModules kernel launch with " << blocks
+         << " blocks of " << threadsPerBlock << " threads\n";
+#endif
+
+      cudaCheck(cudaMemsetAsync(clusters_d.moduleStart(), 0x00, sizeof(uint32_t), stream.id()));
+
+      countModules<<<blocks, threadsPerBlock, 0, stream.id()>>>(digis_d.c_moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter);
+      cudaCheck(cudaGetLastError());
+
+      // read the number of modules into a data member, used by getProduct())
+      cudaCheck(cudaMemcpyAsync(&(nModules_Clusters_h[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+
+      threadsPerBlock = 256;
+      blocks = MaxNumModules;
+#ifdef GPU_DEBUG
+         std::cout << "CUDA findClus kernel launch with " << blocks
+         << " blocks of " << threadsPerBlock << " threads\n";
+#endif
+      cudaCheck(cudaMemsetAsync(clusters_d.clusInModule(), 0, (MaxNumModules)*sizeof(uint32_t), stream.id()));
+      findClus<<<blocks, threadsPerBlock, 0, stream.id()>>>(
+          digis_d.c_moduleInd(),
+          digis_d.c_xx(), digis_d.c_yy(),
+          clusters_d.c_moduleStart(),
+          clusters_d.clusInModule(), clusters_d.moduleId(),
+          digis_d.clus(),
+          wordCounter);
+      cudaCheck(cudaGetLastError());
+
+      // apply charge cut
+      clusterChargeCut<<<blocks, threadsPerBlock, 0, stream.id()>>>(
+          digis_d.moduleInd(),
+          digis_d.c_adc(),
+          clusters_d.c_moduleStart(),
+          clusters_d.clusInModule(), clusters_d.c_moduleId(),
+          digis_d.clus(),
+          wordCounter);
+      cudaCheck(cudaGetLastError());
+
+
+      // count the module start indices already here (instead of
+      // rechits) so that the number of clusters/hits can be made
+      // available in the rechit producer without additional points of
+      // synchronization/ExternalWork
+      //
+      // Temporary storage
+      size_t tempScanStorageSize = 0;
+      {
+        uint32_t *tmp = nullptr;
+        cudaCheck(cub::DeviceScan::InclusiveSum(nullptr, tempScanStorageSize, tmp, tmp, MaxNumModules));
+      }
+      auto tempScanStorage_d = cs->make_device_unique<uint32_t[]>(tempScanStorageSize, stream);
+      // Set first the first element to 0
+      cudaCheck(cudaMemsetAsync(clusters_d.clusModuleStart(), 0, sizeof(uint32_t), stream.id()));
+      // Then use inclusive_scan to get the partial sum to the rest
+      cudaCheck(cub::DeviceScan::InclusiveSum(tempScanStorage_d.get(), tempScanStorageSize,
+                                              clusters_d.c_clusInModule(), &clusters_d.clusModuleStart()[1], gpuClustering::MaxNumModules,
+                                              stream.id()));
+      // last element holds the number of all clusters
+      cudaCheck(cudaMemcpyAsync(&(nModules_Clusters_h[1]), clusters_d.clusModuleStart()+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+    } // end clusterizer scope
+  }
+}
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
new file mode 100644
index 0000000000000..1ab8bc3fa5998
--- /dev/null
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -0,0 +1,234 @@
+#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_SiPixelRawToClusterGPUKernel_h
+#define RecoLocalTracker_SiPixelClusterizer_plugins_SiPixelRawToClusterGPUKernel_h
+
+#include <algorithm>
+#include <cuda_runtime.h>
+#include "cuda/api_wrappers.h"
+
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
+#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
+#include "FWCore/Utilities/interface/typedefs.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "DataFormats/SiPixelDigi/interface/PixelErrors.h"
+
+struct SiPixelFedCablingMapGPU;
+class SiPixelGainForHLTonGPU;
+
+namespace pixelgpudetails {
+
+  // Phase 1 geometry constants
+  const uint32_t layerStartBit    = 20;
+  const uint32_t ladderStartBit   = 12;
+  const uint32_t moduleStartBit   = 2;
+
+  const uint32_t panelStartBit    = 10;
+  const uint32_t diskStartBit     = 18;
+  const uint32_t bladeStartBit    = 12;
+
+  const uint32_t layerMask        = 0xF;
+  const uint32_t ladderMask       = 0xFF;
+  const uint32_t moduleMask       = 0x3FF;
+  const uint32_t panelMask        = 0x3;
+  const uint32_t diskMask         = 0xF;
+  const uint32_t bladeMask        = 0x3F;
+
+  const uint32_t LINK_bits        = 6;
+  const uint32_t ROC_bits         = 5;
+  const uint32_t DCOL_bits        = 5;
+  const uint32_t PXID_bits        = 8;
+  const uint32_t ADC_bits         = 8;
+
+  // special for layer 1
+  const uint32_t LINK_bits_l1     = 6;
+  const uint32_t ROC_bits_l1      = 5;
+  const uint32_t COL_bits_l1      = 6;
+  const uint32_t ROW_bits_l1      = 7;
+  const uint32_t OMIT_ERR_bits    = 1;
+
+  const uint32_t maxROCIndex      = 8;
+  const uint32_t numRowsInRoc     = 80;
+  const uint32_t numColsInRoc     = 52;
+
+  const uint32_t MAX_WORD = 2000;
+
+  const uint32_t ADC_shift  = 0;
+  const uint32_t PXID_shift = ADC_shift + ADC_bits;
+  const uint32_t DCOL_shift = PXID_shift + PXID_bits;
+  const uint32_t ROC_shift  = DCOL_shift + DCOL_bits;
+  const uint32_t LINK_shift = ROC_shift + ROC_bits_l1;
+  // special for layer 1 ROC
+  const uint32_t ROW_shift = ADC_shift + ADC_bits;
+  const uint32_t COL_shift = ROW_shift + ROW_bits_l1;
+  const uint32_t OMIT_ERR_shift = 20;
+
+  const uint32_t LINK_mask = ~(~uint32_t(0) << LINK_bits_l1);
+  const uint32_t ROC_mask  = ~(~uint32_t(0) << ROC_bits_l1);
+  const uint32_t COL_mask  = ~(~uint32_t(0) << COL_bits_l1);
+  const uint32_t ROW_mask  = ~(~uint32_t(0) << ROW_bits_l1);
+  const uint32_t DCOL_mask = ~(~uint32_t(0) << DCOL_bits);
+  const uint32_t PXID_mask = ~(~uint32_t(0) << PXID_bits);
+  const uint32_t ADC_mask  = ~(~uint32_t(0) << ADC_bits);
+  const uint32_t ERROR_mask = ~(~uint32_t(0) << ROC_bits_l1);
+  const uint32_t OMIT_ERR_mask = ~(~uint32_t(0) << OMIT_ERR_bits);
+
+  struct DetIdGPU {
+    uint32_t RawId;
+    uint32_t rocInDet;
+    uint32_t moduleId;
+  };
+
+  struct Pixel {
+   uint32_t row;
+   uint32_t col;
+  };
+
+  class Packing {
+  public:
+    using PackedDigiType = uint32_t;
+
+    // Constructor: pre-computes masks and shifts from field widths
+    __host__ __device__
+    inline
+    constexpr Packing(unsigned int row_w, unsigned int column_w,
+                      unsigned int time_w, unsigned int adc_w) :
+      row_width(row_w),
+      column_width(column_w),
+      adc_width(adc_w),
+      row_shift(0),
+      column_shift(row_shift + row_w),
+      time_shift(column_shift + column_w),
+      adc_shift(time_shift + time_w),
+      row_mask(~(~0U << row_w)),
+      column_mask( ~(~0U << column_w)),
+      time_mask(~(~0U << time_w)),
+      adc_mask(~(~0U << adc_w)),
+      rowcol_mask(~(~0U << (column_w+row_w))),
+      max_row(row_mask),
+      max_column(column_mask),
+      max_adc(adc_mask)
+    { }
+
+    uint32_t  row_width;
+    uint32_t  column_width;
+    uint32_t  adc_width;
+
+    uint32_t  row_shift;
+    uint32_t  column_shift;
+    uint32_t  time_shift;
+    uint32_t  adc_shift;
+
+    PackedDigiType row_mask;
+    PackedDigiType column_mask;
+    PackedDigiType time_mask;
+    PackedDigiType adc_mask;
+    PackedDigiType rowcol_mask;
+
+    uint32_t  max_row;
+    uint32_t  max_column;
+    uint32_t  max_adc;
+  };
+
+  __host__ __device__
+  inline
+  constexpr Packing packing() {
+    return Packing(11, 11, 0, 10);
+  }
+
+
+  __host__ __device__
+  inline
+  uint32_t pack(uint32_t row, uint32_t col, uint32_t adc) {
+    constexpr Packing thePacking = packing();
+    adc = std::min(adc, thePacking.max_adc);
+
+    return (row << thePacking.row_shift) |
+           (col << thePacking.column_shift) |
+           (adc << thePacking.adc_shift);
+  }
+
+  constexpr
+  uint32_t pixelToChannel( int row, int col) {
+    constexpr Packing thePacking = packing();
+    return (row << thePacking.column_width) | col;
+  }
+
+
+  class SiPixelRawToClusterGPUKernel {
+  public:
+    class WordFedAppender {
+    public:
+      WordFedAppender(cuda::stream_t<>& cudaStream);
+      ~WordFedAppender() = default;
+
+      void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length);
+
+      const unsigned int *word() const { return word_.get(); }
+      const unsigned char *fedId() const { return fedId_.get(); }
+
+    private:
+      cudautils::host::unique_ptr<unsigned int[]> word_;
+      cudautils::host::unique_ptr<unsigned char[]> fedId_;
+    };
+
+    SiPixelRawToClusterGPUKernel() = default;
+    ~SiPixelRawToClusterGPUKernel() = default;
+
+
+    SiPixelRawToClusterGPUKernel(const SiPixelRawToClusterGPUKernel&) = delete;
+    SiPixelRawToClusterGPUKernel(SiPixelRawToClusterGPUKernel&&) = delete;
+    SiPixelRawToClusterGPUKernel& operator=(const SiPixelRawToClusterGPUKernel&) = delete;
+    SiPixelRawToClusterGPUKernel& operator=(SiPixelRawToClusterGPUKernel&&) = delete;
+
+    void makeClustersAsync(const SiPixelFedCablingMapGPU *cablingMap, const unsigned char *modToUnp,
+                           const SiPixelGainForHLTonGPU *gains,
+                           const WordFedAppender& wordFed,
+                           PixelFormatterErrors&& errors,
+                           const uint32_t wordCounter, const uint32_t fedCounter, bool convertADCtoElectrons,
+                           bool useQualityInfo, bool includeErrors, bool debug,
+                           cuda::stream_t<>& stream);
+
+    std::pair<SiPixelDigisCUDA, SiPixelClustersCUDA> getResults() {
+      digis_d.setNModulesDigis(nModules_Clusters_h[0], nDigis);
+      clusters_d.setNClusters(nModules_Clusters_h[1]);
+      // need to explicitly deallocate while the associated CUDA
+      // stream is still alive
+      //
+      // technically the statement above is not true anymore now that
+      // the CUDA streams are cached within the CUDAService, but it is
+      // still better to release as early as possible
+      nModules_Clusters_h.reset();
+      return std::make_pair(std::move(digis_d), std::move(clusters_d));
+    }
+
+    SiPixelDigiErrorsCUDA&& getErrors() {
+      return std::move(digiErrors_d);
+    }
+
+  private:
+    uint32_t nDigis = 0;
+
+    // Data to be put in the event
+    cudautils::host::unique_ptr<uint32_t[]> nModules_Clusters_h;
+    SiPixelDigisCUDA digis_d;
+    SiPixelClustersCUDA clusters_d;
+    SiPixelDigiErrorsCUDA digiErrors_d;
+  };
+
+  // see RecoLocalTracker/SiPixelClusterizer
+  // all are runtime const, should be specified in python _cfg.py
+  struct ADCThreshold {
+    const int     thePixelThreshold       = 1000;     // default Pixel threshold in electrons
+    const int     theSeedThreshold        = 1000;     // seed thershold in electrons not used in our algo
+    const float   theClusterThreshold     = 4000;     // cluster threshold in electron
+    const int     ConversionFactor        =   65;     // adc to electron conversion factor
+
+    const int     theStackADC_            =  255;     // the maximum adc count for stack layer
+    const int     theFirstStack_          =    5;     // the index of the fits stack layer
+    const double  theElectronPerADCGain_  =  600;     // ADC to electron conversion
+  };
+
+}
+
+#endif // RecoLocalTracker_SiPixelClusterizer_plugins_SiPixelRawToClusterGPUKernel_h
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h
new file mode 100644
index 0000000000000..5a681e791f94f
--- /dev/null
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h
@@ -0,0 +1,125 @@
+#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuCalibPixel_h
+#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuCalibPixel_h
+
+#include <cstdint>
+#include <cstdio>
+
+#include "CondFormats/SiPixelObjects/interface/SiPixelGainForHLTonGPU.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+namespace gpuCalibPixel {
+
+  constexpr uint16_t InvId=9999; // must be > MaxNumModules
+
+  constexpr float VCaltoElectronGain      = 47;   // L2-4: 47 +- 4.7
+  constexpr float VCaltoElectronGain_L1   = 50;   // L1:   49.6 +- 2.6
+  constexpr float VCaltoElectronOffset    = -60;  // L2-4: -60 +- 130
+  constexpr float VCaltoElectronOffset_L1 = -670; // L1:   -670 +- 220
+
+
+ __global__ void calibDigis(uint16_t * id,
+                           uint16_t const * __restrict__ x,
+                           uint16_t const * __restrict__ y,
+                           uint16_t * adc,
+                           SiPixelGainForHLTonGPU const * __restrict__ ped,
+                           int numElements
+                         )
+{
+
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i >= numElements) return;
+    if (InvId==id[i]) return;
+
+    float conversionFactor = id[i]<96 ? VCaltoElectronGain_L1 : VCaltoElectronGain;
+    float offset =  id[i]<96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset;
+
+    bool isDeadColumn=false, isNoisyColumn=false;
+ 
+    int row = x[i];
+    int col = y[i];
+    auto ret = ped->getPedAndGain(id[i], col, row, isDeadColumn, isNoisyColumn);
+    float pedestal = ret.first; float gain = ret.second;
+    // float pedestal = 0; float gain = 1.;
+    if ( isDeadColumn | isNoisyColumn )
+      { 
+        id[i]=InvId; adc[i] =0; 
+        printf("bad pixel at %d in %d\n",i,id[i]);
+    }
+    else {
+      float vcal = adc[i] * gain  - pedestal*gain;
+      adc[i] = std::max(100, int( vcal * conversionFactor + offset));
+    }
+
+    // if (threadIdx.x==0)
+    //  printf ("calibrated %d\n",id[i]);
+}
+
+ __global__ void calibADCByModule(uint16_t * id,
+			   uint16_t const * __restrict__ x,
+			   uint16_t const * __restrict__ y,
+			   uint16_t * adc,
+			   uint32_t * moduleStart,
+                           SiPixelGainForHLTonGPU const * __restrict__ ped,
+                           int numElements
+                         )
+{
+
+
+    auto first = moduleStart[1 + blockIdx.x];  
+    
+    auto me = id[first];
+    
+    assert(me<2000);
+
+    /// depends on "me"
+
+    float conversionFactor = me<96 ? VCaltoElectronGain_L1 : VCaltoElectronGain; 
+    float offset =  me<96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset; 
+ 
+
+#ifdef GPU_DEBUG
+    if (me%100==1)
+      if (threadIdx.x==0) printf("start pixel calibration for module %d in block %d\n",me,blockIdx.x);
+#endif
+
+    first+=threadIdx.x;
+ 
+    // __syncthreads();
+
+    float pedestal=0,gain=0;
+    bool isDeadColumn=false, isNoisyColumn=false;
+    int oldCol=-1, oldAveragedBlock=-1;
+
+    for (int i=first; i<numElements; i+=blockDim.x) {
+       if (id[i]==InvId) continue;  // not valid
+       if (id[i]!=me) break;  // end of module
+       int row = x[i];
+       int col = y[i];
+       int averagedBlock = row / ped->numberOfRowsAveragedOver_; // 80....  ( row<80 will be faster...)
+       if ( (col!=oldCol) | ( averagedBlock != oldAveragedBlock) ) {
+        oldCol=col; oldAveragedBlock= averagedBlock;
+        auto ret = ped->getPedAndGain(me,col, row, isDeadColumn, isNoisyColumn);
+        pedestal = ret.first; gain = ret.second;
+       }
+       if ( isDeadColumn | isNoisyColumn ) 
+         { id[i]=InvId; adc[i] =0; }
+       else {
+         float vcal = adc[i] * gain  - pedestal*gain;
+         adc[i] = std::max(100, int( vcal * conversionFactor + offset)); 
+       }
+    } 
+
+    __syncthreads(); 
+    //reset start
+    if(0==threadIdx.x) {
+     auto & k = moduleStart[1 + blockIdx.x];
+     while (id[k]==InvId) ++k;
+    }
+     
+
+ }
+
+
+}
+
+#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuCalibPixel_h
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
new file mode 100644
index 0000000000000..855216960d659
--- /dev/null
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
@@ -0,0 +1,97 @@
+#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
+#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
+
+#include <cstdint>
+#include <cstdio>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
+
+#include "gpuClusteringConstants.h"
+
+namespace gpuClustering {
+
+  __global__ void  clusterChargeCut(
+                           uint16_t * __restrict__ id,             // module id of each pixel (modified if bad cluster)
+                           uint16_t const * __restrict__ adc,              //  charge of each pixel
+                           uint32_t const * __restrict__ moduleStart,    // index of the first pixel of each module
+                           uint32_t * __restrict__ nClustersInModule,    // modified: number of clusters found in each module
+                           uint32_t const * __restrict__ moduleId,             // module id of each module
+                           int32_t * __restrict__  clusterId,            // modified: cluster id of each pixel
+                           int numElements)
+  {
+
+    if (blockIdx.x >= moduleStart[0])
+      return;
+
+    auto firstPixel = moduleStart[1 + blockIdx.x];
+    auto thisModuleId = id[firstPixel];
+    assert(thisModuleId < MaxNumModules);
+    assert(thisModuleId==moduleId[blockIdx.x]);
+
+    auto nclus = nClustersInModule[thisModuleId];
+    if (nclus==0) return;
+
+    assert(nclus<=MaxNumClustersPerModules);
+
+#ifdef GPU_DEBUG
+    if (thisModuleId % 100 == 1)
+      if (threadIdx.x == 0)
+        printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
+#endif
+
+    auto first = firstPixel + threadIdx.x;
+
+    __shared__ int32_t charge[MaxNumClustersPerModules];
+    for (int i=threadIdx.x; i<nclus; i += blockDim.x) {
+      charge[i]=0;
+    }
+    __syncthreads();
+
+    for (int i = first; i < numElements; i += blockDim.x) {
+      if (id[i] == InvId) continue;     // not valid
+      if (id[i] != thisModuleId) break;           // end of module
+      atomicAdd(&charge[clusterId[i]], adc[i]);
+    }
+    __syncthreads();
+
+    auto chargeCut = thisModuleId<96 ? 2000 : 4000; // move in constants (calib?)
+    __shared__ uint8_t ok[MaxNumClustersPerModules];
+    __shared__ uint16_t newclusId[MaxNumClustersPerModules];
+    for (int i=threadIdx.x; i<nclus; i += blockDim.x) {
+       newclusId[i] = ok[i] =  charge[i]>chargeCut ? 1 : 0;
+    }
+
+    __syncthreads();
+
+    // renumber
+    __shared__ uint16_t ws[32];
+    blockPrefixScan(newclusId, nclus, ws);
+
+    assert(nclus>=newclusId[nclus-1]);
+    
+    if(nclus==newclusId[nclus-1]) return;
+
+    nClustersInModule[thisModuleId] = newclusId[nclus-1];
+    __syncthreads();
+
+    // mark bad cluster again
+    for (int i=threadIdx.x; i<nclus; i += blockDim.x) {
+      if (0==ok[i]) newclusId[i]=InvId+1;
+    }
+    __syncthreads();
+
+    // reassign id
+    for (int i = first; i < numElements; i += blockDim.x) {
+      if (id[i] == InvId) continue;     // not valid
+      if (id[i] != thisModuleId) break;           // end of module
+      clusterId[i] = newclusId[clusterId[i]]-1;
+      if(clusterId[i]==InvId) id[i] = InvId;
+    }
+
+    //done
+  }
+
+
+} // namespace
+#endif
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
new file mode 100644
index 0000000000000..5c21a39302d70
--- /dev/null
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
@@ -0,0 +1,271 @@
+#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
+#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
+
+#include <cstdint>
+#include <cstdio>
+
+#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "gpuClusteringConstants.h"
+
+namespace gpuClustering {
+
+  __global__ void countModules(uint16_t const * __restrict__ id,
+                               uint32_t * __restrict__ moduleStart,
+                               int32_t * __restrict__ clusterId,
+                               int numElements)
+  {
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i >= numElements)
+      return;
+    clusterId[i] = i;
+    if (InvId == id[i])
+      return;
+    auto j = i - 1;
+    while (j >= 0 and id[j] == InvId)
+      --j;
+    if (j < 0 or id[j] != id[i]) {
+      // boundary...
+      auto loc = atomicInc(moduleStart, MaxNumModules);
+      moduleStart[loc + 1] = i;
+    }
+  }
+
+  __global__
+//  __launch_bounds__(256,4)
+  void findClus(uint16_t const * __restrict__ id,             // module id of each pixel
+                           uint16_t const * __restrict__ x,              // local coordinates of each pixel
+                           uint16_t const * __restrict__ y,              //
+                           uint32_t const * __restrict__ moduleStart,    // index of the first pixel of each module
+                           uint32_t * __restrict__ nClustersInModule,    // output: number of clusters found in each module
+                           uint32_t * __restrict__ moduleId,             // output: module id of each module
+                           int32_t * __restrict__  clusterId,            // output: cluster id of each pixel
+                           int numElements)
+  {
+
+    if (blockIdx.x >= moduleStart[0])
+      return;
+
+    auto firstPixel = moduleStart[1 + blockIdx.x];
+    auto thisModuleId = id[firstPixel];
+    assert(thisModuleId < MaxNumModules);
+
+#ifdef GPU_DEBUG
+    if (thisModuleId % 100 == 1)
+      if (threadIdx.x == 0)
+        printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
+#endif
+
+    auto first = firstPixel + threadIdx.x;
+
+    // find the index of the first pixel not belonging to this module (or invalid)
+    __shared__ int msize;
+    msize = numElements;
+    __syncthreads();
+
+    // skip threads not associated to an existing pixel
+      for (int i = first; i < numElements; i += blockDim.x) {
+        if (id[i] == InvId)                 // skip invalid pixels
+          continue;
+        if (id[i] != thisModuleId) {        // find the first pixel in a different module
+          atomicMin(&msize, i);
+          break;
+        }
+      }
+
+   //init hist  (ymax=416 < 512 : 9bits)
+   constexpr uint32_t maxPixInModule = 4000;
+   constexpr auto  nbins = phase1PixelTopology::numColsInModule + 2;   //2+2;
+   using Hist = HistoContainer<uint16_t,nbins,maxPixInModule,9,uint16_t>;
+    __shared__ Hist hist;
+    __shared__ typename Hist::Counter ws[32];
+    for (auto j=threadIdx.x; j<Hist::totbins(); j+=blockDim.x) { hist.off[j]=0;}
+    __syncthreads();
+
+    assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
+    assert(msize-firstPixel<maxPixInModule);
+
+
+#ifdef GPU_DEBUG
+    __shared__ uint32_t totGood;
+    totGood=0;
+    __syncthreads();
+#endif
+
+    // fill histo
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId)                 // skip invalid pixels
+          continue;
+        hist.count(y[i]);
+#ifdef GPU_DEBUG
+        atomicAdd(&totGood,1);
+#endif
+      }
+    __syncthreads();
+    if (threadIdx.x<32) ws[threadIdx.x]=0;  // used by prefix scan...
+    __syncthreads();
+    hist.finalize(ws);
+    __syncthreads();
+#ifdef GPU_DEBUG
+    assert(hist.size()==totGood);
+    if (thisModuleId % 100 == 1)
+      if (threadIdx.x == 0)
+        printf("histo size %d\n",hist.size());
+#endif
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId)                 // skip invalid pixels
+          continue;
+        hist.fill(y[i],i-firstPixel);
+      }
+
+    // assume that we can cover the whole module with up to 10 blockDim.x-wide iterations
+    constexpr int maxiter = 10;
+    if (threadIdx.x==0) {
+      assert((hist.size()/ blockDim.x) <= maxiter);
+    }
+    // nearest neighbour
+    uint16_t nn[maxiter][5];
+    uint8_t nnn[maxiter]; // number of nn
+    for (int k = 0; k < maxiter; ++k)
+      nnn[k] = 0;
+
+    __syncthreads();  // for hit filling!
+
+#ifdef GPU_DEBUG
+    // look for anomalous high occupancy
+    __shared__ uint32_t n40,n60;
+    n40=n60=0;
+    __syncthreads();
+    for (auto j=threadIdx.x; j<Hist::nbins(); j+=blockDim.x) {
+      if(hist.size(j)>60) atomicAdd(&n60,1);
+      if(hist.size(j)>40) atomicAdd(&n40,1);
+     }
+    __syncthreads();
+    if (0==threadIdx.x) {
+      if (n60>0) printf("columns with more than 60 px %d in %d\n",n60,thisModuleId);
+      else if (n40>0) printf("columns with more than 40 px %d in %d\n",n40,thisModuleId);
+    }
+    __syncthreads();
+#endif
+
+    // fill NN
+    for (int j=threadIdx.x, k = 0; j<hist.size(); j+=blockDim.x, ++k) {
+        auto p = hist.begin()+j;
+        auto i = *p + firstPixel;
+        assert (id[i] != InvId);
+        assert(id[i] == thisModuleId);    // same module
+        int be = Hist::bin(y[i]+1);
+        auto e = hist.end(be);
+        ++p;
+        for (;p<e;++p) {
+          auto m = (*p)+firstPixel;
+          assert(m!=i);
+          if (std::abs(int(x[m]) - int(x[i])) > 1) continue;
+          auto l = nnn[k]++;
+          assert(l<5);
+          nn[k][l]=*p;
+        }
+     }
+
+    // for each pixel, look at all the pixels until the end of the module;
+    // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
+    // after the loop, all the pixel in each cluster should have the id equeal to the lowest
+    // pixel in the cluster ( clus[i] == i ).
+    bool more = true;
+    int nloops=0;
+    while (__syncthreads_or(more)) {
+      if (1==nloops%2) {
+        for (int j=threadIdx.x, k = 0; j<hist.size(); j+=blockDim.x, ++k) {
+             auto p = hist.begin()+j;
+             auto i = *p + firstPixel;
+             auto m = clusterId[i];
+             while (m!=clusterId[m]) m=clusterId[m];
+             clusterId[i]=m;
+        }
+      } else {
+        more = false;
+        for (int j=threadIdx.x, k = 0; j<hist.size(); j+=blockDim.x, ++k) {
+          auto p = hist.begin()+j;
+          auto i = *p + firstPixel;
+          for (int kk=0; kk<nnn[k]; ++kk) {
+            auto l = nn[k][kk];
+            auto m = l+firstPixel;
+            assert(m!=i);
+            auto old = atomicMin(&clusterId[m], clusterId[i]);
+            if (old != clusterId[i]) {
+              // end the loop only if no changes were applied
+              more = true;
+            }
+            atomicMin(&clusterId[i], old);
+          } // nnloop
+        } // pixel loop
+      }
+      ++nloops;
+    }  // end while
+
+#ifdef GPU_DEBUG
+   {
+     __shared__ int n0;
+     if (threadIdx.x == 0) n0=nloops;
+     __syncthreads();
+     auto ok = n0==nloops;
+     assert(__syncthreads_and(ok));
+   if (thisModuleId % 100 == 1)
+      if (threadIdx.x == 0)
+        printf("# loops %d\n",nloops);
+   }
+#endif
+
+    __shared__ unsigned int foundClusters;
+    foundClusters = 0;
+    __syncthreads();
+
+    // find the number of different clusters, identified by a pixels with clus[i] == i;
+    // mark these pixels with a negative id.
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId)                 // skip invalid pixels
+          continue;
+        if (clusterId[i] == i) {
+          auto old = atomicInc(&foundClusters, 0xffffffff);
+          clusterId[i] = -(old + 1);
+        }
+      }
+    __syncthreads();
+
+    // propagate the negative id to all the pixels in the cluster.
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId)                 // skip invalid pixels
+          continue;
+        if (clusterId[i] >= 0) {
+          // mark each pixel in a cluster with the same id as the first one
+          clusterId[i] = clusterId[clusterId[i]];
+        }
+      }
+    __syncthreads();
+
+    // adjust the cluster id to be a positive value starting from 0
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId) {               // skip invalid pixels
+          clusterId[i] = -9999;
+          continue;
+        }
+        clusterId[i] = - clusterId[i] - 1;
+      }
+    __syncthreads();
+
+      if (threadIdx.x == 0) {
+        nClustersInModule[thisModuleId] = foundClusters;
+        moduleId[blockIdx.x] = thisModuleId;
+#ifdef GPU_DEBUG
+      if (thisModuleId % 100 == 1)
+        if (threadIdx.x == 0)
+          printf("%d clusters in module %d\n", foundClusters, thisModuleId);
+#endif
+    }
+  }
+
+} // namespace gpuClustering
+
+#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h
new file mode 100644
index 0000000000000..7b4bb5a1c8c95
--- /dev/null
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h
@@ -0,0 +1,14 @@
+#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h
+#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h
+
+#include <cstdint>
+
+namespace gpuClustering {
+  constexpr uint32_t MaxNumModules  = 2000;
+  constexpr uint32_t MaxNumPixels   = 256 * 2000;   // this does not mean maxPixelPerModule == 256!
+  constexpr uint32_t MaxNumClustersPerModules = 1024;
+  constexpr uint16_t InvId          = 9999;         // must be > MaxNumModules
+
+}
+
+#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h
diff --git a/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py b/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py
index ba8d492c5f610..b9c6862b015bf 100644
--- a/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py
+++ b/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py
@@ -1,7 +1,17 @@
-
 import FWCore.ParameterSet.Config as cms
 
-#
 from CondTools.SiPixel.SiPixelGainCalibrationService_cfi import *
 from RecoLocalTracker.SiPixelClusterizer.SiPixelClusterizer_cfi import siPixelClusters as _siPixelClusters
-siPixelClustersPreSplitting = _siPixelClusters.clone()
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+siPixelClustersPreSplitting = SwitchProducerCUDA(
+    cpu = _siPixelClusters.clone()
+)
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+gpu.toModify(siPixelClustersPreSplitting,
+    cuda = cms.EDAlias(
+        siPixelDigisClustersPreSplitting = cms.VPSet(
+            cms.PSet(type = cms.string("SiPixelClusteredmNewDetSetVector"))
+        )
+    )
+)
diff --git a/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py b/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py
new file mode 100644
index 0000000000000..c80f3b16b3a43
--- /dev/null
+++ b/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py
@@ -0,0 +1,21 @@
+import FWCore.ParameterSet.Config as cms
+
+from RecoLocalTracker.SiPixelClusterizer.SiPixelClusterizerPreSplitting_cfi import siPixelClustersPreSplitting
+from RecoLocalTracker.SiPixelClusterizer.siPixelRawToClusterCUDA_cfi import siPixelRawToClusterCUDA as _siPixelRawToClusterCUDA
+from RecoLocalTracker.SiPixelClusterizer.siPixelDigisClustersFromSoA_cfi import siPixelDigisClustersFromSoA as _siPixelDigisClustersFromSoA
+from RecoLocalTracker.SiPixelClusterizer.siPixelFedCablingMapGPUWrapper_cfi import *
+from CalibTracker.SiPixelESProducers.siPixelGainCalibrationForHLTGPU_cfi import *
+
+siPixelClustersPreSplittingTask = cms.Task(siPixelClustersPreSplitting)
+
+siPixelClustersCUDAPreSplitting = _siPixelRawToClusterCUDA.clone()
+siPixelDigisClustersPreSplitting = _siPixelDigisClustersFromSoA.clone()
+siPixelClustersPreSplittingTaskCUDA = cms.Task(
+    siPixelClustersCUDAPreSplitting,
+    siPixelDigisClustersPreSplitting,
+)
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+_siPixelClustersPreSplittingTask_gpu = siPixelClustersPreSplittingTask.copy()
+_siPixelClustersPreSplittingTask_gpu.add(siPixelClustersPreSplittingTaskCUDA)
+gpu.toReplaceWith(siPixelClustersPreSplittingTask, _siPixelClustersPreSplittingTask_gpu)
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml b/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml
index e4a31cc26cd56..66a87291221f2 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml
+++ b/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml
@@ -2,6 +2,7 @@
 <use name="clhep"/>
 <use name="root"/>
 <use name="CommonTools/UtilAlgos"/>
+<use name="CondFormats/L1TObjects"/>
 <use name="DataFormats/Common"/>
 <use name="DataFormats/DetId"/>
 <use name="DataFormats/L1GlobalTrigger"/>
@@ -9,6 +10,7 @@
 <use name="DataFormats/VertexReco"/>
 <use name="FWCore/Framework"/>
 <use name="FWCore/ParameterSet"/>
+<use name="FWCore/PluginManager"/>
 <use name="Geometry/Records"/>
 <use name="Geometry/TrackerGeometryBuilder"/>
 <use name="L1Trigger/GlobalTriggerAnalyzer"/>
@@ -16,18 +18,33 @@
 <use name="TrackingTools/TrackFitters"/>
 <use name="TrackingTools/TrajectoryState"/>
 <use name="TrackingTools/TransientTrack"/>
+
 <library file="ReadPixClusters.cc" name="ReadPixClusters">
   <flags EDM_PLUGIN="1"/>
 </library>
-
 <library file="TestClusters.cc" name="TestClusters">
   <flags EDM_PLUGIN="1"/>
 </library>
-
 <library file="TestWithTracks.cc" name="TestWithTracks">
   <flags EDM_PLUGIN="1"/>
 </library>
-
 <library file="Triplet.cc" name="Triplet">
   <flags EDM_PLUGIN="1"/>
 </library>
+
+<bin file="gpuClustering.cu" name="gpuClustering_t">
+  <use name="cuda"/>
+  <use name="cub"/>
+  <use name="cuda-api-wrappers"/>
+  <use name="HeterogeneousCore/CUDAUtilities"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
+
+<bin file="gpuClustering.cu" name="gpuClustering_debug">
+  <use name="cuda"/>
+  <use name="cub"/>
+  <use name="cuda-api-wrappers"/>
+  <use name="HeterogeneousCore/CUDAUtilities"/>
+  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+</bin>
diff --git a/RecoLocalTracker/SiPixelRecHits/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/BuildFile.xml
index d0f5f096dbb19..7918c7a4f4d9a 100644
--- a/RecoLocalTracker/SiPixelRecHits/BuildFile.xml
+++ b/RecoLocalTracker/SiPixelRecHits/BuildFile.xml
@@ -1,12 +1,22 @@
-<use name="FWCore/ParameterSet"/>
-<use name="CondFormats/SiPixelObjects"/>
-<use name="CalibTracker/SiPixelESProducers"/>
-<use name="DataFormats/TrackerCommon"/>
-<use name="DataFormats/TrackerRecHit2D"/>
-<use name="CondFormats/SiPixelTransient"/>
-<use name="RecoLocalTracker/ClusterParameterEstimator"/>
-<use name="boost"/>
-<use name="vdt_headers"/>
+<use   name="FWCore/Framework"/>
+
+<use   name="FWCore/ParameterSet"/>
+
+<use   name="CondFormats/SiPixelObjects"/>
+<use   name="CalibTracker/SiPixelESProducers"/>
+<use   name="DataFormats/TrackerCommon"/>
+<use   name="DataFormats/SiPixelCluster"/>
+<use   name="DataFormats/TrackerRecHit2D"/>
+<use   name="TrackingTools/TrajectoryState"/>
+<use   name="CondFormats/SiPixelTransient"/>
+<use   name="boost"/>
+<use   name="vdt_headers"/>
+
+<use   name="cuda"/>
+<use   name="cuda-api-wrappers"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="HeterogeneousCore/CUDAServices"/>
+
 <export>
-  <lib name="1"/>
+  <lib   name="1"/>
 </export>
diff --git a/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h b/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h
index 4a7ba119b0a5b..f908325029afe 100644
--- a/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h
+++ b/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h
@@ -21,13 +21,13 @@
 #include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
 #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
 #include "Geometry/CommonDetUnit/interface/GeomDetType.h"
-#include "Geometry/CommonDetUnit/interface/PixelGeomDetUnit.h"
+#include "Geometry/TrackerGeometryBuilder/interface/PixelGeomDetUnit.h"
 #include "Geometry/CommonTopologies/interface/PixelTopology.h"
 #include "Geometry/CommonTopologies/interface/Topology.h"
 
 //--- For the configuration:
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+
 
 #include "DataFormats/GeometryCommonDetAlgo/interface/MeasurementPoint.h"
 #include "DataFormats/GeometryCommonDetAlgo/interface/MeasurementError.h"
@@ -51,244 +51,240 @@
 
 class RectangularPixelTopology;
 class MagneticField;
-class PixelCPEBase : public PixelClusterParameterEstimator {
+class PixelCPEBase : public PixelClusterParameterEstimator
+{
 public:
-  struct DetParam {
-    DetParam() {}
-    const PixelGeomDetUnit* theDet;
-    // gavril : replace RectangularPixelTopology with PixelTopology
-    const PixelTopology* theTopol;
-    const RectangularPixelTopology* theRecTopol;
-
-    GeomDetType::SubDetector thePart;
-    Local3DPoint theOrigin;
-    float theThickness;
-    float thePitchX;
-    float thePitchY;
-
-    float bz;  // local Bz
-    float bx;  // local Bx
-    LocalVector driftDirection;
-    float widthLAFractionX;   // Width-LA to Offset-LA in X
-    float widthLAFractionY;   // same in Y
-    float lorentzShiftInCmX;  // a FULL shift, in cm
-    float lorentzShiftInCmY;  // a FULL shift, in cm
-    int detTemplateId;        // det if for templates & generic errors
-    int detTemplateId2D;      // det if for 2D templates
-  };
-
-  struct ClusterParam {
-    ClusterParam(const SiPixelCluster& cl) : theCluster(&cl) {}
-
-    virtual ~ClusterParam() = default;
-
-    const SiPixelCluster* theCluster;
-
-    //--- Cluster-level quantities (filled in computeAnglesFrom....)
-    float cotalpha;
-    float cotbeta;
-
-    // G.Giurgiu (05/14/08) track local coordinates
-    // filled in computeAnglesFrom....
-    float trk_lp_x;
-    float trk_lp_y;
-
-    // ggiurgiu@jhu.edu (12/01/2010) : Needed for calling topology methods
-    // with track angles to handle surface deformations (bows/kinks)
-    // filled in computeAnglesFrom.... (btw redundant with the 4 above)
-    Topology::LocalTrackPred loc_trk_pred;
-
-    //--- Probability  (protected by hasFilledProb_)
-    float probabilityX_;
-    float probabilityY_;
-    float probabilityQ_;
-    int qBin_;  // always filled by qbin
-
-    bool isOnEdge_;              // filled in setTheClu
-    bool hasBadPixels_ = false;  // (never used in current code)
-    bool spansTwoROCs_;          // filled in setTheClu
-    bool hasFilledProb_ = false;
-    // ggiurgiu@jhu.edu (10/18/2008)
-    bool with_track_angle;        // filled in computeAnglesFrom....
-    bool filled_from_2d = false;  //
-
-    // More detailed edge information (for CPE ClusterRepair, and elsewhere...)
-    int edgeTypeX_ = 0;  // 0: not on edge, 1: low end on edge, 2: high end
-    int edgeTypeY_ = 0;  // 0: not on edge, 1: low end on edge, 2: high end
-  };
-
+   struct DetParam
+   {
+      DetParam() {}
+      const PixelGeomDetUnit * theDet;
+      // gavril : replace RectangularPixelTopology with PixelTopology
+      const PixelTopology * theTopol;
+      const RectangularPixelTopology * theRecTopol;
+      
+      GeomDetType::SubDetector thePart;
+      Local3DPoint theOrigin;
+      float theThickness;
+      float thePitchX;
+      float thePitchY;
+            
+      float bz; // local Bz
+      float bx; // local Bx
+      LocalVector driftDirection;
+      float widthLAFractionX;    // Width-LA to Offset-LA in X
+      float widthLAFractionY;    // same in Y
+      float lorentzShiftInCmX;   // a FULL shift, in cm
+      float lorentzShiftInCmY;   // a FULL shift, in cm
+      int   detTemplateId;       // det if for templates & generic errors
+   };
+   
+   struct ClusterParam
+   {
+      ClusterParam(){}
+      ClusterParam(const SiPixelCluster & cl) : theCluster(&cl) {}
+
+      virtual ~ClusterParam() = default;
+
+      const SiPixelCluster * theCluster = nullptr;;
+      
+      //--- Cluster-level quantities (filled in computeAnglesFrom....)
+      float cotalpha;
+      float cotbeta;
+      
+      // G.Giurgiu (05/14/08) track local coordinates
+      // filled in computeAnglesFrom....
+      float trk_lp_x;
+      float trk_lp_y;
+      
+      // ggiurgiu@jhu.edu (12/01/2010) : Needed for calling topology methods
+      // with track angles to handle surface deformations (bows/kinks)
+      // filled in computeAnglesFrom.... (btw redundant with the 4 above)
+      Topology::LocalTrackPred loc_trk_pred;
+      
+      //--- Probability  (protected by hasFilledProb_)
+      float probabilityX_ ;
+      float probabilityY_ ;
+      float probabilityQ_ ;
+      int    qBin_ ;  // always filled by qbin
+
+      bool  isOnEdge_ ; // filled in setTheClu
+      bool  hasBadPixels_ = false;  // (never used in current code)
+      bool  spansTwoROCs_ ; // filled in setTheClu
+      bool  hasFilledProb_ =false;
+      // ggiurgiu@jhu.edu (10/18/2008)
+      bool with_track_angle; // filled in computeAnglesFrom....
+      bool filled_from_2d = false; //
+
+     // More detailed edge information (for CPE ClusterRepair, and elsewhere...)
+     int   edgeTypeX_ = 0;   // 0: not on edge, 1: low end on edge, 2: high end
+     int   edgeTypeY_ = 0;   // 0: not on edge, 1: low end on edge, 2: high end
+   };
+   
 public:
-  PixelCPEBase(edm::ParameterSet const& conf,
-               const MagneticField* mag,
-               const TrackerGeometry& geom,
-               const TrackerTopology& ttopo,
-               const SiPixelLorentzAngle* lorentzAngle,
-               const SiPixelGenErrorDBObject* genErrorDBObject,
-               const SiPixelTemplateDBObject* templateDBobject,
-               const SiPixelLorentzAngle* lorentzAngleWidth,
-               int flag = 0  // flag=0 for generic, =1 for templates
-  );                         // NEW
-
-  static void fillPSetDescription(edm::ParameterSetDescription& desc);
-
-  //--------------------------------------------------------------------------
-  // Allow the magnetic field to be set/updated later.
-  //--------------------------------------------------------------------------
-  //inline void setMagField(const MagneticField *mag) const { magfield_ = mag; } // Not used, AH
-
-  //--------------------------------------------------------------------------
-  // Obtain the angles from the position of the DetUnit.
-  //--------------------------------------------------------------------------
-
-  inline ReturnType getParameters(const SiPixelCluster& cl, const GeomDetUnit& det) const override {
+   PixelCPEBase(edm::ParameterSet const& conf, const MagneticField * mag, const TrackerGeometry& geom, const TrackerTopology& ttopo,
+                const SiPixelLorentzAngle * lorentzAngle,
+                const SiPixelGenErrorDBObject * genErrorDBObject,
+                const SiPixelTemplateDBObject * templateDBobject,
+                const SiPixelLorentzAngle * lorentzAngleWidth,
+                int flag=0  // flag=0 for generic, =1 for templates
+   );  // NEW
+   
+   //--------------------------------------------------------------------------
+   // Allow the magnetic field to be set/updated later.
+   //--------------------------------------------------------------------------
+   //inline void setMagField(const MagneticField *mag) const { magfield_ = mag; } // Not used, AH
+   
+   
+   //--------------------------------------------------------------------------
+   // Obtain the angles from the position of the DetUnit.
+   //--------------------------------------------------------------------------
+   
+   inline ReturnType getParameters(const SiPixelCluster & cl,
+                                   const GeomDetUnit    & det ) const override
+   {
 #ifdef EDM_ML_DEBUG
-    nRecHitsTotal_++;
-    //std::cout<<" in PixelCPEBase:localParameters(all) - "<<nRecHitsTotal_<<std::endl;  //dk
+      nRecHitsTotal_++ ;
+      //std::cout<<" in PixelCPEBase:localParameters(all) - "<<nRecHitsTotal_<<std::endl;  //dk
 #endif
-
-    DetParam const& theDetParam = detParam(det);
-    std::unique_ptr<ClusterParam> theClusterParam = createClusterParam(cl);
-    setTheClu(theDetParam, *theClusterParam);
-    computeAnglesFromDetPosition(theDetParam, *theClusterParam);
-
-    // localPosition( cl, det ) must be called before localError( cl, det ) !!!
-    LocalPoint lp = localPosition(theDetParam, *theClusterParam);
-    LocalError le = localError(theDetParam, *theClusterParam);
-    SiPixelRecHitQuality::QualWordType rqw = rawQualityWord(*theClusterParam);
-    auto tuple = std::make_tuple(lp, le, rqw);
-
-    //std::cout<<" in PixelCPEBase:localParameters(all) - "<<lp.x()<<" "<<lp.y()<<std::endl;  //dk
-    return tuple;
-  }
-
-  //--------------------------------------------------------------------------
-  // In principle we could use the track too to obtain alpha and beta.
-  //--------------------------------------------------------------------------
-  inline ReturnType getParameters(const SiPixelCluster& cl,
-                                  const GeomDetUnit& det,
-                                  const LocalTrajectoryParameters& ltp) const override {
+      
+      DetParam const & theDetParam = detParam(det);
+      ClusterParam * theClusterParam = createClusterParam(cl);
+      setTheClu( theDetParam, *theClusterParam );
+      computeAnglesFromDetPosition(theDetParam, *theClusterParam);
+      
+      // localPosition( cl, det ) must be called before localError( cl, det ) !!!
+      LocalPoint lp = localPosition(theDetParam, *theClusterParam);
+      LocalError le = localError(theDetParam, *theClusterParam);
+      SiPixelRecHitQuality::QualWordType rqw = rawQualityWord(*theClusterParam);
+      auto tuple = std::make_tuple(lp, le , rqw);
+      delete theClusterParam;
+      
+      //std::cout<<" in PixelCPEBase:localParameters(all) - "<<lp.x()<<" "<<lp.y()<<std::endl;  //dk
+      return tuple;
+   }
+   
+   //--------------------------------------------------------------------------
+   // In principle we could use the track too to obtain alpha and beta.
+   //--------------------------------------------------------------------------
+   inline ReturnType getParameters(const SiPixelCluster & cl,
+                                   const GeomDetUnit    & det,
+                                   const LocalTrajectoryParameters & ltp ) const override
+   {
 #ifdef EDM_ML_DEBUG
-    nRecHitsTotal_++;
-    //std::cout<<" in PixelCPEBase:localParameters(on track) - "<<nRecHitsTotal_<<std::endl;  //dk
+      nRecHitsTotal_++ ;
+      //std::cout<<" in PixelCPEBase:localParameters(on track) - "<<nRecHitsTotal_<<std::endl;  //dk
 #endif
-
-    DetParam const& theDetParam = detParam(det);
-    std::unique_ptr<ClusterParam> theClusterParam = createClusterParam(cl);
-    setTheClu(theDetParam, *theClusterParam);
-    computeAnglesFromTrajectory(theDetParam, *theClusterParam, ltp);
-
-    // localPosition( cl, det ) must be called before localError( cl, det ) !!!
-    LocalPoint lp = localPosition(theDetParam, *theClusterParam);
-    LocalError le = localError(theDetParam, *theClusterParam);
-    SiPixelRecHitQuality::QualWordType rqw = rawQualityWord(*theClusterParam);
-    auto tuple = std::make_tuple(lp, le, rqw);
-
-    //std::cout<<" in PixelCPEBase:localParameters(on track) - "<<lp.x()<<" "<<lp.y()<<std::endl;  //dk
-    return tuple;
-  }
-
+      
+      DetParam const & theDetParam = detParam(det);
+      ClusterParam *  theClusterParam = createClusterParam(cl);
+      setTheClu( theDetParam, *theClusterParam );
+      computeAnglesFromTrajectory(theDetParam, *theClusterParam, ltp);
+      
+      // localPosition( cl, det ) must be called before localError( cl, det ) !!!
+      LocalPoint lp = localPosition(theDetParam, *theClusterParam);
+      LocalError le = localError(theDetParam, *theClusterParam);
+      SiPixelRecHitQuality::QualWordType rqw = rawQualityWord(*theClusterParam);
+      auto tuple = std::make_tuple(lp, le , rqw);
+      delete theClusterParam;
+      
+      //std::cout<<" in PixelCPEBase:localParameters(on track) - "<<lp.x()<<" "<<lp.y()<<std::endl;  //dk
+      return tuple;
+   }
+   
+   
+   
 private:
-  virtual std::unique_ptr<ClusterParam> createClusterParam(const SiPixelCluster& cl) const = 0;
-
-  //--------------------------------------------------------------------------
-  // This is where the action happens.
-  //--------------------------------------------------------------------------
-  virtual LocalPoint localPosition(DetParam const& theDetParam, ClusterParam& theClusterParam) const = 0;
-  virtual LocalError localError(DetParam const& theDetParam, ClusterParam& theClusterParam) const = 0;
-
-  void fillDetParams();
-
-  //-----------------------------------------------------------------------------
-  //! A convenience method to fill a whole SiPixelRecHitQuality word in one shot.
-  //! This way, we can keep the details of what is filled within the pixel
-  //! code and not expose the Transient SiPixelRecHit to it as well.  The name
-  //! of this function is chosen to match the one in SiPixelRecHit.
-  //-----------------------------------------------------------------------------
-  SiPixelRecHitQuality::QualWordType rawQualityWord(ClusterParam& theClusterParam) const;
-
+   virtual ClusterParam * createClusterParam(const SiPixelCluster & cl) const = 0;
+   
+   //--------------------------------------------------------------------------
+   // This is where the action happens.
+   //--------------------------------------------------------------------------
+   virtual LocalPoint localPosition(DetParam const & theDetParam, ClusterParam & theClusterParam) const = 0;
+   virtual LocalError localError   (DetParam const & theDetParam, ClusterParam & theClusterParam) const = 0;
+   
+   void fillDetParams();
+   
+   //-----------------------------------------------------------------------------
+   //! A convenience method to fill a whole SiPixelRecHitQuality word in one shot.
+   //! This way, we can keep the details of what is filled within the pixel
+   //! code and not expose the Transient SiPixelRecHit to it as well.  The name
+   //! of this function is chosen to match the one in SiPixelRecHit.
+   //-----------------------------------------------------------------------------
+   SiPixelRecHitQuality::QualWordType rawQualityWord(ClusterParam & theClusterParam) const;
+   
 protected:
-  //--- All methods and data members are protected to facilitate (for now)
-  //--- access from derived classes.
-
-  typedef GloballyPositioned<double> Frame;
-
-  //---------------------------------------------------------------------------
-  //  Data members
-  //---------------------------------------------------------------------------
-
-  //--- Counters
+   //--- All methods and data members are protected to facilitate (for now)
+   //--- access from derived classes.
+   
+   typedef GloballyPositioned<double> Frame;
+   
+   //---------------------------------------------------------------------------
+   //  Data members
+   //---------------------------------------------------------------------------
+   
+   //--- Counters
 #ifdef EDM_ML_DEBUG
-  mutable std::atomic<int> nRecHitsTotal_;     //for debugging only
-  mutable std::atomic<int> nRecHitsUsedEdge_;  //for debugging only
+   mutable std::atomic<int>    nRecHitsTotal_ ; //for debugging only
+   mutable std::atomic<int>    nRecHitsUsedEdge_ ; //for debugging only
 #endif
-
-  // Added new members
-  float lAOffset_;     // la used to calculate the offset from configuration (for testing)
-  float lAWidthBPix_;  // la used to calculate the cluster width from conf.
-  float lAWidthFPix_;  // la used to calculate the cluster width from conf.
-  //bool useLAAlignmentOffsets_; // lorentz angle offsets detrmined by alignment
-  bool useLAOffsetFromConfig_;  // lorentz angle used to calculate the offset
-  bool useLAWidthFromConfig_;   // lorentz angle used to calculate the cluster width
-  bool useLAWidthFromDB_;       // lorentz angle used to calculate the cluster width
-
-  //--- Global quantities
-  int theVerboseLevel;  // algorithm's verbosity
-  int theFlag_;         // flag to recognice if we are in generic or templates
-
-  const MagneticField* magfield_;  // magnetic field
-  const TrackerGeometry& geom_;    // geometry
-  const TrackerTopology& ttopo_;   // Tracker Topology
-
-  const SiPixelLorentzAngle* lorentzAngle_;
-  const SiPixelLorentzAngle* lorentzAngleWidth_;  // for the charge width (generic)
-
-  const SiPixelGenErrorDBObject* genErrorDBObject_;  // NEW
-  //const SiPixelCPEGenericErrorParm * genErrorParm_;  // OLD
-
-  const SiPixelTemplateDBObject* templateDBobject_;
-  bool alpha2Order;  // switch on/off E.B effect.
-
-  bool DoLorentz_;
-  bool LoadTemplatesFromDB_;
-
-  //errors for template reco for edge hits, based on observed residuals from
-  //studies likely done in 2011...
-  static constexpr float xEdgeXError_ = 23.0f;
-  static constexpr float xEdgeYError_ = 39.0f;
-
-  static constexpr float yEdgeXError_ = 24.0f;
-  static constexpr float yEdgeYError_ = 96.0f;
-
-  static constexpr float bothEdgeXError_ = 31.0f;
-  static constexpr float bothEdgeYError_ = 90.0f;
-
-  static constexpr float clusterSplitMaxError_ = 7777.7f;
-
-  //---------------------------------------------------------------------------
-  //  Geometrical services to subclasses.
-  //---------------------------------------------------------------------------
+   
+   // Added new members
+   float lAOffset_; // la used to calculate the offset from configuration (for testing)
+   float lAWidthBPix_;  // la used to calculate the cluster width from conf.
+   float lAWidthFPix_;  // la used to calculate the cluster width from conf.
+   //bool useLAAlignmentOffsets_; // lorentz angle offsets detrmined by alignment
+   bool useLAOffsetFromConfig_; // lorentz angle used to calculate the offset
+   bool useLAWidthFromConfig_; // lorentz angle used to calculate the cluster width
+   bool useLAWidthFromDB_;     // lorentz angle used to calculate the cluster width
+   
+   //--- Global quantities
+   int     theVerboseLevel;                    // algorithm's verbosity
+   int     theFlag_;   // flag to recognice if we are in generic or templates
+   
+   const MagneticField * magfield_;          // magnetic field
+   const TrackerGeometry & geom_;          // geometry
+   const TrackerTopology & ttopo_;         // Tracker Topology
+   
+   const SiPixelLorentzAngle * lorentzAngle_;
+   const SiPixelLorentzAngle * lorentzAngleWidth_;  // for the charge width (generic)
+   
+   const SiPixelGenErrorDBObject * genErrorDBObject_;  // NEW
+   //const SiPixelCPEGenericErrorParm * genErrorParm_;  // OLD
+   
+   const SiPixelTemplateDBObject * templateDBobject_;
+   bool  alpha2Order;                          // switch on/off E.B effect.
+   
+   bool DoLorentz_;
+   bool LoadTemplatesFromDB_;
+   
+   //---------------------------------------------------------------------------
+   //  Geometrical services to subclasses.
+   //---------------------------------------------------------------------------
 protected:
-  void computeAnglesFromDetPosition(DetParam const& theDetParam, ClusterParam& theClusterParam) const;
-
-  void computeAnglesFromTrajectory(DetParam const& theDetParam,
-                                   ClusterParam& theClusterParam,
-                                   const LocalTrajectoryParameters& ltp) const;
-
-  void setTheClu(DetParam const&, ClusterParam& theClusterParam) const;
-
-  LocalVector driftDirection(DetParam& theDetParam, GlobalVector bfield) const;
-  LocalVector driftDirection(DetParam& theDetParam, LocalVector bfield) const;
-  void computeLorentzShifts(DetParam&) const;
-
-  //---------------------------------------------------------------------------
-  //  Cluster-level services.
-  //---------------------------------------------------------------------------
-
-  DetParam const& detParam(const GeomDetUnit& det) const;
-
-  using DetParams = std::vector<DetParam>;
-
-  DetParams m_DetParams = DetParams(1440);
+   void computeAnglesFromDetPosition( DetParam const & theDetParam, ClusterParam & theClusterParam ) const;
+   
+   void computeAnglesFromTrajectory ( DetParam const & theDetParam, ClusterParam & theClusterParam,
+                                     const LocalTrajectoryParameters & ltp) const;
+   
+   void  setTheClu( DetParam const &, ClusterParam & theClusterParam ) const ;
+   
+   LocalVector driftDirection       (DetParam & theDetParam, GlobalVector bfield ) const ;
+   LocalVector driftDirection       (DetParam & theDetParam, LocalVector bfield ) const ;
+   void computeLorentzShifts(DetParam &) const ;
+   
+   
+   //---------------------------------------------------------------------------
+   //  Cluster-level services.
+   //---------------------------------------------------------------------------
+   
+   DetParam const & detParam(const GeomDetUnit & det) const;
+   
+   using DetParams=std::vector<DetParam>;
+   
+   DetParams m_DetParams=DetParams(1440);
+   
 };
 
 #endif
+
+
diff --git a/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h b/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h
new file mode 100644
index 0000000000000..9b8924988e848
--- /dev/null
+++ b/RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h
@@ -0,0 +1,98 @@
+#ifndef RecoLocalTracker_SiPixelRecHits_PixelCPEFast_h
+#define RecoLocalTracker_SiPixelRecHits_PixelCPEFast_h
+
+#include <utility>
+
+#include <cuda/api_wrappers.h>
+
+#include "CalibTracker/SiPixelESProducers/interface/SiPixelCPEGenericDBErrorParametrization.h"
+#include "CondFormats/SiPixelTransient/interface/SiPixelGenError.h"
+#include "CondFormats/SiPixelTransient/interface/SiPixelTemplate.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+
+class MagneticField;
+class PixelCPEFast final : public PixelCPEBase
+{
+public:
+   struct ClusterParamGeneric : ClusterParam
+   {
+      ClusterParamGeneric() {}
+      ClusterParamGeneric(const SiPixelCluster & cl) : ClusterParam(cl){}
+
+      // The truncation value pix_maximum is an angle-dependent cutoff on the
+      // individual pixel signals. It should be applied to all pixels in the
+      // cluster [signal_i = fminf(signal_i, pixmax)] before the column and row
+      // sums are made. Morris
+      int pixmx;
+      
+      // These are errors predicted by PIXELAV
+      float sigmay; // CPE Generic y-error for multi-pixel cluster
+      float sigmax; // CPE Generic x-error for multi-pixel cluster
+      float sy1;    // CPE Generic y-error for single single-pixel
+      float sy2;    // CPE Generic y-error for single double-pixel cluster
+      float sx1;    // CPE Generic x-error for single single-pixel cluster
+      float sx2;    // CPE Generic x-error for single double-pixel cluster
+      
+   };
+   
+   PixelCPEFast(edm::ParameterSet const& conf, const MagneticField *,
+                   const TrackerGeometry&, const TrackerTopology&, const SiPixelLorentzAngle *,
+                   const SiPixelGenErrorDBObject *, const SiPixelLorentzAngle *);
+   
+   
+   ~PixelCPEFast() override;
+
+    // The return value can only be used safely in kernels launched on
+    // the same cudaStream, or after cudaStreamSynchronize.
+    const pixelCPEforGPU::ParamsOnGPU *getGPUProductAsync(cuda::stream_t<>& cudaStream) const;
+
+private:
+   ClusterParam * createClusterParam(const SiPixelCluster & cl) const override;
+   
+   LocalPoint localPosition (DetParam const & theDetParam, ClusterParam & theClusterParam) const override;
+   LocalError localError   (DetParam const & theDetParam, ClusterParam & theClusterParam) const override;
+
+   void errorFromTemplates(DetParam const & theDetParam, ClusterParamGeneric & theClusterParam, float qclus) const;
+   
+   static void
+   collect_edge_charges(ClusterParam & theClusterParam,  //!< input, the cluster
+                        int & Q_f_X,              //!< output, Q first  in X
+                        int & Q_l_X,              //!< output, Q last   in X
+                        int & Q_f_Y,              //!< output, Q first  in Y
+                        int & Q_l_Y,              //!< output, Q last   in Y
+                        bool truncate
+   );
+   
+   
+   bool UseErrorsFromTemplates_;
+   bool TruncatePixelCharge_;
+   
+   float EdgeClusterErrorX_;
+   float EdgeClusterErrorY_;
+   
+   std::vector<float> xerr_barrel_l1_,  yerr_barrel_l1_,  xerr_barrel_ln_;
+   std::vector<float> yerr_barrel_ln_,  xerr_endcap_,  yerr_endcap_;
+   float xerr_barrel_l1_def_, yerr_barrel_l1_def_,  xerr_barrel_ln_def_;
+   float yerr_barrel_ln_def_, xerr_endcap_def_, yerr_endcap_def_;
+   
+   //--- DB Error Parametrization object, new light templates 
+   std::vector< SiPixelGenErrorStore > thePixelGenError_;
+
+   std::vector<pixelCPEforGPU::DetParams, CUDAHostAllocator<pixelCPEforGPU::DetParams>> m_detParamsGPU;
+   pixelCPEforGPU::CommonParams m_commonParamsGPU;     
+
+   struct GPUData {
+     ~GPUData();
+     // not needed if not used on CPU...
+     pixelCPEforGPU::ParamsOnGPU h_paramsOnGPU;
+     pixelCPEforGPU::ParamsOnGPU * d_paramsOnGPU = nullptr;  // copy of the above on the Device
+   };
+   CUDAESProduct<GPUData> gpuData_;
+
+   void fillParamsForGpu();
+};
+
+#endif // RecoLocalTracker_SiPixelRecHits_PixelCPEFast_h
diff --git a/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h b/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h
new file mode 100644
index 0000000000000..fa326865ced73
--- /dev/null
+++ b/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h
@@ -0,0 +1,289 @@
+#ifndef RecoLocalTracker_SiPixelRecHits_pixelCPEforGPU_h
+#define RecoLocalTracker_SiPixelRecHits_pixelCPEforGPU_h
+
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <iterator>
+
+#include "DataFormats/GeometrySurface/interface/SOARotation.h"
+#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h"
+
+namespace pixelCPEforGPU {
+
+  using Frame = SOAFrame<float>;
+  using Rotation = SOARotation<float>;
+
+  // all modules are identical!
+  struct CommonParams {
+    float theThicknessB;
+    float theThicknessE;
+    float thePitchX;
+    float thePitchY;
+  };
+
+  struct DetParams {
+    bool isBarrel;
+    bool isPosZ;
+    uint16_t layer;
+    uint16_t index;
+    uint32_t rawId;
+
+    float shiftX;
+    float shiftY;
+    float chargeWidthX;
+    float chargeWidthY;
+
+    float x0,y0,z0;  // the vertex in the local coord of the detector
+
+    float sx[3], sy[3]; // the errors...
+
+    Frame frame;
+  };
+
+
+  struct ParamsOnGPU {
+    CommonParams * m_commonParams;
+    DetParams * m_detParams;
+
+    constexpr
+    CommonParams const & __restrict__ commonParams() const {
+      CommonParams const * __restrict__ l = m_commonParams;
+       return *l;
+    }
+    constexpr
+    DetParams const &  __restrict__ detParams(int i) const {
+      DetParams const * __restrict__ l = m_detParams;
+       return l[i];
+    }
+  };
+
+  // SOA (on device)
+  template<uint32_t N>
+  struct ClusParamsT {
+    uint32_t minRow[N];
+    uint32_t maxRow[N];
+    uint32_t minCol[N];
+    uint32_t maxCol[N];
+
+    int32_t Q_f_X[N];
+    int32_t Q_l_X[N];
+    int32_t Q_f_Y[N];
+    int32_t Q_l_Y[N];
+
+    int32_t charge[N];
+
+    float xpos[N];
+    float ypos[N];
+
+    float xerr[N];
+    float yerr[N];
+  };
+
+
+  constexpr uint32_t MaxClusInModule=256;
+  using ClusParams = ClusParamsT<256>;
+
+  constexpr inline
+  void computeAnglesFromDet(DetParams const & __restrict__ detParams, float const x, float const y, float & cotalpha, float & cotbeta) {
+    // x,y local position on det
+    auto gvx = x - detParams.x0;
+    auto gvy = y - detParams.y0;
+    auto gvz = -1.f / detParams.z0;
+    // normalization not required as only ratio used...
+    // calculate angles
+    cotalpha = gvx * gvz;
+    cotbeta  = gvy * gvz;
+  }
+
+  constexpr inline
+  float correction(
+      int sizeM1,
+      int Q_f,                          //!< Charge in the first pixel.
+      int Q_l,                          //!< Charge in the last pixel.
+      uint16_t upper_edge_first_pix,    //!< As the name says.
+      uint16_t lower_edge_last_pix,     //!< As the name says.
+      float lorentz_shift,              //!< L-shift at half thickness
+      float theThickness,               //detector thickness
+      float cot_angle,                  //!< cot of alpha_ or beta_
+      float pitch,                      //!< thePitchX or thePitchY
+      bool first_is_big,                //!< true if the first is big
+      bool last_is_big )                //!< true if the last is big
+  {
+    if (0 == sizeM1)    // size 1
+      return 0;
+
+    float W_eff = 0;
+    bool simple = true;
+    if (1 == sizeM1) {  // size 2
+      //--- Width of the clusters minus the edge (first and last) pixels.
+      //--- In the note, they are denoted x_F and x_L (and y_F and y_L)
+      // assert(lower_edge_last_pix >= upper_edge_first_pix);
+      auto W_inner = pitch * float(lower_edge_last_pix - upper_edge_first_pix);     // in cm
+
+      //--- Predicted charge width from geometry
+      auto W_pred  = theThickness * cot_angle       // geometric correction (in cm)
+                   - lorentz_shift;                 // (in cm) &&& check fpix!
+
+      W_eff = std::abs(W_pred) - W_inner;
+
+      //--- If the observed charge width is inconsistent with the expectations
+      //--- based on the track, do *not* use W_pred-W_inner.  Instead, replace
+      //--- it with an *average* effective charge width, which is the average
+      //--- length of the edge pixels.
+      simple = (W_eff < 0.0f) | (W_eff > pitch);    // this produces "large" regressions for very small numeric differences...
+    }
+
+    if (simple) {
+      //--- Total length of the two edge pixels (first+last)
+      float sum_of_edge = 2.0f;
+      if (first_is_big) sum_of_edge += 1.0f;
+      if (last_is_big)  sum_of_edge += 1.0f;
+      W_eff = pitch * 0.5f * sum_of_edge;           // ave. length of edge pixels (first+last) (cm)
+    }
+
+    //--- Finally, compute the position in this projection
+    float Qdiff = Q_l - Q_f;
+    float Qsum  = Q_l + Q_f;
+
+    //--- Temporary fix for clusters with both first and last pixel with charge = 0
+    if (Qsum == 0)
+      Qsum = 1.0f;
+
+    return 0.5f * (Qdiff/Qsum) * W_eff;
+  }
+
+  constexpr inline
+  void position(CommonParams const & __restrict__ comParams, DetParams const & __restrict__ detParams, ClusParams & cp, uint32_t ic) {
+
+    //--- Upper Right corner of Lower Left pixel -- in measurement frame
+    uint16_t llx = cp.minRow[ic]+1;
+    uint16_t lly = cp.minCol[ic]+1;
+
+    //--- Lower Left corner of Upper Right pixel -- in measurement frame
+    uint16_t urx = cp.maxRow[ic];
+    uint16_t ury = cp.maxCol[ic];
+
+    auto llxl = phase1PixelTopology::localX(llx);
+    auto llyl = phase1PixelTopology::localY(lly);
+    auto urxl = phase1PixelTopology::localX(urx);
+    auto uryl = phase1PixelTopology::localY(ury);
+
+    auto mx = llxl+urxl;
+    auto my = llyl+uryl;
+
+    // apply the lorentz offset correction
+    auto xPos = detParams.shiftX + comParams.thePitchX*(0.5f*float(mx)+float(phase1PixelTopology::xOffset));
+    auto yPos = detParams.shiftY + comParams.thePitchY*(0.5f*float(my)+float(phase1PixelTopology::yOffset));
+
+    float cotalpha=0, cotbeta=0;
+
+    computeAnglesFromDet(detParams, xPos,  yPos, cotalpha, cotbeta);
+
+    auto thickness = detParams.isBarrel ? comParams.theThicknessB : comParams.theThicknessE;
+
+    auto xcorr = correction(
+        cp.maxRow[ic]-cp.minRow[ic],
+        cp.Q_f_X[ic], cp.Q_l_X[ic],
+        llxl, urxl,
+        detParams.chargeWidthX,   // lorentz shift in cm
+        thickness,
+        cotalpha,
+        comParams.thePitchX,
+        phase1PixelTopology::isBigPixX(cp.minRow[ic]),
+        phase1PixelTopology::isBigPixX(cp.maxRow[ic]) );
+
+    auto ycorr = correction(
+        cp.maxCol[ic]-cp.minCol[ic],
+        cp.Q_f_Y[ic], cp.Q_l_Y[ic],
+        llyl, uryl,
+        detParams.chargeWidthY,   // lorentz shift in cm
+        thickness,
+        cotbeta,
+        comParams.thePitchY,
+        phase1PixelTopology::isBigPixY(cp.minCol[ic]),
+        phase1PixelTopology::isBigPixY(cp.maxCol[ic]) );
+
+    cp.xpos[ic]=xPos+xcorr;
+    cp.ypos[ic]=yPos+ycorr;
+  }
+
+  constexpr inline
+  void errorFromSize(CommonParams const & __restrict__ comParams, DetParams const & __restrict__ detParams, ClusParams & cp, uint32_t ic) {
+    // Edge cluster errors
+    cp.xerr[ic]= 0.0050;
+    cp.yerr[ic]= 0.0085;
+
+    // FIXME these are errors form Run1
+    constexpr float xerr_barrel_l1[] = { 0.00115, 0.00120, 0.00088 };
+    constexpr float xerr_barrel_l1_def = 0.00200;           // 0.01030;
+    constexpr float yerr_barrel_l1[] = { 0.00375, 0.00230, 0.00250, 0.00250, 0.00230, 0.00230, 0.00210, 0.00210, 0.00240 };
+    constexpr float yerr_barrel_l1_def = 0.00210;
+    constexpr float xerr_barrel_ln[] = { 0.00115, 0.00120, 0.00088 };
+    constexpr float xerr_barrel_ln_def = 0.00200; // 0.01030;
+    constexpr float yerr_barrel_ln[] = { 0.00375, 0.00230, 0.00250, 0.00250, 0.00230, 0.00230, 0.00210, 0.00210, 0.00240 };
+    constexpr float yerr_barrel_ln_def = 0.00210;
+    constexpr float xerr_endcap[] = { 0.0020, 0.0020 };
+    constexpr float xerr_endcap_def = 0.0020;
+    constexpr float yerr_endcap[] = { 0.00210 };
+    constexpr float yerr_endcap_def = 0.00210;
+
+    auto sx = cp.maxRow[ic] - cp.minRow[ic];
+    auto sy = cp.maxCol[ic] - cp.minCol[ic];
+
+    // is edgy ?
+    bool isEdgeX = cp.minRow[ic] == 0 or cp.maxRow[ic] == phase1PixelTopology::lastRowInModule;
+    bool isEdgeY = cp.minCol[ic] == 0 or cp.maxCol[ic] == phase1PixelTopology::lastColInModule;
+    // is one and big?
+    bool isBig1X = (0==sx)  && phase1PixelTopology::isBigPixX(cp.minRow[ic]);
+    bool isBig1Y = (0==sy)  && phase1PixelTopology::isBigPixY(cp.minCol[ic]);
+
+
+    if (!isEdgeX && !isBig1X ) {
+      if (not detParams.isBarrel) {
+        cp.xerr[ic] = sx <std::size(xerr_endcap) ? xerr_endcap[sx] : xerr_endcap_def;
+      } else if (detParams.layer == 1) {
+        cp.xerr[ic] = sx <std::size(xerr_barrel_l1) ? xerr_barrel_l1[sx]: xerr_barrel_l1_def;
+      } else {
+        cp.xerr[ic] = sx <std::size(xerr_barrel_ln) ? xerr_barrel_ln[sx]: xerr_barrel_ln_def;
+      }
+    }
+
+    if (!isEdgeY && !isBig1Y) {
+      if (not detParams.isBarrel) {
+        cp.yerr[ic] = sy <std::size(yerr_endcap) ? yerr_endcap[sy] : yerr_endcap_def;
+      } else if (detParams.layer == 1) {
+        cp.yerr[ic] = sy <std::size(yerr_barrel_l1) ? yerr_barrel_l1[sy]: yerr_barrel_l1_def;
+      } else {
+        cp.yerr[ic] = sy <std::size(yerr_barrel_ln) ? yerr_barrel_ln[sy]: yerr_barrel_ln_def;
+      }
+    }
+  }
+
+
+  constexpr inline
+  void errorFromDB(CommonParams const & __restrict__ comParams, DetParams const & __restrict__ detParams, ClusParams & cp, uint32_t ic) {
+    // Edge cluster errors
+    cp.xerr[ic]= 0.0050f;
+    cp.yerr[ic]= 0.0085f;
+
+    auto sx = cp.maxRow[ic] - cp.minRow[ic];
+    auto sy = cp.maxCol[ic] - cp.minCol[ic];
+
+    // is edgy ?
+    bool isEdgeX = cp.minRow[ic] == 0 or cp.maxRow[ic] == phase1PixelTopology::lastRowInModule;
+    bool isEdgeY = cp.minCol[ic] == 0 or cp.maxCol[ic] == phase1PixelTopology::lastColInModule;
+    // is one and big?
+    uint32_t ix = (0==sx);
+    uint32_t iy = (0==sy);
+    ix+= (0==sx)  && phase1PixelTopology::isBigPixX(cp.minRow[ic]);
+    iy+= (0==sy)  && phase1PixelTopology::isBigPixY(cp.minCol[ic]);
+
+    if (not isEdgeX) cp.xerr[ic] = detParams.sx[ix];
+    if (not isEdgeY) cp.yerr[ic] = detParams.sy[iy];
+  }
+
+}
+
+#endif  // RecoLocalTracker_SiPixelRecHits_pixelCPEforGPU_h
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml
index e02a0b722c1ae..a8af0c8a7c4f9 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml
@@ -1,7 +1,12 @@
+<use name="DataFormats/TrackerCommon"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/Producer"/>
+<use name="HeterogeneousCore/Product"/>
 <use name="RecoLocalTracker/ClusterParameterEstimator"/>
 <use name="RecoLocalTracker/Records"/>
 <use name="RecoLocalTracker/SiPixelRecHits"/>
-<use name="DataFormats/TrackerCommon"/>
-<library file="*.cc" name="RecoLocalTrackerSiPixelRecHitsPlugins">
+<library file="*.cc *.cu" name="RecoLocalTrackerSiPixelRecHitsPlugins">
+  <use name="cuda"/>
+  <use name="cuda-api-wrappers"/>
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelCPEFastESProducer.cc b/RecoLocalTracker/SiPixelRecHits/plugins/PixelCPEFastESProducer.cc
new file mode 100644
index 0000000000000..344625cba01b6
--- /dev/null
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelCPEFastESProducer.cc
@@ -0,0 +1,103 @@
+#include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h"
+#include "MagneticField/Engine/interface/MagneticField.h"
+#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
+#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
+#include "Geometry/Records/interface/TrackerTopologyRcd.h"
+#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
+
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/ModuleFactory.h"
+
+// new record 
+#include "CondFormats/DataRecord/interface/SiPixelGenErrorDBObjectRcd.h"
+
+#include "FWCore/Framework/interface/ESProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "RecoLocalTracker/Records/interface/TkPixelCPERecord.h"
+#include "RecoLocalTracker/ClusterParameterEstimator/interface/PixelClusterParameterEstimator.h"
+#include <memory>
+
+class  PixelCPEFastESProducer: public edm::ESProducer{
+ public:
+  PixelCPEFastESProducer(const edm::ParameterSet & p);
+  std::shared_ptr<PixelClusterParameterEstimator> produce(const TkPixelCPERecord &);
+ private:
+  std::shared_ptr<PixelClusterParameterEstimator> cpe_;
+  edm::ParameterSet pset_;
+  edm::ESInputTag magname_;
+  bool UseErrorsFromTemplates_;
+};
+
+
+#include <string>
+#include <memory>
+
+using namespace edm;
+
+
+
+
+PixelCPEFastESProducer::PixelCPEFastESProducer(const edm::ParameterSet & p) 
+{
+  std::string myname = p.getParameter<std::string>("ComponentName");
+  magname_ = p.existsAs<edm::ESInputTag>("MagneticFieldRecord")?
+    p.getParameter<edm::ESInputTag>("MagneticFieldRecord"):edm::ESInputTag("");
+  UseErrorsFromTemplates_    = p.getParameter<bool>("UseErrorsFromTemplates");
+
+
+  pset_ = p;
+  setWhatProduced(this,myname);
+
+
+}
+
+
+std::shared_ptr<PixelClusterParameterEstimator>
+PixelCPEFastESProducer::produce(const TkPixelCPERecord & iRecord){ 
+
+  ESHandle<MagneticField> magfield;
+  iRecord.getRecord<IdealMagneticFieldRecord>().get( magname_, magfield );
+
+  edm::ESHandle<TrackerGeometry> pDD;
+  iRecord.getRecord<TrackerDigiGeometryRecord>().get( pDD );
+
+  edm::ESHandle<TrackerTopology> hTT;
+  iRecord.getRecord<TrackerDigiGeometryRecord>().getRecord<TrackerTopologyRcd>().get(hTT);
+
+  // Lorant angle for offsets
+  ESHandle<SiPixelLorentzAngle> lorentzAngle;
+  iRecord.getRecord<SiPixelLorentzAngleRcd>().get(lorentzAngle );
+
+  // add the new la width object
+  ESHandle<SiPixelLorentzAngle> lorentzAngleWidth;
+  const SiPixelLorentzAngle * lorentzAngleWidthProduct = nullptr;
+  iRecord.getRecord<SiPixelLorentzAngleRcd>().get("forWidth",lorentzAngleWidth );
+  lorentzAngleWidthProduct = lorentzAngleWidth.product();
+
+  const SiPixelGenErrorDBObject * genErrorDBObjectProduct = nullptr;
+
+  // Errors take only from new GenError
+  ESHandle<SiPixelGenErrorDBObject> genErrorDBObject;
+  if(UseErrorsFromTemplates_) {  // do only when generrors are needed
+    iRecord.getRecord<SiPixelGenErrorDBObjectRcd>().get(genErrorDBObject); 
+    genErrorDBObjectProduct = genErrorDBObject.product();
+    //} else {
+    //std::cout<<" pass an empty GenError pointer"<<std::endl;
+  }
+  cpe_  = std::make_shared<PixelCPEFast>(
+                         pset_,magfield.product(),*pDD.product(),
+			 *hTT.product(),lorentzAngle.product(),
+			 genErrorDBObjectProduct,lorentzAngleWidthProduct);
+
+  return cpe_;
+}
+
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h"
+
+DEFINE_FWK_EVENTSETUP_MODULE(PixelCPEFastESProducer);
+
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
new file mode 100644
index 0000000000000..80be13dedd26b
--- /dev/null
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -0,0 +1,207 @@
+// C++ headers
+#include <algorithm>
+#include <numeric>
+
+// CUDA runtime
+#include <cuda_runtime.h>
+
+// CMSSW headers
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h"
+#include "PixelRecHits.h"
+#include "gpuPixelRecHits.h"
+
+namespace {
+  __global__
+  void setHitsLayerStart(const uint32_t* hitsModuleStart, const uint32_t* layerStart, uint32_t* hitsLayerStart) {
+    auto i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if(i < 10) {
+      hitsLayerStart[i] = hitsModuleStart[layerStart[i]];
+    }
+    else if(i == 10) {
+      hitsLayerStart[i] = hitsModuleStart[gpuClustering::MaxNumModules];
+    }
+  }
+
+  template <typename T>
+  T *slicePitch(void *ptr, size_t pitch, size_t row) {
+    return reinterpret_cast<T *>( reinterpret_cast<char *>(ptr) + pitch*row);
+  }
+}
+
+namespace pixelgpudetails {
+  PixelRecHitGPUKernel::PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream) {
+
+    constexpr auto MAX_HITS = siPixelRecHitsHeterogeneousProduct::maxHits();
+
+    cudaCheck(cudaMalloc((void **) & gpu_.bs_d, 3 * sizeof(float)));
+    cudaCheck(cudaMalloc((void **) & gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t)));
+
+    // Coalesce all 32bit and 16bit arrays to two big blobs
+    //
+    // This is just a toy. Please don't copy-paste the logic but
+    // create a proper abstraction (e.g. along FWCore/SOA, or
+    // FWCore/Utilities/interface/SoATuple.h
+    //
+    // Order such that the first ones are the ones transferred to CPU
+    static_assert(sizeof(uint32_t) == sizeof(float)); // just stating the obvious
+    cudaCheck(cudaMallocPitch(&gpu_.owner_32bit_, &gpu_.owner_32bit_pitch_, MAX_HITS*sizeof(uint32_t), 9));
+    cudaCheck(cudaMemsetAsync(gpu_.owner_32bit_, 0x0, gpu_.owner_32bit_pitch_*9, cudaStream.id()));
+    //edm::LogPrint("Foo") << "Allocate 32bit with pitch " << gpu_.owner_32bit_pitch_;
+    gpu_.charge_d = slicePitch<int32_t>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 0);
+    gpu_.xl_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 1);
+    gpu_.yl_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 2);
+    gpu_.xerr_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 3);
+    gpu_.yerr_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 4);
+    gpu_.xg_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 5);
+    gpu_.yg_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 6);
+    gpu_.zg_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 7);
+    gpu_.rg_d = slicePitch<float>(gpu_.owner_32bit_, gpu_.owner_32bit_pitch_, 8);
+
+    // Order such that the first ones are the ones transferred to CPU
+    cudaCheck(cudaMallocPitch(&gpu_.owner_16bit_, &gpu_.owner_16bit_pitch_, MAX_HITS*sizeof(uint16_t), 5));
+    cudaCheck(cudaMemsetAsync(gpu_.owner_16bit_, 0x0, gpu_.owner_16bit_pitch_*5, cudaStream.id()));
+    //edm::LogPrint("Foo") << "Allocate 16bit with pitch " << gpu_.owner_16bit_pitch_;
+    gpu_.detInd_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 0);
+    gpu_.mr_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 1);
+    gpu_.mc_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 2);
+    gpu_.iphi_d = slicePitch<int16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 3);
+    gpu_.sortIndex_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 4);
+
+    cudaCheck(cudaMalloc((void **) & gpu_.hist_d, sizeof(HitsOnGPU::Hist)));
+    cudaCheck(cudaMalloc((void **) & gpu_.hws_d, HitsOnGPU::Hist::wsSize()));
+    cudaCheck(cudaMalloc((void **) & gpu_d, sizeof(HitsOnGPU)));
+
+    // Feels a bit dumb but constexpr arrays are not supported for device code
+    // TODO: should be moved to EventSetup (or better ideas?)
+    // Would it be better to use "constant memory"?
+    cudaCheck(cudaMalloc((void **) & d_phase1TopologyLayerStart_, 11 * sizeof(uint32_t)));
+    cudaCheck(cudaMemcpyAsync(d_phase1TopologyLayerStart_, phase1PixelTopology::layerStart, 11 * sizeof(uint32_t), cudaMemcpyDefault, cudaStream.id()));
+    cudaCheck(cudaMalloc((void **) & d_phase1TopologyLayer_, phase1PixelTopology::layer.size() * sizeof(uint8_t)));
+    cudaCheck(cudaMemcpyAsync(d_phase1TopologyLayer_, phase1PixelTopology::layer.data(), phase1PixelTopology::layer.size() * sizeof(uint8_t), cudaMemcpyDefault, cudaStream.id()));
+
+    gpu_.phase1TopologyLayerStart_d = d_phase1TopologyLayerStart_;
+    gpu_.phase1TopologyLayer_d = d_phase1TopologyLayer_;
+
+    gpu_.me_d = gpu_d;
+    cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, cudaStream.id()));
+
+    cudaCheck(cudaMallocHost(&h_hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t)));
+
+    // On CPU we can safely use MAX_HITS*sizeof as the pitch. Thanks
+    // to '*256' it is even aligned by cache line
+    h_owner_32bit_pitch_ = MAX_HITS*sizeof(uint32_t); 
+    cudaCheck(cudaMallocHost(&h_owner_32bit_, h_owner_32bit_pitch_ * 5));
+    h_charge_ = slicePitch<int32_t>(h_owner_32bit_, h_owner_32bit_pitch_, 0);
+    h_xl_ = slicePitch<float>(h_owner_32bit_, h_owner_32bit_pitch_, 1);
+    h_yl_ = slicePitch<float>(h_owner_32bit_, h_owner_32bit_pitch_, 2);
+    h_xe_ = slicePitch<float>(h_owner_32bit_, h_owner_32bit_pitch_, 3);
+    h_ye_ = slicePitch<float>(h_owner_32bit_, h_owner_32bit_pitch_, 4);
+
+    h_owner_16bit_pitch_ = MAX_HITS*sizeof(uint16_t);
+    cudaCheck(cudaMallocHost(&h_owner_16bit_, h_owner_16bit_pitch_ * 3));
+    h_detInd_ = slicePitch<uint16_t>(h_owner_16bit_, h_owner_16bit_pitch_, 0);
+    h_mr_ = slicePitch<uint16_t>(h_owner_16bit_, h_owner_16bit_pitch_, 1);
+    h_mc_ = slicePitch<uint16_t>(h_owner_16bit_, h_owner_16bit_pitch_, 2);
+
+#ifdef GPU_DEBUG
+    cudaCheck(cudaMallocHost(&h_hitsLayerStart_, 11 * sizeof(uint32_t)));
+#endif
+  }
+  PixelRecHitGPUKernel::~PixelRecHitGPUKernel() {
+    cudaCheck(cudaFree(gpu_.bs_d));
+    cudaCheck(cudaFree(gpu_.hitsLayerStart_d));
+    cudaCheck(cudaFree(gpu_.owner_32bit_));
+    cudaCheck(cudaFree(gpu_.owner_16bit_));
+    cudaCheck(cudaFree(gpu_.hist_d));
+    cudaCheck(cudaFree(gpu_.hws_d));
+    cudaCheck(cudaFree(gpu_d));
+    cudaCheck(cudaFree(d_phase1TopologyLayerStart_));
+    cudaCheck(cudaFree(d_phase1TopologyLayer_));
+
+    cudaCheck(cudaFreeHost(h_hitsModuleStart_));
+    cudaCheck(cudaFreeHost(h_owner_32bit_));
+    cudaCheck(cudaFreeHost(h_owner_16bit_));
+#ifdef GPU_DEBUG
+    cudaCheck(cudaFreeHost(h_hitsLayerStart_));
+#endif
+  }
+
+  void PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d,
+                                           SiPixelClustersCUDA const& clusters_d,
+                                           float const * bs,
+                                           pixelCPEforGPU::ParamsOnGPU const * cpeParams,
+                                           bool transferToCPU,
+                                           cuda::stream_t<>& stream) {
+    cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id()));
+    gpu_.hitsModuleStart_d = clusters_d.clusModuleStart();
+    gpu_.cpeParams = cpeParams; // copy it for use in clients
+    cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id()));
+
+    int threadsPerBlock = 256;
+    int blocks = digis_d.nModules(); // active modules (with digis)
+
+#ifdef GPU_DEBUG
+    std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl;
+#endif
+    gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream.id()>>>(
+      cpeParams,
+      gpu_.bs_d,
+      digis_d.moduleInd(),
+      digis_d.xx(), digis_d.yy(), digis_d.adc(),
+      clusters_d.moduleStart(),
+      clusters_d.clusInModule(), clusters_d.moduleId(),
+      digis_d.clus(),
+      digis_d.nDigis(),
+      gpu_.hitsModuleStart_d,
+      gpu_.charge_d,
+      gpu_.detInd_d,
+      gpu_.xg_d, gpu_.yg_d, gpu_.zg_d, gpu_.rg_d,
+      gpu_.iphi_d,
+      gpu_.xl_d, gpu_.yl_d,
+      gpu_.xerr_d, gpu_.yerr_d,
+      gpu_.mr_d, gpu_.mc_d
+    );
+    cudaCheck(cudaGetLastError());
+
+    // assuming full warp of threads is better than a smaller number...
+    setHitsLayerStart<<<1, 32, 0, stream.id()>>>(gpu_.hitsModuleStart_d, d_phase1TopologyLayerStart_, gpu_.hitsLayerStart_d);
+    cudaCheck(cudaGetLastError());
+
+    // needed only if hits on CPU are required...
+    nhits_ = clusters_d.nClusters();
+    if(transferToCPU) {
+      cudaCheck(cudaMemcpyAsync(h_hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+#ifdef GPU_DEBUG
+      cudaCheck(cudaMemcpyAsync(h_hitsLayerStart_, gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+#endif
+
+      cudaCheck(cudaMemcpy2DAsync(h_owner_16bit_, h_owner_16bit_pitch_,
+                                  gpu_.owner_16bit_, gpu_.owner_16bit_pitch_,
+                                  nhits_*sizeof(uint16_t), 3,
+                                  cudaMemcpyDefault, stream.id()));
+
+      cudaCheck(cudaMemcpy2DAsync(h_owner_32bit_, h_owner_32bit_pitch_,
+                                  gpu_.owner_32bit_, gpu_.owner_32bit_pitch_,
+                                  nhits_*sizeof(uint32_t), 5,
+                                  cudaMemcpyDefault, stream.id()));
+
+#ifdef GPU_DEBUG
+      cudaStreamSynchronize(stream.id());
+
+      std::cout << "hit layerStart ";
+      for (int i=0;i<10;++i) std::cout << phase1PixelTopology::layerName[i] << ':' << h_hitsLayerStart_[i] << ' ';
+      std::cout << "end:" << h_hitsLayerStart_[10] << std::endl;
+#endif
+
+      // for timing test
+      // cudaStreamSynchronize(stream.id());
+      // auto nhits_ = h_hitsLayerStart_[10];
+      // radixSortMultiWrapper<int16_t><<<10, 256, 0, c.stream>>>(gpu_.iphi_d, gpu_.sortIndex_d, gpu_.hitsLayerStart_d);
+    }
+
+    cudautils::fillManyFromVector(gpu_.hist_d, gpu_.hws_d, 10, gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits_, 256, stream.id());
+  }
+}
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
new file mode 100644
index 0000000000000..49164d24ab335
--- /dev/null
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
@@ -0,0 +1,75 @@
+#ifndef RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h
+#define RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h
+
+#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h"
+
+#include <cuda/api_wrappers.h>
+
+#include <cstdint>
+#include <vector>
+
+#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" 
+
+
+namespace pixelCPEforGPU {
+  struct ParamsOnGPU;
+}
+
+namespace pixelgpudetails {
+  using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
+
+  using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
+
+  class PixelRecHitGPUKernel {
+  public:
+    PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream);
+    ~PixelRecHitGPUKernel();
+
+    PixelRecHitGPUKernel(const PixelRecHitGPUKernel&) = delete;
+    PixelRecHitGPUKernel(PixelRecHitGPUKernel&&) = delete;
+    PixelRecHitGPUKernel& operator=(const PixelRecHitGPUKernel&) = delete;
+    PixelRecHitGPUKernel& operator=(PixelRecHitGPUKernel&&) = delete;
+
+    void makeHitsAsync(SiPixelDigisCUDA const& digis_d,
+                       SiPixelClustersCUDA const& clusters_d,
+                       float const * bs,
+                       pixelCPEforGPU::ParamsOnGPU const * cpeParams,
+                       bool transferToCPU,
+                       cuda::stream_t<>& stream);
+
+    HitsOnCPU getOutput() const {
+      return HitsOnCPU{
+        h_hitsModuleStart_, h_detInd_, h_charge_,
+        h_xl_, h_yl_, h_xe_, h_ye_, h_mr_, h_mc_,
+        gpu_d, nhits_
+      };
+    }
+
+  private:
+    HitsOnGPU * gpu_d;  // copy of the structure on the gpu itself: this is the "Product" 
+    HitsOnGPU gpu_;
+    uint32_t nhits_ = 0;
+    uint32_t *d_phase1TopologyLayerStart_ = nullptr;
+    uint8_t *d_phase1TopologyLayer_ = nullptr;
+    uint32_t *h_hitsModuleStart_ = nullptr;
+    uint16_t *h_detInd_ = nullptr;
+    int32_t *h_charge_ = nullptr;
+    float *h_xl_ = nullptr;
+    float *h_yl_ = nullptr;
+    float *h_xe_ = nullptr;
+    float *h_ye_ = nullptr;
+    uint16_t *h_mr_ = nullptr;
+    uint16_t *h_mc_ = nullptr;
+    void *h_owner_32bit_ = nullptr;
+    size_t h_owner_32bit_pitch_ = 0;
+    void *h_owner_16bit_ = nullptr;
+    size_t h_owner_16bit_pitch_ = 0;
+#ifdef GPU_DEBUG
+    uint32_t *h_hitsLayerStart_ = nullptr;
+#endif
+  };
+}
+
+#endif // RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitConverter.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitConverter.cc
index 42efbd12c2e2d..728c1e683e136 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitConverter.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitConverter.cc
@@ -9,68 +9,18 @@
  * ------------------------------------------------------
  */
 
-//---------------------------------------------------------------------------
-//! \class SiPixelRecHitConverter
-//!
-//! \brief EDProducer to covert SiPixelClusters into SiPixelRecHits
-//!
-//! SiPixelRecHitConverter is an EDProducer subclass (i.e., a module)
-//! which orchestrates the conversion of SiPixelClusters into SiPixelRecHits.
-//! Consequently, the input is a edm::DetSetVector<SiPixelCluster> and the output is
-//! SiPixelRecHitCollection.
-//!
-//! SiPixelRecHitConverter invokes one of descendents from
-//! ClusterParameterEstimator (templated on SiPixelCluster), e.g.
-//! CPEFromDetPosition (which is the only available option
-//! right now).  SiPixelRecHitConverter loads the SiPixelClusterCollection,
-//! and then iterates over DetIds, invoking the chosen CPE's methods
-//! localPosition() and localError() to perform the correction (some of which
-//! may be rather involved).  A RecHit is made on the spot, and appended
-//! to the output collection.
-//!
-//! The calibrations are not loaded at the moment,
-//! although that is being planned for the near future.
-//!
-//! \author Porting from ORCA by Petar Maksimovic (JHU). Implementation of the
-//!         DetSetVector by V.Chiochia (Zurich University).
-//!
-//! \version v2, May 30, 2006
-//! change to use Lorentz angle from DB Lotte Wilke, Jan. 31st, 2008
-//!
-//---------------------------------------------------------------------------
-
-//--- Base class for CPEs:
-
-#include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h"
-
-//--- Geometry + DataFormats
-#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
-#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
-#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h"
-#include "DataFormats/Common/interface/DetSetVector.h"
-
-//--- Framework
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-
-#include "DataFormats/Common/interface/Handle.h"
-#include "FWCore/Framework/interface/ESHandle.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/Utilities/interface/InputTag.h"
-#include "FWCore/Utilities/interface/EDPutToken.h"
-#include "FWCore/Utilities/interface/ESGetToken.h"
-
+// Our own stuff
+#include "RecoLocalTracker/SiPixelRecHits/interface/SiPixelRecHitConverter.h"
 // Geometry
 #include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
-#include "Geometry/CommonDetUnit/interface/PixelGeomDetUnit.h"
+#include "Geometry/TrackerGeometryBuilder/interface/PixelGeomDetUnit.h"
 
 // Data Formats
 #include "DataFormats/DetId/interface/DetId.h"
 #include "DataFormats/Common/interface/Ref.h"
 #include "DataFormats/Common/interface/DetSet2RangeMap.h"
 
+
 // STL
 #include <vector>
 #include <memory>
@@ -84,85 +34,57 @@
 
 using namespace std;
 
-namespace cms {
-
-  class SiPixelRecHitConverter : public edm::stream::EDProducer<> {
-  public:
-    //--- Constructor, virtual destructor (just in case)
-    explicit SiPixelRecHitConverter(const edm::ParameterSet& conf);
-    ~SiPixelRecHitConverter() override;
-
-    //--- Factory method to make CPE's depending on the ParameterSet
-    //--- Not sure if we need to make more than one CPE to run concurrently
-    //--- on different parts of the detector (e.g., one for the barrel and the
-    //--- one for the forward).  The way the CPE's are written now, it's
-    //--- likely we can use one (and they will switch internally), or
-    //--- make two of the same but configure them differently.  We need a more
-    //--- realistic use case...
-
-    //--- The top-level event method.
-    void produce(edm::Event& e, const edm::EventSetup& c) override;
-
-    //--- Execute the position estimator algorithm(s).
-    //--- New interface with DetSetVector
-    void run(const edmNew::DetSetVector<SiPixelCluster>& input,
-             SiPixelRecHitCollectionNew& output,
-             TrackerGeometry const& geom);
-
-    void run(edm::Handle<edmNew::DetSetVector<SiPixelCluster>> inputhandle,
-             SiPixelRecHitCollectionNew& output,
-             TrackerGeometry const& geom);
-
-  private:
-    // TO DO: maybe allow a map of pointers?
-    /// const PixelClusterParameterEstimator * cpe_;  // what we got (for now, one ptr to base class)
-    PixelCPEBase const* cpe_ = nullptr;  // What we got (for now, one ptr to base class)
-    edm::InputTag const src_;
-    edm::EDGetTokenT<edmNew::DetSetVector<SiPixelCluster>> const tPixelCluster_;
-    edm::EDPutTokenT<SiPixelRecHitCollection> const tPut_;
-    edm::ESGetToken<TrackerGeometry, TrackerDigiGeometryRecord> const tTrackerGeom_;
-    edm::ESGetToken<PixelClusterParameterEstimator, TkPixelCPERecord> const tCPE_;
-    bool m_newCont;  // save also in emdNew::DetSetVector
-  };
-
+namespace cms
+{
   //---------------------------------------------------------------------------
   //!  Constructor: set the ParameterSet and defer all thinking to setupCPE().
   //---------------------------------------------------------------------------
-  SiPixelRecHitConverter::SiPixelRecHitConverter(edm::ParameterSet const& conf)
-      : src_(conf.getParameter<edm::InputTag>("src")),
-        tPixelCluster_(consumes<edmNew::DetSetVector<SiPixelCluster>>(src_)),
-        tPut_(produces<SiPixelRecHitCollection>()),
-        tTrackerGeom_(esConsumes<TrackerGeometry, TrackerDigiGeometryRecord>()),
-        tCPE_(esConsumes<PixelClusterParameterEstimator, TkPixelCPERecord>(
-            edm::ESInputTag("", conf.getParameter<std::string>("CPE")))) {}
-
+  SiPixelRecHitConverter::SiPixelRecHitConverter(edm::ParameterSet const& conf) 
+    : 
+    conf_(conf),
+    src_( conf.getParameter<edm::InputTag>( "src" ) ),
+    tPixelCluster(consumes< edmNew::DetSetVector<SiPixelCluster> >( src_)) {
+    //--- Declare to the EDM what kind of collections we will be making.
+    produces<SiPixelRecHitCollection>();
+    
+  }
+  
   // Destructor
-  SiPixelRecHitConverter::~SiPixelRecHitConverter() {}
-
+  SiPixelRecHitConverter::~SiPixelRecHitConverter() 
+  { 
+  }  
+  
   //---------------------------------------------------------------------------
   //! The "Event" entrypoint: gets called by framework for every event
   //---------------------------------------------------------------------------
-  void SiPixelRecHitConverter::produce(edm::Event& e, const edm::EventSetup& es) {
-    // Step A.1: get input data
-    edm::Handle<edmNew::DetSetVector<SiPixelCluster>> input;
-    e.getByToken(tPixelCluster_, input);
+  void SiPixelRecHitConverter::produce(edm::Event& e, const edm::EventSetup& es)
+  {
 
+    // Step A.1: get input data
+    edm::Handle< edmNew::DetSetVector<SiPixelCluster> > input;
+    e.getByToken( tPixelCluster, input);
+    
     // Step A.2: get event setup
-    auto const& geom = es.getData(tTrackerGeom_);
+    edm::ESHandle<TrackerGeometry> geom;
+    es.get<TrackerDigiGeometryRecord>().get( geom );
 
     // Step B: create empty output collection
-    SiPixelRecHitCollectionNew output;
-
+    auto output = std::make_unique<SiPixelRecHitCollectionNew>();
+    
     // Step B*: create CPE
-    cpe_ = dynamic_cast<const PixelCPEBase*>(&es.getData(tCPE_));
-
+    edm::ESHandle<PixelClusterParameterEstimator> hCPE;
+    std::string cpeName_ = conf_.getParameter<std::string>("CPE");
+    es.get<TkPixelCPERecord>().get(cpeName_,hCPE);
+    cpe_ = dynamic_cast< const PixelCPEBase* >(&(*hCPE));
+    
     // Step C: Iterate over DetIds and invoke the strip CPE algorithm
     // on each DetUnit
 
-    run(input, output, geom);
+    run( input, *output, geom );
+
+    output->shrink_to_fit();
+    e.put(std::move(output));
 
-    output.shrink_to_fit();
-    e.emplace(tPut_, std::move(output));
   }
 
   //---------------------------------------------------------------------------
@@ -170,71 +92,69 @@ namespace cms {
   //!  and make a RecHit to store the result.
   //!  New interface reading DetSetVector by V.Chiochia (May 30th, 2006)
   //---------------------------------------------------------------------------
-  void SiPixelRecHitConverter::run(edm::Handle<edmNew::DetSetVector<SiPixelCluster>> inputhandle,
-                                   SiPixelRecHitCollectionNew& output,
-                                   TrackerGeometry const& geom) {
-    if (!cpe_) {
-      edm::LogError("SiPixelRecHitConverter") << " at least one CPE is not ready -- can't run!";
-      // TO DO: throw an exception here?  The user may want to know...
-      assert(0);
-      return;  // clusterizer is invalid, bail out
-    }
-
+  void SiPixelRecHitConverter::run(edm::Handle<edmNew::DetSetVector<SiPixelCluster> >  inputhandle,
+				   SiPixelRecHitCollectionNew &output,
+				   edm::ESHandle<TrackerGeometry> & geom) {
+    if ( ! cpe_ ) 
+      {
+	edm::LogError("SiPixelRecHitConverter") << " at least one CPE is not ready -- can't run!";
+	// TO DO: throw an exception here?  The user may want to know...
+	assert(0);
+	return;   // clusterizer is invalid, bail out
+      }
+    
     int numberOfDetUnits = 0;
     int numberOfClusters = 0;
-
+    
     const edmNew::DetSetVector<SiPixelCluster>& input = *inputhandle;
-
-    edmNew::DetSetVector<SiPixelCluster>::const_iterator DSViter = input.begin();
-
-    for (; DSViter != input.end(); DSViter++) {
+    
+    edmNew::DetSetVector<SiPixelCluster>::const_iterator DSViter=input.begin();
+    
+    for ( ; DSViter != input.end() ; DSViter++) {
       numberOfDetUnits++;
       unsigned int detid = DSViter->detId();
-      DetId detIdObject(detid);
-      const GeomDetUnit* genericDet = geom.idToDetUnit(detIdObject);
-      const PixelGeomDetUnit* pixDet = dynamic_cast<const PixelGeomDetUnit*>(genericDet);
-      assert(pixDet);
-      SiPixelRecHitCollectionNew::FastFiller recHitsOnDetUnit(output, detid);
-
+      DetId detIdObject( detid );  
+      const GeomDetUnit * genericDet = geom->idToDetUnit( detIdObject );
+      const PixelGeomDetUnit * pixDet = dynamic_cast<const PixelGeomDetUnit*>(genericDet);
+      assert(pixDet); 
+      SiPixelRecHitCollectionNew::FastFiller recHitsOnDetUnit(output,detid);
+      
       edmNew::DetSet<SiPixelCluster>::const_iterator clustIt = DSViter->begin(), clustEnd = DSViter->end();
-
-      for (; clustIt != clustEnd; clustIt++) {
-        numberOfClusters++;
-        std::tuple<LocalPoint, LocalError, SiPixelRecHitQuality::QualWordType> tuple =
-            cpe_->getParameters(*clustIt, *genericDet);
-        LocalPoint lp(std::get<0>(tuple));
-        LocalError le(std::get<1>(tuple));
-        SiPixelRecHitQuality::QualWordType rqw(std::get<2>(tuple));
-        // Create a persistent edm::Ref to the cluster
-        edm::Ref<edmNew::DetSetVector<SiPixelCluster>, SiPixelCluster> cluster =
-            edmNew::makeRefTo(inputhandle, clustIt);
-        // Make a RecHit and add it to the DetSet
-        // old : recHitsOnDetUnit.push_back( new SiPixelRecHit( lp, le, detIdObject, &*clustIt) );
-        SiPixelRecHit hit(lp, le, rqw, *genericDet, cluster);
-        //
-        // Now save it =================
-        recHitsOnDetUnit.push_back(hit);
-        // =============================
-
-        // std::cout << "SiPixelRecHitConverterVI " << numberOfClusters << ' '<< lp << " " << le << std::endl;
-      }  //  <-- End loop on Clusters
+      
+      for ( ; clustIt != clustEnd; clustIt++) {
+	numberOfClusters++;
+	std::tuple<LocalPoint, LocalError,SiPixelRecHitQuality::QualWordType> tuple = cpe_->getParameters( *clustIt, *genericDet );
+	LocalPoint lp( std::get<0>(tuple) );
+	LocalError le( std::get<1>(tuple) );
+        SiPixelRecHitQuality::QualWordType rqw( std::get<2>(tuple) );
+	// Create a persistent edm::Ref to the cluster
+	edm::Ref< edmNew::DetSetVector<SiPixelCluster>, SiPixelCluster > cluster = edmNew::makeRefTo( inputhandle, clustIt);
+	// Make a RecHit and add it to the DetSet
+	// old : recHitsOnDetUnit.push_back( new SiPixelRecHit( lp, le, detIdObject, &*clustIt) );
+	SiPixelRecHit hit( lp, le, rqw, *genericDet, cluster);
+	// 
+	// Now save it =================
+	recHitsOnDetUnit.push_back(hit);
+	// =============================
+
+	// std::cout << "SiPixelRecHitConverterVI " << numberOfClusters << ' '<< lp << " " << le << std::endl;
+      } //  <-- End loop on Clusters
+	
 
       //  LogDebug("SiPixelRecHitConverter")
       //std::cout << "SiPixelRecHitConverterVI "
-      //	<< " Found " << recHitsOnDetUnit.size() << " RecHits on " << detid //;
-      //	<< std::endl;
-
-    }  //    <-- End loop on DetUnits
-
-    //    LogDebug ("SiPixelRecHitConverter")
+	//	<< " Found " << recHitsOnDetUnit.size() << " RecHits on " << detid //;
+	//	<< std::endl;
+      
+      
+    } //    <-- End loop on DetUnits
+    
+    //    LogDebug ("SiPixelRecHitConverter") 
     //  std::cout << "SiPixelRecHitConverterVI "
-    //  << cpeName_ << " converted " << numberOfClusters
-    //  << " SiPixelClusters into SiPixelRecHits, in "
-    //  << numberOfDetUnits << " DetUnits." //;
+    //  << cpeName_ << " converted " << numberOfClusters 
+    //  << " SiPixelClusters into SiPixelRecHits, in " 
+    //  << numberOfDetUnits << " DetUnits." //; 
     //  << std::endl;
+	
   }
 }  // end of namespace cms
-
-using cms::SiPixelRecHitConverter;
-
-DEFINE_FWK_MODULE(SiPixelRecHitConverter);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
new file mode 100644
index 0000000000000..6864a046bf1dc
--- /dev/null
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
@@ -0,0 +1,157 @@
+#ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelRecHits_h
+#define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelRecHits_h
+
+#include <cstdint>
+#include <cstdio>
+#include <limits>
+
+#include "DataFormats/Math/interface/approx_atan2.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+
+namespace gpuPixelRecHits {
+
+
+
+
+  __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const * __restrict__  cpeParams,
+                          float const * __restrict__  bs,
+                          uint16_t const * __restrict__  id,
+			  uint16_t const * __restrict__  x,
+			  uint16_t const * __restrict__  y,
+			  uint16_t const * __restrict__  adc,
+			  uint32_t const * __restrict__  digiModuleStart,
+			  uint32_t const * __restrict__  clusInModule,
+			  uint32_t const * __restrict__  moduleId,
+			  int32_t  const * __restrict__  clus,
+			  int numElements,
+			  uint32_t const * __restrict__  hitsModuleStart,
+                          int32_t * chargeh,
+                          uint16_t * detInd,
+			  float * xg, float * yg, float * zg, float * rg, int16_t * iph,
+                          float * xl, float * yl,
+                          float * xe, float * ye, 
+                          uint16_t * mr, uint16_t * mc)
+  {
+
+    // to be moved in common namespace...
+    constexpr uint16_t InvId=9999; // must be > MaxNumModules
+    constexpr uint32_t MaxClusInModule = pixelCPEforGPU::MaxClusInModule;
+
+    using ClusParams = pixelCPEforGPU::ClusParams;
+
+
+    // as usual one block per module
+    __shared__ ClusParams clusParams;
+
+    auto first = digiModuleStart[1 + blockIdx.x];
+    auto me = moduleId[blockIdx.x];
+    auto nclus = clusInModule[me];
+
+    if (0==nclus) return;
+
+#ifdef GPU_DEBUG
+    if (threadIdx.x==0) {
+      auto k=first;
+      while (id[k]==InvId) ++k;
+      assert(id[k]==me);
+    }
+#endif
+
+#ifdef GPU_DEBUG
+    if (me%100==1)
+      if (threadIdx.x==0) printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, hitsModuleStart[me]);
+#endif
+
+    assert(blockDim.x >= MaxClusInModule);
+
+    if (threadIdx.x==0 && nclus > MaxClusInModule) { 
+      printf("WARNING: too many clusters %d in Module %d. Only first %d processed\n", nclus,me,MaxClusInModule);
+      // zero charge: do not bother to do it in parallel
+      for (auto d=MaxClusInModule; d<nclus; ++d) { chargeh[d]=0; detInd[d]=InvId;}
+    }
+    nclus = std::min(nclus, MaxClusInModule);
+
+    auto ic = threadIdx.x;
+
+    if (ic < nclus) {
+      clusParams.minRow[ic] = std::numeric_limits<uint32_t>::max();
+      clusParams.maxRow[ic] = 0;
+      clusParams.minCol[ic] = std::numeric_limits<uint32_t>::max();
+      clusParams.maxCol[ic] = 0;
+      clusParams.charge[ic] = 0;
+      clusParams.Q_f_X[ic] = 0;
+      clusParams.Q_l_X[ic] = 0;
+      clusParams.Q_f_Y[ic] = 0;
+      clusParams.Q_l_Y[ic] = 0;
+    }
+
+    first += threadIdx.x;
+
+    __syncthreads();
+
+    // one thead per "digi"
+
+    for (int i = first; i < numElements; i += blockDim.x) {
+      if (id[i] == InvId) continue;     // not valid
+      if (id[i] != me) break;           // end of module
+      if (clus[i] >= nclus) continue;
+      atomicMin(&clusParams.minRow[clus[i]], x[i]);
+      atomicMax(&clusParams.maxRow[clus[i]], x[i]);
+      atomicMin(&clusParams.minCol[clus[i]], y[i]);
+      atomicMax(&clusParams.maxCol[clus[i]], y[i]);
+    }
+
+    __syncthreads();
+
+    for (int i = first; i < numElements; i += blockDim.x) {
+      if (id[i] == InvId) continue;     // not valid
+      if (id[i] != me) break;           // end of module
+      if (clus[i] >= nclus) continue;
+      atomicAdd(&clusParams.charge[clus[i]], adc[i]);
+      if (clusParams.minRow[clus[i]]==x[i]) atomicAdd(&clusParams.Q_f_X[clus[i]], adc[i]);
+      if (clusParams.maxRow[clus[i]]==x[i]) atomicAdd(&clusParams.Q_l_X[clus[i]], adc[i]);
+      if (clusParams.minCol[clus[i]]==y[i]) atomicAdd(&clusParams.Q_f_Y[clus[i]], adc[i]);
+      if (clusParams.maxCol[clus[i]]==y[i]) atomicAdd(&clusParams.Q_l_Y[clus[i]], adc[i]);
+    }
+
+    __syncthreads();
+
+    // next one cluster per thread...
+    if (ic >= nclus) return;
+
+    first = hitsModuleStart[me];
+    auto h = first+ic;  // output index in global memory
+
+    assert(h < 2000*256);
+
+    pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
+    pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
+
+    chargeh[h] = clusParams.charge[ic];
+
+    detInd[h] = me;
+
+    xl[h]= clusParams.xpos[ic];   
+    yl[h]= clusParams.ypos[ic]; 
+
+    xe[h]= clusParams.xerr[ic]*clusParams.xerr[ic];
+    ye[h]= clusParams.yerr[ic]*clusParams.yerr[ic];
+    mr[h]= clusParams.minRow[ic];
+    mc[h]= clusParams.minCol[ic];
+  
+    // to global and compute phi... 
+    cpeParams->detParams(me).frame.toGlobal(xl[h],yl[h], xg[h],yg[h],zg[h]);
+    // here correct for the beamspot...
+    xg[h]-=bs[0];
+    yg[h]-=bs[1];
+    zg[h]-=bs[2];
+
+    rg[h] = std::sqrt(xg[h]*xg[h]+yg[h]*yg[h]);
+    iph[h] = unsafe_atan2s<7>(yg[h],xg[h]);
+    
+  }
+
+}
+
+#endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelRecHits_h
diff --git a/RecoLocalTracker/SiPixelRecHits/python/PixelCPEESProducers_cff.py b/RecoLocalTracker/SiPixelRecHits/python/PixelCPEESProducers_cff.py
index e349a515c69b3..8e28bbb175181 100644
--- a/RecoLocalTracker/SiPixelRecHits/python/PixelCPEESProducers_cff.py
+++ b/RecoLocalTracker/SiPixelRecHits/python/PixelCPEESProducers_cff.py
@@ -3,15 +3,25 @@
 #
 # Load all Pixel Cluster Position Estimator ESProducers
 #
-# 1. Template algorithm
+#
+# 1. RecHits using angles from module position
+#
+from RecoLocalTracker.SiPixelRecHits.PixelCPEInitial_cfi import *
+#
+# 2. TrackingRechits using angles from tracks
+#
+from RecoLocalTracker.SiPixelRecHits.PixelCPEParmError_cfi import *
+#
+# 3. Template algorithm
 #
 from RecoLocalTracker.SiPixelRecHits.PixelCPETemplateReco_cfi import *
 #
-# 2. Pixel Generic CPE
+# 4. Pixel Generic CPE
 #
 from RecoLocalTracker.SiPixelRecHits.PixelCPEGeneric_cfi import *
+from RecoLocalTracker.SiPixelRecHits.PixelCPEFast_cfi import *
 #
-# 3. ESProducer for the Magnetic-field dependent template records
+# 5. ESProducer for the Magnetic-field dependent template records
 #
 from CalibTracker.SiPixelESProducers.SiPixelTemplateDBObjectESProducer_cfi import *
 from CalibTracker.SiPixelESProducers.SiPixel2DTemplateDBObjectESProducer_cfi import *
diff --git a/RecoLocalTracker/SiPixelRecHits/python/PixelCPEFast_cfi.py b/RecoLocalTracker/SiPixelRecHits/python/PixelCPEFast_cfi.py
new file mode 100644
index 0000000000000..bda1fe45a3705
--- /dev/null
+++ b/RecoLocalTracker/SiPixelRecHits/python/PixelCPEFast_cfi.py
@@ -0,0 +1,31 @@
+import FWCore.ParameterSet.Config as cms
+
+PixelCPEFastESProducer = cms.ESProducer("PixelCPEFastESProducer",
+
+    ComponentName = cms.string('PixelCPEFast'),
+    Alpha2Order = cms.bool(True),
+
+    # Edge cluster errors in microns (determined by looking at residual RMS) 
+    EdgeClusterErrorX = cms.double( 50.0 ),                                      
+    EdgeClusterErrorY = cms.double( 85.0 ),                                                     
+
+    # these for CPEBase
+    useLAWidthFromDB = cms.bool(True),
+    useLAAlignmentOffsets = cms.bool(False),
+
+
+    # Can use errors predicted by the template code
+    # If UseErrorsFromTemplates is False, must also set
+    # TruncatePixelCharge and LoadTemplatesFromDB to be False                                        
+    UseErrorsFromTemplates = cms.bool(True),
+    LoadTemplatesFromDB = cms.bool(True),
+
+    # When set True this gives a slight improvement in resolution at no cost 
+    TruncatePixelCharge = cms.bool(True),
+
+    # petar, for clusterProbability() from TTRHs
+    ClusterProbComputationFlag = cms.int32(0),
+
+    #MagneticFieldRecord: e.g. "" or "ParabolicMF"
+    MagneticFieldRecord = cms.ESInputTag(""),
+)
diff --git a/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc b/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc
new file mode 100644
index 0000000000000..eb51dd5a2eaeb
--- /dev/null
+++ b/RecoLocalTracker/SiPixelRecHits/src/PixelCPEFast.cc
@@ -0,0 +1,468 @@
+#include <iostream>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "CondFormats/SiPixelTransient/interface/SiPixelTemplate.h"
+#include "DataFormats/DetId/interface/DetId.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "Geometry/TrackerGeometryBuilder/interface/PixelGeomDetUnit.h"
+#include "Geometry/TrackerGeometryBuilder/interface/RectangularPixelTopology.h"
+#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
+#include "HeterogeneousCore/CUDAServices/interface/numberOfCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "MagneticField/Engine/interface/MagneticField.h"
+
+#include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h"
+
+// Services
+// this is needed to get errors from templates
+
+namespace {
+   constexpr float micronsToCm = 1.0e-4;
+}
+
+//-----------------------------------------------------------------------------
+//!  The constructor.
+//-----------------------------------------------------------------------------
+PixelCPEFast::PixelCPEFast(edm::ParameterSet const & conf,
+                                 const MagneticField * mag,
+                                 const TrackerGeometry& geom,
+                                 const TrackerTopology& ttopo,
+                                 const SiPixelLorentzAngle * lorentzAngle,
+                                 const SiPixelGenErrorDBObject * genErrorDBObject,
+                                 const SiPixelLorentzAngle * lorentzAngleWidth) :
+  PixelCPEBase(conf, mag, geom, ttopo, lorentzAngle, genErrorDBObject, nullptr, lorentzAngleWidth, 0)
+{
+   EdgeClusterErrorX_ = conf.getParameter<double>("EdgeClusterErrorX");
+   EdgeClusterErrorY_ = conf.getParameter<double>("EdgeClusterErrorY");
+   
+   UseErrorsFromTemplates_    = conf.getParameter<bool>("UseErrorsFromTemplates");
+   TruncatePixelCharge_       = conf.getParameter<bool>("TruncatePixelCharge");
+   
+   // Use errors from templates or from GenError
+   if ( UseErrorsFromTemplates_ ) {
+     if ( !SiPixelGenError::pushfile( *genErrorDBObject_, thePixelGenError_) )
+            throw cms::Exception("InvalidCalibrationLoaded")
+            << "ERROR: GenErrors not filled correctly. Check the sqlite file. Using SiPixelTemplateDBObject version "
+            << ( *genErrorDBObject_ ).version();
+   }
+   
+   // Rechit errors in case other, more correct, errors are not vailable
+   // This are constants. Maybe there is a more efficienct way to store them.
+   xerr_barrel_l1_      = { 0.00115, 0.00120, 0.00088 };
+   xerr_barrel_l1_def_  = 0.01030;
+   yerr_barrel_l1_      = { 0.00375, 0.00230, 0.00250, 0.00250, 0.00230, 0.00230, 0.00210, 0.00210, 0.00240 };
+   yerr_barrel_l1_def_  = 0.00210;
+   xerr_barrel_ln_      = { 0.00115, 0.00120, 0.00088};
+   xerr_barrel_ln_def_  = 0.01030;
+   yerr_barrel_ln_      = { 0.00375, 0.00230, 0.00250, 0.00250, 0.00230, 0.00230, 0.00210, 0.00210, 0.00240 };
+   yerr_barrel_ln_def_  = 0.00210;
+   xerr_endcap_         = { 0.0020, 0.0020 };
+   xerr_endcap_def_     = 0.0020;
+   yerr_endcap_         = { 0.00210 };
+   yerr_endcap_def_     = 0.00075;
+
+   fillParamsForGpu();   
+}
+
+const pixelCPEforGPU::ParamsOnGPU *PixelCPEFast::getGPUProductAsync(cuda::stream_t<>& cudaStream) const {
+  const auto& data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData& data, cuda::stream_t<>& stream) {
+      // and now copy to device...
+      cudaCheck(cudaMalloc((void**) & data.h_paramsOnGPU.m_commonParams, sizeof(pixelCPEforGPU::CommonParams)));
+      cudaCheck(cudaMalloc((void**) & data.h_paramsOnGPU.m_detParams, this->m_detParamsGPU.size()*sizeof(pixelCPEforGPU::DetParams)));
+      cudaCheck(cudaMalloc((void**) & data.d_paramsOnGPU, sizeof(pixelCPEforGPU::ParamsOnGPU)));
+
+      cudaCheck(cudaMemcpyAsync(data.d_paramsOnGPU, &data.h_paramsOnGPU, sizeof(pixelCPEforGPU::ParamsOnGPU), cudaMemcpyDefault, stream.id()));
+      cudaCheck(cudaMemcpyAsync(data.h_paramsOnGPU.m_commonParams, &this->m_commonParamsGPU, sizeof(pixelCPEforGPU::CommonParams), cudaMemcpyDefault, stream.id()));
+      cudaCheck(cudaMemcpyAsync(data.h_paramsOnGPU.m_detParams, this->m_detParamsGPU.data(), this->m_detParamsGPU.size()*sizeof(pixelCPEforGPU::DetParams), cudaMemcpyDefault, stream.id()));
+    });
+  return data.d_paramsOnGPU;
+}
+
+void PixelCPEFast::fillParamsForGpu() {
+  m_commonParamsGPU.theThicknessB = m_DetParams.front().theThickness;
+  m_commonParamsGPU.theThicknessE = m_DetParams.back().theThickness;
+  m_commonParamsGPU.thePitchX = m_DetParams[0].thePitchX;
+  m_commonParamsGPU.thePitchY = m_DetParams[0].thePitchY;
+
+  //uint32_t oldLayer = 0;
+  m_detParamsGPU.resize(m_DetParams.size());
+  for (auto i=0U; i<m_DetParams.size(); ++i) {
+    auto & p=m_DetParams[i];
+    auto & g=m_detParamsGPU[i];
+
+    assert(p.theDet->index()==int(i));
+    assert(m_commonParamsGPU.thePitchY==p.thePitchY);    
+    assert(m_commonParamsGPU.thePitchX==p.thePitchX);
+    //assert(m_commonParamsGPU.theThickness==p.theThickness);
+
+    g.isBarrel = GeomDetEnumerators::isBarrel(p.thePart);
+    g.isPosZ = p.theDet->surface().position().z()>0;
+    g.layer = ttopo_.layer(p.theDet->geographicalId());
+    g.index=i; // better be!
+    g.rawId = p.theDet->geographicalId();
+   
+    assert( (g.isBarrel ?m_commonParamsGPU.theThicknessB : m_commonParamsGPU.theThicknessE) ==p.theThickness );
+
+    //if (m_commonParamsGPU.theThickness!=p.theThickness)   
+    //  std::cout << i << (g.isBarrel ? "B " : "E ") << m_commonParamsGPU.theThickness<<"!="<<p.theThickness << std::endl;
+
+    //if (oldLayer != g.layer) {
+    //  oldLayer = g.layer;
+    //  std::cout << "new layer at " << i << (g.isBarrel ? " B  " :  (g.isPosZ ? " E+ " : " E- ")) << g.layer << " starting at " << g.rawId << std::endl;
+    //}
+
+    g.shiftX = 0.5f*p.lorentzShiftInCmX;
+    g.shiftY = 0.5f*p.lorentzShiftInCmY;
+    g.chargeWidthX = p.lorentzShiftInCmX * p.widthLAFractionX;
+    g.chargeWidthY = p.lorentzShiftInCmY * p.widthLAFractionY;
+
+    g.x0 = p.theOrigin.x();
+    g.y0 = p.theOrigin.y();
+    g.z0 = p.theOrigin.z();
+
+    auto vv = p.theDet->surface().position();
+    auto rr = pixelCPEforGPU::Rotation(p.theDet->surface().rotation());
+    g.frame = pixelCPEforGPU::Frame(vv.x(),vv.y(),vv.z(),rr);
+
+
+    // errors .....
+   ClusterParamGeneric cp;
+   auto gvx = p.theOrigin.x() + 40.f*m_commonParamsGPU.thePitchX;
+   auto gvy = p.theOrigin.y();
+   auto gvz = 1.f/p.theOrigin.z();
+   //--- Note that the normalization is not required as only the ratio used
+
+   // calculate angles
+   cp.cotalpha = gvx*gvz;
+   cp.cotbeta  = gvy*gvz;
+
+   cp.with_track_angle = false;
+
+   auto lape = p.theDet->localAlignmentError();
+   if ( lape.invalid() ) lape = LocalError(); // zero....
+
+#ifdef DUMP_ERRORS   
+   auto m=10000.f;
+   for (float qclus = 15000; qclus<35000; qclus+=15000){
+     errorFromTemplates(p,cp,qclus);
+
+     std::cout << i << ' ' << qclus << ' ' << cp.pixmx
+               << ' ' << m*cp.sigmax << ' ' << m*cp.sx1 << ' ' << m*cp.sx2 
+               << ' ' << m*cp.sigmay << ' ' << m*cp.sy1 << ' ' << m*cp.sy2
+              << std::endl;
+   }
+   std::cout << i << ' ' << m*std::sqrt(lape.xx())  <<' '<< m*std::sqrt(lape.yy()) << std::endl;
+#endif   
+
+   
+   errorFromTemplates(p,cp,20000.f);
+   g.sx[0] = cp.sigmax;
+   g.sx[1] = cp.sx1;
+   g.sx[2] = cp.sx2;
+
+   g.sy[0] = cp.sigmay;
+   g.sy[1] = cp.sy1;
+   g.sy[2] = cp.sy2;
+   
+   
+   /*
+    // from run1??
+    if (i<96) {
+      g.sx[0] = 0.00120;
+      g.sx[1] = 0.00115;
+      g.sx[2] = 0.0050;
+
+      g.sy[0] = 0.00210;
+      g.sy[1] = 0.00375;
+      g.sy[2] = 0.0085;
+    } else if (g.isBarrel) {
+      g.sx[0] = 0.00120;
+      g.sx[1] = 0.00115;
+      g.sx[2] = 0.0050;
+
+      g.sy[0] = 0.00210;
+      g.sy[1] = 0.00375;
+      g.sy[2] = 0.0085;
+   } else {
+      g.sx[0] = 0.0020;
+      g.sx[1] = 0.0020;
+      g.sx[2] = 0.0050;
+
+      g.sy[0] = 0.0021;
+      g.sy[1] = 0.0021;
+      g.sy[2] = 0.0085;
+   }
+   */
+   
+
+   for (int i=0; i<3; ++i) {
+     g.sx[i] = std::sqrt(g.sx[i]*g.sx[i]+lape.xx());
+     g.sy[i] = std::sqrt(g.sy[i]*g.sy[i]+lape.yy());
+   }
+
+ }
+}
+
+PixelCPEFast::~PixelCPEFast() {}
+
+PixelCPEFast::GPUData::~GPUData() {
+  if(d_paramsOnGPU != nullptr) {
+    cudaFree(h_paramsOnGPU.m_commonParams);
+    cudaFree(h_paramsOnGPU.m_detParams);
+    cudaFree(d_paramsOnGPU);
+  }
+}
+
+PixelCPEBase::ClusterParam* PixelCPEFast::createClusterParam(const SiPixelCluster & cl) const
+{
+   return new ClusterParamGeneric(cl);
+}
+
+
+
+void
+PixelCPEFast::errorFromTemplates(DetParam const & theDetParam, ClusterParamGeneric & theClusterParam, float qclus) const
+{
+      float locBz = theDetParam.bz;
+      float locBx = theDetParam.bx;
+      //cout << "PixelCPEFast::localPosition(...) : locBz = " << locBz << endl;
+
+      theClusterParam.pixmx  = std::numeric_limits<int>::max();  // max pixel charge for truncation of 2-D cluster
+
+      theClusterParam.sigmay = -999.9; // CPE Generic y-error for multi-pixel cluster
+      theClusterParam.sigmax = -999.9; // CPE Generic x-error for multi-pixel cluster
+      theClusterParam.sy1    = -999.9; // CPE Generic y-error for single single-pixel
+      theClusterParam.sy2    = -999.9; // CPE Generic y-error for single double-pixel cluster
+      theClusterParam.sx1    = -999.9; // CPE Generic x-error for single single-pixel cluster
+      theClusterParam.sx2    = -999.9; // CPE Generic x-error for single double-pixel cluster
+
+      float dummy;
+
+      SiPixelGenError gtempl(thePixelGenError_);
+      int gtemplID_ = theDetParam.detTemplateId;
+
+      theClusterParam.qBin_ = gtempl.qbin( gtemplID_, theClusterParam.cotalpha, theClusterParam.cotbeta, locBz, locBx, qclus,
+                                          false,
+                                          theClusterParam.pixmx, theClusterParam.sigmay, dummy,
+                                          theClusterParam.sigmax, dummy, theClusterParam.sy1,
+                                          dummy, theClusterParam.sy2, dummy, theClusterParam.sx1,
+                                          dummy, theClusterParam.sx2, dummy );
+
+      theClusterParam.sigmax = theClusterParam.sigmax * micronsToCm;
+      theClusterParam.sx1 = theClusterParam.sx1 * micronsToCm;
+      theClusterParam.sx2 = theClusterParam.sx2 * micronsToCm;
+
+      theClusterParam.sigmay = theClusterParam.sigmay * micronsToCm;
+      theClusterParam.sy1 = theClusterParam.sy1 * micronsToCm;
+      theClusterParam.sy2 = theClusterParam.sy2 * micronsToCm;
+}
+
+//-----------------------------------------------------------------------------
+//! Hit position in the local frame (in cm).  Unlike other CPE's, this
+//! one converts everything from the measurement frame (in channel numbers)
+//! into the local frame (in centimeters).
+//-----------------------------------------------------------------------------
+LocalPoint
+PixelCPEFast::localPosition(DetParam const & theDetParam, ClusterParam & theClusterParamBase) const
+{
+   ClusterParamGeneric & theClusterParam = static_cast<ClusterParamGeneric &>(theClusterParamBase);
+
+   assert(!theClusterParam.with_track_angle); 
+   
+   if ( UseErrorsFromTemplates_ ) {
+      errorFromTemplates(theDetParam, theClusterParam, theClusterParam.theCluster->charge());
+   } 
+   else {
+     theClusterParam.qBin_ = 0;
+   }
+   
+   int Q_f_X;        //!< Q of the first  pixel  in X
+   int Q_l_X;        //!< Q of the last   pixel  in X
+   int Q_f_Y;        //!< Q of the first  pixel  in Y
+   int Q_l_Y;        //!< Q of the last   pixel  in Y
+   collect_edge_charges( theClusterParam,
+                        Q_f_X, Q_l_X,
+                        Q_f_Y, Q_l_Y,
+                        UseErrorsFromTemplates_ && TruncatePixelCharge_
+                        );
+   
+     // do GPU like ...
+     pixelCPEforGPU::ClusParams cp;
+     
+     cp.minRow[0] = theClusterParam.theCluster->minPixelRow();
+     cp.maxRow[0] = theClusterParam.theCluster->maxPixelRow();
+     cp.minCol[0] = theClusterParam.theCluster->minPixelCol();
+     cp.maxCol[0] = theClusterParam.theCluster->maxPixelCol();
+
+      cp.Q_f_X[0] = Q_f_X;
+      cp.Q_l_X[0] = Q_l_X;
+      cp.Q_f_Y[0] = Q_f_Y;
+      cp.Q_l_Y[0] = Q_l_Y;
+
+      auto ind = theDetParam.theDet->index();
+      pixelCPEforGPU::position(m_commonParamsGPU, m_detParamsGPU[ind],cp,0);
+      auto xPos = cp.xpos[0];     
+      auto yPos = cp.ypos[0];
+
+   //--- Now put the two together
+   LocalPoint pos_in_local( xPos, yPos );
+   return pos_in_local;
+}
+
+//-----------------------------------------------------------------------------
+//!  Collect the edge charges in x and y, in a single pass over the pixel vector.
+//!  Calculate charge in the first and last pixel projected in x and y
+//!  and the inner cluster charge, projected in x and y.
+//-----------------------------------------------------------------------------
+void
+PixelCPEFast::
+collect_edge_charges(ClusterParam & theClusterParamBase,  //!< input, the cluster
+                     int & Q_f_X,              //!< output, Q first  in X
+                     int & Q_l_X,              //!< output, Q last   in X
+                     int & Q_f_Y,              //!< output, Q first  in Y
+                     int & Q_l_Y,              //!< output, Q last   in Y
+   	       	     bool truncate
+)
+{
+   ClusterParamGeneric & theClusterParam = static_cast<ClusterParamGeneric &>(theClusterParamBase);
+   
+   // Initialize return variables.
+   Q_f_X = Q_l_X = 0;
+   Q_f_Y = Q_l_Y = 0;
+   
+   // Obtain boundaries in index units
+   int xmin = theClusterParam.theCluster->minPixelRow();
+   int xmax = theClusterParam.theCluster->maxPixelRow();
+   int ymin = theClusterParam.theCluster->minPixelCol();
+   int ymax = theClusterParam.theCluster->maxPixelCol();
+   
+   // Iterate over the pixels.
+   int isize = theClusterParam.theCluster->size();
+   for (int i = 0;  i != isize; ++i)
+   {
+      auto const & pixel = theClusterParam.theCluster->pixel(i);
+      // ggiurgiu@fnal.gov: add pixel charge truncation
+      int pix_adc = pixel.adc;
+      if ( truncate )
+         pix_adc = std::min(pix_adc, theClusterParam.pixmx );
+      
+      //
+      // X projection
+      if ( pixel.x == xmin ) Q_f_X += pix_adc;
+      if ( pixel.x == xmax ) Q_l_X += pix_adc;
+      //
+      // Y projection
+      if ( pixel.y == ymin ) Q_f_Y += pix_adc;
+      if ( pixel.y == ymax ) Q_l_Y += pix_adc;
+   }
+}
+
+
+//==============  INFLATED ERROR AND ERRORS FROM DB BELOW  ================
+
+//-------------------------------------------------------------------------
+//  Hit error in the local frame
+//-------------------------------------------------------------------------
+LocalError
+PixelCPEFast::localError(DetParam const & theDetParam,  ClusterParam & theClusterParamBase) const
+{
+   
+   ClusterParamGeneric & theClusterParam = static_cast<ClusterParamGeneric &>(theClusterParamBase);
+   
+   // Default errors are the maximum error used for edge clusters.
+   // These are determined by looking at residuals for edge clusters
+   float xerr = EdgeClusterErrorX_ * micronsToCm;
+   float yerr = EdgeClusterErrorY_ * micronsToCm;
+   
+   
+   // Find if cluster is at the module edge.
+   int maxPixelCol = theClusterParam.theCluster->maxPixelCol();
+   int maxPixelRow = theClusterParam.theCluster->maxPixelRow();
+   int minPixelCol = theClusterParam.theCluster->minPixelCol();
+   int minPixelRow = theClusterParam.theCluster->minPixelRow();
+   
+   bool edgex =  phase1PixelTopology::isEdgeX(minPixelRow) | phase1PixelTopology::isEdgeX(maxPixelRow);
+   bool edgey =  phase1PixelTopology::isEdgeY(minPixelCol) | phase1PixelTopology::isEdgeY(maxPixelCol);
+   
+   unsigned int sizex = theClusterParam.theCluster->sizeX();
+   unsigned int sizey = theClusterParam.theCluster->sizeY();
+   
+   // Find if cluster contains double (big) pixels.
+   bool bigInX = theDetParam.theRecTopol->containsBigPixelInX( minPixelRow, maxPixelRow );
+   bool bigInY = theDetParam.theRecTopol->containsBigPixelInY( minPixelCol, maxPixelCol );
+   
+   if (UseErrorsFromTemplates_ ) {
+      //
+      // Use template errors
+      
+      if ( !edgex ) { // Only use this for non-edge clusters
+         if ( sizex == 1 ) {
+            if ( !bigInX ) {xerr = theClusterParam.sx1;}
+            else           {xerr = theClusterParam.sx2;}
+         } else {xerr = theClusterParam.sigmax;}
+      }
+      
+      if ( !edgey ) { // Only use for non-edge clusters
+         if ( sizey == 1 ) {
+            if ( !bigInY ) {yerr = theClusterParam.sy1;}
+            else           {yerr = theClusterParam.sy2;}
+         } else {yerr = theClusterParam.sigmay;}
+      }
+      
+   } else  { // simple errors
+      
+      // This are the simple errors, hardcoded in the code
+      //cout << "Track angles are not known " << endl;
+      //cout << "Default angle estimation which assumes track from PV (0,0,0) does not work." << endl;
+      
+      if ( GeomDetEnumerators::isTrackerPixel(theDetParam.thePart) ) {
+         if(GeomDetEnumerators::isBarrel(theDetParam.thePart)) {
+            
+            DetId id = (theDetParam.theDet->geographicalId());
+            int layer=ttopo_.layer(id);
+            if ( layer==1 ) {
+               if ( !edgex ) {
+                  if ( sizex<=xerr_barrel_l1_.size() ) xerr=xerr_barrel_l1_[sizex-1];
+                  else xerr=xerr_barrel_l1_def_;
+               }
+               
+               if ( !edgey ) {
+                  if ( sizey<=yerr_barrel_l1_.size() ) yerr=yerr_barrel_l1_[sizey-1];
+                  else yerr=yerr_barrel_l1_def_;
+               }
+            } else{  // layer 2,3
+               if ( !edgex ) {
+                  if ( sizex<=xerr_barrel_ln_.size() ) xerr=xerr_barrel_ln_[sizex-1];
+                  else xerr=xerr_barrel_ln_def_;
+               }
+               
+               if ( !edgey ) {
+                  if ( sizey<=yerr_barrel_ln_.size() ) yerr=yerr_barrel_ln_[sizey-1];
+                  else yerr=yerr_barrel_ln_def_;
+               }
+            }
+            
+         } else { // EndCap
+            
+            if ( !edgex ) {
+               if ( sizex<=xerr_endcap_.size() ) xerr=xerr_endcap_[sizex-1];
+               else xerr=xerr_endcap_def_;
+            }
+            
+            if ( !edgey ) {
+               if ( sizey<=yerr_endcap_.size() ) yerr=yerr_endcap_[sizey-1];
+               else yerr=yerr_endcap_def_;
+            }
+         } // end endcap
+      }
+      
+   } // end 
+   
+   auto xerr_sq = xerr*xerr; 
+   auto yerr_sq = yerr*yerr;
+   
+   return LocalError( xerr_sq, 0, yerr_sq );
+   
+}
diff --git a/SLHCUpgradeSimulations/Geometry/test/phase2_digi_reco_pixelntuple_cfg.py b/SLHCUpgradeSimulations/Geometry/test/phase2_digi_reco_pixelntuple_cfg.py
index 713d424998b18..4ab47dd98ca29 100644
--- a/SLHCUpgradeSimulations/Geometry/test/phase2_digi_reco_pixelntuple_cfg.py
+++ b/SLHCUpgradeSimulations/Geometry/test/phase2_digi_reco_pixelntuple_cfg.py
@@ -2,12 +2,12 @@
 # using: 
 # Revision: 1.19 
 # Source: /local/reps/CMSSW/CMSSW/Configuration/Applications/python/ConfigBuilder.py,v 
-# with command line options: step2 --conditions auto:phase2_realistic -s DIGI:pdigi_valid,L1,L1TrackTrigger,DIGI2RAW,HLT:@fake2,RAW2DIGI,L1Reco,RECO --datatier GEN-SIM-RECO -n 10 --geometry Extended2023D41 --era Phase2 --eventcontent FEVTDEBUGHLT --filein file:SingleMuPt1000_pythia8_cfi_GEN_SIM.root --runUnscheduled --no_exec
+# with command line options: step2 --conditions auto:phase2_realistic -s DIGI:pdigi_valid,L1,L1TrackTrigger,DIGI2RAW,HLT:@fake2,RAW2DIGI,L1Reco,RECO --datatier GEN-SIM-RECO -n 10 --geometry Extended2023D21 --era Phase2 --eventcontent FEVTDEBUGHLT --filein file:SingleMuPt1000_pythia8_cfi_GEN_SIM.root --runUnscheduled --no_exec
 import FWCore.ParameterSet.Config as cms
 
+from Configuration.StandardSequences.Eras import eras
 
-from Configuration.Eras.Era_Phase2C9_cff import Phase2C9
-process = cms.Process('Phase2PixelNtuple',Phase2C9)
+process = cms.Process('Phase2PixelNtuple',eras.Phase2)
 
 # import of standard configurations
 process.load('Configuration.StandardSequences.Services_cff')
@@ -16,7 +16,7 @@
 process.load('Configuration.EventContent.EventContent_cff')
 process.load('SimGeneral.MixingModule.mixNoPU_cfi')
 #process.load('SimGeneral.MixingModule.mix_POISSON_average_cfi')
-process.load('Configuration.Geometry.GeometryExtended2026D49Reco_cff')
+process.load('Configuration.Geometry.GeometryExtended2023D21Reco_cff')
 process.load('Configuration.StandardSequences.MagneticField_cff')
 process.load('Configuration.StandardSequences.Digi_cff')
 process.load('Configuration.StandardSequences.SimL1Emulator_cff')
@@ -35,7 +35,7 @@
 
 process.source = cms.Source("PoolSource",
     fileNames = cms.untracked.vstring(
-		'/store/relval/CMSSW_11_2_0_pre1/RelValSingleMuPt10/GEN-SIM/110X_mcRun4_realistic_v3_2026D49noPU-v1/10000/743B02CC-F5B9-5642-A7EF-EE222E18C54F.root'
+       '/store/relval/CMSSW_10_0_0_pre1/RelValSingleMuPt10/GEN-SIM/94X_upgrade2023_realistic_v2_2023D21noPU-v2/10000/F2B83850-E6CE-E711-8185-0CC47A78A4B0.root'
     )
 )
 
@@ -102,7 +102,7 @@
 process.siPixelClustersPreSplitting.ElectronPerADCGain  = cms.double(135.)
 
 from Configuration.AlCa.GlobalTag import GlobalTag
-process.GlobalTag = GlobalTag(process.GlobalTag, 'auto:phase2_realistic_T15', '')
+process.GlobalTag = GlobalTag(process.GlobalTag, 'auto:phase2_realistic', '')
 
 # Path and EndPath definitions
 process.digitisation_step = cms.Path(process.pdigi_valid)
@@ -132,6 +132,10 @@
 process = customizeHLTforMC(process)
 
 # End of customisation functions
+#do not add changes to your config after this point (unless you know what you are doing)
+from FWCore.ParameterSet.Utilities import convertToUnscheduled
+process=convertToUnscheduled(process)
+
 
 # Customisation from command line
 
diff --git a/Validation/Configuration/python/globalValidation_cff.py b/Validation/Configuration/python/globalValidation_cff.py
index 0a98acf89ad69..a34d575a56a15 100644
--- a/Validation/Configuration/python/globalValidation_cff.py
+++ b/Validation/Configuration/python/globalValidation_cff.py
@@ -33,7 +33,6 @@
 from Validation.RecoParticleFlow.PFMuonValidation_cff import *
 from Validation.RecoParticleFlow.PFElectronValidation_cff import *
 from Validation.RecoParticleFlow.PFJetResValidation_cff import *
-from Validation.RecoParticleFlow.PFClusterValidation_cff import *
 from Validation.RPCRecHits.rpcRecHitValidation_cfi import *
 from Validation.DTRecHits.DTRecHitQuality_cfi import *
 from Validation.RecoTau.DQMMCValidation_cfi import *
@@ -42,10 +41,8 @@
 from DQMOffline.RecoB.dqmAnalyzer_cff import *
 from Validation.RecoB.BDHadronTrackValidation_cff import *
 from Validation.Configuration.hgcalSimValid_cff import *
-from Validation.Configuration.mtdSimValid_cff import *
 from Validation.SiOuterTrackerV.OuterTrackerSourceConfigV_cff import *
-from Validation.Configuration.ecalSimValid_cff import *
-from Validation.SiTrackerPhase2V.Phase2TrackerValidationFirstStep_cff import *
+
 
 # filter/producer "pre-" sequence for globalValidation
 globalPrevalidationTracking = cms.Sequence(
@@ -77,6 +74,8 @@
                                  + hcalSimHitsValidationSequence
                                  + hcaldigisValidationSequence
                                  + hcalSimHitStudy
+                                 + hcalRecHitsValidationSequence
+                                 + calotowersValidationSequence
                                  + validSimHit+muondtdigianalyzer
                                  + cscDigiValidation
                                  + validationMuonRPCDigis
@@ -92,7 +91,6 @@
                                  + pfElectronValidationSequence
                                  + pfJetResValidationSequence
                                  + pfMuonValidationSequence
-                                 + pfClusterValidationSequence
                                  + rpcRecHitValidation_step
                                  + dtLocalRecoValidation_no2D
                                  + pfTauRunDQMValidation
@@ -132,7 +130,6 @@
     + vertexValidationTrackingOnly
 )
 globalValidationTrackingOnly = cms.Sequence()
-
 # Pixel tracking only validation
 globalPrevalidationPixelTrackingOnly = cms.Sequence(
       simHitTPAssocProducer
@@ -142,63 +139,26 @@
 globalValidationPixelTrackingOnly = cms.Sequence()
 
 globalValidationJetMETonly = cms.Sequence(
-      JetValidation
-    + METValidation
+                                   JetValidation
+                                 + METValidation
 )
 
 globalPrevalidationJetMETOnly = cms.Sequence(
-      jetPreValidSeq
-    + metPreValidSeq
-)
-
-# ECAL local reconstruction
-globalPrevalidationECAL = cms.Sequence()
-globalPrevalidationECALOnly = cms.Sequence(
-      baseCommonPreValidation
-    + globalPrevalidationECAL
+				   jetPreValidSeq
+				  +metPreValidSeq
 )
 
-globalValidationECAL = cms.Sequence(
-      ecalSimHitsValidationSequence
-    + ecalDigisValidationSequence
-    + ecalRecHitsValidationSequence
-    + ecalClustersValidationSequence
-)
-globalValidationECALOnly = cms.Sequence(
-      ecalSimHitsValidationSequence
-    + ecalDigisValidationSequence
-    + ecalRecHitsValidationSequence
-    + pfClusterCaloOnlyValidationSequence
-)
-
-# HCAL local reconstruction
 globalPrevalidationHCAL = cms.Sequence()
 
-globalPrevalidationHCALOnly = cms.Sequence(
-      baseCommonPreValidation
-    + globalPrevalidationHCAL
-)
-
-hcalRecHitsOnlyValidationSequence = hcalRecHitsValidationSequence.copyAndExclude([NoiseRatesValidation])
-
 globalValidationHCAL = cms.Sequence(
       hcalSimHitsValidationSequence
     + hcaldigisValidationSequence
     + hcalSimHitStudy
-)
-
-globalValidationHCALOnly = cms.Sequence(
-      hcalSimHitsValidationSequence
-    + hcaldigisValidationSequence
-    + hcalSimHitStudy
-    + hcalRecHitsOnlyValidationSequence
-    + pfClusterCaloOnlyValidationSequence
+    + hcalRecHitsValidationSequence
+    + calotowersValidationSequence
 )
 
 globalValidationHGCal = cms.Sequence(hgcalValidation)
-globalPrevalidationHGCal = cms.Sequence(hgcalAssociators)
-
-globalValidationMTD = cms.Sequence()
 
 globalValidationOuterTracker = cms.Sequence(OuterTrackerSourceV)
 
@@ -226,11 +186,8 @@
 _run3_globalValidation += gemSimValid
 
 _phase2_globalValidation = _run3_globalValidation.copy()
-_phase2_globalValidation += trackerphase2ValidationSource
 _phase2_globalValidation += me0SimValid
 
-_phase2_ge0_globalValidation = _run3_globalValidation.copy()
-_phase2_ge0_globalValidation += trackerphase2ValidationSource
 
 from Configuration.Eras.Modifier_run2_GEM_2017_cff import run2_GEM_2017
 run2_GEM_2017.toReplaceWith( globalValidation, _run3_globalValidation )
@@ -238,10 +195,5 @@
 run3_GEM.toReplaceWith( globalValidation, _run3_globalValidation )
 from Configuration.Eras.Modifier_phase2_muon_cff import phase2_muon
 phase2_muon.toReplaceWith( globalValidation, _phase2_globalValidation )
-from Configuration.Eras.Modifier_phase2_GE0_cff import phase2_GE0
-phase2_GE0.toReplaceWith( globalValidation, _phase2_ge0_globalValidation )
-phase2_GE0.toReplaceWith( globalPrevalidationMuons, globalPrevalidationMuons.copyAndExclude([me0SimValid]) )
-from Configuration.ProcessModifiers.pp_on_AA_cff import pp_on_AA
-pp_on_AA.toReplaceWith(globalValidation, globalValidation.copyAndExclude([pfTauRunDQMValidation]))
-from Configuration.Eras.Modifier_phase2_timing_layer_cff import phase2_timing_layer
-phase2_timing_layer.toReplaceWith(globalValidationMTD, cms.Sequence(mtdSimValid+mtdDigiValid+mtdRecoValid))
+from Configuration.Eras.Modifier_pp_on_AA_2018_cff import pp_on_AA_2018
+pp_on_AA_2018.toReplaceWith(globalValidation, globalValidation.copyAndExclude([pfTauRunDQMValidation]))
diff --git a/Validation/Configuration/python/postValidation_cff.py b/Validation/Configuration/python/postValidation_cff.py
index 8f310eacfce2e..a9acd3c59a1ee 100644
--- a/Validation/Configuration/python/postValidation_cff.py
+++ b/Validation/Configuration/python/postValidation_cff.py
@@ -21,7 +21,7 @@
 from Validation.SiPixelPhase1ConfigV.SiPixelPhase1OfflineDQM_harvestingV_cff import *
 from DQMOffline.RecoB.dqmCollector_cff import *
 from Validation.SiOuterTrackerV.SiOuterTrackerMCHarvesting_cff import *
-from Validation.SiTrackerPhase2V.Phase2TrackerMCHarvesting_cff import *
+
 
 postValidationTracking = cms.Sequence(
       postProcessorTrackSequence
@@ -67,7 +67,6 @@
 from Validation.MuonGEMRecHits.PostProcessor_cff import *
 from Validation.MuonME0Validation.PostProcessor_cff import *
 from Validation.HGCalValidation.HGCalPostProcessor_cff import *
-from Validation.MtdValidation.MtdPostProcessor_cff import *
 
 postValidation_common = cms.Sequence()
 
@@ -90,8 +89,6 @@
     METPostProcessor
 )
 
-postValidation_ECAL = cms.Sequence()
-
 postValidation_HCAL = cms.Sequence(
       hcalSimHitsPostProcessor
     + hcaldigisPostProcessor
@@ -127,11 +124,6 @@
 _phase2_postValidation += hgcalPostProcessor
 _phase2_postValidation += MuonME0DigisPostProcessors
 _phase2_postValidation += MuonME0SegPostProcessors
-_phase2_postValidation += trackerphase2ValidationHarvesting
-
-_phase2_ge0_postValidation = _run3_postValidation.copy()
-_phase2_ge0_postValidation += hgcalPostProcessor
-_phase2_ge0_postValidation += trackerphase2ValidationHarvesting
 
 from Configuration.Eras.Modifier_run2_GEM_2017_cff import run2_GEM_2017
 run2_GEM_2017.toReplaceWith( postValidation, _run3_postValidation )
@@ -139,6 +131,3 @@
 run3_GEM.toReplaceWith( postValidation, _run3_postValidation )
 from Configuration.Eras.Modifier_phase2_hgcal_cff import phase2_hgcal
 phase2_hgcal.toReplaceWith( postValidation, _phase2_postValidation )
-from Configuration.Eras.Modifier_phase2_GE0_cff import phase2_GE0
-(phase2_GE0 & phase2_hgcal).toReplaceWith( postValidation, _phase2_ge0_postValidation )
-phase2_GE0.toReplaceWith( postValidation_muons, postValidation_muons.copyAndExclude([MuonME0DigisPostProcessors, MuonME0SegPostProcessors]) )