diff --git a/CUDADataFormats/HcalDigi/BuildFile.xml b/CUDADataFormats/HcalDigi/BuildFile.xml
new file mode 100644
index 0000000000000..fb871f16b69f0
--- /dev/null
+++ b/CUDADataFormats/HcalDigi/BuildFile.xml
@@ -0,0 +1,8 @@
+<use name="DataFormats/Common"/>
+<use name="CUDADataFormats/CaloCommon"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="cuda"/>
+
+<export>
+  <lib   name="1"/>
+</export>
diff --git a/CUDADataFormats/HcalDigi/interface/DigiCollection.h b/CUDADataFormats/HcalDigi/interface/DigiCollection.h
new file mode 100644
index 0000000000000..e2f4bf0848e94
--- /dev/null
+++ b/CUDADataFormats/HcalDigi/interface/DigiCollection.h
@@ -0,0 +1,160 @@
+#ifndef CUDADataFormats_HcalDigi_interface_DigiCollection_h
+#define CUDADataFormats_HcalDigi_interface_DigiCollection_h
+
+#include "CUDADataFormats/CaloCommon/interface/Common.h"
+
+namespace hcal {
+
+  // FLAVOR_HE_QIE11 = 1; Phase1 upgrade
+  struct Flavor1 {
+    static constexpr int WORDS_PER_SAMPLE = 1;
+    static constexpr int SAMPLES_PER_WORD = 1;
+    static constexpr int HEADER_WORDS = 1;
+
+    static constexpr uint8_t adc(uint16_t const* const sample_start) { return (*sample_start & 0xff); }
+    static constexpr uint8_t tdc(uint16_t const* const sample_start) { return (*sample_start >> 8) & 0x3f; }
+    static constexpr uint8_t soibit(uint16_t const* const sample_start) { return (*sample_start >> 14) & 0x1; }
+  };
+
+  // FLAVOR_HB_QIE11 = 3; Phase1 upgrade
+  struct Flavor3 {
+    static constexpr int WORDS_PER_SAMPLE = 1;
+    static constexpr int SAMPLES_PER_WORD = 1;
+    static constexpr int HEADER_WORDS = 1;
+
+    static constexpr uint8_t adc(uint16_t const* const sample_start) { return (*sample_start & 0xff); }
+    static constexpr uint8_t tdc(uint16_t const* const sample_start) { return ((*sample_start >> 8) & 0x3); }
+    static constexpr uint8_t soibit(uint16_t const* const sample_start) { return ((*sample_start >> 14) & 0x1); }
+    static constexpr uint8_t capid(uint16_t const* const sample_start) { return ((*sample_start >> 10) & 0x3); }
+  };
+
+  // FLAVOR_HB_QIE10 = 5; Phase0
+  struct Flavor5 {
+    static constexpr float WORDS_PER_SAMPLE = 0.5;
+    static constexpr int SAMPLES_PER_WORD = 2;
+    static constexpr int HEADER_WORDS = 1;
+
+    static constexpr uint8_t adc(uint16_t const* const sample_start, uint8_t const shifter) {
+      return ((*sample_start >> shifter * 8) & 0x7f);
+    }
+  };
+
+  template <typename Flavor>
+  constexpr uint8_t capid_for_sample(uint16_t const* const dfstart, uint32_t const sample) {
+    auto const capid_first = (*dfstart >> 8) & 0x3;
+    return (capid_first + sample) & 0x3;  // same as % 4
+  }
+
+  template <>
+  constexpr uint8_t capid_for_sample<Flavor3>(uint16_t const* const dfstart, uint32_t const sample) {
+    return Flavor3::capid(dfstart + Flavor3::HEADER_WORDS + sample * Flavor3::WORDS_PER_SAMPLE);
+  }
+
+  template <typename Flavor>
+  constexpr uint8_t soibit_for_sample(uint16_t const* const dfstart, uint32_t const sample) {
+    return Flavor::soibit(dfstart + Flavor::HEADER_WORDS + sample * Flavor::WORDS_PER_SAMPLE);
+  }
+
+  template <typename Flavor>
+  constexpr uint8_t adc_for_sample(uint16_t const* const dfstart, uint32_t const sample) {
+    return Flavor::adc(dfstart + Flavor::HEADER_WORDS + sample * Flavor::WORDS_PER_SAMPLE);
+  }
+
+  template <typename Flavor>
+  constexpr uint8_t tdc_for_sample(uint16_t const* const dfstart, uint32_t const sample) {
+    return Flavor::tdc(dfstart + Flavor::HEADER_WORDS + sample * Flavor::WORDS_PER_SAMPLE);
+  }
+
+  template <>
+  constexpr uint8_t adc_for_sample<Flavor5>(uint16_t const* const dfstart, uint32_t const sample) {
+    // avoid using WORDS_PER_SAMPLE and simply shift
+    return Flavor5::adc(dfstart + Flavor5::HEADER_WORDS + (sample >> 1), sample % 2);
+  }
+
+  template <typename Flavor>
+  constexpr uint32_t compute_stride(uint32_t const nsamples) {
+    return static_cast<uint32_t>(nsamples * Flavor::WORDS_PER_SAMPLE) + Flavor::HEADER_WORDS;
+  }
+
+  template <typename Flavor>
+  constexpr uint32_t compute_nsamples(uint32_t const nwords) {
+    if constexpr (Flavor::SAMPLES_PER_WORD >= 1)
+      return (nwords - Flavor::HEADER_WORDS) * Flavor::SAMPLES_PER_WORD;
+    else
+      return (nwords - Flavor::HEADER_WORDS) / Flavor::WORDS_PER_SAMPLE;
+  }
+
+  //
+  template <typename StoragePolicy>
+  struct DigiCollectionBase : public ::calo::common::AddSize<typename StoragePolicy::TagType> {
+    DigiCollectionBase() = default;
+    DigiCollectionBase(DigiCollectionBase const&) = default;
+    DigiCollectionBase& operator=(DigiCollectionBase const&) = default;
+
+    DigiCollectionBase(DigiCollectionBase&&) = default;
+    DigiCollectionBase& operator=(DigiCollectionBase&&) = default;
+
+    template <typename T = typename StoragePolicy::TagType>
+    typename std::enable_if<std::is_same<T, ::calo::common::tags::Vec>::value, void>::type resize(std::size_t size) {
+      ids.resize(size);
+      data.resize(size * stride);
+    }
+
+    template <typename T = typename StoragePolicy::TagType>
+    typename std::enable_if<std::is_same<T, ::calo::common::tags::Vec>::value, void>::type reserve(std::size_t size) {
+      ids.reserve(size);
+      data.reserve(size * stride);
+    }
+
+    template <typename T = typename StoragePolicy::TagType>
+    typename std::enable_if<std::is_same<T, ::calo::common::tags::Vec>::value, void>::type clear() {
+      ids.clear();
+      data.clear();
+    }
+
+    typename StoragePolicy::template StorageSelector<uint32_t>::type ids;
+    typename StoragePolicy::template StorageSelector<uint16_t>::type data;
+    uint32_t stride{0};
+  };
+
+  template <typename Flavor, typename StoragePolicy>
+  struct DigiCollection : public DigiCollectionBase<StoragePolicy> {
+    using DigiCollectionBase<StoragePolicy>::DigiCollectionBase;
+  };
+
+  // NOTE: base ctors will not be available
+  template <typename StoragePolicy>
+  struct DigiCollection<Flavor5, StoragePolicy> : public DigiCollectionBase<StoragePolicy> {
+    DigiCollection() = default;
+
+    DigiCollection(DigiCollection const&) = default;
+    DigiCollection& operator=(DigiCollection const&) = default;
+
+    DigiCollection(DigiCollection&&) = default;
+    DigiCollection& operator=(DigiCollection&&) = default;
+
+    template <typename T = typename StoragePolicy::TagType>
+    typename std::enable_if<std::is_same<T, ::calo::common::tags::Vec>::value, void>::type resize(std::size_t size) {
+      DigiCollectionBase<StoragePolicy>::resize(size);
+      npresamples.resize(size);
+    }
+
+    template <typename T = typename StoragePolicy::TagType>
+    typename std::enable_if<std::is_same<T, ::calo::common::tags::Vec>::value, void>::type reserve(std::size_t size) {
+      DigiCollectionBase<StoragePolicy>::reserve(size);
+      npresamples.reserve(size);
+    }
+
+    template <typename T = typename StoragePolicy::TagType>
+    typename std::enable_if<std::is_same<T, ::calo::common::tags::Vec>::value, void>::type clear() {
+      DigiCollectionBase<StoragePolicy>::clear();
+      npresamples.clear();
+    }
+
+    // add npresamples member
+    typename StoragePolicy::template StorageSelector<uint8_t>::type npresamples;
+  };
+
+}  // namespace hcal
+
+#endif  // CUDADataFormats_HcalDigi_interface_DigiCollection_h
diff --git a/CUDADataFormats/HcalDigi/src/classes.h b/CUDADataFormats/HcalDigi/src/classes.h
new file mode 100644
index 0000000000000..8c4a20318928e
--- /dev/null
+++ b/CUDADataFormats/HcalDigi/src/classes.h
@@ -0,0 +1,3 @@
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+#include "DataFormats/Common/interface/Wrapper.h"
diff --git a/CUDADataFormats/HcalDigi/src/classes_def.xml b/CUDADataFormats/HcalDigi/src/classes_def.xml
new file mode 100644
index 0000000000000..71997eb59ba61
--- /dev/null
+++ b/CUDADataFormats/HcalDigi/src/classes_def.xml
@@ -0,0 +1,36 @@
+<lcgdict>
+    <class name="hcal::DigiCollectionBase<calo::common::VecStoragePolicy<std::allocator>>" />
+    <class name="hcal::DigiCollectionBase<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>" />
+
+    <class name="hcal::DigiCollection<hcal::Flavor1, calo::common::VecStoragePolicy<std::allocator>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor3, calo::common::VecStoragePolicy<std::allocator>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor5, calo::common::VecStoragePolicy<std::allocator>>" />
+            
+    <class name="hcal::DigiCollection<hcal::Flavor1, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor3, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>" />
+    <class name="hcal::DigiCollection<hcal::Flavor5, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>" />
+
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor1, calo::common::ViewStoragePolicy>>" persistent="false" />
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, calo::common::ViewStoragePolicy>>" persistent="false" />
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, calo::common::ViewStoragePolicy>>" persistent="false" />
+            
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor1, calo::common::DevStoragePolicy>>" persistent="false" />
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, calo::common::DevStoragePolicy>>" persistent="false" />
+    <class name="cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, calo::common::DevStoragePolicy>>" persistent="false" />
+            
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor1, calo::common::ViewStoragePolicy>>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, calo::common::ViewStoragePolicy>>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, calo::common::ViewStoragePolicy>>>" persistent="false" />
+                
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor1, calo::common::DevStoragePolicy>>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, calo::common::DevStoragePolicy>>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, calo::common::DevStoragePolicy>>>" persistent="false" />
+
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor1, calo::common::VecStoragePolicy<std::allocator>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor3, calo::common::VecStoragePolicy<std::allocator>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor5, calo::common::VecStoragePolicy<std::allocator>>>" />
+                
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor1, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor3, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>>" />
+    <class name="edm::Wrapper<hcal::DigiCollection<hcal::Flavor5, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>>" />
+</lcgdict>
diff --git a/CUDADataFormats/HcalRecHitSoA/BuildFile.xml b/CUDADataFormats/HcalRecHitSoA/BuildFile.xml
new file mode 100644
index 0000000000000..245701de5fdb0
--- /dev/null
+++ b/CUDADataFormats/HcalRecHitSoA/BuildFile.xml
@@ -0,0 +1,7 @@
+<use name="DataFormats/Common"/>
+<use name="CUDADataFormats/Common" />
+<use name="HeterogeneousCore/CUDAUtilities"/>
+
+<export>
+  <lib   name="1"/>
+</export>
diff --git a/CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h b/CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h
new file mode 100644
index 0000000000000..424b2c0813b4c
--- /dev/null
+++ b/CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h
@@ -0,0 +1,38 @@
+#ifndef CUDADataFormats_HcalRecHitCollectionSoA_interface_RecHitCollection_h
+#define CUDADataFormats_HcalRecHitCollectionSoA_interface_RecHitCollection_h
+
+#include <vector>
+
+#include "CUDADataFormats/CaloCommon/interface/Common.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+
+namespace hcal {
+
+  template <typename StoragePolicy>
+  struct RecHitCollection : public ::calo::common::AddSize<typename StoragePolicy::TagType> {
+    RecHitCollection() = default;
+    RecHitCollection(const RecHitCollection&) = default;
+    RecHitCollection& operator=(const RecHitCollection&) = default;
+
+    RecHitCollection(RecHitCollection&&) = default;
+    RecHitCollection& operator=(RecHitCollection&&) = default;
+
+    typename StoragePolicy::template StorageSelector<float>::type energy;
+    typename StoragePolicy::template StorageSelector<float>::type chi2;
+    typename StoragePolicy::template StorageSelector<float>::type energyM0;
+    typename StoragePolicy::template StorageSelector<float>::type timeM0;
+    typename StoragePolicy::template StorageSelector<uint32_t>::type did;
+
+    template <typename U = typename StoragePolicy::TagType>
+    typename std::enable_if<std::is_same<U, ::calo::common::tags::Vec>::value, void>::type resize(size_t size) {
+      energy.resize(size);
+      chi2.resize(size);
+      energyM0.resize(size);
+      timeM0.resize(size);
+      did.resize(size);
+    }
+  };
+
+}  // namespace hcal
+
+#endif  // RecoLocalCalo_HcalRecAlgos_interface_RecHitCollection_h
diff --git a/CUDADataFormats/HcalRecHitSoA/src/classes.h b/CUDADataFormats/HcalRecHitSoA/src/classes.h
new file mode 100644
index 0000000000000..a13782165c413
--- /dev/null
+++ b/CUDADataFormats/HcalRecHitSoA/src/classes.h
@@ -0,0 +1,3 @@
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h"
+#include "DataFormats/Common/interface/Wrapper.h"
diff --git a/CUDADataFormats/HcalRecHitSoA/src/classes_def.xml b/CUDADataFormats/HcalRecHitSoA/src/classes_def.xml
new file mode 100644
index 0000000000000..71dd18a7daddb
--- /dev/null
+++ b/CUDADataFormats/HcalRecHitSoA/src/classes_def.xml
@@ -0,0 +1,13 @@
+<lcgdict>
+    <class name="hcal::RecHitCollection<calo::common::VecStoragePolicy<std::allocator>>"/>
+    <class name="edm::Wrapper<hcal::RecHitCollection<calo::common::VecStoragePolicy<std::allocator>>>"/>
+
+    <class name="hcal::RecHitCollection<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>"/>
+    <class name="edm::Wrapper<hcal::RecHitCollection<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>>"/>
+
+    <class name="cms::cuda::Product<hcal::RecHitCollection<calo::common::ViewStoragePolicy>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::RecHitCollection<calo::common::ViewStoragePolicy>>>" persistent="false" />
+                
+    <class name="cms::cuda::Product<hcal::RecHitCollection<calo::common::DevStoragePolicy>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<hcal::RecHitCollection<calo::common::DevStoragePolicy>>>" persistent="false" />
+</lcgdict>
diff --git a/EventFilter/HcalRawToDigi/bin/BuildFile.xml b/EventFilter/HcalRawToDigi/bin/BuildFile.xml
new file mode 100644
index 0000000000000..7a24968df89c8
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/bin/BuildFile.xml
@@ -0,0 +1,8 @@
+<bin name="makeHcalRaw2DigiGpuValidationPlots" file="makeHcalRaw2DigiGpuValidationPlots.cpp">
+    <use name="root"/>
+    <use name="rootgraphics"/>
+    <use name="DataFormats/Common"/>
+    <use name="DataFormats/HcalDigi"/>
+    <use name="DataFormats/HcalDetId"/>
+    <use name="cuda"/>
+</bin>
diff --git a/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp b/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp
new file mode 100644
index 0000000000000..039c38dd9df16
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp
@@ -0,0 +1,386 @@
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <TCanvas.h>
+#include <TFile.h>
+#include <TH1D.h>
+#include <TH2D.h>
+#include <TPaveStats.h>
+#include <TTree.h>
+
+#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h"
+
+#define CREATE_HIST_1D(varname, nbins, first, last) auto varname = new TH1D(#varname, #varname, nbins, first, last)
+
+#define CREATE_HIST_2D(varname, nbins, first, last) \
+  auto varname = new TH2D(#varname, #varname, nbins, first, last, nbins, first, last)
+
+QIE11DigiCollection filterQIE11(QIE11DigiCollection const& coll) {
+  QIE11DigiCollection out;
+  out.reserve(coll.size());
+
+  for (uint32_t i = 0; i < coll.size(); i++) {
+    auto const df = coll[i];
+    auto const id = HcalDetId{df.id()};
+    if (id.subdetId() != HcalEndcap)
+      continue;
+
+    out.push_back(QIE11DataFrame{df});
+  }
+
+  return out;
+}
+
+int main(int argc, char* argv[]) {
+  if (argc < 3) {
+    std::cout << "run with: ./<exe> <path to input file> <path to output file>\n";
+    exit(0);
+  }
+
+  auto filterf01HE = [](QIE11DigiCollection const& coll) {
+    QIE11DigiCollection out{coll.samples(), coll.subdetId()};
+    out.reserve(coll.size());
+
+    for (uint32_t i = 0; i < coll.size(); i++) {
+      auto const df = QIE11DataFrame{coll[i]};
+      auto const id = HcalDetId{df.id()};
+      if ((df.flavor() == 0 or df.flavor() == 1) and id.subdetId() == HcalEndcap)
+        out.push_back(df);
+    }
+
+    return out;
+  };
+
+  auto filterf3HB = [](QIE11DigiCollection const& coll) {
+    QIE11DigiCollection out{coll.samples(), coll.subdetId()};
+    out.reserve(coll.size());
+
+    for (uint32_t i = 0; i < coll.size(); i++) {
+      auto const df = QIE11DataFrame{coll[i]};
+      auto const did = HcalDetId{df.id()};
+      if (df.flavor() == 3 and did.subdetId() == HcalBarrel)
+        out.push_back(df);
+    }
+
+    return out;
+  };
+
+  // branches to use
+  using Collectionf01 =
+      hcal::DigiCollection<hcal::Flavor1, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
+  using Collectionf5 =
+      hcal::DigiCollection<hcal::Flavor5, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
+  using Collectionf3 =
+      hcal::DigiCollection<hcal::Flavor3, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
+  edm::Wrapper<Collectionf01>* wgpuf01he = nullptr;
+  edm::Wrapper<Collectionf5>* wgpuf5hb = nullptr;
+  edm::Wrapper<Collectionf3>* wgpuf3hb = nullptr;
+  edm::Wrapper<QIE11DigiCollection>* wcpuf01he = nullptr;
+  edm::Wrapper<HBHEDigiCollection>* wcpuf5hb = nullptr;
+
+  std::string inFileName{argv[1]};
+  std::string outFileName{argv[2]};
+
+  // prep output
+  TFile rfout{outFileName.c_str(), "recreate"};
+
+  CREATE_HIST_1D(hADCf01HEGPU, 256, 0, 256);
+  CREATE_HIST_1D(hADCf01HECPU, 256, 0, 256);
+  CREATE_HIST_1D(hADCf5HBGPU, 128, 0, 128);
+  CREATE_HIST_1D(hADCf5HBCPU, 128, 0, 128);
+  CREATE_HIST_1D(hADCf3HBGPU, 256, 0, 256);
+  CREATE_HIST_1D(hADCf3HBCPU, 256, 0, 256);
+  CREATE_HIST_1D(hTDCf01HEGPU, 64, 0, 64);
+  CREATE_HIST_1D(hTDCf01HECPU, 64, 0, 64);
+
+  CREATE_HIST_2D(hADCf01HEGPUvsCPU, 256, 0, 256);
+  CREATE_HIST_2D(hADCf3HBGPUvsCPU, 256, 0, 256);
+  CREATE_HIST_2D(hADCf5HBGPUvsCPU, 128, 0, 128);
+  CREATE_HIST_2D(hTDCf01HEGPUvsCPU, 64, 0, 64);
+  CREATE_HIST_2D(hTDCf3HBGPUvsCPU, 4, 0, 4);
+
+  // prep input
+  TFile rfin{inFileName.c_str()};
+  TTree* rt = (TTree*)rfin.Get("Events");
+  rt->SetBranchAddress("QIE11DataFrameHcalDataFrameContainer_hcalDigis__RECO.", &wcpuf01he);
+  rt->SetBranchAddress("HBHEDataFramesSorted_hcalDigis__RECO.", &wcpuf5hb);
+  rt->SetBranchAddress(
+      "hcalFlavor5calocommonCUDAHostAllocatorAliascalocommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_"
+      "f5HBDigis_RECO.",
+      &wgpuf5hb);
+  rt->SetBranchAddress(
+      "hcalFlavor1calocommonCUDAHostAllocatorAliascalocommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_"
+      "f01HEDigis_RECO.",
+      &wgpuf01he);
+  rt->SetBranchAddress(
+      "hcalFlavor3calocommonCUDAHostAllocatorAliascalocommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_"
+      "f3HBDigis_RECO.",
+      &wgpuf3hb);
+
+  // accumulate
+  auto const nentries = rt->GetEntries();
+  std::cout << ">>> nentries = " << nentries << std::endl;
+  for (int ie = 0; ie < nentries; ++ie) {
+    rt->GetEntry(ie);
+
+    auto const& f01HEProduct = wgpuf01he->bareProduct();
+    auto const& f5HBProduct = wgpuf5hb->bareProduct();
+    auto const& f3HBProduct = wgpuf3hb->bareProduct();
+    auto const& qie11Product = wcpuf01he->bareProduct();
+    auto const qie11Filteredf01 = filterf01HE(qie11Product);
+    auto const qie11Filteredf3 = filterf3HB(qie11Product);
+    auto const& qie8Product = wcpuf5hb->bareProduct();
+
+    auto const ngpuf01he = f01HEProduct.ids.size();
+    auto const ngpuf5hb = f5HBProduct.ids.size();
+    auto const ngpuf3hb = f3HBProduct.ids.size();
+    auto const ncpuf01he = qie11Filteredf01.size();
+    auto const ncpuf5hb = qie8Product.size();
+    auto const ncpuf3hb = qie11Filteredf3.size();
+
+    /*
+        printf("ngpuf01he = %u nqie11 = %u ncpuf01he = %u ngpuf5hb = %u ncpuf5hb = %u\n",
+            f01HEProduct.size(), qie11Product.size(), qie11Filtered.size(), 
+            f5HBProduct.size(),
+            static_cast<uint32_t>(qie8Product.size()));
+            */
+
+    if (ngpuf01he != ncpuf01he) {
+      std::cerr << "*** mismatch in number of flavor 01 digis for event " << ie << std::endl
+                << ">>> ngpuf01he = " << ngpuf01he << std::endl
+                << ">>> ncpuf01he = " << ncpuf01he << std::endl;
+    }
+
+    {
+      auto const& idsgpu = f01HEProduct.ids;
+      auto const& datagpu = f01HEProduct.data;
+
+      for (uint32_t ich = 0; ich < ncpuf01he; ich++) {
+        auto const cpudf = QIE11DataFrame{qie11Filteredf01[ich]};
+        auto const cpuid = cpudf.id();
+        auto iter2idgpu = std::find(idsgpu.begin(), idsgpu.end(), cpuid);
+
+        if (iter2idgpu == idsgpu.end()) {
+          std::cerr << "missing " << HcalDetId{cpuid} << std::endl;
+          continue;
+        }
+
+        // FIXME: cna fail...
+        assert(*iter2idgpu == cpuid);
+
+        auto const ptrdiff = iter2idgpu - idsgpu.begin();
+        auto const nsamples_gpu = hcal::compute_nsamples<hcal::Flavor1>(f01HEProduct.stride);
+        auto const nsamples_cpu = qie11Filteredf01.samples();
+        assert(static_cast<uint32_t>(nsamples_cpu) == nsamples_gpu);
+
+        uint32_t ichgpu = ptrdiff;
+        uint32_t offset = ichgpu * f01HEProduct.stride;
+        uint16_t const* df_start = datagpu.data() + offset;
+        for (uint32_t sample = 0u; sample < nsamples_gpu; sample++) {
+          auto const cpuadc = cpudf[sample].adc();
+          auto const gpuadc = hcal::adc_for_sample<hcal::Flavor1>(df_start, sample);
+          auto const cputdc = cpudf[sample].tdc();
+          auto const gputdc = hcal::tdc_for_sample<hcal::Flavor1>(df_start, sample);
+          auto const cpucapid = cpudf[sample].capid();
+          auto const gpucapid = hcal::capid_for_sample<hcal::Flavor1>(df_start, sample);
+
+          hADCf01HEGPU->Fill(gpuadc);
+          hADCf01HECPU->Fill(cpuadc);
+          hTDCf01HEGPU->Fill(gputdc);
+          hTDCf01HECPU->Fill(cputdc);
+          hADCf01HEGPUvsCPU->Fill(cpuadc, gpuadc);
+          hTDCf01HEGPUvsCPU->Fill(cputdc, gputdc);
+
+          // At RAW Decoding level there must not be any mistmatches
+          // in the adc values at all!
+          assert(static_cast<uint8_t>(cpuadc) == gpuadc);
+          assert(static_cast<uint8_t>(cputdc) == gputdc);
+          assert(static_cast<uint8_t>(cpucapid) == gpucapid);
+        }
+      }
+    }
+
+    if (ngpuf3hb != ncpuf3hb) {
+      std::cerr << "*** mismatch in number of flavor 3 digis for event " << ie << std::endl
+                << ">>> ngpuf01he = " << ngpuf3hb << std::endl
+                << ">>> ncpuf01he = " << ncpuf3hb << std::endl;
+    }
+
+    {
+      auto const& idsgpu = f3HBProduct.ids;
+      auto const& datagpu = f3HBProduct.data;
+
+      for (uint32_t ich = 0; ich < ncpuf3hb; ich++) {
+        auto const cpudf = QIE11DataFrame{qie11Filteredf3[ich]};
+        auto const cpuid = cpudf.id();
+        auto iter2idgpu = std::find(idsgpu.begin(), idsgpu.end(), cpuid);
+
+        if (iter2idgpu == idsgpu.end()) {
+          std::cerr << "missing " << HcalDetId{cpuid} << std::endl;
+          continue;
+        }
+
+        // FIXME: cna fail...
+        assert(*iter2idgpu == cpuid);
+
+        auto const ptrdiff = iter2idgpu - idsgpu.begin();
+        auto const nsamples_gpu = hcal::compute_nsamples<hcal::Flavor3>(f3HBProduct.stride);
+        auto const nsamples_cpu = qie11Filteredf3.samples();
+        assert(static_cast<uint32_t>(nsamples_cpu) == nsamples_gpu);
+
+        uint32_t ichgpu = ptrdiff;
+        uint32_t offset = ichgpu * f3HBProduct.stride;
+        uint16_t const* df_start = datagpu.data() + offset;
+        for (uint32_t sample = 0u; sample < nsamples_gpu; sample++) {
+          auto const cpuadc = cpudf[sample].adc();
+          auto const gpuadc = hcal::adc_for_sample<hcal::Flavor3>(df_start, sample);
+          auto const cputdc = cpudf[sample].tdc();
+          auto const gputdc = hcal::tdc_for_sample<hcal::Flavor3>(df_start, sample);
+
+          hADCf3HBGPU->Fill(gpuadc);
+          hADCf3HBCPU->Fill(cpuadc);
+          hADCf3HBGPUvsCPU->Fill(cpuadc, gpuadc);
+          hTDCf3HBGPUvsCPU->Fill(cputdc, gputdc);
+
+          // At RAW Decoding level there must not be any mistmatches
+          // in the adc values at all!
+          assert(static_cast<uint8_t>(cpuadc) == gpuadc);
+          assert(static_cast<uint8_t>(cputdc) == gputdc);
+        }
+      }
+    }
+
+    if (ngpuf5hb != ncpuf5hb) {
+      std::cerr << "*** mismatch in number of flavor 5 digis for event " << ie << std::endl
+                << ">>> ngpuf5hb = " << ngpuf5hb << std::endl
+                << ">>> ncpuf5hb = " << ncpuf5hb << std::endl;
+    }
+
+    {
+      auto const& idsgpu = f5HBProduct.ids;
+      auto const& datagpu = f5HBProduct.data;
+      for (uint32_t i = 0; i < ncpuf5hb; i++) {
+        auto const cpudf = qie8Product[i];
+        auto const cpuid = cpudf.id().rawId();
+        auto iter2idgpu = std::find(idsgpu.begin(), idsgpu.end(), cpuid);
+        if (iter2idgpu == idsgpu.end()) {
+          std::cerr << "missing " << HcalDetId{cpuid} << std::endl;
+          continue;
+        }
+
+        assert(*iter2idgpu == cpuid);
+
+        auto const ptrdiff = iter2idgpu - idsgpu.begin();
+        auto const nsamples_gpu = hcal::compute_nsamples<hcal::Flavor5>(f5HBProduct.stride);
+        auto const nsamples_cpu = qie8Product[0].size();
+        assert(static_cast<uint32_t>(nsamples_cpu) == nsamples_gpu);
+
+        uint32_t offset = ptrdiff * f5HBProduct.stride;
+        uint16_t const* df_start = datagpu.data() + offset;
+        for (uint32_t sample = 0u; sample < nsamples_gpu; sample++) {
+          auto const cpuadc = cpudf.sample(sample).adc();
+          auto const gpuadc = hcal::adc_for_sample<hcal::Flavor5>(df_start, sample);
+          auto const cpucapid = cpudf.sample(sample).capid();
+          auto const gpucapid = hcal::capid_for_sample<hcal::Flavor1>(df_start, sample);
+
+          hADCf5HBGPU->Fill(gpuadc);
+          hADCf5HBCPU->Fill(cpuadc);
+          hADCf5HBGPUvsCPU->Fill(cpuadc, gpuadc);
+
+          // the must for us at RAW Decoding stage
+          assert(static_cast<uint8_t>(cpuadc) == gpuadc);
+          assert(static_cast<uint8_t>(cpucapid) == gpucapid);
+        }
+      }
+    }
+  }
+
+  {
+    TCanvas c{"plots", "plots", 4200, 6200};
+    c.Divide(3, 3);
+    c.cd(1);
+    {
+      gPad->SetLogy();
+      hADCf01HECPU->SetLineColor(kBlack);
+      hADCf01HECPU->SetLineWidth(1.);
+      hADCf01HECPU->Draw("");
+      hADCf01HEGPU->SetLineColor(kBlue);
+      hADCf01HEGPU->SetLineWidth(1.);
+      hADCf01HEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hADCf01HEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(2);
+    {
+      gPad->SetLogy();
+      hADCf5HBCPU->SetLineColor(kBlack);
+      hADCf5HBCPU->SetLineWidth(1.);
+      hADCf5HBCPU->Draw("");
+      hADCf5HBGPU->SetLineColor(kBlue);
+      hADCf5HBGPU->SetLineWidth(1.);
+      hADCf5HBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hADCf5HBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(3);
+    {
+      gPad->SetLogy();
+      hADCf3HBCPU->SetLineColor(kBlack);
+      hADCf3HBCPU->SetLineWidth(1.);
+      hADCf3HBCPU->Draw("");
+      hADCf3HBGPU->SetLineColor(kBlue);
+      hADCf3HBGPU->SetLineWidth(1.);
+      hADCf3HBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hADCf3HBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(4);
+    hADCf01HEGPUvsCPU->Draw("colz");
+    c.cd(5);
+    hADCf5HBGPUvsCPU->Draw("colz");
+    c.cd(6);
+    hADCf3HBGPUvsCPU->Draw("colz");
+    c.cd(7);
+    {
+      gPad->SetLogy();
+      hTDCf01HECPU->SetLineColor(kBlack);
+      hTDCf01HECPU->SetLineWidth(1.);
+      hTDCf01HECPU->Draw("");
+      hTDCf01HEGPU->SetLineColor(kBlue);
+      hTDCf01HEGPU->SetLineWidth(1.);
+      hTDCf01HEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hTDCf01HEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(8);
+    hTDCf01HEGPUvsCPU->Draw("colz");
+    c.cd(9);
+    hTDCf3HBGPUvsCPU->Draw("colz");
+
+    c.SaveAs("plots.pdf");
+  }
+
+  rfin.Close();
+  rfout.Write();
+  rfout.Close();
+}
diff --git a/EventFilter/HcalRawToDigi/plugins/BuildFile.xml b/EventFilter/HcalRawToDigi/plugins/BuildFile.xml
index ccf6a061119c2..3077a68a665e4 100644
--- a/EventFilter/HcalRawToDigi/plugins/BuildFile.xml
+++ b/EventFilter/HcalRawToDigi/plugins/BuildFile.xml
@@ -1,16 +1,27 @@
+<use name="boost"/>
+<use name="zlib"/>
+<use name="CalibFormats/HcalObjects"/>
+<use name="CondFormats/HcalObjects"/>
+<use name="DataFormats/FEDRawData"/>
 <use name="DataFormats/HcalDetId"/>
 <use name="DataFormats/HcalDigi"/>
-<use name="DataFormats/FEDRawData"/>
-<use name="CondFormats/HcalObjects"/>
-<use name="CalibFormats/HcalObjects"/>
+<use name="EventFilter/HcalRawToDigi"/>
 <use name="FWCore/Framework"/>
 <use name="FWCore/MessageLogger"/>
-<use name="boost"/>
-<use name="zlib"/>
-<use name="EventFilter/HcalRawToDigi"/>
 <flags EDM_PLUGIN="1"/>
+
 <library file="HcalCalibFEDSelector.cc,HcalCalibTypeFilter.cc,HcalDigiToRaw.cc,HcalEmptyEventFilter.cc,HcalHistogramRawToDigi.cc,HcalRawToDigi.cc,modules.cc,HcalDigiToRawuHTR.cc,HcalRawToDigiFake.cc" name="EventFilterHcalRawToDigiPlugins">
 </library>
 
 <library file="HcalLaserEventFiltProducer2012.cc, HcalLaserEventFilter2012.cc,HcalLaserHFFilter2012.cc,HcalLaserHBHEFilter2012.cc,HcalLaserHBHEHFFilter2012.cc" name="EventFilterHcalRawToDigiFiltersPlugins">
 </library>
+
+<library file="HcalRawToDigiGPU.cc,DecodeGPU.cu,HcalESProducerGPUDefs.cc,ElectronicsMappingGPU.cc,HcalCPUDigisProducer.cc,HcalDigisProducerGPU.cc" name="EventFilterHcalRawToDigiGPUPlugins">
+  <use name="cuda" />
+  <use name="CUDADataFormats/Common" />
+  <use name="CUDADataFormats/CaloCommon"/>
+  <use name="CUDADataFormats/HcalDigi" />
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="HeterogeneousCore/CUDAServices"/>
+  <use name="HeterogeneousCore/CUDAUtilities"/>
+</library>
diff --git a/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
new file mode 100644
index 0000000000000..9903b77efb341
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
@@ -0,0 +1,86 @@
+#ifndef EventFilter_HcalRawToDigi_interface_DeclsForKernels_h
+#define EventFilter_HcalRawToDigi_interface_DeclsForKernels_h
+
+#include <vector>
+
+#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+#include "ElectronicsMappingGPU.h"
+
+namespace hcal {
+  namespace raw {
+
+    constexpr int32_t empty_event_size = 32;
+    constexpr uint32_t utca_nfeds_max = 50;
+    constexpr uint32_t nbytes_per_fed_max = 10 * 1024;
+
+    // each collection corresponds to a particular flavor with a certain number of
+    // samples per digi
+    constexpr uint32_t numOutputCollections = 3;
+    constexpr uint8_t OutputF01HE = 0;
+    constexpr uint8_t OutputF5HB = 1;
+    constexpr uint8_t OutputF3HB = 2;
+
+    struct ConfigurationParameters {
+      uint32_t maxChannelsF01HE;
+      uint32_t maxChannelsF5HB;
+      uint32_t maxChannelsF3HB;
+      uint32_t nsamplesF01HE;
+      uint32_t nsamplesF5HB;
+      uint32_t nsamplesF3HB;
+    };
+
+    struct InputDataCPU {
+      cms::cuda::host::unique_ptr<unsigned char[]> data;
+      cms::cuda::host::unique_ptr<uint32_t[]> offsets;
+      cms::cuda::host::unique_ptr<int[]> feds;
+    };
+
+    struct OutputDataCPU {
+      cms::cuda::host::unique_ptr<uint32_t[]> nchannels;
+    };
+
+    struct ScratchDataGPU {
+      // depends on the number of output collections
+      // that is a statically known predefined number
+      cms::cuda::device::unique_ptr<uint32_t[]> pChannelsCounters;
+    };
+
+    struct OutputDataGPU {
+      DigiCollection<Flavor1, ::calo::common::DevStoragePolicy> digisF01HE;
+      DigiCollection<Flavor5, ::calo::common::DevStoragePolicy> digisF5HB;
+      DigiCollection<Flavor3, ::calo::common::DevStoragePolicy> digisF3HB;
+
+      void allocate(ConfigurationParameters const &config, cudaStream_t cudaStream) {
+        digisF01HE.data = cms::cuda::make_device_unique<uint16_t[]>(
+            config.maxChannelsF01HE * compute_stride<Flavor1>(config.nsamplesF01HE), cudaStream);
+        digisF01HE.ids = cms::cuda::make_device_unique<uint32_t[]>(config.maxChannelsF01HE, cudaStream);
+
+        digisF5HB.data = cms::cuda::make_device_unique<uint16_t[]>(
+            config.maxChannelsF5HB * compute_stride<Flavor5>(config.nsamplesF5HB), cudaStream);
+        digisF5HB.ids = cms::cuda::make_device_unique<uint32_t[]>(config.maxChannelsF5HB, cudaStream);
+        digisF5HB.npresamples = cms::cuda::make_device_unique<uint8_t[]>(config.maxChannelsF5HB, cudaStream);
+
+        digisF3HB.data = cms::cuda::make_device_unique<uint16_t[]>(
+            config.maxChannelsF3HB * compute_stride<Flavor3>(config.nsamplesF3HB), cudaStream);
+        digisF3HB.ids = cms::cuda::make_device_unique<uint32_t[]>(config.maxChannelsF3HB, cudaStream);
+      }
+    };
+
+    struct InputDataGPU {
+      cms::cuda::device::unique_ptr<unsigned char[]> data;
+      cms::cuda::device::unique_ptr<uint32_t[]> offsets;
+      cms::cuda::device::unique_ptr<int[]> feds;
+    };
+
+    struct ConditionsProducts {
+      ElectronicsMappingGPU::Product const &eMappingProduct;
+    };
+
+  }  // namespace raw
+}  // namespace hcal
+
+#endif  // EventFilter_HcalRawToDigi_interface_DeclsForKernels_h
diff --git a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
new file mode 100644
index 0000000000000..4f2ca85861b30
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
@@ -0,0 +1,593 @@
+#include "DataFormats/HcalDetId/interface/HcalElectronicsId.h"
+#include "DataFormats/HcalDetId/interface/HcalSubdetector.h"
+#include "DataFormats/HcalDetId/interface/HcalDetId.h"
+
+#include "EventFilter/HcalRawToDigi/plugins/DecodeGPU.h"
+
+#include <cooperative_groups.h>
+using namespace cooperative_groups;
+
+namespace hcal {
+  namespace raw {
+
+    __forceinline__ __device__ char const* get_subdet_str(DetId const& did) {
+      switch (did.subdetId()) {
+        case HcalEmpty:
+          return "HcalEmpty";
+          break;
+        case HcalBarrel:
+          return "HcalBarrel";
+          break;
+        case HcalEndcap:
+          return "HcalEndcap";
+          break;
+        case HcalOuter:
+          return "HcalOuter";
+          break;
+        case HcalForward:
+          return "HcalForward";
+          break;
+        case HcalTriggerTower:
+          return "HcalTriggerTower";
+          break;
+        case HcalOther:
+          return "HcalOther";
+          break;
+        default:
+          return "Unknown";
+          break;
+      }
+
+      return "Unknown";
+    }
+
+    __forceinline__ __device__ bool is_channel_header_word(uint16_t const* ptr) {
+      uint8_t bit = (*ptr >> 15) & 0x1;
+      return bit == 1;
+    }
+
+    template <typename T>
+    constexpr bool is_power_of_two(T x) {
+      return (x != 0) && ((x & (x - 1)) == 0);
+    }
+
+    template <int NTHREADS>
+    __global__ void kernel_rawdecode_test(unsigned char const* data,
+                                          uint32_t const* offsets,
+                                          int const* feds,
+                                          uint32_t const* eid2did,
+                                          uint32_t const* eid2tid,
+                                          uint16_t* digisF01HE,
+                                          uint32_t* idsF01HE,
+                                          uint16_t* digisF5HB,
+                                          uint32_t* idsF5HB,
+                                          uint8_t* npresamplesF5HB,
+                                          uint16_t* digisF3HB,
+                                          uint32_t* idsF3HB,
+                                          uint32_t* pChannelsCounters,
+                                          uint32_t const nsamplesF01HE,
+                                          uint32_t const nsamplesF5HB,
+                                          uint32_t const nsamplesF3HB,
+                                          uint32_t const nBytesTotal) {
+      // in order to properly use cooperative groups
+      static_assert(is_power_of_two(NTHREADS) == true && NTHREADS <= 32);
+
+      thread_block_tile<NTHREADS> thread_group = tiled_partition<NTHREADS>(this_thread_block());
+
+      auto const iamc = threadIdx.x / NTHREADS;
+      auto const ifed = blockIdx.x;
+      auto const offset = offsets[ifed];
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG_CG
+      if (ifed > 0 || iamc > 0)
+        return;
+      printf("threadIdx.x = %d rank = %d iamc = %d\n", threadIdx.x, thread_group.thread_rank(), iamc);
+#endif
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+      auto const fed = feds[ifed];
+      auto const size = ifed == gridDim.x - 1 ? nBytesTotal - offset : offsets[ifed + 1] - offset;
+      printf("ifed = %d fed = %d offset = %u size = %u\n", ifed, fed, offset, size);
+#endif
+
+      // offset to the right raw buffer
+      uint64_t const* buffer = reinterpret_cast<uint64_t const*>(data + offset);
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+      //
+      // fed header
+      //
+      auto const fed_header = buffer[0];
+      uint32_t const fed_id = (fed_header >> 8) & 0xfff;
+      uint32_t const bx = (fed_header >> 20) & 0xfff;
+      uint32_t const lv1 = (fed_header >> 32) & 0xffffff;
+      uint8_t const trigger_type = (fed_header >> 56) & 0xf;
+      uint8_t const bid_fed_header = (fed_header >> 60) & 0xf;
+
+      printf("fed = %d fed_id = %u bx = %u lv1 = %u trigger_type = %u bid = %u\n",
+             fed,
+             fed_id,
+             bx,
+             lv1,
+             trigger_type,
+             bid_fed_header);
+#endif
+
+      // amc 13 header
+      auto const amc13word = buffer[1];
+      uint8_t const namc = (amc13word >> 52) & 0xf;
+      if (iamc >= namc)
+        return;
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+      uint8_t const amc13version = (amc13word >> 60) & 0xf;
+      uint32_t const amc13OrbitNumber = (amc13word >> 4) & 0xffffffffu;
+      printf("fed = %d namc = %u amc13version = %u amc13OrbitNumber = %u\n", fed, namc, amc13version, amc13OrbitNumber);
+#endif
+
+      // compute the offset int to the right buffer
+      uint32_t amcoffset = 0;
+      for (uint8_t ii = 0u; ii < iamc; ii++) {
+        auto const word = buffer[2 + ii];
+        int const amcSize = (word >> 32) & 0xffffff;
+        amcoffset += amcSize;
+      }
+
+      auto const word = buffer[2 + iamc];
+      int const amcSize = (word >> 32) & 0xffffff;
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+      uint16_t const amcid = word & 0xffff;
+      int const slot = (word >> 16) & 0xf;
+      int const amcBlockNumber = (word >> 20) & 0xff;
+      printf("fed = %d amcid = %u slot = %d amcBlockNumber = %d\n", fed, amcid, slot, amcBlockNumber);
+
+      bool const amcmore = ((word >> 61) & 0x1) != 0;
+      bool const amcSegmented = ((word >> 60) & 0x1) != 0;
+      bool const amcLengthOk = ((word >> 62) & 0x1) != 0;
+      bool const amcCROk = ((word >> 56) & 0x1) != 0;
+      bool const amcDataPresent = ((word >> 58) & 0x1) != 0;
+      bool const amcDataValid = ((word >> 56) & 0x1) != 0;
+      bool const amcEnabled = ((word >> 59) & 0x1) != 0;
+      printf(
+          "fed = %d amcmore = %d amcSegmented = %d, amcLengthOk = %d amcCROk = %d\n>> amcDataPresent = %d amcDataValid "
+          "= %d amcEnabled = %d\n",
+          fed,
+          static_cast<int>(amcmore),
+          static_cast<int>(amcSegmented),
+          static_cast<int>(amcLengthOk),
+          static_cast<int>(amcCROk),
+          static_cast<int>(amcDataPresent),
+          static_cast<int>(amcDataValid),
+          static_cast<int>(amcEnabled));
+#endif
+
+      // get to the payload
+      auto const* payload64 = buffer + 2 + namc + amcoffset;
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+      // uhtr header v1 1st 64 bits
+      auto const payload64_w0 = payload64[0];
+#endif
+      // uhtr n bytes comes from amcSize, according to the cpu version!
+      uint32_t const data_length64 = amcSize;
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+      uint16_t bcn = (payload64_w0 >> 20) & 0xfff;
+      uint32_t evn = (payload64_w0 >> 32) & 0xffffff;
+      printf("fed = %d data_length64 = %u bcn = %u evn = %u\n", fed, data_length64, bcn, evn);
+#endif
+
+      // uhtr header v1 2nd 64 bits
+      auto const payload64_w1 = payload64[1];
+      uint8_t const uhtrcrate = payload64_w1 & 0xff;
+      uint8_t const uhtrslot = (payload64_w1 >> 8) & 0xf;
+      uint8_t const presamples = (payload64_w1 >> 12) & 0xf;
+      uint8_t const payloadFormat = (payload64_w1 >> 44) & 0xf;
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+      uint16_t const orbitN = (payload64_w1 >> 16) & 0xffff;
+      uint8_t const firmFlavor = (payload64_w1 >> 32) & 0xff;
+      uint8_t const eventType = (payload64_w1 >> 40) & 0xf;
+      printf(
+          "fed = %d crate = %u slot = %u presamples = %u\n>>> orbitN = %u firmFlavor = %u eventType = %u payloadFormat "
+          "= %u\n",
+          fed,
+          uhtrcrate,
+          uhtrslot,
+          presamples,
+          orbitN,
+          firmFlavor,
+          eventType,
+          payloadFormat);
+#endif
+
+      // this should be filtering out uMNio...
+      if (payloadFormat != 1)
+        return;
+
+      // skip uhtr header words
+      auto const channelDataSize = data_length64 - 2;        // 2 uhtr header v1 words
+      auto const* channelDataBuffer64Start = payload64 + 2;  // 2 uhtr header v2 wds
+      auto const* ptr = reinterpret_cast<uint16_t const*>(channelDataBuffer64Start);
+      auto const* end = ptr + sizeof(uint64_t) / sizeof(uint16_t) * (channelDataSize - 1);
+      auto const t_rank = thread_group.thread_rank();
+
+      // iterate through the channel data
+      while (ptr != end) {
+        // this is the starting point for this thread group for this iteration
+        // with respect to this pointer every thread will move forward afterwards
+        auto const* const start_ptr = ptr;
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG_CG
+        thread_group.sync();
+#endif
+
+        // skip to the header word of the right channel for this thread
+        int counter = 0;
+        while (counter < thread_group.thread_rank()) {
+          // just a check for threads that land beyond the end
+          if (ptr == end)
+            break;
+
+          // move ptr one forward past header
+          if (is_channel_header_word(ptr))
+            ++ptr;
+          else {
+            // go to the next channel and do not consider this guy as a channel
+            while (ptr != end)
+              if (!is_channel_header_word(ptr))
+                ++ptr;
+              else
+                break;
+            continue;
+          }
+
+          // skip
+          while (ptr != end)
+            if (!is_channel_header_word(ptr))
+              ++ptr;
+            else
+              break;
+          counter++;
+        }
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG_CG
+        thread_group.sync();
+        printf("ptr - start_ptr = %d counter = %d rank = %d\n", static_cast<int>(ptr - start_ptr), counter, t_rank);
+#endif
+
+        // when the end is near, channels will land outside of the [start_ptr, end) region
+        if (ptr != end) {
+          // for all of the flavors, these 2 guys have the same bit layout
+          uint8_t const flavor = (ptr[0] >> 12) & 0x7;
+          uint8_t const channelid = ptr[0] & 0xff;
+          auto const* const new_channel_start = ptr;
+
+          // flavor dependent stuff
+          switch (flavor) {
+            case 0:
+            case 1: {
+              // treat eid and did
+              uint8_t fiber = (channelid >> 3) & 0x1f;
+              uint8_t fchannel = channelid & 0x7;
+              HcalElectronicsId eid{uhtrcrate, uhtrslot, fiber, fchannel, false};
+              auto const did = HcalDetId{eid2did[eid.linearIndex()]};
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+              printf("erawId = %u linearIndex = %u drawid = %u subdet = %s\n",
+                     eid.rawId(),
+                     eid.linearIndex(),
+                     did.rawId(),
+                     get_subdet_str(did));
+              printf("flavor = %u crate = %u slot = %u channelid = %u fiber = %u fchannel = %u\n",
+                     flavor,
+                     uhtrcrate,
+                     uhtrslot,
+                     channelid,
+                     fiber,
+                     fchannel);
+#endif
+
+              // remove digis not for HE
+              if (did.subdetId() != HcalEndcap)
+                break;
+
+              // count words
+              auto const* channel_header_word = ptr++;
+              while (!is_channel_header_word(ptr) && ptr != end)
+                ++ptr;
+              auto const* channel_end = ptr;  // set ptr
+              uint32_t const nwords = channel_end - channel_header_word;
+
+              // filter out this digi if nwords does not equal expected
+              auto const expected_words = compute_stride<Flavor1>(nsamplesF01HE);
+              if (nwords != expected_words)
+                break;
+
+              // inc the number of digis of this type
+              auto const pos = atomicAdd(&pChannelsCounters[OutputF01HE], 1);
+#ifdef HCAL_RAWDECODE_GPUDEBUG_CG
+              printf("rank = %d pos = %d\n", thread_group.thread_rank(), pos);
+#endif
+
+              // store to global mem words for this digi
+              idsF01HE[pos] = did.rawId();
+
+              for (uint32_t iword = 0; iword < expected_words; iword++)
+                digisF01HE[pos * expected_words + iword] = channel_header_word[iword];
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+              printf("nwords = %u\n", nwords);
+#endif
+
+              break;
+            }
+            case 3: {
+              // treat eid and did
+              uint8_t fiber = (channelid >> 3) & 0x1f;
+              uint8_t fchannel = channelid & 0x7;
+              HcalElectronicsId eid{uhtrcrate, uhtrslot, fiber, fchannel, false};
+              auto const did = HcalDetId{eid2did[eid.linearIndex()]};
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+              printf("erawId = %u linearIndex = %u drawid = %u subdet = %s\n",
+                     eid.rawId(),
+                     eid.linearIndex(),
+                     did.rawId(),
+                     get_subdet_str(did));
+              printf("flavor = %u crate = %u slot = %u channelid = %u fiber = %u fchannel = %u\n",
+                     flavor,
+                     uhtrcrate,
+                     uhtrslot,
+                     channelid,
+                     fiber,
+                     fchannel);
+#endif
+
+              // remove digis not for HE
+              if (did.subdetId() != HcalBarrel)
+                break;
+
+              // count words
+              auto const* channel_header_word = ptr++;
+              while (!is_channel_header_word(ptr) && ptr != end)
+                ++ptr;
+              auto const* channel_end = ptr;  // set ptr
+              uint32_t const nwords = channel_end - channel_header_word;
+
+              // filter out this digi if nwords does not equal expected
+              auto const expected_words = compute_stride<Flavor3>(nsamplesF3HB);
+              if (nwords != expected_words)
+                break;
+
+              // inc the number of digis of this type
+              auto const pos = atomicAdd(&pChannelsCounters[OutputF3HB], 1);
+
+              // store to global mem words for this digi
+              idsF3HB[pos] = did.rawId();
+              for (uint32_t iword = 0; iword < expected_words; iword++)
+                digisF3HB[pos * expected_words + iword] = channel_header_word[iword];
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+              printf("nwords = %u\n", nwords);
+#endif
+
+              break;
+            }
+            case 2: {
+              uint8_t fiber = (channelid >> 3) & 0x1f;
+              uint8_t fchannel = channelid & 0x7;
+              HcalElectronicsId eid{uhtrcrate, uhtrslot, fiber, fchannel, false};
+              auto const did = DetId{eid2did[eid.linearIndex()]};
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+              printf("erawId = %u linearIndex = %u drawid = %u subdet = %s\n",
+                     eid.rawId(),
+                     eid.linearIndex(),
+                     did.rawId(),
+                     get_subdet_str(did));
+              printf("flavor = %u crate = %u slot = %u channelid = %u fiber = %u fchannel = %u\n",
+                     flavor,
+                     uhtrcrate,
+                     uhtrslot,
+                     channelid,
+                     fiber,
+                     fchannel);
+#endif
+
+              break;
+            }
+            case 4: {
+              uint8_t link = (channelid >> 4) & 0xf;
+              uint8_t tower = channelid & 0xf;
+              HcalElectronicsId eid{uhtrcrate, uhtrslot, link, tower, true};
+              auto const did = DetId{eid2tid[eid.linearIndex()]};
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+              printf("erawId = %u linearIndex = %u drawid = %u subdet = %s\n",
+                     eid.rawId(),
+                     eid.linearIndex(),
+                     did.rawId(),
+                     get_subdet_str(did));
+              printf("flavor = %u crate = %u slot = %u channelid = %u link = %u tower = %u\n",
+                     flavor,
+                     uhtrcrate,
+                     uhtrslot,
+                     channelid,
+                     link,
+                     tower);
+#endif
+
+              break;
+            }
+            case 5: {
+              uint8_t fiber = (channelid >> 2) & 0x3f;
+              uint8_t fchannel = channelid & 0x3;
+              HcalElectronicsId eid{uhtrcrate, uhtrslot, fiber, fchannel, false};
+              auto const did = HcalDetId{eid2did[eid.linearIndex()]};
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+              printf("erawId = %u linearIndex = %u drawid = %u subdet = %s\n",
+                     eid.rawId(),
+                     eid.linearIndex(),
+                     did.rawId(),
+                     get_subdet_str(did));
+              printf("flavor = %u crate = %u slot = %u channelid = %u fiber = %u fchannel = %u\n",
+                     flavor,
+                     uhtrcrate,
+                     uhtrslot,
+                     channelid,
+                     fiber,
+                     fchannel);
+#endif
+
+              // remove digis not for HB
+              if (did.subdetId() != HcalBarrel)
+                break;
+
+              // count words
+              auto const* channel_header_word = ptr++;
+              while (!is_channel_header_word(ptr) && ptr != end)
+                ++ptr;
+              auto const* channel_end = ptr;  // set ptr
+              uint32_t const nwords = channel_end - channel_header_word;
+
+              // filter out this digi if nwords does not equal expected
+              auto const expected_words = compute_stride<Flavor5>(nsamplesF5HB);
+              if (nwords != expected_words)
+                break;
+
+              // inc the number of digis of this type
+              auto const pos = atomicAdd(&pChannelsCounters[OutputF5HB], 1);
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG_CG
+              printf("rank = %d pos = %d\n", thread_group.thread_rank(), pos);
+#endif
+
+              // store to global mem words for this digi
+              idsF5HB[pos] = did.rawId();
+              npresamplesF5HB[pos] = presamples;
+              for (uint32_t iword = 0; iword < expected_words; iword++)
+                digisF5HB[pos * expected_words + iword] = channel_header_word[iword];
+
+              break;
+            }
+            case 7: {
+              uint8_t const fiber = (channelid >> 2) & 0x3f;
+              uint8_t const fchannel = channelid & 0x3;
+              HcalElectronicsId eid{uhtrcrate, uhtrslot, fiber, fchannel, false};
+              auto const did = DetId{eid2did[eid.linearIndex()]};
+
+              /* uncomment to check the linear index validity
+              if (eid.rawId() >= HcalElectronicsId::maxLinearIndex) {
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+                  printf("*** rawid = %u has no known det id***\n", eid.rawId());
+#endif
+                  break;
+              }
+              */
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+              printf("erawId = %u linearIndex = %u drawid = %u\n", eid.rawId(), eid.linearIndex(), did.rawId());
+              printf("flavor = %u crate = %u slot = %u channelid = %u fiber = %u fchannel = %u\n",
+                     flavor,
+                     uhtrcrate,
+                     uhtrslot,
+                     channelid,
+                     fiber,
+                     fchannel);
+#endif
+
+              break;
+            }
+            default:
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+              printf("flavor = %u crate = %u slot = %u channelid = %u\n", flavor, uhtrcrate, uhtrslot, channelid);
+#endif
+              ;
+          }
+
+          // skip to the next word in case
+          // 1) current channel was not treated
+          // 2) we are in the middle of the digi words and not at the end
+          while (new_channel_start == ptr || !is_channel_header_word(ptr) && ptr != end)
+            ++ptr;
+        }
+
+        // thread with rank 31 of the group will have the ptr pointing to the
+        // header word of the next channel or the end
+        int const offset_to_shuffle = ptr - start_ptr;
+
+        // always receive from the last guy in the group
+        auto const offset_for_rank31 = thread_group.shfl(offset_to_shuffle, NTHREADS - 1);
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG_CG
+        printf("rank = %d offset_to_shuffle = %d offset_for_rank32 = %d\n",
+               thread_group.thread_rank(),
+               offset_to_shuffle,
+               offset_for_rank31);
+#endif
+
+        // update the ptr for all threads of this group
+        // NOTE: relative to the start_ptr that is the same for all threads of
+        // this group
+        ptr = start_ptr + offset_for_rank31;
+      }
+    }
+
+    void entryPoint(InputDataCPU const& inputCPU,
+                    InputDataGPU& inputGPU,
+                    OutputDataGPU& outputGPU,
+                    ScratchDataGPU& scratchGPU,
+                    OutputDataCPU& outputCPU,
+                    ConditionsProducts const& conditions,
+                    ConfigurationParameters const& config,
+                    cudaStream_t cudaStream,
+                    uint32_t const nfedsWithData,
+                    uint32_t const nbytesTotal) {
+      // transfer
+      cudaCheck(cudaMemcpyAsync(inputGPU.data.get(),
+                                inputCPU.data.get(),
+                                nbytesTotal * sizeof(unsigned char),
+                                cudaMemcpyHostToDevice,
+                                cudaStream));
+      cudaCheck(cudaMemcpyAsync(inputGPU.offsets.get(),
+                                inputCPU.offsets.get(),
+                                nfedsWithData * sizeof(uint32_t),
+                                cudaMemcpyHostToDevice,
+                                cudaStream));
+      cudaCheck(
+          cudaMemsetAsync(scratchGPU.pChannelsCounters.get(), 0, sizeof(uint32_t) * numOutputCollections, cudaStream));
+      cudaCheck(cudaMemcpyAsync(
+          inputGPU.feds.get(), inputCPU.feds.get(), nfedsWithData * sizeof(int), cudaMemcpyHostToDevice, cudaStream));
+
+      // 12 is the max number of modules per crate
+      kernel_rawdecode_test<32><<<nfedsWithData, 12 * 32, 0, cudaStream>>>(inputGPU.data.get(),
+                                                                           inputGPU.offsets.get(),
+                                                                           inputGPU.feds.get(),
+                                                                           conditions.eMappingProduct.eid2did,
+                                                                           conditions.eMappingProduct.eid2tid,
+                                                                           outputGPU.digisF01HE.data.get(),
+                                                                           outputGPU.digisF01HE.ids.get(),
+                                                                           outputGPU.digisF5HB.data.get(),
+                                                                           outputGPU.digisF5HB.ids.get(),
+                                                                           outputGPU.digisF5HB.npresamples.get(),
+                                                                           outputGPU.digisF3HB.data.get(),
+                                                                           outputGPU.digisF3HB.ids.get(),
+                                                                           scratchGPU.pChannelsCounters.get(),
+                                                                           config.nsamplesF01HE,
+                                                                           config.nsamplesF5HB,
+                                                                           config.nsamplesF3HB,
+                                                                           nbytesTotal);
+      cudaCheck(cudaGetLastError());
+
+      cudaCheck(cudaMemcpyAsync(outputCPU.nchannels.get(),
+                                scratchGPU.pChannelsCounters.get(),
+                                sizeof(uint32_t) * numOutputCollections,
+                                cudaMemcpyDeviceToHost,
+                                cudaStream));
+    }
+
+  }  // namespace raw
+}  // namespace hcal
diff --git a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.h b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.h
new file mode 100644
index 0000000000000..3d5e4eec32269
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.h
@@ -0,0 +1,23 @@
+#ifndef EventFilter_HcalRawToDigi_interface_DecodeGPU_h
+#define EventFilter_HcalRawToDigi_interface_DecodeGPU_h
+
+#include "DeclsForKernels.h"
+
+namespace hcal {
+  namespace raw {
+
+    void entryPoint(InputDataCPU const&,
+                    InputDataGPU&,
+                    OutputDataGPU&,
+                    ScratchDataGPU&,
+                    OutputDataCPU&,
+                    ConditionsProducts const&,
+                    ConfigurationParameters const&,
+                    cudaStream_t cudaStream,
+                    uint32_t const,
+                    uint32_t const);
+
+  }
+}  // namespace hcal
+
+#endif  // EventFilter_HcalRawToDigi_interface_DecodeGPU_h
diff --git a/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.cc b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.cc
new file mode 100644
index 0000000000000..6b7b89cc6ea77
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.cc
@@ -0,0 +1,63 @@
+#include "DataFormats/HcalDetId/interface/HcalElectronicsId.h"
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "ElectronicsMappingGPU.h"
+
+namespace hcal {
+  namespace raw {
+
+    // TODO: 0x3FFFFF * 4B ~= 16MB
+    // tmp solution for linear mapping of eid -> did
+    ElectronicsMappingGPU::ElectronicsMappingGPU(HcalElectronicsMap const& mapping)
+        : eid2tid_(HcalElectronicsId::maxLinearIndex, 0u), eid2did_(HcalElectronicsId::maxLinearIndex, 0u) {
+      auto const& eidsPrecision = mapping.allElectronicsIdPrecision();
+      for (uint32_t i = 0; i < eidsPrecision.size(); ++i) {
+        auto const& eid = eidsPrecision[i];
+
+        // assign
+        eid2did_[eid.linearIndex()] = eid.isTriggerChainId() ? 0u : mapping.lookup(eid).rawId();
+      }
+
+      auto const& eidsTrigger = mapping.allElectronicsIdTrigger();
+      for (uint32_t i = 0; i < eidsTrigger.size(); i++) {
+        auto const& eid = eidsTrigger[i];
+
+        // assign
+        eid2tid_[eid.linearIndex()] = eid.isTriggerChainId() ? mapping.lookupTrigger(eid).rawId() : 0u;
+      }
+    }
+
+    ElectronicsMappingGPU::Product::~Product() {
+      // deallocation
+      cudaCheck(cudaFree(eid2did));
+      cudaCheck(cudaFree(eid2tid));
+    }
+
+    ElectronicsMappingGPU::Product const& ElectronicsMappingGPU::getProduct(cudaStream_t cudaStream) const {
+      auto const& product = product_.dataForCurrentDeviceAsync(
+          cudaStream, [this](ElectronicsMappingGPU::Product& product, cudaStream_t cudaStream) {
+            // malloc
+            cudaCheck(cudaMalloc((void**)&product.eid2did, this->eid2did_.size() * sizeof(uint32_t)));
+            cudaCheck(cudaMalloc((void**)&product.eid2tid, this->eid2tid_.size() * sizeof(uint32_t)));
+
+            // transfer
+            cudaCheck(cudaMemcpyAsync(product.eid2did,
+                                      this->eid2did_.data(),
+                                      this->eid2did_.size() * sizeof(uint32_t),
+                                      cudaMemcpyHostToDevice,
+                                      cudaStream));
+            cudaCheck(cudaMemcpyAsync(product.eid2tid,
+                                      this->eid2tid_.data(),
+                                      this->eid2tid_.size() * sizeof(uint32_t),
+                                      cudaMemcpyHostToDevice,
+                                      cudaStream));
+          });
+
+      return product;
+    }
+
+  }  // namespace raw
+}  // namespace hcal
+
+TYPELOOKUP_DATA_REG(hcal::raw::ElectronicsMappingGPU);
diff --git a/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h
new file mode 100644
index 0000000000000..0f4c12f02a92d
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h
@@ -0,0 +1,48 @@
+#ifndef EventFilter_HcalRawToDigi_plugins_ElectronicsMappingGPU_h
+#define EventFilter_HcalRawToDigi_plugins_ElectronicsMappingGPU_h
+
+#include "CondFormats/HcalObjects/interface/HcalElectronicsMap.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#endif
+
+namespace hcal {
+  namespace raw {
+
+    class ElectronicsMappingGPU {
+    public:
+      struct Product {
+        ~Product();
+        // trigger
+        uint32_t *eid2tid;
+        // detector
+        uint32_t *eid2did;
+      };
+
+#ifndef __CUDACC__
+
+      // rearrange pedestals
+      ElectronicsMappingGPU(HcalElectronicsMap const &);
+
+      // will call dealloation for Product thru ~Product
+      ~ElectronicsMappingGPU() = default;
+
+      // get device pointers
+      Product const &getProduct(cudaStream_t) const;
+
+    private:
+      // in the future, we need to arrange so to avoid this copy on the host
+      // if possible
+      std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t>> eid2tid_;
+      std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t>> eid2did_;
+
+      cms::cuda::ESProduct<Product> product_;
+#endif
+    };
+
+  }  // namespace raw
+}  // namespace hcal
+
+#endif  // EventFilter_HcalRawToDigi_plugins_ElectronicsMappingGPU_h
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc b/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc
new file mode 100644
index 0000000000000..c2b67a10afaff
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc
@@ -0,0 +1,117 @@
+#include <iostream>
+
+#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
+#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+class HcalCPUDigisProducer : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit HcalCPUDigisProducer(edm::ParameterSet const& ps);
+  ~HcalCPUDigisProducer() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+  using IProductTypef01 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor1, calo::common::DevStoragePolicy>>;
+  edm::EDGetTokenT<IProductTypef01> digisF01HETokenIn_;
+  using IProductTypef5 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, calo::common::DevStoragePolicy>>;
+  edm::EDGetTokenT<IProductTypef5> digisF5HBTokenIn_;
+  using IProductTypef3 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, calo::common::DevStoragePolicy>>;
+  edm::EDGetTokenT<IProductTypef3> digisF3HBTokenIn_;
+
+  using OProductTypef01 =
+      hcal::DigiCollection<hcal::Flavor1, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
+  edm::EDPutTokenT<OProductTypef01> digisF01HETokenOut_;
+  using OProductTypef5 =
+      hcal::DigiCollection<hcal::Flavor5, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
+  edm::EDPutTokenT<OProductTypef5> digisF5HBTokenOut_;
+  using OProductTypef3 =
+      hcal::DigiCollection<hcal::Flavor3, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
+  edm::EDPutTokenT<OProductTypef3> digisF3HBTokenOut_;
+
+  // needed to pass data from acquire to produce
+  OProductTypef01 digisf01HE_;
+  OProductTypef5 digisf5HB_;
+  OProductTypef3 digisf3HB_;
+};
+
+void HcalCPUDigisProducer::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("digisLabelF01HEIn", edm::InputTag{"hcalRawToDigiGPU", "f01HEDigisGPU"});
+  desc.add<edm::InputTag>("digisLabelF5HBIn", edm::InputTag{"hcalRawToDigiGPU", "f5HBDigisGPU"});
+  desc.add<edm::InputTag>("digisLabelF3HBIn", edm::InputTag{"hcalRawToDigiGPU", "f3HBDigisGPU"});
+  desc.add<std::string>("digisLabelF01HEOut", "f01HEDigis");
+  desc.add<std::string>("digisLabelF5HBOut", "f5HBDigis");
+  desc.add<std::string>("digisLabelF3HBOut", "f3HBDigis");
+
+  confDesc.addWithDefaultLabel(desc);
+}
+
+HcalCPUDigisProducer::HcalCPUDigisProducer(const edm::ParameterSet& ps)
+    : digisF01HETokenIn_{consumes<IProductTypef01>(ps.getParameter<edm::InputTag>("digisLabelF01HEIn"))},
+      digisF5HBTokenIn_{consumes<IProductTypef5>(ps.getParameter<edm::InputTag>("digisLabelF5HBIn"))},
+      digisF3HBTokenIn_{consumes<IProductTypef3>(ps.getParameter<edm::InputTag>("digisLabelF3HBIn"))},
+      digisF01HETokenOut_{produces<OProductTypef01>(ps.getParameter<std::string>("digisLabelF01HEOut"))},
+      digisF5HBTokenOut_{produces<OProductTypef5>(ps.getParameter<std::string>("digisLabelF5HBOut"))},
+      digisF3HBTokenOut_{produces<OProductTypef3>(ps.getParameter<std::string>("digisLabelF3HBOut"))} {}
+
+HcalCPUDigisProducer::~HcalCPUDigisProducer() {}
+
+void HcalCPUDigisProducer::acquire(edm::Event const& event,
+                                   edm::EventSetup const& setup,
+                                   edm::WaitingTaskWithArenaHolder taskHolder) {
+  // retrieve data/ctx
+  auto const& f01HEProduct = event.get(digisF01HETokenIn_);
+  auto const& f5HBProduct = event.get(digisF5HBTokenIn_);
+  auto const& f3HBProduct = event.get(digisF3HBTokenIn_);
+  cms::cuda::ScopedContextAcquire ctx{f01HEProduct, std::move(taskHolder)};
+  auto const& f01HEDigis = ctx.get(f01HEProduct);
+  auto const& f5HBDigis = ctx.get(f5HBProduct);
+  auto const& f3HBDigis = ctx.get(f3HBProduct);
+
+  // resize out tmp buffers
+  digisf01HE_.stride = f01HEDigis.stride;
+  digisf5HB_.stride = f5HBDigis.stride;
+  digisf3HB_.stride = f3HBDigis.stride;
+  digisf01HE_.resize(f01HEDigis.size);
+  digisf5HB_.resize(f5HBDigis.size);
+  digisf3HB_.resize(f3HBDigis.size);
+
+  auto lambdaToTransfer = [&ctx](auto& dest, auto* src) {
+    using vector_type = typename std::remove_reference<decltype(dest)>::type;
+    using type = typename vector_type::value_type;
+    using src_data_type = typename std::remove_pointer<decltype(src)>::type;
+    static_assert(std::is_same<src_data_type, type>::value && "Dest and Src data types do not match");
+    cudaCheck(cudaMemcpyAsync(dest.data(), src, dest.size() * sizeof(type), cudaMemcpyDeviceToHost, ctx.stream()));
+  };
+
+  lambdaToTransfer(digisf01HE_.data, f01HEDigis.data.get());
+  lambdaToTransfer(digisf01HE_.ids, f01HEDigis.ids.get());
+
+  lambdaToTransfer(digisf5HB_.data, f5HBDigis.data.get());
+  lambdaToTransfer(digisf5HB_.ids, f5HBDigis.ids.get());
+  lambdaToTransfer(digisf5HB_.npresamples, f5HBDigis.npresamples.get());
+
+  lambdaToTransfer(digisf3HB_.data, f3HBDigis.data.get());
+  lambdaToTransfer(digisf3HB_.ids, f3HBDigis.ids.get());
+}
+
+void HcalCPUDigisProducer::produce(edm::Event& event, edm::EventSetup const& setup) {
+  event.emplace(digisF01HETokenOut_, std::move(digisf01HE_));
+  event.emplace(digisF5HBTokenOut_, std::move(digisf5HB_));
+  event.emplace(digisF3HBTokenOut_, std::move(digisf3HB_));
+}
+
+DEFINE_FWK_MODULE(HcalCPUDigisProducer);
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
new file mode 100644
index 0000000000000..9ca33340f7036
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
@@ -0,0 +1,235 @@
+#include <iostream>
+
+#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+class HcalDigisProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit HcalDigisProducerGPU(edm::ParameterSet const& ps);
+  ~HcalDigisProducerGPU() override = default;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+  // input product tokens
+  edm::EDGetTokenT<HBHEDigiCollection> hbheDigiToken_;
+  edm::EDGetTokenT<QIE11DigiCollection> qie11DigiToken_;
+
+  // type aliases
+  using HostCollectionf01 =
+      hcal::DigiCollection<hcal::Flavor1, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
+  using DeviceCollectionf01 = hcal::DigiCollection<hcal::Flavor1, calo::common::DevStoragePolicy>;
+  using HostCollectionf5 =
+      hcal::DigiCollection<hcal::Flavor5, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
+  using DeviceCollectionf5 = hcal::DigiCollection<hcal::Flavor5, calo::common::DevStoragePolicy>;
+  using HostCollectionf3 =
+      hcal::DigiCollection<hcal::Flavor3, calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
+  using DeviceCollectionf3 = hcal::DigiCollection<hcal::Flavor3, calo::common::DevStoragePolicy>;
+
+  // output product tokens
+  using ProductTypef01 = cms::cuda::Product<DeviceCollectionf01>;
+  edm::EDPutTokenT<ProductTypef01> digisF01HEToken_;
+  using ProductTypef5 = cms::cuda::Product<DeviceCollectionf5>;
+  edm::EDPutTokenT<ProductTypef5> digisF5HBToken_;
+  using ProductTypef3 = cms::cuda::Product<DeviceCollectionf3>;
+  edm::EDPutTokenT<ProductTypef3> digisF3HBToken_;
+
+  cms::cuda::ContextState cudaState_;
+
+  struct ConfigParameters {
+    uint32_t maxChannelsF01HE, maxChannelsF5HB, maxChannelsF3HB;
+  };
+  ConfigParameters config_;
+
+  // per event host buffers
+  HostCollectionf01 hf01_;
+  HostCollectionf5 hf5_;
+  HostCollectionf3 hf3_;
+
+  // device products: product owns memory (i.e. not the module)
+  DeviceCollectionf01 df01_;
+  DeviceCollectionf5 df5_;
+  DeviceCollectionf3 df3_;
+};
+
+void HcalDigisProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
+
+  // FIXME
+  desc.add<edm::InputTag>("hbheDigisLabel", edm::InputTag("hcalDigis"));
+  desc.add<edm::InputTag>("qie11DigiLabel", edm::InputTag("hcalDigis"));
+  desc.add<std::string>("digisLabelF01HE", std::string{"f01HEDigisGPU"});
+  desc.add<std::string>("digisLabelF5HB", std::string{"f5HBDigisGPU"});
+  desc.add<std::string>("digisLabelF3HB", std::string{"f3HBDigisGPU"});
+  desc.add<uint32_t>("maxChannelsF01HE", 10000u);
+  desc.add<uint32_t>("maxChannelsF5HB", 10000u);
+  desc.add<uint32_t>("maxChannelsF3HB", 10000u);
+
+  confDesc.addWithDefaultLabel(desc);
+}
+
+HcalDigisProducerGPU::HcalDigisProducerGPU(const edm::ParameterSet& ps)
+    : hbheDigiToken_{consumes<HBHEDigiCollection>(ps.getParameter<edm::InputTag>("hbheDigisLabel"))},
+      qie11DigiToken_{consumes<QIE11DigiCollection>(ps.getParameter<edm::InputTag>("qie11DigiLabel"))},
+      digisF01HEToken_{produces<ProductTypef01>(ps.getParameter<std::string>("digisLabelF01HE"))},
+      digisF5HBToken_{produces<ProductTypef5>(ps.getParameter<std::string>("digisLabelF5HB"))},
+      digisF3HBToken_{produces<ProductTypef3>(ps.getParameter<std::string>("digisLabelF3HB"))} {
+  config_.maxChannelsF01HE = ps.getParameter<uint32_t>("maxChannelsF01HE");
+  config_.maxChannelsF5HB = ps.getParameter<uint32_t>("maxChannelsF5HB");
+  config_.maxChannelsF3HB = ps.getParameter<uint32_t>("maxChannelsF3HB");
+
+  // this is a preallocation for the max statically known number of time samples
+  // actual stride/nsamples will be inferred from data
+  hf01_.stride = hcal::compute_stride<hcal::Flavor1>(QIE11DigiCollection::MAXSAMPLES);
+  hf5_.stride = hcal::compute_stride<hcal::Flavor5>(HBHEDataFrame::MAXSAMPLES);
+  hf3_.stride = hcal::compute_stride<hcal::Flavor3>(QIE11DigiCollection::MAXSAMPLES);
+
+  // preallocate pinned host memory only if CUDA is available
+  edm::Service<CUDAService> cs;
+  if (cs and cs->enabled()) {
+    hf01_.reserve(config_.maxChannelsF01HE);
+    hf5_.reserve(config_.maxChannelsF5HB);
+    hf3_.reserve(config_.maxChannelsF3HB);
+  }
+}
+
+void HcalDigisProducerGPU::acquire(edm::Event const& event,
+                                   edm::EventSetup const& setup,
+                                   edm::WaitingTaskWithArenaHolder holder) {
+  // raii
+  cms::cuda::ScopedContextAcquire ctx{event.streamID(), std::move(holder), cudaState_};
+
+  // clear host buffers
+  hf01_.clear();
+  hf5_.clear();
+  hf3_.clear();
+
+  // event data
+  edm::Handle<HBHEDigiCollection> hbheDigis;
+  edm::Handle<QIE11DigiCollection> qie11Digis;
+  event.getByToken(hbheDigiToken_, hbheDigis);
+  event.getByToken(qie11DigiToken_, qie11Digis);
+
+  // init f5 collection
+  if (not hbheDigis->empty()) {
+    auto const nsamples = (*hbheDigis)[0].size();
+    auto const stride = hcal::compute_stride<hcal::Flavor5>(nsamples);
+    hf5_.stride = stride;
+
+    // flavor5 get device blobs
+    df5_.stride = stride;
+    df5_.data = cms::cuda::make_device_unique<uint16_t[]>(config_.maxChannelsF5HB * stride, ctx.stream());
+    df5_.ids = cms::cuda::make_device_unique<uint32_t[]>(config_.maxChannelsF5HB, ctx.stream());
+    df5_.npresamples = cms::cuda::make_device_unique<uint8_t[]>(config_.maxChannelsF5HB, ctx.stream());
+  }
+
+  if (not qie11Digis->empty()) {
+    auto const nsamples = qie11Digis->samples();
+    auto const stride01 = hcal::compute_stride<hcal::Flavor1>(nsamples);
+    auto const stride3 = hcal::compute_stride<hcal::Flavor3>(nsamples);
+
+    hf01_.stride = stride01;
+    hf3_.stride = stride3;
+
+    // flavor 0/1 get devie blobs
+    df01_.stride = stride01;
+    df01_.data = cms::cuda::make_device_unique<uint16_t[]>(config_.maxChannelsF01HE * stride01, ctx.stream());
+    df01_.ids = cms::cuda::make_device_unique<uint32_t[]>(config_.maxChannelsF01HE, ctx.stream());
+
+    // flavor3 get device blobs
+    df3_.stride = stride3;
+    df3_.data = cms::cuda::make_device_unique<uint16_t[]>(config_.maxChannelsF3HB * stride3, ctx.stream());
+    df3_.ids = cms::cuda::make_device_unique<uint32_t[]>(config_.maxChannelsF3HB, ctx.stream());
+  }
+
+  for (auto const& hbhe : *hbheDigis) {
+    auto const id = hbhe.id().rawId();
+    auto const presamples = hbhe.presamples();
+    hf5_.ids.push_back(id);
+    hf5_.npresamples.push_back(presamples);
+    auto const stride = hcal::compute_stride<hcal::Flavor5>(hbhe.size());
+    assert(stride == hf5_.stride && "strides must be the same for every single digi of the collection");
+    // simple for now...
+    static_assert(hcal::Flavor5::HEADER_WORDS == 1);
+    uint16_t header_word = (1 << 15) | (0x5 << 12) | (0 << 10) | ((hbhe.sample(0).capid() & 0x3) << 8);
+    hf5_.data.push_back(header_word);
+    for (unsigned int i = 0; i < stride - hcal::Flavor5::HEADER_WORDS; i++) {
+      uint16_t s0 = (0 << 7) | (static_cast<uint8_t>(hbhe.sample(2 * i).adc()) & 0x7f);
+      uint16_t s1 = (0 << 7) | (static_cast<uint8_t>(hbhe.sample(2 * i + 1).adc()) & 0x7f);
+      uint16_t sample = (s1 << 8) | s0;
+      hf5_.data.push_back(sample);
+    }
+  }
+
+  for (unsigned int i = 0; i < qie11Digis->size(); i++) {
+    auto const& digi = QIE11DataFrame{(*qie11Digis)[i]};
+    assert(digi.samples() == qie11Digis->samples() && "collection nsamples must equal per digi samples");
+    if (digi.flavor() == 0 or digi.flavor() == 1) {
+      if (digi.detid().subdetId() != HcalEndcap)
+        continue;
+      auto const id = digi.detid().rawId();
+      hf01_.ids.push_back(id);
+      for (int hw = 0; hw < hcal::Flavor1::HEADER_WORDS; hw++)
+        hf01_.data.push_back((*qie11Digis)[i][hw]);
+      for (int sample = 0; sample < digi.samples(); sample++) {
+        hf01_.data.push_back((*qie11Digis)[i][hcal::Flavor1::HEADER_WORDS + sample]);
+      }
+    } else if (digi.flavor() == 3) {
+      if (digi.detid().subdetId() != HcalBarrel)
+        continue;
+      auto const id = digi.detid().rawId();
+      hf3_.ids.push_back(id);
+      for (int hw = 0; hw < hcal::Flavor3::HEADER_WORDS; hw++)
+        hf3_.data.push_back((*qie11Digis)[i][hw]);
+      for (int sample = 0; sample < digi.samples(); sample++) {
+        hf3_.data.push_back((*qie11Digis)[i][hcal::Flavor3::HEADER_WORDS + sample]);
+      }
+    }
+  }
+
+  auto lambdaToTransfer = [&ctx](auto* dest, auto const& src) {
+    if (src.empty())
+      return;
+    using vector_type = typename std::remove_reference<decltype(src)>::type;
+    using type = typename vector_type::value_type;
+    using dest_data_type = typename std::remove_pointer<decltype(dest)>::type;
+    static_assert(std::is_same<dest_data_type, type>::value && "Dest and Src data typesdo not match");
+    cudaCheck(cudaMemcpyAsync(dest, src.data(), src.size() * sizeof(type), cudaMemcpyHostToDevice, ctx.stream()));
+  };
+
+  lambdaToTransfer(df01_.data.get(), hf01_.data);
+  lambdaToTransfer(df01_.ids.get(), hf01_.ids);
+
+  lambdaToTransfer(df5_.data.get(), hf5_.data);
+  lambdaToTransfer(df5_.ids.get(), hf5_.ids);
+  lambdaToTransfer(df5_.npresamples.get(), hf5_.npresamples);
+
+  lambdaToTransfer(df3_.data.get(), hf3_.data);
+  lambdaToTransfer(df3_.ids.get(), hf3_.ids);
+
+  df01_.size = hf01_.ids.size();
+  df5_.size = hf5_.ids.size();
+  df3_.size = hf3_.ids.size();
+}
+
+void HcalDigisProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) {
+  cms::cuda::ScopedContextProduce ctx{cudaState_};
+
+  ctx.emplace(event, digisF01HEToken_, std::move(df01_));
+  ctx.emplace(event, digisF5HBToken_, std::move(df5_));
+  ctx.emplace(event, digisF3HBToken_, std::move(df3_));
+}
+
+DEFINE_FWK_MODULE(HcalDigisProducerGPU);
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc b/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc
new file mode 100644
index 0000000000000..749a98e990755
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc
@@ -0,0 +1,10 @@
+#include "CondFormats/DataRecord/interface/HcalElectronicsMapRcd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "HeterogeneousCore/CUDACore/interface/ConvertingESProducerT.h"
+
+#include "ElectronicsMappingGPU.h"
+
+using HcalElectronicsMappingGPUESProducer =
+    ConvertingESProducerT<HcalElectronicsMapRcd, hcal::raw::ElectronicsMappingGPU, HcalElectronicsMap>;
+
+DEFINE_FWK_EVENTSETUP_MODULE(HcalElectronicsMappingGPUESProducer);
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
new file mode 100644
index 0000000000000..7e8388a5f4d2f
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
@@ -0,0 +1,200 @@
+#include <iostream>
+
+#include "CondFormats/DataRecord/interface/HcalElectronicsMapRcd.h"
+#include "DataFormats/FEDRawData/interface/FEDNumbering.h"
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+#include "DeclsForKernels.h"
+#include "DecodeGPU.h"
+#include "ElectronicsMappingGPU.h"
+
+class HcalRawToDigiGPU : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit HcalRawToDigiGPU(edm::ParameterSet const& ps);
+  ~HcalRawToDigiGPU() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+  edm::EDGetTokenT<FEDRawDataCollection> rawDataToken_;
+  using ProductTypef01 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor1, calo::common::DevStoragePolicy>>;
+  edm::EDPutTokenT<ProductTypef01> digisF01HEToken_;
+  using ProductTypef5 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, calo::common::DevStoragePolicy>>;
+  edm::EDPutTokenT<ProductTypef5> digisF5HBToken_;
+  using ProductTypef3 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, calo::common::DevStoragePolicy>>;
+  edm::EDPutTokenT<ProductTypef3> digisF3HBToken_;
+
+  cms::cuda::ContextState cudaState_;
+
+  std::vector<int> fedsToUnpack_;
+
+  hcal::raw::ConfigurationParameters config_;
+  hcal::raw::OutputDataGPU outputGPU_;
+  hcal::raw::OutputDataCPU outputCPU_;
+};
+
+void HcalRawToDigiGPU::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("InputLabel", edm::InputTag("rawDataCollector"));
+  auto nFeds = FEDNumbering::MAXHCALuTCAFEDID - FEDNumbering::MINHCALuTCAFEDID + 1;
+  std::vector<int> feds(nFeds);
+  for (int i = 0; i < nFeds; ++i)
+    feds[i] = i + FEDNumbering::MINHCALuTCAFEDID;
+  desc.add<std::vector<int>>("FEDs", feds);
+  desc.add<uint32_t>("maxChannelsF01HE", 10000u);
+  desc.add<uint32_t>("maxChannelsF5HB", 10000u);
+  desc.add<uint32_t>("maxChannelsF3HB", 10000u);
+  desc.add<uint32_t>("nsamplesF01HE", 8);
+  desc.add<uint32_t>("nsamplesF5HB", 8);
+  desc.add<uint32_t>("nsamplesF3HB", 8);
+  desc.add<std::string>("digisLabelF5HB", "f5HBDigisGPU");
+  desc.add<std::string>("digisLabelF01HE", "f01HEDigisGPU");
+  desc.add<std::string>("digisLabelF3HB", "f3HBDigisGPU");
+
+  std::string label = "hcalRawToDigiGPU";
+  confDesc.add(label, desc);
+}
+
+HcalRawToDigiGPU::HcalRawToDigiGPU(const edm::ParameterSet& ps)
+    : rawDataToken_{consumes<FEDRawDataCollection>(ps.getParameter<edm::InputTag>("InputLabel"))},
+      digisF01HEToken_{produces<ProductTypef01>(ps.getParameter<std::string>("digisLabelF01HE"))},
+      digisF5HBToken_{produces<ProductTypef5>(ps.getParameter<std::string>("digisLabelF5HB"))},
+      digisF3HBToken_{produces<ProductTypef3>(ps.getParameter<std::string>("digisLabelF3HB"))},
+      fedsToUnpack_{ps.getParameter<std::vector<int>>("FEDs")} {
+  config_.maxChannelsF01HE = ps.getParameter<uint32_t>("maxChannelsF01HE");
+  config_.maxChannelsF5HB = ps.getParameter<uint32_t>("maxChannelsF5HB");
+  config_.maxChannelsF3HB = ps.getParameter<uint32_t>("maxChannelsF3HB");
+  config_.nsamplesF01HE = ps.getParameter<uint32_t>("nsamplesF01HE");
+  config_.nsamplesF5HB = ps.getParameter<uint32_t>("nsamplesF5HB");
+  config_.nsamplesF3HB = ps.getParameter<uint32_t>("nsamplesF3HB");
+}
+
+HcalRawToDigiGPU::~HcalRawToDigiGPU() {}
+
+void HcalRawToDigiGPU::acquire(edm::Event const& event,
+                               edm::EventSetup const& setup,
+                               edm::WaitingTaskWithArenaHolder holder) {
+  // raii
+  cms::cuda::ScopedContextAcquire ctx{event.streamID(), std::move(holder), cudaState_};
+
+  // conditions
+  edm::ESHandle<hcal::raw::ElectronicsMappingGPU> eMappingHandle;
+  setup.get<HcalElectronicsMapRcd>().get(eMappingHandle);
+  auto const& eMappingProduct = eMappingHandle->getProduct(ctx.stream());
+
+  // bundle up conditions
+  hcal::raw::ConditionsProducts conditions{eMappingProduct};
+
+  // event data
+  edm::Handle<FEDRawDataCollection> rawDataHandle;
+  event.getByToken(rawDataToken_, rawDataHandle);
+
+  // scratch
+  hcal::raw::ScratchDataGPU scratchGPU = {
+      cms::cuda::make_device_unique<uint32_t[]>(hcal::raw::numOutputCollections, ctx.stream())};
+
+  // input cpu data
+  hcal::raw::InputDataCPU inputCPU = {cms::cuda::make_host_unique<unsigned char[]>(
+                                          hcal::raw::utca_nfeds_max * hcal::raw::nbytes_per_fed_max, ctx.stream()),
+                                      cms::cuda::make_host_unique<uint32_t[]>(hcal::raw::utca_nfeds_max, ctx.stream()),
+                                      cms::cuda::make_host_unique<int[]>(hcal::raw::utca_nfeds_max, ctx.stream())};
+
+  // input data gpu
+  hcal::raw::InputDataGPU inputGPU = {
+      cms::cuda::make_device_unique<unsigned char[]>(hcal::raw::utca_nfeds_max * hcal::raw::nbytes_per_fed_max,
+                                                     ctx.stream()),
+      cms::cuda::make_device_unique<uint32_t[]>(hcal::raw::utca_nfeds_max, ctx.stream()),
+      cms::cuda::make_device_unique<int[]>(hcal::raw::utca_nfeds_max, ctx.stream())};
+
+  // output cpu
+  outputCPU_ = {cms::cuda::make_host_unique<uint32_t[]>(hcal::raw::numOutputCollections, ctx.stream())};
+
+  // output gpu
+  outputGPU_.allocate(config_, ctx.stream());
+
+  // iterate over feds
+  // TODO: another idea
+  //   - loop over all feds to unpack and enqueue cuda memcpy
+  //   - accumulate the sizes
+  //   - after the loop launch cuda memcpy for sizes
+  //   - enqueue the kernel
+  uint32_t currentCummOffset = 0;
+  uint32_t counter = 0;
+  for (auto const& fed : fedsToUnpack_) {
+    auto const& data = rawDataHandle->FEDData(fed);
+    auto const nbytes = data.size();
+
+    // skip empty feds
+    if (nbytes < hcal::raw::empty_event_size)
+      continue;
+
+#ifdef HCAL_RAWDECODE_CPUDEBUG
+    printf("fed = %d nbytes = %lu\n", fed, nbytes);
+#endif
+
+    // copy raw data into plain buffer
+    std::memcpy(inputCPU.data.get() + currentCummOffset, data.data(), nbytes);
+    // set the offset in bytes from the start
+    inputCPU.offsets[counter] = currentCummOffset;
+    inputCPU.feds[counter] = fed;
+
+    // this is the current offset into the vector
+    currentCummOffset += nbytes;
+    ++counter;
+  }
+
+  hcal::raw::entryPoint(inputCPU,
+                        inputGPU,
+                        outputGPU_,
+                        scratchGPU,
+                        outputCPU_,
+                        conditions,
+                        config_,
+                        ctx.stream(),
+                        counter,
+                        currentCummOffset);
+}
+
+void HcalRawToDigiGPU::produce(edm::Event& event, edm::EventSetup const& setup) {
+  cms::cuda::ScopedContextProduce ctx{cudaState_};
+
+#ifdef HCAL_RAWDECODE_CPUDEBUG
+  printf("f01he channels = %u f5hb channesl = %u\n",
+         outputCPU_.nchannels[hcal::raw::OutputF01HE],
+         outputCPU_.nchannels[hcal::raw::OutputF5HB]);
+#endif
+
+  // FIXME: use sizes of views directly for cuda mem cpy?
+  auto const nchannelsF01HE = outputCPU_.nchannels[hcal::raw::OutputF01HE];
+  auto const nchannelsF5HB = outputCPU_.nchannels[hcal::raw::OutputF5HB];
+  auto const nchannelsF3HB = outputCPU_.nchannels[hcal::raw::OutputF3HB];
+  outputGPU_.digisF01HE.size = nchannelsF01HE;
+  outputGPU_.digisF5HB.size = nchannelsF5HB;
+  outputGPU_.digisF3HB.size = nchannelsF3HB;
+  outputGPU_.digisF01HE.stride = hcal::compute_stride<hcal::Flavor1>(config_.nsamplesF01HE);
+  outputGPU_.digisF5HB.stride = hcal::compute_stride<hcal::Flavor5>(config_.nsamplesF5HB);
+  outputGPU_.digisF3HB.stride = hcal::compute_stride<hcal::Flavor3>(config_.nsamplesF3HB);
+
+  ctx.emplace(event, digisF01HEToken_, std::move(outputGPU_.digisF01HE));
+  ctx.emplace(event, digisF5HBToken_, std::move(outputGPU_.digisF5HB));
+  ctx.emplace(event, digisF3HBToken_, std::move(outputGPU_.digisF3HB));
+
+  // reset ptrs that are carried as members
+  outputCPU_.nchannels.reset();
+}
+
+DEFINE_FWK_MODULE(HcalRawToDigiGPU);
diff --git a/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py b/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py
new file mode 100644
index 0000000000000..b3a2548791ae5
--- /dev/null
+++ b/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py
@@ -0,0 +1,60 @@
+import FWCore.ParameterSet.Config as cms
+
+# Customise the HCAL-only reconstruction to run on GPU
+#
+# Currently, this means:
+#   - running the unpacker on CPU, converting the digis into SoA format and copying them to GPU;
+#   - running the HBHE local reconstruction, including MAHI, on GPU.
+def customizeHcalOnlyForProfilingGPUOnly(process):
+
+  process.consumer = cms.EDAnalyzer("GenericConsumer",
+      eventProducts = cms.untracked.vstring('hbheRecHitProducerGPU')
+  )
+
+  process.consume_step = cms.EndPath(process.consumer)
+
+  process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step)
+
+  return process
+
+
+# Customise the HCAL-only reconstruction to run on GPU, and copy the data to the host
+#
+# Currently, this means:
+#   - running the unpacker on CPU, converting the digis into SoA format and copying them to GPU;
+#   - running the HBHE local reconstruction, including MAHI, on GPU;
+#   - copying the rechits to CPU and converting them to legacy format.
+#
+# (this is equivalent to customizeHcalOnlyForProfiling, as the copy and conversion is done by the same module)
+def customizeHcalOnlyForProfilingGPUWithHostCopy(process):
+
+  process.consumer = cms.EDAnalyzer("GenericConsumer",
+      eventProducts = cms.untracked.vstring('hbheprereco')
+  )
+
+  process.consume_step = cms.EndPath(process.consumer)
+
+  process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step)
+
+  return process
+
+
+# Customise the HCAL-only reconstruction to run on GPU, copy the data to the host, and convert to legacy format
+#
+# Currently, this means:
+#   - running the unpacker on CPU, converting the digis into SoA format and copying them to GPU;
+#   - running the HBHE local reconstruction, including MAHI, on GPU;
+#   - copying the rechits to CPU and converting them to legacy format.
+#
+# The same customisation can be also used on the CPU workflow, running up to the rechits on CPU.
+def customizeHcalOnlyForProfiling(process):
+
+  process.consumer = cms.EDAnalyzer("GenericConsumer",
+      eventProducts = cms.untracked.vstring('hbheprereco')
+  )
+
+  process.consume_step = cms.EndPath(process.consumer)
+
+  process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step)
+
+  return process
diff --git a/RecoLocalCalo/Configuration/python/hcalGlobalReco_cff.py b/RecoLocalCalo/Configuration/python/hcalGlobalReco_cff.py
index 70207e36ba654..fbb4c53f9f28b 100644
--- a/RecoLocalCalo/Configuration/python/hcalGlobalReco_cff.py
+++ b/RecoLocalCalo/Configuration/python/hcalGlobalReco_cff.py
@@ -4,7 +4,18 @@
 hcalGlobalRecoTask = cms.Task(hbhereco)
 hcalGlobalRecoSequence = cms.Sequence(hcalGlobalRecoTask)
 
+#--- for Run 3 and later
+from Configuration.Eras.Modifier_run3_HB_cff import run3_HB
+
 from RecoLocalCalo.HcalRecProducers.HBHEPhase1Reconstructor_cfi import hbheprereco as _phase1_hbheprereco
+run3_HB.toReplaceWith(hbhereco, _phase1_hbheprereco)
 
-from Configuration.Eras.Modifier_run3_HB_cff import run3_HB
-run3_HB.toReplaceWith( hbhereco, _phase1_hbheprereco ) # >=Run3
+#--- for Run 3 on GPU
+from Configuration.ProcessModifiers.gpu_cff import gpu
+
+from RecoLocalCalo.HcalRecProducers.hcalCPURecHitsProducer_cfi import hcalCPURecHitsProducer as _hcalCPURecHitsProducer
+gpu.toReplaceWith(hbhereco, _hcalCPURecHitsProducer.clone(
+    recHitsM0LabelIn = "hbheRecHitProducerGPU",
+    recHitsM0LabelOut = "",
+    recHitsLegacyLabelOut = ""
+))
diff --git a/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py b/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py
index 84e7498b7eb2d..a7bdce3b916af 100644
--- a/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py
+++ b/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py
@@ -16,20 +16,17 @@
 from RecoLocalCalo.HcalRecProducers.HcalHitReconstructor_ho_cfi import *
 from RecoLocalCalo.HcalRecProducers.HcalHitReconstructor_hf_cfi import *
 from RecoLocalCalo.HcalRecProducers.HcalHitReconstructor_zdc_cfi import *
-hcalLocalRecoTask = cms.Task(hbheprereco,hfreco,horeco,zdcreco)
+hcalLocalRecoTask = cms.Task(hbheprereco, hfreco, horeco, zdcreco)
 hcalLocalRecoSequence = cms.Sequence(hcalLocalRecoTask)
 
 from RecoLocalCalo.HcalRecProducers.hfprereco_cfi import hfprereco
 from RecoLocalCalo.HcalRecProducers.HFPhase1Reconstructor_cfi import hfreco as _phase1_hfreco
 from RecoLocalCalo.HcalRecProducers.hbheplan1_cfi import hbheplan1
 
-#--- for HCALonly wf
-hcalOnlyLocalRecoTask = cms.Task(hbheprereco,hfprereco,hfreco,horeco)
-
-# copy for cosmics
+#--- for cosmics
 _default_hfreco = hfreco.clone()
 
-#--- Phase1 
+#--- for Phase 1
 _phase1_hcalLocalRecoTask = hcalLocalRecoTask.copy()
 _phase1_hcalLocalRecoTask.add(hfprereco)
 
@@ -37,7 +34,7 @@
 run2_HF_2017.toReplaceWith( hcalLocalRecoTask, _phase1_hcalLocalRecoTask )
 run2_HF_2017.toReplaceWith( hfreco, _phase1_hfreco )
 from Configuration.Eras.Modifier_run2_HCAL_2017_cff import run2_HCAL_2017
-run2_HCAL_2017.toReplaceWith( hbheprereco, _phase1_hbheprereco )
+run2_HCAL_2017.toReplaceWith(hbheprereco, _phase1_hbheprereco)
 
 _plan1_hcalLocalRecoTask = _phase1_hcalLocalRecoTask.copy()
 _plan1_hcalLocalRecoTask.add(hbheplan1)
@@ -50,12 +47,39 @@
 from Configuration.ProcessModifiers.run2_HECollapse_2018_cff import run2_HECollapse_2018
 run2_HECollapse_2018.toReplaceWith(hcalLocalRecoTask, _collapse_hcalLocalRecoTask)
 
-#--- from >=Run3
+#--- for Run 3 and later
 _run3_hcalLocalRecoTask = _phase1_hcalLocalRecoTask.copy()
 _run3_hcalLocalRecoTask.remove(hbheprereco)
 from Configuration.Eras.Modifier_run3_HB_cff import run3_HB
-run3_HB.toReplaceWith( hcalLocalRecoTask, _run3_hcalLocalRecoTask )
+run3_HB.toReplaceWith(hcalLocalRecoTask, _run3_hcalLocalRecoTask)
+
+#--- for Run 3 on GPU
+from Configuration.ProcessModifiers.gpu_cff import gpu
+
+from RecoLocalCalo.HcalRecProducers.hbheRecHitProducerGPUTask_cff import *
+_run3_hcalLocalRecoGPUTask = _run3_hcalLocalRecoTask.copy()
+_run3_hcalLocalRecoGPUTask.add(hbheRecHitProducerGPUTask)
+gpu.toReplaceWith(hcalLocalRecoTask, _run3_hcalLocalRecoGPUTask)
+
+#--- HCAL-only workflow for Run 3
+# FIXME rename `hbheprereco` to `hbhereco` and use it from hcalGlobalRecoTask
+hcalOnlyLocalRecoTask = cms.Task(hbheprereco, hfprereco, hfreco, horeco)
+
+#--- HCAL-only workflow for Run 3 on GPU
+from Configuration.ProcessModifiers.gpu_cff import gpu
+
+_hcalOnlyLocalRecoGPUTask = hcalOnlyLocalRecoTask.copy()
+_hcalOnlyLocalRecoGPUTask.add(hbheRecHitProducerGPUTask)
+gpu.toReplaceWith(hcalOnlyLocalRecoTask, _hcalOnlyLocalRecoGPUTask)
+
+from RecoLocalCalo.HcalRecProducers.hcalCPURecHitsProducer_cfi import hcalCPURecHitsProducer as _hcalCPURecHitsProducer
+gpu.toReplaceWith(hbheprereco, _hcalCPURecHitsProducer.clone(
+    recHitsM0LabelIn = "hbheRecHitProducerGPU",
+    recHitsM0LabelOut = "",
+    recHitsLegacyLabelOut = ""
+))
 
+#--- for FastSim
 _fastSim_hcalLocalRecoTask = hcalLocalRecoTask.copyAndExclude([zdcreco])
 from Configuration.Eras.Modifier_fastSim_cff import fastSim
 fastSim.toReplaceWith( hcalLocalRecoTask, _fastSim_hcalLocalRecoTask )
diff --git a/RecoLocalCalo/HcalRecAlgos/BuildFile.xml b/RecoLocalCalo/HcalRecAlgos/BuildFile.xml
index ac94d61e12494..2c2ee20aff7d5 100644
--- a/RecoLocalCalo/HcalRecAlgos/BuildFile.xml
+++ b/RecoLocalCalo/HcalRecAlgos/BuildFile.xml
@@ -1,19 +1,26 @@
 <use name="boost"/>
-<use name="eigen"/>
 <use name="clhep"/>
+<use name="cuda"/>
+<use name="eigen"/>
+<use name="rootminuit2"/>
+<use name="vdt_headers"/>
+<use name="CUDADataFormats/Common" />
+<use name="CUDADataFormats/HcalDigi"/>
+<use name="CUDADataFormats/HcalRecHitSoA" />
+<use name="CalibCalorimetry/HcalAlgos"/>
+<use name="CalibFormats/CaloObjects"/>
+<use name="CalibFormats/HcalObjects"/>
 <use name="DataFormats/HcalDigi"/>
 <use name="DataFormats/HcalRecHit"/>
 <use name="DataFormats/TrackReco"/>
-<use name="CalibFormats/HcalObjects"/>
-<use name="CalibFormats/CaloObjects"/>
-<use name="CalibCalorimetry/HcalAlgos"/>
 <use name="FWCore/Framework"/>
-<use name="FWCore/ParameterSet"/>
 <use name="FWCore/MessageLogger"/>
-<use name="CondFormats/DataRecord"/>
+<use name="FWCore/ParameterSet"/>
+<use name="Geometry/CaloTopology"/>
+<use name="Geometry/HcalCommonData"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="RecoLocalCalo/EcalRecAlgos"/>
-<use name="vdt_headers"/>
-<use name="rootminuit2"/>
 <export>
   <lib name="1"/>
 </export>
diff --git a/RecoLocalCalo/HcalRecAlgos/bin/BuildFile.xml b/RecoLocalCalo/HcalRecAlgos/bin/BuildFile.xml
new file mode 100644
index 0000000000000..30f76131622bd
--- /dev/null
+++ b/RecoLocalCalo/HcalRecAlgos/bin/BuildFile.xml
@@ -0,0 +1,2 @@
+<bin name="generateQIEShapes" file="generateQIEShapes.cc">
+</bin>
diff --git a/RecoLocalCalo/HcalRecAlgos/bin/generateQIEShapes.cc b/RecoLocalCalo/HcalRecAlgos/bin/generateQIEShapes.cc
new file mode 100644
index 0000000000000..39d5195e7438d
--- /dev/null
+++ b/RecoLocalCalo/HcalRecAlgos/bin/generateQIEShapes.cc
@@ -0,0 +1,82 @@
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+//
+// Pregenerate QIE Shapes using hardcoded arrays
+// This is taken directly from CondFormats/HcalObjects/srcHcalQIEData.cc
+// This generation is running upon conditions retrieval typically for the cpu workload
+//
+// For the GPU workload, it is better to put generated values into constant memory.
+// Either this or just use global memory (for global mem, we need getters...).
+// Choosign constant memory as thsese
+// values are statically known and never change. Any change in any case requires
+// recompilation!
+//
+
+const float binMin[32] = {-1, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+                          16, 18, 20, 22, 24, 26, 28, 31, 34, 37, 40, 44, 48, 52, 57, 62};
+
+const float binMin2[64] = {-0.5,  0.5,   1.5,   2.5,   3.5,   4.5,   5.5,   6.5,   7.5,   8.5,   9.5,
+                           10.5,  11.5,  12.5,  13.5,  14.5,  // 16 bins with width 1x
+                           15.5,  17.5,  19.5,  21.5,  23.5,  25.5,  27.5,  29.5,  31.5,  33.5,  35.5,
+                           37.5,  39.5,  41.5,  43.5,  45.5,  47.5,  49.5,  51.5,  53.5,  // 20 bins with width 2x
+                           55.5,  59.5,  63.5,  67.5,  71.5,  75.5,  79.5,  83.5,  87.5,  91.5,  95.5,
+                           99.5,  103.5, 107.5, 111.5, 115.5, 119.5, 123.5, 127.5, 131.5, 135.5,  // 21 bins with width 4x
+                           139.5, 147.5, 155.5, 163.5, 171.5, 179.5, 187.5};  // 7 bins with width 8x
+
+constexpr uint32_t nbins_qie8 = 32;
+constexpr uint32_t nbins_qie11 = 64;
+
+void dump(std::vector<float> const& vec, std::string const& name) {
+  std::stringstream str;
+  str << "float const " << name << "[" << vec.size() << "] = {";
+  uint32_t counter = 0;
+  for (auto const& value : vec) {
+    if (counter % 8 == 0)
+      str << std::endl;
+    if (counter == vec.size() - 1)
+      str << value;
+    else
+      str << value << ", ";
+    counter++;
+  }
+  str << "};";
+  std::cout << str.str() << std::endl;
+}
+
+void generate(uint32_t const nbins, float const* initValues, std::vector<float>& values) {
+  // preset the first range
+  for (uint32_t adc = 0; adc < nbins; adc++)
+    values[adc] = initValues[adc];
+
+  // do the rest
+  int scale = 1;
+  for (uint32_t range = 1; range < 4; range++) {
+    int factor = nbins == 32 ? 5 : 8;
+    scale *= factor;
+
+    auto const index_offset = range * nbins;
+    uint32_t const overlap = nbins == 32 ? 2 : 3;
+    values[index_offset] = values[index_offset - overlap];
+
+    for (uint32_t i = 1; i < nbins; i++)
+      values[index_offset + i] = values[index_offset + i - 1] + scale * (values[i] - values[i - 1]);
+  }
+
+  values[nbins * 4] = 2 * values[nbins * 4 - 1] - values[nbins * 4 - 2];
+}
+
+int main(int argc, char* argv[]) {
+  //
+  // run 128 bins
+  //
+  std::vector<float> valuesqie8(nbins_qie8 * 4 + 1), valuesqie11(nbins_qie11 * 4 + 1);
+  generate(nbins_qie8, binMin, valuesqie8);
+  generate(nbins_qie11, binMin2, valuesqie11);
+
+  dump(valuesqie8, std::string{"qie8shape"});
+  dump(valuesqie11, std::string{"qie11shape"});
+
+  return 0;
+}
diff --git a/RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h b/RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h
new file mode 100644
index 0000000000000..98ce9c0b660f1
--- /dev/null
+++ b/RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h
@@ -0,0 +1,37 @@
+#ifndef RecoLocalCalo_HcalRecAlgos_interface_HcalMahiPulseOffsetsGPU_h
+#define RecoLocalCalo_HcalRecAlgos_interface_HcalMahiPulseOffsetsGPU_h
+
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif
+
+class HcalMahiPulseOffsetsGPU {
+public:
+  struct Product {
+    ~Product();
+    int* values;
+  };
+
+#ifndef __CUDACC__
+  // rearrange reco params
+  HcalMahiPulseOffsetsGPU(edm::ParameterSet const&);
+
+  // will trigger deallocation of Product thru ~Product
+  ~HcalMahiPulseOffsetsGPU() = default;
+
+  std::vector<int, cms::cuda::HostAllocator<int>> const& getValues() const { return values_; }
+
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+
+private:
+  std::vector<int, cms::cuda::HostAllocator<int>> values_;
+
+  cms::cuda::ESProduct<Product> product_;
+#endif
+};
+
+#endif
diff --git a/RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h b/RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h
new file mode 100644
index 0000000000000..965fb873bcf88
--- /dev/null
+++ b/RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h
@@ -0,0 +1,54 @@
+#ifndef RecoLocalCalo_HcalRecAlgos_interface_HcalRecoParamsWithPulseShapesGPU_h
+#define RecoLocalCalo_HcalRecAlgos_interface_HcalRecoParamsWithPulseShapesGPU_h
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif
+
+class HcalRecoParams;
+
+//
+// TODO: HcalPulseShapes will need to be used via ESSource
+// This is a workaround: precompute/store/transfer what's needed only
+//
+class HcalRecoParamsWithPulseShapesGPU {
+public:
+  struct Product {
+    ~Product();
+    uint32_t *param1 = nullptr, *param2 = nullptr;
+    uint32_t *ids = nullptr;
+
+    // These guys come directly from PulseShapeFunctor class
+    float *acc25nsVec = nullptr, *diff25nsItvlVec = nullptr, *accVarLenIdxMinusOneVec = nullptr,
+          *diffVarItvlIdxMinusOneVec = nullptr, *accVarLenIdxZEROVec = nullptr, *diffVarItvlIdxZEROVec = nullptr;
+  };
+
+#ifndef __CUDACC__
+  // rearrange reco params
+  HcalRecoParamsWithPulseShapesGPU(HcalRecoParams const &);
+
+  // will trigger deallocation of Product thru ~Product
+  ~HcalRecoParamsWithPulseShapesGPU() = default;
+
+  // get device pointers
+  Product const &getProduct(cudaStream_t) const;
+
+private:
+  uint64_t totalChannels_;  // hb + he
+  std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t>> param1_;
+  std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t>> param2_;
+  std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t>> ids_;
+
+  std::vector<float, cms::cuda::HostAllocator<float>> acc25nsVec_;                 // 256
+  std::vector<float, cms::cuda::HostAllocator<float>> diff25nsItvlVec_;            // 256
+  std::vector<float, cms::cuda::HostAllocator<float>> accVarLenIdxMinusOneVec_;    // 25
+  std::vector<float, cms::cuda::HostAllocator<float>> diffVarItvlIdxMinusOneVec_;  // 25
+  std::vector<float, cms::cuda::HostAllocator<float>> accVarLenIdxZEROVec_;        // 25
+  std::vector<float, cms::cuda::HostAllocator<float>> diffVarItvlIdxZEROVec_;      // 25
+
+  cms::cuda::ESProduct<Product> product_;
+#endif
+};
+
+#endif
diff --git a/RecoLocalCalo/HcalRecAlgos/src/HcalMahiPulseOffsetsGPU.cc b/RecoLocalCalo/HcalRecAlgos/src/HcalMahiPulseOffsetsGPU.cc
new file mode 100644
index 0000000000000..005a77932f6df
--- /dev/null
+++ b/RecoLocalCalo/HcalRecAlgos/src/HcalMahiPulseOffsetsGPU.cc
@@ -0,0 +1,35 @@
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+// FIXME: add proper getters to conditions
+HcalMahiPulseOffsetsGPU::HcalMahiPulseOffsetsGPU(edm::ParameterSet const& ps) {
+  auto const& values = ps.getParameter<std::vector<int>>("pulseOffsets");
+  values_.resize(values.size());
+  std::copy(values.begin(), values.end(), values_.begin());
+}
+
+HcalMahiPulseOffsetsGPU::Product::~Product() {
+  // deallocation
+  cudaCheck(cudaFree(values));
+}
+
+HcalMahiPulseOffsetsGPU::Product const& HcalMahiPulseOffsetsGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](HcalMahiPulseOffsetsGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.values, this->values_.size() * sizeof(int)));
+
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.values,
+                                  this->values_.data(),
+                                  this->values_.size() * sizeof(int),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(HcalMahiPulseOffsetsGPU);
diff --git a/RecoLocalCalo/HcalRecAlgos/src/HcalRecoParamsWithPulseShapesGPU.cc b/RecoLocalCalo/HcalRecAlgos/src/HcalRecoParamsWithPulseShapesGPU.cc
new file mode 100644
index 0000000000000..b42621b98908e
--- /dev/null
+++ b/RecoLocalCalo/HcalRecAlgos/src/HcalRecoParamsWithPulseShapesGPU.cc
@@ -0,0 +1,222 @@
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h"
+
+#include "CondFormats/HcalObjects/interface/HcalRecoParams.h"
+#include "CalibCalorimetry/HcalAlgos/interface/HcalPulseShapes.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/PulseShapeFunctor.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include <unordered_map>
+
+// FIXME: add proper getters to conditions
+HcalRecoParamsWithPulseShapesGPU::HcalRecoParamsWithPulseShapesGPU(HcalRecoParams const& recoParams)
+    : totalChannels_{recoParams.getAllContainers()[0].second.size() + recoParams.getAllContainers()[1].second.size()},
+      param1_(totalChannels_),
+      param2_(totalChannels_),
+      ids_(totalChannels_) {
+#ifdef HCAL_MAHI_CPUDEBUG
+  printf("hello from a reco params with pulse shapes\n");
+#endif
+
+  auto const containers = recoParams.getAllContainers();
+
+  HcalPulseShapes pulseShapes;
+  std::unordered_map<unsigned int, uint32_t> idCache;
+
+  // fill in eb
+  auto const& barrelValues = containers[0].second;
+  for (uint64_t i = 0; i < barrelValues.size(); ++i) {
+    param1_[i] = barrelValues[i].param1();
+    param2_[i] = barrelValues[i].param2();
+
+    auto const pulseShapeId = barrelValues[i].pulseShapeID();
+    // FIXME: 0 throws upon look up to HcalPulseShapes
+    // although comments state that 0 is reserved,
+    // HcalPulseShapes::getShape throws on 0!
+    if (pulseShapeId == 0) {
+      ids_[i] = 0;
+      continue;
+    }
+    if (auto const iter = idCache.find(pulseShapeId); iter == idCache.end()) {
+      // new guy
+      auto const newId = idCache.size();
+      idCache[pulseShapeId] = newId;
+      // this will be the id
+      ids_[i] = newId;
+
+      // resize value arrays
+      acc25nsVec_.resize(acc25nsVec_.size() + hcal::constants::maxPSshapeBin);
+      diff25nsItvlVec_.resize(diff25nsItvlVec_.size() + hcal::constants::maxPSshapeBin);
+      accVarLenIdxMinusOneVec_.resize(accVarLenIdxMinusOneVec_.size() + hcal::constants::nsPerBX);
+      diffVarItvlIdxMinusOneVec_.resize(diffVarItvlIdxMinusOneVec_.size() + hcal::constants::nsPerBX);
+      accVarLenIdxZEROVec_.resize(accVarLenIdxZEROVec_.size() + hcal::constants::nsPerBX);
+      diffVarItvlIdxZEROVec_.resize(diffVarItvlIdxZEROVec_.size() + hcal::constants::nsPerBX);
+
+      // precompute and get values from the functor
+      auto const& pulseShape = pulseShapes.getShape(pulseShapeId);
+      FitterFuncs::PulseShapeFunctor functor{pulseShape, false, false, false, 1, 0, 0, hcal::constants::maxSamples};
+      auto const offset256 = newId * hcal::constants::maxPSshapeBin;
+      auto const offset25 = newId * hcal::constants::nsPerBX;
+      auto const numShapes = newId;
+      for (int i = 0; i < hcal::constants::maxPSshapeBin; i++) {
+        acc25nsVec_[offset256 * numShapes + i] = functor.acc25nsVec()[i];
+        diff25nsItvlVec_[offset256 * numShapes + i] = functor.diff25nsItvlVec()[i];
+      }
+
+      for (int i = 0; i < hcal::constants::nsPerBX; i++) {
+        accVarLenIdxMinusOneVec_[offset25 * numShapes + i] = functor.accVarLenIdxMinusOneVec()[i];
+        diffVarItvlIdxMinusOneVec_[offset25 * numShapes + i] = functor.diffVarItvlIdxMinusOneVec()[i];
+        accVarLenIdxZEROVec_[offset25 * numShapes + i] = functor.accVarLenIdxZEROVec()[i];
+        diffVarItvlIdxZEROVec_[offset25 * numShapes + i] = functor.diffVarItvlIdxZEROVec()[i];
+      }
+    } else {
+      // already recorded this pulse shape, just set id
+      ids_[i] = iter->second;
+    }
+#ifdef HCAL_MAHI_CPUDEBUG
+    if (barrelValues[i].rawId() == DETID_TO_DEBUG) {
+      printf("recoShapeId = %u myid = %u\n", pulseShapeId, ids_[i]);
+    }
+#endif
+  }
+
+  // fill in ee
+  auto const& endcapValues = containers[1].second;
+  auto const offset = barrelValues.size();
+  for (uint64_t i = 0; i < endcapValues.size(); ++i) {
+    param1_[i + offset] = endcapValues[i].param1();
+    param2_[i + offset] = endcapValues[i].param2();
+
+    auto const pulseShapeId = endcapValues[i].pulseShapeID();
+    // FIXME: 0 throws upon look up to HcalPulseShapes
+    // although comments state that 0 is reserved,
+    // HcalPulseShapes::getShape throws on 0!
+    if (pulseShapeId == 0) {
+      ids_[i + offset] = 0;
+      continue;
+    }
+    if (auto const iter = idCache.find(pulseShapeId); iter == idCache.end()) {
+      // new guy
+      auto const newId = idCache.size();
+      idCache[pulseShapeId] = newId;
+      // this will be the id
+      ids_[i + offset] = newId;
+
+      // resize value arrays
+      acc25nsVec_.resize(acc25nsVec_.size() + hcal::constants::maxPSshapeBin);
+      diff25nsItvlVec_.resize(diff25nsItvlVec_.size() + hcal::constants::maxPSshapeBin);
+      accVarLenIdxMinusOneVec_.resize(accVarLenIdxMinusOneVec_.size() + hcal::constants::nsPerBX);
+      diffVarItvlIdxMinusOneVec_.resize(diffVarItvlIdxMinusOneVec_.size() + hcal::constants::nsPerBX);
+      accVarLenIdxZEROVec_.resize(accVarLenIdxZEROVec_.size() + hcal::constants::nsPerBX);
+      diffVarItvlIdxZEROVec_.resize(diffVarItvlIdxZEROVec_.size() + hcal::constants::nsPerBX);
+
+      // precompute and get values from the functor
+      auto const& pulseShape = pulseShapes.getShape(pulseShapeId);
+      FitterFuncs::PulseShapeFunctor functor{pulseShape, false, false, false, 1, 0, 0, hcal::constants::maxSamples};
+      auto const offset256 = newId * hcal::constants::maxPSshapeBin;
+      auto const offset25 = newId * hcal::constants::nsPerBX;
+      auto const numShapes = newId;
+      for (int i = 0; i < hcal::constants::maxPSshapeBin; i++) {
+        acc25nsVec_[offset256 * numShapes + i] = functor.acc25nsVec()[i];
+        diff25nsItvlVec_[offset256 * numShapes + i] = functor.diff25nsItvlVec()[i];
+      }
+
+      for (int i = 0; i < hcal::constants::nsPerBX; i++) {
+        accVarLenIdxMinusOneVec_[offset25 * numShapes + i] = functor.accVarLenIdxMinusOneVec()[i];
+        diffVarItvlIdxMinusOneVec_[offset25 * numShapes + i] = functor.diffVarItvlIdxMinusOneVec()[i];
+        accVarLenIdxZEROVec_[offset25 * numShapes + i] = functor.accVarLenIdxZEROVec()[i];
+        diffVarItvlIdxZEROVec_[offset25 * numShapes + i] = functor.diffVarItvlIdxZEROVec()[i];
+      }
+    } else {
+      // already recorded this pulse shape, just set id
+      ids_[i + offset] = iter->second;
+    }
+  }
+
+#ifdef HCAL_MAHI_CPUDEBUG
+  for (auto const& p : idCache)
+    printf("recoPulseShapeId = %u id = %u\n", p.first, p.second);
+#endif
+}
+
+HcalRecoParamsWithPulseShapesGPU::Product::~Product() {
+  // deallocation
+  cudaCheck(cudaFree(param1));
+  cudaCheck(cudaFree(param2));
+  cudaCheck(cudaFree(ids));
+  cudaCheck(cudaFree(acc25nsVec));
+  cudaCheck(cudaFree(diff25nsItvlVec));
+  cudaCheck(cudaFree(accVarLenIdxMinusOneVec));
+  cudaCheck(cudaFree(diffVarItvlIdxMinusOneVec));
+  cudaCheck(cudaFree(accVarLenIdxZEROVec));
+  cudaCheck(cudaFree(diffVarItvlIdxZEROVec));
+}
+
+HcalRecoParamsWithPulseShapesGPU::Product const& HcalRecoParamsWithPulseShapesGPU::getProduct(
+    cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](HcalRecoParamsWithPulseShapesGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.param1, this->param1_.size() * sizeof(uint32_t)));
+        cudaCheck(cudaMalloc((void**)&product.param2, this->param2_.size() * sizeof(uint32_t)));
+        cudaCheck(cudaMalloc((void**)&product.ids, this->ids_.size() * sizeof(uint32_t)));
+        cudaCheck(cudaMalloc((void**)&product.acc25nsVec, this->acc25nsVec_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.diff25nsItvlVec, this->diff25nsItvlVec_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.accVarLenIdxMinusOneVec,
+                             this->accVarLenIdxMinusOneVec_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.diffVarItvlIdxMinusOneVec,
+                             this->diffVarItvlIdxMinusOneVec_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.accVarLenIdxZEROVec, this->accVarLenIdxZEROVec_.size() * sizeof(float)));
+        cudaCheck(
+            cudaMalloc((void**)&product.diffVarItvlIdxZEROVec, this->diffVarItvlIdxZEROVec_.size() * sizeof(float)));
+
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.param1,
+                                  this->param1_.data(),
+                                  this->param1_.size() * sizeof(uint32_t),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.param2,
+                                  this->param2_.data(),
+                                  this->param2_.size() * sizeof(uint32_t),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(
+            product.ids, this->ids_.data(), this->ids_.size() * sizeof(uint32_t), cudaMemcpyHostToDevice, cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.acc25nsVec,
+                                  this->acc25nsVec_.data(),
+                                  this->acc25nsVec_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.diff25nsItvlVec,
+                                  this->diff25nsItvlVec_.data(),
+                                  this->diff25nsItvlVec_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.accVarLenIdxMinusOneVec,
+                                  this->accVarLenIdxMinusOneVec_.data(),
+                                  this->accVarLenIdxMinusOneVec_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.diffVarItvlIdxMinusOneVec,
+                                  this->diffVarItvlIdxMinusOneVec_.data(),
+                                  this->diffVarItvlIdxMinusOneVec_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.accVarLenIdxZEROVec,
+                                  this->accVarLenIdxZEROVec_.data(),
+                                  this->accVarLenIdxZEROVec_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.diffVarItvlIdxZEROVec,
+                                  this->diffVarItvlIdxZEROVec_.data(),
+                                  this->diffVarItvlIdxZEROVec_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(HcalRecoParamsWithPulseShapesGPU);
diff --git a/RecoLocalCalo/HcalRecProducers/BuildFile.xml b/RecoLocalCalo/HcalRecProducers/BuildFile.xml
index c3ae589a0c0a7..e34037ecbcbb5 100644
--- a/RecoLocalCalo/HcalRecProducers/BuildFile.xml
+++ b/RecoLocalCalo/HcalRecProducers/BuildFile.xml
@@ -1,8 +1,25 @@
-<flags EDM_PLUGIN="1"/>
+<use name="boost"/>
+<use name="cuda"/>
+<use name="CUDADataFormats/CaloCommon"/>
+<use name="CUDADataFormats/Common"/>
+<use name="CUDADataFormats/HcalDigi"/>
+<use name="CUDADataFormats/HcalRecHitSoA"/>
 <use name="CalibCalorimetry/HcalAlgos"/>
 <use name="CalibFormats/HcalObjects"/>
-<use name="RecoLocalCalo/HcalRecAlgos"/>
-<use name="FWCore/Framework"/>
+<use name="CommonTools/UtilAlgos"/>
 <use name="DataFormats/Common"/>
+<use name="DataFormats/HcalDetId"/>
+<use name="DataFormats/HcalRecHit"/>
+<use name="FWCore/Framework"/>
+<use name="FWCore/ParameterSet"/>
+<use name="FWCore/PluginManager"/>
+<use name="FWCore/ServiceRegistry"/>
+<use name="Geometry/CaloGeometry"/>
+<use name="Geometry/HcalCommonData"/>
 <use name="Geometry/Records"/>
-<use name="boost"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="RecoLocalCalo/HcalRecAlgos"/>
+<use name="SimCalorimetry/CaloSimAlgos"/>
+<use name="SimCalorimetry/HcalSimAlgos"/>
+<flags EDM_PLUGIN="1"/>
diff --git a/RecoLocalCalo/HcalRecProducers/bin/BuildFile.xml b/RecoLocalCalo/HcalRecProducers/bin/BuildFile.xml
new file mode 100644
index 0000000000000..a804a7fe4b70e
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/bin/BuildFile.xml
@@ -0,0 +1,7 @@
+<bin name="makeHcalRecHitGpuValidationPlots" file="makeHcalRecHitGpuValidationPlots.cpp">
+    <use name="root"/>
+    <use name="rootgraphics"/>
+    <use name="DataFormats/Common"/>
+    <use name="DataFormats/HcalDetId"/>
+    <use name="CUDADataFormats/HcalRecHitSoA"/>
+</bin>
diff --git a/RecoLocalCalo/HcalRecProducers/bin/makeHcalRecHitGpuValidationPlots.cpp b/RecoLocalCalo/HcalRecProducers/bin/makeHcalRecHitGpuValidationPlots.cpp
new file mode 100644
index 0000000000000..fe6aabf928aca
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/bin/makeHcalRecHitGpuValidationPlots.cpp
@@ -0,0 +1,282 @@
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <TCanvas.h>
+#include <TFile.h>
+#include <TH1D.h>
+#include <TH2D.h>
+#include <TTree.h>
+#include <TPaveStats.h>
+
+#include "DataFormats/HcalRecHit/interface/HcalRecHitCollections.h"
+//#include "CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h"
+
+#define CREATE_HIST_1D(varname, nbins, first, last) auto varname = new TH1D(#varname, #varname, nbins, first, last)
+
+#define CREATE_HIST_2D(varname, nbins, first, last) \
+  auto varname = new TH2D(#varname, #varname, nbins, first, last, nbins, first, last)
+
+int main(int argc, char* argv[]) {
+  if (argc < 3) {
+    std::cout << "run with: ./<exe> <path to input file> <path to output file>\n";
+    exit(0);
+  }
+
+  std::string inFileName{argv[1]};
+  std::string outFileName{argv[2]};
+
+  // branches to use
+  edm::Wrapper<HBHERecHitCollection>* wcpu = nullptr;
+  edm::Wrapper<HBHERecHitCollection>* wgpu = nullptr;
+  //    edm::Wrapper<hcal::RecHitCollection<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>> *wgpu=nullptr;
+
+  // prep output
+  TFile rfout{outFileName.c_str(), "recreate"};
+
+  CREATE_HIST_1D(hEnergyM0HBGPU, 1000, 0, 100);
+  CREATE_HIST_1D(hEnergyM0HEGPU, 1000, 0, 100);
+  CREATE_HIST_1D(hEnergyM0HBCPU, 1000, 0, 100);
+  CREATE_HIST_1D(hEnergyM0HECPU, 1000, 0, 100);
+
+  CREATE_HIST_1D(hEnergyHBGPU, 1000, 0, 100);
+  CREATE_HIST_1D(hEnergyHBCPU, 1000, 0, 100);
+  CREATE_HIST_1D(hEnergyHEGPU, 1000, 0, 100);
+  CREATE_HIST_1D(hEnergyHECPU, 1000, 0, 100);
+
+  CREATE_HIST_1D(hChi2HBGPU, 1000, 0, 100);
+  CREATE_HIST_1D(hChi2HBCPU, 1000, 0, 100);
+  CREATE_HIST_1D(hChi2HEGPU, 1000, 0, 100);
+  CREATE_HIST_1D(hChi2HECPU, 1000, 0, 100);
+
+  CREATE_HIST_2D(hEnergyHBGPUvsCPU, 1000, 0, 100);
+  CREATE_HIST_2D(hEnergyHEGPUvsCPU, 1000, 0, 100);
+  CREATE_HIST_2D(hChi2HBGPUvsCPU, 1000, 0, 100);
+  CREATE_HIST_2D(hChi2HEGPUvsCPU, 1000, 0, 100);
+
+  CREATE_HIST_2D(hEnergyM0HBGPUvsCPU, 1000, 0, 100);
+  CREATE_HIST_2D(hEnergyM0HEGPUvsCPU, 1000, 0, 100);
+
+  // prep input
+  TFile rfin{inFileName.c_str()};
+  TTree* rt = (TTree*)rfin.Get("Events");
+  rt->SetBranchAddress("HBHERecHitsSorted_hcalCPURecHitsProducer_recHitsLegacyHBHE_RECO.", &wgpu);
+  //    rt->SetBranchAddress("hcalCUDAHostAllocatorAliashcalcommonVecStoragePolicyhcalRecHitCollection_hcalCPURecHitsProducer_recHitsM0LabelOut_RECO.", &wgpu);
+  rt->SetBranchAddress("HBHERecHitsSorted_hbheprereco__RECO.", &wcpu);
+
+  // accumulate
+  auto const nentries = rt->GetEntries();
+  std::cout << ">>> nentries = " << nentries << std::endl;
+  for (int ie = 0; ie < nentries; ++ie) {
+    rt->GetEntry(ie);
+
+    auto const& gpuProduct = wgpu->bareProduct();
+    auto const& cpuProduct = wcpu->bareProduct();
+
+    auto const ncpu = cpuProduct.size();
+    auto const ngpu = gpuProduct.size();
+    //        auto const ngpu = gpuProduct.energy.size();
+
+    if (ngpu != ncpu) {
+      std::cerr << "*** mismatch in number of rec hits for event " << ie << std::endl
+                << ">>> ngpu = " << ngpu << std::endl
+                << ">>> ncpu = " << ncpu << std::endl;
+    }
+
+    for (uint32_t ich = 0; ich < ncpu; ich++) {
+      auto const& cpurh = cpuProduct[ich];
+      auto const& did = cpurh.id();
+      auto iter2gpu = gpuProduct.find(did);
+      //            auto iter2idgpu = std::find(
+      //                gpuProduct.did.begin(), gpuProduct.did.end(), did.rawId());
+
+      if (iter2gpu == gpuProduct.end()) {
+        std::cerr << "missing " << did << std::endl;
+        continue;
+      }
+
+      assert(iter2gpu->id().rawId() == did.rawId());
+
+      auto const gpu_energy_m0 = iter2gpu->eraw();
+      auto const cpu_energy_m0 = cpurh.eraw();
+      auto const gpu_energy = iter2gpu->energy();
+      auto const cpu_energy = cpurh.energy();
+      auto const gpu_chi2 = iter2gpu->chi2();
+      auto const cpu_chi2 = cpurh.chi2();
+
+      if (did.subdetId() == HcalBarrel) {
+        hEnergyM0HBGPU->Fill(gpu_energy_m0);
+        hEnergyM0HBCPU->Fill(cpu_energy_m0);
+        hEnergyM0HBGPUvsCPU->Fill(cpu_energy_m0, gpu_energy_m0);
+
+        hEnergyHBGPU->Fill(gpu_energy);
+        hEnergyHBCPU->Fill(cpu_energy);
+        hEnergyHBGPUvsCPU->Fill(cpu_energy, gpu_energy);
+        hChi2HBGPU->Fill(gpu_chi2);
+        hChi2HBCPU->Fill(cpu_chi2);
+        hChi2HBGPUvsCPU->Fill(cpu_chi2, gpu_chi2);
+      } else if (did.subdetId() == HcalEndcap) {
+        hEnergyM0HEGPU->Fill(gpu_energy_m0);
+        hEnergyM0HECPU->Fill(cpu_energy_m0);
+        hEnergyM0HEGPUvsCPU->Fill(cpu_energy_m0, gpu_energy_m0);
+
+        hEnergyHEGPU->Fill(gpu_energy);
+        hEnergyHECPU->Fill(cpu_energy);
+        hEnergyHEGPUvsCPU->Fill(cpu_energy, gpu_energy);
+
+        hChi2HEGPU->Fill(gpu_chi2);
+        hChi2HECPU->Fill(cpu_chi2);
+        hChi2HEGPUvsCPU->Fill(cpu_chi2, gpu_chi2);
+      }
+    }
+  }
+
+  {
+    TCanvas c{"plots", "plots", 4200, 6200};
+    c.Divide(4, 3);
+    c.cd(1);
+    {
+      gPad->SetLogy();
+      hEnergyM0HBCPU->SetLineColor(kBlack);
+      hEnergyM0HBCPU->SetLineWidth(1.);
+      hEnergyM0HBCPU->Draw("");
+      hEnergyM0HBGPU->SetLineColor(kBlue);
+      hEnergyM0HBGPU->SetLineWidth(1.);
+      hEnergyM0HBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hEnergyM0HBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(2);
+    {
+      gPad->SetLogz();
+      hEnergyM0HBGPUvsCPU->GetXaxis()->SetTitle("cpu");
+      hEnergyM0HBGPUvsCPU->GetYaxis()->SetTitle("gpu");
+      hEnergyM0HBGPUvsCPU->Draw("colz");
+    }
+    c.cd(3);
+    {
+      gPad->SetLogy();
+      hEnergyM0HECPU->SetLineColor(kBlack);
+      hEnergyM0HECPU->SetLineWidth(1.);
+      hEnergyM0HECPU->Draw("");
+      hEnergyM0HEGPU->SetLineColor(kBlue);
+      hEnergyM0HEGPU->SetLineWidth(1.);
+      hEnergyM0HEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hEnergyM0HEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(4);
+    {
+      gPad->SetLogz();
+      hEnergyM0HEGPUvsCPU->GetXaxis()->SetTitle("cpu");
+      hEnergyM0HEGPUvsCPU->GetYaxis()->SetTitle("gpu");
+      hEnergyM0HEGPUvsCPU->Draw("colz");
+    }
+    c.cd(5);
+    {
+      gPad->SetLogy();
+      hEnergyHBCPU->SetLineColor(kBlack);
+      hEnergyHBCPU->SetLineWidth(1.);
+      hEnergyHBCPU->Draw("");
+      hEnergyHBGPU->SetLineColor(kBlue);
+      hEnergyHBGPU->SetLineWidth(1.);
+      hEnergyHBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hEnergyHBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(6);
+    {
+      gPad->SetLogz();
+      hEnergyHBGPUvsCPU->GetXaxis()->SetTitle("cpu");
+      hEnergyHBGPUvsCPU->GetYaxis()->SetTitle("gpu");
+      hEnergyHBGPUvsCPU->Draw("colz");
+    }
+    c.cd(7);
+    {
+      gPad->SetLogy();
+      hEnergyHECPU->SetLineColor(kBlack);
+      hEnergyHECPU->SetLineWidth(1.);
+      hEnergyHECPU->Draw("");
+      hEnergyHEGPU->SetLineColor(kBlue);
+      hEnergyHEGPU->SetLineWidth(1.);
+      hEnergyHEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hEnergyHEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(8);
+    {
+      gPad->SetLogz();
+      hEnergyHEGPUvsCPU->GetXaxis()->SetTitle("cpu");
+      hEnergyHEGPUvsCPU->GetYaxis()->SetTitle("gpu");
+      hEnergyHEGPUvsCPU->Draw("colz");
+    }
+    c.cd(9);
+    {
+      gPad->SetLogy();
+      hChi2HBCPU->SetLineColor(kBlack);
+      hChi2HBCPU->SetLineWidth(1.);
+      hChi2HBCPU->Draw("");
+      hChi2HBGPU->SetLineColor(kBlue);
+      hChi2HBGPU->SetLineWidth(1.);
+      hChi2HBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hChi2HBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(10);
+    {
+      gPad->SetLogz();
+      hChi2HBGPUvsCPU->GetXaxis()->SetTitle("cpu");
+      hChi2HBGPUvsCPU->GetYaxis()->SetTitle("gpu");
+      hChi2HBGPUvsCPU->Draw("colz");
+    }
+    c.cd(11);
+    {
+      gPad->SetLogy();
+      hChi2HECPU->SetLineColor(kBlack);
+      hChi2HECPU->SetLineWidth(1.);
+      hChi2HECPU->Draw("");
+      hChi2HEGPU->SetLineColor(kBlue);
+      hChi2HEGPU->SetLineWidth(1.);
+      hChi2HEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hChi2HEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(12);
+    {
+      gPad->SetLogz();
+      hChi2HEGPUvsCPU->GetXaxis()->SetTitle("cpu");
+      hChi2HEGPUvsCPU->GetYaxis()->SetTitle("gpu");
+      hChi2HEGPUvsCPU->Draw("colz");
+    }
+    c.SaveAs("plots.pdf");
+  }
+
+  rfin.Close();
+  rfout.Write();
+  rfout.Close();
+}
diff --git a/RecoLocalCalo/HcalRecProducers/python/hbheRecHitProducerGPUTask_cff.py b/RecoLocalCalo/HcalRecProducers/python/hbheRecHitProducerGPUTask_cff.py
new file mode 100644
index 0000000000000..d938653d5a15e
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/python/hbheRecHitProducerGPUTask_cff.py
@@ -0,0 +1,65 @@
+import FWCore.ParameterSet.Config as cms
+
+# Run 3 HCAL workflow on GPU
+
+# EventSetup modules used by HBHERecHitProducerGPU
+from RecoLocalCalo.HcalRecProducers.hcalGainsGPUESProducer_cfi import hcalGainsGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalGainWidthsGPUESProducer_cfi import hcalGainWidthsGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalLUTCorrsGPUESProducer_cfi import hcalLUTCorrsGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalConvertedPedestalsGPUESProducer_cfi import hcalConvertedPedestalsGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalConvertedEffectivePedestalsGPUESProducer_cfi import hcalConvertedEffectivePedestalsGPUESProducer
+hcalConvertedEffectivePedestalsGPUESProducer.label0 = "withTopoEff"
+
+from RecoLocalCalo.HcalRecProducers.hcalConvertedPedestalWidthsGPUESProducer_cfi import hcalConvertedPedestalWidthsGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalConvertedEffectivePedestalWidthsGPUESProducer_cfi import hcalConvertedEffectivePedestalWidthsGPUESProducer
+hcalConvertedEffectivePedestalWidthsGPUESProducer.label0 = "withTopoEff"
+hcalConvertedEffectivePedestalWidthsGPUESProducer.label1 = "withTopoEff"
+
+from RecoLocalCalo.HcalRecProducers.hcalQIECodersGPUESProducer_cfi import hcalQIECodersGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalRecoParamsWithPulseShapesGPUESProducer_cfi import hcalRecoParamsWithPulseShapesGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalRespCorrsGPUESProducer_cfi import hcalRespCorrsGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalTimeCorrsGPUESProducer_cfi import hcalTimeCorrsGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalQIETypesGPUESProducer_cfi import hcalQIETypesGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalSiPMParametersGPUESProducer_cfi import hcalSiPMParametersGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalSiPMCharacteristicsGPUESProducer_cfi import hcalSiPMCharacteristicsGPUESProducer
+from RecoLocalCalo.HcalRecProducers.hcalMahiPulseOffsetsGPUESProducer_cfi import hcalMahiPulseOffsetsGPUESProducer
+
+# convert the HBHE digis into SoA format, and copy them from CPU to GPU
+from EventFilter.HcalRawToDigi.hcalDigisProducerGPU_cfi import hcalDigisProducerGPU as _hcalDigisProducerGPU
+hcalDigisGPU = _hcalDigisProducerGPU.clone(
+    digisLabelF01HE = "",
+    digisLabelF5HB = "",
+    digisLabelF3HB = ""
+)
+
+# run the HCAL local reconstruction (MAHI) on GPU
+from RecoLocalCalo.HcalRecProducers.hbheRecHitProducerGPU_cfi import hbheRecHitProducerGPU as _hbheRecHitProducerGPU
+hbheRecHitProducerGPU = _hbheRecHitProducerGPU.clone(
+    digisLabelF01HE = "hcalDigisGPU",
+    digisLabelF5HB = "hcalDigisGPU",
+    digisLabelF3HB = "hcalDigisGPU",
+    recHitsLabelM0HBHE = ""
+)
+
+# Tasks and Sequences
+hbheRecHitProducerGPUTask = cms.Task(
+    hcalGainsGPUESProducer,
+    hcalGainWidthsGPUESProducer,
+    hcalLUTCorrsGPUESProducer,
+    hcalConvertedPedestalsGPUESProducer,
+    hcalConvertedEffectivePedestalsGPUESProducer,
+    hcalConvertedPedestalWidthsGPUESProducer,
+    hcalConvertedEffectivePedestalWidthsGPUESProducer,
+    hcalQIECodersGPUESProducer,
+    hcalRecoParamsWithPulseShapesGPUESProducer,
+    hcalRespCorrsGPUESProducer,
+    hcalTimeCorrsGPUESProducer,
+    hcalQIETypesGPUESProducer,
+    hcalSiPMParametersGPUESProducer,
+    hcalSiPMCharacteristicsGPUESProducer,
+    hcalMahiPulseOffsetsGPUESProducer,
+    hcalDigisGPU,
+    hbheRecHitProducerGPU
+)
+
+hbheRecHitProducerGPUSequence = cms.Sequence(hbheRecHitProducerGPUTask)
diff --git a/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h b/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
new file mode 100644
index 0000000000000..807c42b057fa3
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
@@ -0,0 +1,115 @@
+#ifndef RecoLocalCalo_HcalRecProducers_src_DeclsForKernels_h
+#define RecoLocalCalo_HcalRecProducers_src_DeclsForKernels_h
+
+#include <functional>
+#include <optional>
+
+#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+#include "CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h"
+#include "CalibCalorimetry/HcalAlgos/interface/HcalTimeSlew.h"
+#include "CondFormats/DataRecord/interface/HcalCombinedRecordsGPU.h"
+#include "CondFormats/DataRecord/interface/HcalGainWidthsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalGainsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalLUTCorrsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalQIEDataRcd.h"
+#include "CondFormats/DataRecord/interface/HcalQIETypesRcd.h"
+#include "CondFormats/DataRecord/interface/HcalRecoParamsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalRespCorrsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalSiPMCharacteristicsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalSiPMParametersRcd.h"
+#include "CondFormats/DataRecord/interface/HcalTimeCorrsRcd.h"
+#include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalWidthsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalGainWidthsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalGainsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalLUTCorrsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalQIECodersGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalQIETypesGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalRecoParamsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalRespCorrsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalSiPMCharacteristicsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalSiPMParametersGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalTimeCorrsGPU.h"
+#include "Geometry/CaloTopology/interface/HcalTopology.h"
+#include "Geometry/HcalCommonData/interface/HcalDDDRecConstants.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h"
+
+namespace hcal {
+  namespace reconstruction {
+
+    struct ConditionsProducts {
+      HcalGainWidthsGPU::Product const& gainWidths;
+      HcalGainsGPU::Product const& gains;
+      HcalLUTCorrsGPU::Product const& lutCorrs;
+      HcalConvertedPedestalWidthsGPU::Product const& pedestalWidths;
+      HcalConvertedEffectivePedestalWidthsGPU::Product const& effectivePedestalWidths;
+      HcalConvertedPedestalsGPU::Product const& pedestals;
+      HcalQIECodersGPU::Product const& qieCoders;
+      HcalRecoParamsWithPulseShapesGPU::Product const& recoParams;
+      HcalRespCorrsGPU::Product const& respCorrs;
+      HcalTimeCorrsGPU::Product const& timeCorrs;
+      HcalQIETypesGPU::Product const& qieTypes;
+      HcalSiPMParametersGPU::Product const& sipmParameters;
+      HcalSiPMCharacteristicsGPU::Product const& sipmCharacteristics;
+      HcalConvertedPedestalsGPU::Product const* convertedEffectivePedestals;
+      HcalTopology const* topology;
+      HcalDDDRecConstants const* recConstants;
+      uint32_t offsetForHashes;
+      HcalMahiPulseOffsetsGPU::Product const& pulseOffsets;
+      std::vector<int, cms::cuda::HostAllocator<int>> const& pulseOffsetsHost;
+    };
+
+    struct ConfigParameters {
+      uint32_t maxChannels;
+      uint32_t maxTimeSamples;
+      uint32_t kprep1dChannelsPerBlock;
+      int sipmQTSShift;
+      int sipmQNTStoSum;
+      int firstSampleShift;
+      bool useEffectivePedestals;
+
+      float meanTime;
+      float timeSigmaSiPM, timeSigmaHPD;
+      float ts4Thresh;
+
+      std::array<uint32_t, 3> kernelMinimizeThreads;
+
+      // FIXME:
+      //   - add "getters" to HcalTimeSlew calib formats
+      //   - add ES Producer to consume what is produced above not to replicate.
+      //   which ones to use is hardcoded, therefore no need to send those to the device
+      bool applyTimeSlew;
+      float tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew;
+    };
+
+    struct OutputDataGPU {
+      RecHitCollection<::calo::common::DevStoragePolicy> recHits;
+
+      void allocate(ConfigParameters const& config, cudaStream_t cudaStream) {
+        recHits.energy = cms::cuda::make_device_unique<float[]>(config.maxChannels, cudaStream);
+        recHits.chi2 = cms::cuda::make_device_unique<float[]>(config.maxChannels, cudaStream);
+        recHits.energyM0 = cms::cuda::make_device_unique<float[]>(config.maxChannels, cudaStream);
+        recHits.timeM0 = cms::cuda::make_device_unique<float[]>(config.maxChannels, cudaStream);
+        recHits.did = cms::cuda::make_device_unique<uint32_t[]>(config.maxChannels, cudaStream);
+      }
+    };
+
+    struct ScratchDataGPU {
+      cms::cuda::device::unique_ptr<float[]> amplitudes, noiseTerms, pulseMatrices, pulseMatricesM, pulseMatricesP;
+      cms::cuda::device::unique_ptr<int8_t[]> soiSamples;
+    };
+
+    struct InputDataGPU {
+      DigiCollection<Flavor1, ::calo::common::DevStoragePolicy> const& f01HEDigis;
+      DigiCollection<Flavor5, ::calo::common::DevStoragePolicy> const& f5HBDigis;
+      DigiCollection<Flavor3, ::calo::common::DevStoragePolicy> const& f3HBDigis;
+    };
+
+  }  // namespace reconstruction
+}  // namespace hcal
+
+#endif  // RecoLocalCalo_HcalRecProducers_src_DeclsForKernels_h
diff --git a/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
new file mode 100644
index 0000000000000..af5398e49fa8f
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
@@ -0,0 +1,246 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDACore/interface/JobConfigurationGPURecord.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+#include "SimpleAlgoGPU.h"
+
+class HBHERecHitProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit HBHERecHitProducerGPU(edm::ParameterSet const&);
+  ~HBHERecHitProducerGPU() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
+
+  using IProductTypef01 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor1, calo::common::DevStoragePolicy>>;
+  edm::EDGetTokenT<IProductTypef01> digisTokenF01HE_;
+
+  using IProductTypef5 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor5, calo::common::DevStoragePolicy>>;
+  edm::EDGetTokenT<IProductTypef5> digisTokenF5HB_;
+
+  using IProductTypef3 = cms::cuda::Product<hcal::DigiCollection<hcal::Flavor3, calo::common::DevStoragePolicy>>;
+  edm::EDGetTokenT<IProductTypef3> digisTokenF3HB_;
+
+  using RecHitType = hcal::RecHitCollection<calo::common::DevStoragePolicy>;
+  using OProductType = cms::cuda::Product<RecHitType>;
+  edm::EDPutTokenT<OProductType> rechitsM0Token_;
+
+  hcal::reconstruction::ConfigParameters configParameters_;
+  hcal::reconstruction::OutputDataGPU outputGPU_;
+  cms::cuda::ContextState cudaState_;
+};
+
+HBHERecHitProducerGPU::HBHERecHitProducerGPU(edm::ParameterSet const& ps)
+    : digisTokenF01HE_{consumes<IProductTypef01>(ps.getParameter<edm::InputTag>("digisLabelF01HE"))},
+      digisTokenF5HB_{consumes<IProductTypef5>(ps.getParameter<edm::InputTag>("digisLabelF5HB"))},
+      digisTokenF3HB_{consumes<IProductTypef3>(ps.getParameter<edm::InputTag>("digisLabelF3HB"))},
+      rechitsM0Token_{produces<OProductType>(ps.getParameter<std::string>("recHitsLabelM0HBHE"))} {
+  configParameters_.maxChannels = ps.getParameter<uint32_t>("maxChannels");
+  configParameters_.maxTimeSamples = ps.getParameter<uint32_t>("maxTimeSamples");
+  configParameters_.kprep1dChannelsPerBlock = ps.getParameter<uint32_t>("kprep1dChannelsPerBlock");
+  configParameters_.sipmQTSShift = ps.getParameter<int>("sipmQTSShift");
+  configParameters_.sipmQNTStoSum = ps.getParameter<int>("sipmQNTStoSum");
+  configParameters_.firstSampleShift = ps.getParameter<int>("firstSampleShift");
+  configParameters_.useEffectivePedestals = ps.getParameter<bool>("useEffectivePedestals");
+
+  configParameters_.meanTime = ps.getParameter<double>("meanTime");
+  configParameters_.timeSigmaSiPM = ps.getParameter<double>("timeSigmaSiPM");
+  configParameters_.timeSigmaHPD = ps.getParameter<double>("timeSigmaHPD");
+  configParameters_.ts4Thresh = ps.getParameter<double>("ts4Thresh");
+
+  configParameters_.applyTimeSlew = ps.getParameter<bool>("applyTimeSlew");
+  auto const tzeroValues = ps.getParameter<std::vector<double>>("tzeroTimeSlewParameters");
+  auto const slopeValues = ps.getParameter<std::vector<double>>("slopeTimeSlewParameters");
+  auto const tmaxValues = ps.getParameter<std::vector<double>>("tmaxTimeSlewParameters");
+
+  configParameters_.tzeroTimeSlew = tzeroValues[HcalTimeSlew::Medium];
+  configParameters_.slopeTimeSlew = slopeValues[HcalTimeSlew::Medium];
+  configParameters_.tmaxTimeSlew = tmaxValues[HcalTimeSlew::Medium];
+
+  auto threadsMinimize = ps.getParameter<std::vector<uint32_t>>("kernelMinimizeThreads");
+  configParameters_.kernelMinimizeThreads[0] = threadsMinimize[0];
+  configParameters_.kernelMinimizeThreads[1] = threadsMinimize[1];
+  configParameters_.kernelMinimizeThreads[2] = threadsMinimize[2];
+}
+
+HBHERecHitProducerGPU::~HBHERecHitProducerGPU() {}
+
+void HBHERecHitProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& cdesc) {
+  edm::ParameterSetDescription desc;
+  desc.add<uint32_t>("maxChannels", 10000u);
+  desc.add<uint32_t>("maxTimeSamples", 10);
+  desc.add<uint32_t>("kprep1dChannelsPerBlock", 32);
+  desc.add<edm::InputTag>("digisLabelF01HE", edm::InputTag{"hcalRawToDigiGPU", "f01HEDigisGPU"});
+  desc.add<edm::InputTag>("digisLabelF5HB", edm::InputTag{"hcalRawToDigiGPU", "f5HBDigisGPU"});
+  desc.add<edm::InputTag>("digisLabelF3HB", edm::InputTag{"hcalRawToDigiGPU", "f3HBDigisGPU"});
+  desc.add<std::string>("recHitsLabelM0HBHE", "recHitsM0HBHE");
+  desc.add<int>("sipmQTSShift", 0);
+  desc.add<int>("sipmQNTStoSum", 3);
+  desc.add<int>("firstSampleShift", 0);
+  desc.add<bool>("useEffectivePedestals", true);
+
+  desc.add<double>("meanTime", 0.f);
+  desc.add<double>("timeSigmaSiPM", 2.5f);
+  desc.add<double>("timeSigmaHPD", 5.0f);
+  desc.add<double>("ts4Thresh", 0.0);
+
+  desc.add<bool>("applyTimeSlew", true);
+  desc.add<std::vector<double>>("tzeroTimeSlewParameters", {23.960177, 11.977461, 9.109694});
+  desc.add<std::vector<double>>("slopeTimeSlewParameters", {-3.178648, -1.5610227, -1.075824});
+  desc.add<std::vector<double>>("tmaxTimeSlewParameters", {16.00, 10.00, 6.25});
+  desc.add<std::vector<uint32_t>>("kernelMinimizeThreads", {16, 1, 1});
+
+  cdesc.addWithDefaultLabel(desc);
+}
+
+void HBHERecHitProducerGPU::acquire(edm::Event const& event,
+                                    edm::EventSetup const& setup,
+                                    edm::WaitingTaskWithArenaHolder holder) {
+#ifdef HCAL_MAHI_CPUDEBUG
+  auto start = std::chrono::high_resolution_clock::now();
+#endif
+
+  // input + raii
+  auto const& f01HEProduct = event.get(digisTokenF01HE_);
+  auto const& f5HBProduct = event.get(digisTokenF5HB_);
+  auto const& f3HBProduct = event.get(digisTokenF3HB_);
+  cms::cuda::ScopedContextAcquire ctx{f01HEProduct, std::move(holder), cudaState_};
+  auto const& f01HEDigis = ctx.get(f01HEProduct);
+  auto const& f5HBDigis = ctx.get(f5HBProduct);
+  auto const& f3HBDigis = ctx.get(f3HBProduct);
+
+  hcal::reconstruction::InputDataGPU inputGPU{f01HEDigis, f5HBDigis, f3HBDigis};
+
+  // conditions
+  edm::ESHandle<HcalRecoParamsWithPulseShapesGPU> recoParamsHandle;
+  setup.get<HcalRecoParamsRcd>().get(recoParamsHandle);
+  auto const& recoParamsProduct = recoParamsHandle->getProduct(ctx.stream());
+
+  edm::ESHandle<HcalGainWidthsGPU> gainWidthsHandle;
+  setup.get<HcalGainWidthsRcd>().get(gainWidthsHandle);
+  auto const& gainWidthsProduct = gainWidthsHandle->getProduct(ctx.stream());
+
+  edm::ESHandle<HcalGainsGPU> gainsHandle;
+  setup.get<HcalGainsRcd>().get(gainsHandle);
+  auto const& gainsProduct = gainsHandle->getProduct(ctx.stream());
+
+  edm::ESHandle<HcalLUTCorrsGPU> lutCorrsHandle;
+  setup.get<HcalLUTCorrsRcd>().get(lutCorrsHandle);
+  auto const& lutCorrsProduct = lutCorrsHandle->getProduct(ctx.stream());
+
+  // use only 1 depending on useEffectivePedestals
+  edm::ESHandle<HcalConvertedPedestalWidthsGPU> pedestalWidthsHandle;
+  edm::ESHandle<HcalConvertedEffectivePedestalWidthsGPU> effectivePedestalWidthsHandle;
+  setup.get<HcalConvertedPedestalWidthsRcd>().get(effectivePedestalWidthsHandle);
+  setup.get<HcalConvertedPedestalWidthsRcd>().get(pedestalWidthsHandle);
+  auto const& pedestalWidthsProduct = pedestalWidthsHandle->getProduct(ctx.stream());
+  auto const& effectivePedestalWidthsProduct = effectivePedestalWidthsHandle->getProduct(ctx.stream());
+
+  edm::ESHandle<HcalConvertedPedestalsGPU> pedestalsHandle;
+  setup.get<HcalConvertedPedestalsRcd>().get(pedestalsHandle);
+  auto const& pedestalsProduct = pedestalsHandle->getProduct(ctx.stream());
+
+  edm::ESHandle<HcalConvertedEffectivePedestalsGPU> effectivePedestalsHandle;
+  if (configParameters_.useEffectivePedestals)
+    setup.get<HcalConvertedPedestalsRcd>().get(effectivePedestalsHandle);
+  auto const* effectivePedestalsProduct =
+      configParameters_.useEffectivePedestals ? &effectivePedestalsHandle->getProduct(ctx.stream()) : nullptr;
+
+  edm::ESHandle<HcalQIECodersGPU> qieCodersHandle;
+  setup.get<HcalQIEDataRcd>().get(qieCodersHandle);
+  auto const& qieCodersProduct = qieCodersHandle->getProduct(ctx.stream());
+
+  edm::ESHandle<HcalRespCorrsGPU> respCorrsHandle;
+  setup.get<HcalRespCorrsRcd>().get(respCorrsHandle);
+  auto const& respCorrsProduct = respCorrsHandle->getProduct(ctx.stream());
+
+  edm::ESHandle<HcalTimeCorrsGPU> timeCorrsHandle;
+  setup.get<HcalTimeCorrsRcd>().get(timeCorrsHandle);
+  auto const& timeCorrsProduct = timeCorrsHandle->getProduct(ctx.stream());
+
+  edm::ESHandle<HcalQIETypesGPU> qieTypesHandle;
+  setup.get<HcalQIETypesRcd>().get(qieTypesHandle);
+  auto const& qieTypesProduct = qieTypesHandle->getProduct(ctx.stream());
+
+  edm::ESHandle<HcalTopology> topologyHandle;
+  setup.get<HcalRecNumberingRecord>().get(topologyHandle);
+  edm::ESHandle<HcalDDDRecConstants> recConstantsHandle;
+  setup.get<HcalRecNumberingRecord>().get(recConstantsHandle);
+
+  edm::ESHandle<HcalSiPMParametersGPU> sipmParametersHandle;
+  setup.get<HcalSiPMParametersRcd>().get(sipmParametersHandle);
+  auto const& sipmParametersProduct = sipmParametersHandle->getProduct(ctx.stream());
+
+  edm::ESHandle<HcalSiPMCharacteristicsGPU> sipmCharacteristicsHandle;
+  setup.get<HcalSiPMCharacteristicsRcd>().get(sipmCharacteristicsHandle);
+  auto const& sipmCharacteristicsProduct = sipmCharacteristicsHandle->getProduct(ctx.stream());
+
+  edm::ESHandle<HcalMahiPulseOffsetsGPU> pulseOffsetsHandle;
+  setup.get<JobConfigurationGPURecord>().get(pulseOffsetsHandle);
+  auto const& pulseOffsetsProduct = pulseOffsetsHandle->getProduct(ctx.stream());
+
+  // bundle up conditions
+  hcal::reconstruction::ConditionsProducts conditions{gainWidthsProduct,
+                                                      gainsProduct,
+                                                      lutCorrsProduct,
+                                                      pedestalWidthsProduct,
+                                                      effectivePedestalWidthsProduct,
+                                                      pedestalsProduct,
+                                                      qieCodersProduct,
+                                                      recoParamsProduct,
+                                                      respCorrsProduct,
+                                                      timeCorrsProduct,
+                                                      qieTypesProduct,
+                                                      sipmParametersProduct,
+                                                      sipmCharacteristicsProduct,
+                                                      effectivePedestalsProduct,
+                                                      topologyHandle.product(),
+                                                      recConstantsHandle.product(),
+                                                      pedestalsHandle->offsetForHashes(),
+                                                      pulseOffsetsProduct,
+                                                      pulseOffsetsHandle->getValues()};
+
+  // scratch mem on device
+  hcal::reconstruction::ScratchDataGPU scratchGPU = {
+      cms::cuda::make_device_unique<float[]>(configParameters_.maxChannels * configParameters_.maxTimeSamples,
+                                             ctx.stream()),
+      cms::cuda::make_device_unique<float[]>(configParameters_.maxChannels * configParameters_.maxTimeSamples,
+                                             ctx.stream()),
+      cms::cuda::make_device_unique<float[]>(
+          configParameters_.maxChannels * configParameters_.maxTimeSamples * configParameters_.maxTimeSamples,
+          ctx.stream()),
+      cms::cuda::make_device_unique<float[]>(
+          configParameters_.maxChannels * configParameters_.maxTimeSamples * configParameters_.maxTimeSamples,
+          ctx.stream()),
+      cms::cuda::make_device_unique<float[]>(
+          configParameters_.maxChannels * configParameters_.maxTimeSamples * configParameters_.maxTimeSamples,
+          ctx.stream()),
+      cms::cuda::make_device_unique<int8_t[]>(configParameters_.maxChannels, ctx.stream()),
+  };
+
+  // output dev mem
+  outputGPU_.allocate(configParameters_, ctx.stream());
+
+  hcal::reconstruction::entryPoint(inputGPU, outputGPU_, conditions, scratchGPU, configParameters_, ctx.stream());
+
+#ifdef HCAL_MAHI_CPUDEBUG
+  auto end = std::chrono::high_resolution_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+  std::cout << "acquire  duration = " << duration << std::endl;
+#endif
+}
+
+void HBHERecHitProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) {
+  cms::cuda::ScopedContextProduce ctx{cudaState_};
+  ctx.emplace(event, rechitsM0Token_, std::move(outputGPU_.recHits));
+}
+
+DEFINE_FWK_MODULE(HBHERecHitProducerGPU);
diff --git a/RecoLocalCalo/HcalRecProducers/src/HCALGPUAnalyzer.cc b/RecoLocalCalo/HcalRecProducers/src/HCALGPUAnalyzer.cc
new file mode 100644
index 0000000000000..ba3c9de696c47
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/src/HCALGPUAnalyzer.cc
@@ -0,0 +1,307 @@
+// -*- C++ -*-
+//
+// Package:    ComparisonPlots/HCALGPUAnalyzer
+// Class:      HCALGPUAnalyzer
+//
+/**\class HCALGPUAnalyzer HCALGPUAnalyzer.cc ComparisonPlots/HCALGPUAnalyzer/plugins/HCALGPUAnalyzer.cc
+
+ Description: [one line class summary]
+
+ Implementation:
+     [Notes on implementation]
+*/
+//
+// Original Author:  Mariarosaria D'Alfonso
+//         Created:  Mon, 17 Dec 2018 16:22:58 GMT
+//
+//
+
+// system include files
+#include <memory>
+#include <string>
+#include <map>
+#include <iostream>
+using namespace std;
+
+// user include files
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/one/EDAnalyzer.h"
+
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "CommonTools/UtilAlgos/interface/TFileService.h"
+
+#include "DataFormats/HcalRecHit/interface/HBHERecHit.h"
+#include "DataFormats/HcalRecHit/interface/HcalRecHitCollections.h"
+#include "DataFormats/HcalDetId/interface/HcalDetId.h"
+
+#include "SimDataFormats/CaloHit/interface/PCaloHit.h"
+#include "SimDataFormats/CaloHit/interface/PCaloHitContainer.h"
+
+#include "SimCalorimetry/HcalSimAlgos/interface/HcalSimParameterMap.h"
+
+#include "TH2F.h"
+
+//
+// class declaration
+//
+
+class HCALGPUAnalyzer : public edm::one::EDAnalyzer<edm::one::SharedResources> {
+public:
+  explicit HCALGPUAnalyzer(const edm::ParameterSet &);
+  ~HCALGPUAnalyzer() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions &descriptions);
+
+private:
+  void beginJob() override;
+  void analyze(const edm::Event &, const edm::EventSetup &) override;
+  void endJob() override;
+
+  // ----------member data ---------------------------
+  //  void ClearVariables();
+
+  // some variables for storing information
+  double Method0Energy, Method0EnergyGPU;
+  double RecHitEnergy, RecHitEnergyGPU;
+  double RecHitTime, RecHitTimeGPU;
+  double iEta, iEtaGPU;
+  double iPhi, iPhiGPU;
+  int depth, depthGPU;
+
+  TH2F *hEnergy_2dMahi;
+  TH2F *hEnergy_2dM0;
+  TH2F *hTime_2dMahi;
+
+  TH2F *Unmatched;
+  TH2F *Matched;
+  TH1F *hEnergy_cpu;
+  TH1F *hEnergy_gpu;
+  TH1F *hEnergy_cpugpu;
+  TH1F *hEnergy_cpugpu_rel;
+  TH1F *hEnergyM0_cpu;
+  TH1F *hEnergyM0_gpu;
+  TH1F *hTime_cpu;
+  TH1F *hTime_gpu;
+
+  // create the output file
+  edm::Service<TFileService> FileService;
+  // create the token to retrieve hit information
+  edm::EDGetTokenT<HBHERecHitCollection> hRhToken;
+  edm::EDGetTokenT<HBHERecHitCollection> hRhTokenGPU;
+};
+
+//
+// constants, enums and typedefs
+//
+
+//
+// static data member definitions
+//
+
+//
+// constructors and destructor
+//
+HCALGPUAnalyzer::HCALGPUAnalyzer(const edm::ParameterSet &iConfig) {
+  usesResource("TFileService");
+
+  hRhToken = consumes<HBHERecHitCollection>(iConfig.getUntrackedParameter<string>("HBHERecHits", "hbheprereco"));
+  hRhTokenGPU = consumes<HBHERecHitCollection>(
+      iConfig.getUntrackedParameter<string>("HBHERecHits", "hcalCPURecHitsProducer:recHitsLegacyHBHE"));
+
+  //
+
+  hEnergy_2dM0 = FileService->make<TH2F>("hEnergy_2dM0", "hEnergy_2dM0", 1000, 0., 100., 1000, 0., 100.);
+  hEnergy_2dM0->GetXaxis()->SetTitle("Cpu M0 Energy");
+  hEnergy_2dM0->GetYaxis()->SetTitle("GPU M0 Energy");
+
+  hEnergy_2dMahi = FileService->make<TH2F>("hEnergy_2dMahi", "hEnergy_2dMahi", 1000, 0., 100., 1000, 0., 100.);
+  hEnergy_2dMahi->GetXaxis()->SetTitle("CPU Energy");
+  hEnergy_2dMahi->GetYaxis()->SetTitle("GPU Energy");
+
+  hTime_2dMahi = FileService->make<TH2F>("hTime_2dMahi", "hTime_2dMahi", 250, -12.5, 12.5, 250, -12.5, 12.5);
+  hTime_2dMahi->GetXaxis()->SetTitle("Mahi Time CPU");
+  hTime_2dMahi->GetYaxis()->SetTitle("Mahi Time GPU");
+
+  //
+
+  hEnergyM0_cpu = FileService->make<TH1F>("hEnergyM0_cpu", "hEnergyM0_cpu", 100, 0., 100.);
+  hEnergyM0_cpu->GetXaxis()->SetTitle("CPU Energy");
+
+  hEnergy_cpu = FileService->make<TH1F>("hEnergy_cpu", "hEnergy_cpu", 50, 0., 50.);
+  hEnergy_cpu->GetXaxis()->SetTitle("CPU Energy");
+
+  hEnergy_gpu = FileService->make<TH1F>("hEnergy_gpu", "hEnergy_gpu", 50, 0., 50.);
+  hEnergy_gpu->GetXaxis()->SetTitle("GPU Energy");
+
+  //
+
+  hEnergy_cpugpu = FileService->make<TH1F>("hEnergy_cpugpu", "hEnergy_cpugpu", 500, -2.5, 2.5);
+  hEnergy_cpugpu->GetXaxis()->SetTitle("GPU Energy - CPU Energy [GeV]");
+  hEnergy_cpugpu->GetYaxis()->SetTitle("# RecHits");
+
+  hEnergy_cpugpu_rel =
+      FileService->make<TH1F>("hEnergy_cpugpu_rel", "hEnergy_cpugpu_rel ( E > 0.005 GeV)", 500, -2.5, 2.5);
+  hEnergy_cpugpu_rel->GetXaxis()->SetTitle("(GPU Energy - CPU Energy) / CPU energy");
+  hEnergy_cpugpu_rel->GetYaxis()->SetTitle("# RecHits");
+
+  //
+
+  hTime_cpu = FileService->make<TH1F>("hTime_cpu", "hTime_cpu", 50, -25., 25.);
+  hTime_cpu->GetXaxis()->SetTitle("CPU Time");
+
+  hTime_gpu = FileService->make<TH1F>("hTime_gpu", "hTime_gpu", 50, -25., 25.);
+  hTime_gpu->GetXaxis()->SetTitle("GPU Time");
+
+  Unmatched = FileService->make<TH2F>("Unmatched", "Unmatched (eta,phi)", 100, -50., 50., 85, 0., 85.);
+  Matched = FileService->make<TH2F>("Matched", "Matched (eta,phi)", 100, -50., 50., 85, 0., 85.);
+
+  //now do what ever initialization is needed
+}
+
+//
+// member functions
+//
+
+// ------------ method called for each event  ------------
+void HCALGPUAnalyzer::analyze(const edm::Event &iEvent, const edm::EventSetup &iSetup) {
+  using namespace edm;
+
+  // Read events
+  Handle<HBHERecHitCollection> hRecHits;
+  iEvent.getByToken(hRhToken, hRecHits);
+
+  Handle<HBHERecHitCollection> hRecHitsGPU;
+  iEvent.getByToken(hRhTokenGPU, hRecHitsGPU);
+
+  // Loop over all rechits in one event
+  for (int i = 0; i < (int)hRecHits->size(); i++) {
+    // get ID information for the reconstructed hit
+    HcalDetId detID_rh = (*hRecHits)[i].id().rawId();
+
+    // ID information can get us detector coordinates
+    depth = (*hRecHits)[i].id().depth();
+    iEta = detID_rh.ieta();
+    iPhi = detID_rh.iphi();
+
+    // get some variables
+    Method0Energy = (*hRecHits)[i].eraw();
+    RecHitEnergy = (*hRecHits)[i].energy();
+    RecHitTime = (*hRecHits)[i].time();
+
+    hEnergy_cpu->Fill(RecHitEnergy);
+    hTime_cpu->Fill(RecHitTime);
+
+    /*
+     cout << "Run " << i << ": ";
+     cout << "Method0Energy: " << Method0Energy;
+     cout << "RecHitEnergy: " << RecHitEnergy;
+     cout << "depth: " << depth;
+     cout << "iEta: " << iEta;
+     cout << "iPhi: " << iPhi;
+     cout << "RecHitTime" << RecHitTime;
+     */
+  }
+
+  for (int i = 0; i < (int)hRecHitsGPU->size(); i++) {
+    // get ID information for the reconstructed hit
+    HcalDetId detID_rh = (*hRecHitsGPU)[i].id().rawId();
+
+    // ID information can get us detector coordinates
+    depthGPU = (*hRecHitsGPU)[i].id().depth();
+    iEtaGPU = detID_rh.ieta();
+    iPhiGPU = detID_rh.iphi();
+
+    // get some variables
+    Method0EnergyGPU = (*hRecHitsGPU)[i].eraw();
+    RecHitEnergyGPU = (*hRecHitsGPU)[i].energy();
+    RecHitTimeGPU = (*hRecHitsGPU)[i].time();
+
+    hEnergy_gpu->Fill(RecHitEnergyGPU);
+    hTime_gpu->Fill(RecHitTimeGPU);
+
+    /*
+     cout << "Run " << i << ": ";
+     cout << "Method0Energy: " << Method0EnergyGPU;
+     cout << "RecHitEnergy: " << RecHitEnergyGPU;
+     cout << "depth: " << depthGPU;
+     cout << "iEta: " << iEtaGPU;
+     cout << "iPhi: " << iPhiGPU;
+     cout << "RecHitTime" << RecHitTimeGPU;
+     */
+  }
+
+  // Loop over all rechits in one event
+  for (int i = 0; i < (int)hRecHits->size(); i++) {
+    HcalDetId detID_rh = (*hRecHits)[i].id().rawId();
+
+    bool unmatched = true;
+    //     cout << "--------------------------------------------------------" << endl;
+
+    for (int j = 0; j < (int)hRecHitsGPU->size(); j++) {
+      HcalDetId detID_gpu = (*hRecHitsGPU)[j].id().rawId();
+
+      if ((detID_rh == detID_gpu)) {
+        /*
+	 cout << "Mtime(cpu)" << (*hRecHits)[i].time() << endl; 
+	 cout << "     Mtime(gpu)" << (*hRecHitsGPU)[j].time() << endl;
+
+	 cout << "M0E(cpu)" << (*hRecHits)[i].eraw() << endl; 
+	 cout << "     M0E(gpu)" << (*hRecHitsGPU)[j].eraw() << endl;
+	 */
+
+        auto relValue = ((*hRecHitsGPU)[j].energy() - (*hRecHits)[i].energy()) / (*hRecHits)[i].energy();
+
+        hEnergy_2dM0->Fill((*hRecHits)[i].eraw(), (*hRecHitsGPU)[j].eraw());
+        hEnergy_2dMahi->Fill((*hRecHits)[i].energy(), (*hRecHitsGPU)[j].energy());
+        hEnergy_cpugpu->Fill((*hRecHitsGPU)[j].energy() - (*hRecHits)[i].energy());
+        if ((*hRecHits)[i].energy() > 0.005)
+          hEnergy_cpugpu_rel->Fill(relValue);
+        hTime_2dMahi->Fill((*hRecHits)[i].time(), (*hRecHitsGPU)[j].time());
+
+        /*
+	 if((relValue < - 0.9) and ((*hRecHits)[i].energy()>0.005)) {
+	   cout << "----------------------------------"<< endl;
+	   cout << " detID = " << detID_rh.rawId() << endl;
+	   cout << "ME(cpu)" << (*hRecHits)[i].energy() << endl; 
+	   cout << "     ME(gpu)" << (*hRecHitsGPU)[j].energy() << endl;
+	 }
+	 */
+
+        Matched->Fill(detID_rh.ieta(), detID_rh.iphi());
+
+        unmatched = false;
+      }
+    }
+
+    ///
+
+    if (unmatched) {
+      Unmatched->Fill(detID_rh.ieta(), detID_rh.iphi());
+      //       cout << "   recHit not matched ="  << detID_rh << "  E(raw)=" << (*hRecHits)[i].eraw() << " E=" << (*hRecHits)[i].energy() << endl;
+    }
+  }
+}
+
+// ------------ method called once each job just before starting event loop  ------------
+void HCALGPUAnalyzer::beginJob() {}
+
+// ------------ method called once each job just after ending the event loop  ------------
+void HCALGPUAnalyzer::endJob() {}
+
+// ------------ method fills 'descriptions' with the allowed parameters for the module  ------------
+void HCALGPUAnalyzer::fillDescriptions(edm::ConfigurationDescriptions &descriptions) {
+  //The following says we do not know what parameters are allowed so do no validation
+  // Please change this to state exactly what you do use, even if it is no parameters
+  edm::ParameterSetDescription desc;
+  desc.setUnknown();
+  descriptions.addDefault(desc);
+}
+
+//define this as a plug-in
+DEFINE_FWK_MODULE(HCALGPUAnalyzer);
diff --git a/RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc b/RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc
new file mode 100644
index 0000000000000..714ec8b7de5af
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc
@@ -0,0 +1,106 @@
+#include <iostream>
+#include <string>
+
+#include "CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h"
+#include "DataFormats/HcalRecHit/interface/HcalRecHitCollections.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+class HcalCPURecHitsProducer : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit HcalCPURecHitsProducer(edm::ParameterSet const& ps);
+  ~HcalCPURecHitsProducer() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+  using IProductType = cms::cuda::Product<hcal::RecHitCollection<calo::common::DevStoragePolicy>>;
+  edm::EDGetTokenT<IProductType> recHitsM0TokenIn_;
+  using OProductType = hcal::RecHitCollection<calo::common::VecStoragePolicy<calo::common::CUDAHostAllocatorAlias>>;
+  edm::EDPutTokenT<OProductType> recHitsM0TokenOut_;
+  edm::EDPutTokenT<HBHERecHitCollection> recHitsLegacyTokenOut_;
+
+  // to pass from acquire to produce
+  OProductType tmpRecHits_;
+};
+
+void HcalCPURecHitsProducer::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("recHitsM0LabelIn", edm::InputTag{"hbheRecHitProducerGPU", "recHitsM0HBHE"});
+  desc.add<std::string>("recHitsM0LabelOut", "recHitsM0HBHE");
+  desc.add<std::string>("recHitsLegacyLabelOut", "recHitsLegacyHBHE");
+
+  confDesc.addWithDefaultLabel(desc);
+}
+
+HcalCPURecHitsProducer::HcalCPURecHitsProducer(const edm::ParameterSet& ps)
+    : recHitsM0TokenIn_{consumes<IProductType>(ps.getParameter<edm::InputTag>("recHitsM0LabelIn"))},
+      recHitsM0TokenOut_{produces<OProductType>(ps.getParameter<std::string>("recHitsM0LabelOut"))},
+      recHitsLegacyTokenOut_{produces<HBHERecHitCollection>(ps.getParameter<std::string>("recHitsLegacyLabelOut"))} {}
+
+HcalCPURecHitsProducer::~HcalCPURecHitsProducer() {}
+
+void HcalCPURecHitsProducer::acquire(edm::Event const& event,
+                                     edm::EventSetup const& setup,
+                                     edm::WaitingTaskWithArenaHolder taskHolder) {
+  // retrieve data/ctx
+  auto const& recHitsProduct = event.get(recHitsM0TokenIn_);
+  cms::cuda::ScopedContextAcquire ctx{recHitsProduct, std::move(taskHolder)};
+  auto const& recHits = ctx.get(recHitsProduct);
+
+  // resize tmp buffers
+  tmpRecHits_.resize(recHits.size);
+
+#ifdef HCAL_MAHI_CPUDEBUG
+  std::cout << "num rec Hits = " << recHits.size << std::endl;
+#endif
+
+  auto lambdaToTransfer = [&ctx](auto& dest, auto* src) {
+    using vector_type = typename std::remove_reference<decltype(dest)>::type;
+    using src_data_type = typename std::remove_pointer<decltype(src)>::type;
+    using type = typename vector_type::value_type;
+    static_assert(std::is_same<src_data_type, type>::value && "Dest and Src data types do not match");
+    cudaCheck(cudaMemcpyAsync(dest.data(), src, dest.size() * sizeof(type), cudaMemcpyDeviceToHost, ctx.stream()));
+  };
+
+  lambdaToTransfer(tmpRecHits_.energy, recHits.energy.get());
+  lambdaToTransfer(tmpRecHits_.chi2, recHits.chi2.get());
+  lambdaToTransfer(tmpRecHits_.energyM0, recHits.energyM0.get());
+  lambdaToTransfer(tmpRecHits_.timeM0, recHits.timeM0.get());
+  lambdaToTransfer(tmpRecHits_.did, recHits.did.get());
+}
+
+void HcalCPURecHitsProducer::produce(edm::Event& event, edm::EventSetup const& setup) {
+  // populate the legacy collection
+  auto recHitsLegacy = std::make_unique<HBHERecHitCollection>();
+  // did not set size with ctor as there is no setter for did
+  recHitsLegacy->reserve(tmpRecHits_.did.size());
+  for (uint32_t i = 0; i < tmpRecHits_.did.size(); i++) {
+    recHitsLegacy->emplace_back(HcalDetId{tmpRecHits_.did[i]},
+                                tmpRecHits_.energy[i],
+                                0  // timeRising
+    );
+
+    // update newly pushed guy
+    (*recHitsLegacy)[i].setChiSquared(tmpRecHits_.chi2[i]);
+    (*recHitsLegacy)[i].setRawEnergy(tmpRecHits_.energyM0[i]);
+  }
+
+  // put a legacy format
+  event.put(recHitsLegacyTokenOut_, std::move(recHitsLegacy));
+
+  // put a new format
+  event.emplace(recHitsM0TokenOut_, std::move(tmpRecHits_));
+}
+
+DEFINE_FWK_MODULE(HcalCPURecHitsProducer);
diff --git a/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc b/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc
new file mode 100644
index 0000000000000..2fc6cc0d19002
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc
@@ -0,0 +1,120 @@
+#include "CondFormats/DataRecord/interface/HcalCombinedRecordsGPU.h"
+#include "CondFormats/DataRecord/interface/HcalGainWidthsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalGainsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalLUTCorrsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalPedestalWidthsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalPedestalsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalQIEDataRcd.h"
+#include "CondFormats/DataRecord/interface/HcalQIETypesRcd.h"
+#include "CondFormats/DataRecord/interface/HcalRecoParamsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalRespCorrsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalSiPMCharacteristicsRcd.h"
+#include "CondFormats/DataRecord/interface/HcalSiPMParametersRcd.h"
+#include "CondFormats/DataRecord/interface/HcalTimeCorrsRcd.h"
+#include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalWidthsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalConvertedPedestalWidthsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalConvertedPedestalsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalGainWidths.h"
+#include "CondFormats/HcalObjects/interface/HcalGainWidthsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalGains.h"
+#include "CondFormats/HcalObjects/interface/HcalGainsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalLUTCorrs.h"
+#include "CondFormats/HcalObjects/interface/HcalLUTCorrsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalPedestalWidths.h"
+#include "CondFormats/HcalObjects/interface/HcalPedestalWidthsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalPedestals.h"
+#include "CondFormats/HcalObjects/interface/HcalPedestalsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalQIECodersGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalQIEData.h"
+#include "CondFormats/HcalObjects/interface/HcalQIETypes.h"
+#include "CondFormats/HcalObjects/interface/HcalQIETypesGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalRecoParams.h"
+#include "CondFormats/HcalObjects/interface/HcalRecoParamsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalRespCorrs.h"
+#include "CondFormats/HcalObjects/interface/HcalRespCorrsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalSiPMCharacteristics.h"
+#include "CondFormats/HcalObjects/interface/HcalSiPMCharacteristicsGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalSiPMParameters.h"
+#include "CondFormats/HcalObjects/interface/HcalSiPMParametersGPU.h"
+#include "CondFormats/HcalObjects/interface/HcalTimeCorrs.h"
+#include "CondFormats/HcalObjects/interface/HcalTimeCorrsGPU.h"
+#include "HeterogeneousCore/CUDACore/interface/ConvertingESProducerT.h"
+#include "HeterogeneousCore/CUDACore/interface/ConvertingESProducerWithDependenciesT.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h"
+
+using HcalRecoParamsGPUESProducer = ConvertingESProducerT<HcalRecoParamsRcd, HcalRecoParamsGPU, HcalRecoParams>;
+
+using HcalRecoParamsWithPulseShapesGPUESProducer =
+    ConvertingESProducerT<HcalRecoParamsRcd, HcalRecoParamsWithPulseShapesGPU, HcalRecoParams>;
+
+using HcalPedestalsGPUESProducer = ConvertingESProducerT<HcalPedestalsRcd, HcalPedestalsGPU, HcalPedestals>;
+
+using HcalGainsGPUESProducer = ConvertingESProducerT<HcalGainsRcd, HcalGainsGPU, HcalGains>;
+
+using HcalLUTCorrsGPUESProducer = ConvertingESProducerT<HcalLUTCorrsRcd, HcalLUTCorrsGPU, HcalLUTCorrs>;
+
+using HcalRespCorrsGPUESProducer = ConvertingESProducerT<HcalRespCorrsRcd, HcalRespCorrsGPU, HcalRespCorrs>;
+
+using HcalTimeCorrsGPUESProducer = ConvertingESProducerT<HcalTimeCorrsRcd, HcalTimeCorrsGPU, HcalTimeCorrs>;
+
+using HcalPedestalWidthsGPUESProducer =
+    ConvertingESProducerT<HcalPedestalWidthsRcd, HcalPedestalWidthsGPU, HcalPedestalWidths>;
+
+using HcalGainWidthsGPUESProducer = ConvertingESProducerT<HcalGainWidthsRcd, HcalGainWidthsGPU, HcalGainWidths>;
+
+using HcalQIECodersGPUESProducer = ConvertingESProducerT<HcalQIEDataRcd, HcalQIECodersGPU, HcalQIEData>;
+
+using HcalQIETypesGPUESProducer = ConvertingESProducerT<HcalQIETypesRcd, HcalQIETypesGPU, HcalQIETypes>;
+
+using HcalSiPMParametersGPUESProducer =
+    ConvertingESProducerT<HcalSiPMParametersRcd, HcalSiPMParametersGPU, HcalSiPMParameters>;
+
+using HcalSiPMCharacteristicsGPUESProducer =
+    ConvertingESProducerT<HcalSiPMCharacteristicsRcd, HcalSiPMCharacteristicsGPU, HcalSiPMCharacteristics>;
+
+using HcalConvertedPedestalsGPUESProducer = ConvertingESProducerWithDependenciesT<HcalConvertedPedestalsRcd,
+                                                                                  HcalConvertedPedestalsGPU,
+                                                                                  HcalPedestals,
+                                                                                  HcalQIEData,
+                                                                                  HcalQIETypes>;
+
+using HcalConvertedEffectivePedestalsGPUESProducer =
+    ConvertingESProducerWithDependenciesT<HcalConvertedPedestalsRcd,
+                                          HcalConvertedEffectivePedestalsGPU,
+                                          HcalPedestals,
+                                          HcalQIEData,
+                                          HcalQIETypes>;
+
+using HcalConvertedPedestalWidthsGPUESProducer = ConvertingESProducerWithDependenciesT<HcalConvertedPedestalWidthsRcd,
+                                                                                       HcalConvertedPedestalWidthsGPU,
+                                                                                       HcalPedestals,
+                                                                                       HcalPedestalWidths,
+                                                                                       HcalQIEData,
+                                                                                       HcalQIETypes>;
+
+using HcalConvertedEffectivePedestalWidthsGPUESProducer =
+    ConvertingESProducerWithDependenciesT<HcalConvertedPedestalWidthsRcd,
+                                          HcalConvertedEffectivePedestalWidthsGPU,
+                                          HcalPedestals,
+                                          HcalPedestalWidths,
+                                          HcalQIEData,
+                                          HcalQIETypes>;
+
+DEFINE_FWK_EVENTSETUP_MODULE(HcalRecoParamsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalRecoParamsWithPulseShapesGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalPedestalsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalGainsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalLUTCorrsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalRespCorrsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalTimeCorrsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalPedestalWidthsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalGainWidthsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalQIECodersGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalQIETypesGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalSiPMParametersGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalSiPMCharacteristicsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalConvertedPedestalsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalConvertedEffectivePedestalsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalConvertedPedestalWidthsGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(HcalConvertedEffectivePedestalWidthsGPUESProducer);
diff --git a/RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc b/RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc
new file mode 100644
index 0000000000000..0862e0a861d5d
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc
@@ -0,0 +1,58 @@
+#include <array>
+#include <iostream>
+#include <tuple>
+#include <utility>
+
+#include "FWCore/Framework/interface/ESProducer.h"
+#include "FWCore/Framework/interface/ESProductHost.h"
+#include "FWCore/Framework/interface/ESTransientHandle.h"
+#include "FWCore/Framework/interface/EventSetupRecordIntervalFinder.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/ModuleFactory.h"
+#include "FWCore/Framework/interface/SourceFactory.h"
+#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDACore/interface/JobConfigurationGPURecord.h"
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h"
+
+class HcalMahiPulseOffsetsGPUESProducer : public edm::ESProducer, public edm::EventSetupRecordIntervalFinder {
+public:
+  HcalMahiPulseOffsetsGPUESProducer(edm::ParameterSet const&);
+  ~HcalMahiPulseOffsetsGPUESProducer() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+  std::unique_ptr<HcalMahiPulseOffsetsGPU> produce(JobConfigurationGPURecord const&);
+
+protected:
+  void setIntervalFor(const edm::eventsetup::EventSetupRecordKey&,
+                      const edm::IOVSyncValue&,
+                      edm::ValidityInterval&) override;
+
+private:
+  edm::ParameterSet const& pset_;
+};
+
+HcalMahiPulseOffsetsGPUESProducer::HcalMahiPulseOffsetsGPUESProducer(edm::ParameterSet const& pset) : pset_{pset} {
+  setWhatProduced(this);
+  findingRecord<JobConfigurationGPURecord>();
+}
+
+void HcalMahiPulseOffsetsGPUESProducer::setIntervalFor(const edm::eventsetup::EventSetupRecordKey& iKey,
+                                                       const edm::IOVSyncValue& iTime,
+                                                       edm::ValidityInterval& oInterval) {
+  oInterval = edm::ValidityInterval(edm::IOVSyncValue::beginOfTime(), edm::IOVSyncValue::endOfTime());
+}
+
+void HcalMahiPulseOffsetsGPUESProducer::fillDescriptions(edm::ConfigurationDescriptions& desc) {
+  edm::ParameterSetDescription d;
+  d.add<std::vector<int>>("pulseOffsets", {-3, -2, -1, 0, 1, 2, 3, 4});
+  desc.addWithDefaultLabel(d);
+}
+
+std::unique_ptr<HcalMahiPulseOffsetsGPU> HcalMahiPulseOffsetsGPUESProducer::produce(JobConfigurationGPURecord const&) {
+  return std::make_unique<HcalMahiPulseOffsetsGPU>(pset_);
+}
+
+DEFINE_FWK_EVENTSETUP_SOURCE(HcalMahiPulseOffsetsGPUESProducer);
diff --git a/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h b/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h
new file mode 100644
index 0000000000000..ade221b2c4870
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h
@@ -0,0 +1,220 @@
+#ifndef RecoLocalCalo_HcalRecProducers_src_KernelHelpers_h
+#define RecoLocalCalo_HcalRecProducers_src_KernelHelpers_h
+
+#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConstants.h"
+
+#include "DeclsForKernels.h"
+
+namespace hcal {
+  namespace reconstruction {
+
+    // this is from HcalTimeSlew.
+    // HcalTimeSlew are values that come in from ESProducer that takes them
+    // from a python config. see DeclsForKernels for more explanation
+    __forceinline__ __device__ float compute_time_slew_delay(float const fC,
+                                                             float const tzero,
+                                                             float const slope,
+                                                             float const tmax) {
+      auto const rawDelay = tzero + slope * std::log(fC);
+      return rawDelay < 0 ? 0 : (rawDelay > tmax ? tmax : rawDelay);
+    }
+
+    // HcalQIEShapes are hardcoded in HcalQIEData.cc basically
+    // + some logic to generate 128 and 256 value arrays...
+    __constant__ float const qie8shape[129] = {
+        -1,   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,   14,   16,
+        18,   20,   22,   24,   26,   28,   31,   34,   37,   40,   44,   48,   52,   57,   62,   57,   62,
+        67,   72,   77,   82,   87,   92,   97,   102,  107,  112,  117,  122,  127,  132,  142,  152,  162,
+        172,  182,  192,  202,  217,  232,  247,  262,  282,  302,  322,  347,  372,  347,  372,  397,  422,
+        447,  472,  497,  522,  547,  572,  597,  622,  647,  672,  697,  722,  772,  822,  872,  922,  972,
+        1022, 1072, 1147, 1222, 1297, 1372, 1472, 1572, 1672, 1797, 1922, 1797, 1922, 2047, 2172, 2297, 2422,
+        2547, 2672, 2797, 2922, 3047, 3172, 3297, 3422, 3547, 3672, 3922, 4172, 4422, 4672, 4922, 5172, 5422,
+        5797, 6172, 6547, 6922, 7422, 7922, 8422, 9047, 9672, 10297};
+
+    __constant__ float const qie11shape[257] = {
+        -0.5,    0.5,     1.5,     2.5,     3.5,     4.5,     5.5,     6.5,     7.5,     8.5,     9.5,     10.5,
+        11.5,    12.5,    13.5,    14.5,    15.5,    17.5,    19.5,    21.5,    23.5,    25.5,    27.5,    29.5,
+        31.5,    33.5,    35.5,    37.5,    39.5,    41.5,    43.5,    45.5,    47.5,    49.5,    51.5,    53.5,
+        55.5,    59.5,    63.5,    67.5,    71.5,    75.5,    79.5,    83.5,    87.5,    91.5,    95.5,    99.5,
+        103.5,   107.5,   111.5,   115.5,   119.5,   123.5,   127.5,   131.5,   135.5,   139.5,   147.5,   155.5,
+        163.5,   171.5,   179.5,   187.5,   171.5,   179.5,   187.5,   195.5,   203.5,   211.5,   219.5,   227.5,
+        235.5,   243.5,   251.5,   259.5,   267.5,   275.5,   283.5,   291.5,   299.5,   315.5,   331.5,   347.5,
+        363.5,   379.5,   395.5,   411.5,   427.5,   443.5,   459.5,   475.5,   491.5,   507.5,   523.5,   539.5,
+        555.5,   571.5,   587.5,   603.5,   619.5,   651.5,   683.5,   715.5,   747.5,   779.5,   811.5,   843.5,
+        875.5,   907.5,   939.5,   971.5,   1003.5,  1035.5,  1067.5,  1099.5,  1131.5,  1163.5,  1195.5,  1227.5,
+        1259.5,  1291.5,  1355.5,  1419.5,  1483.5,  1547.5,  1611.5,  1675.5,  1547.5,  1611.5,  1675.5,  1739.5,
+        1803.5,  1867.5,  1931.5,  1995.5,  2059.5,  2123.5,  2187.5,  2251.5,  2315.5,  2379.5,  2443.5,  2507.5,
+        2571.5,  2699.5,  2827.5,  2955.5,  3083.5,  3211.5,  3339.5,  3467.5,  3595.5,  3723.5,  3851.5,  3979.5,
+        4107.5,  4235.5,  4363.5,  4491.5,  4619.5,  4747.5,  4875.5,  5003.5,  5131.5,  5387.5,  5643.5,  5899.5,
+        6155.5,  6411.5,  6667.5,  6923.5,  7179.5,  7435.5,  7691.5,  7947.5,  8203.5,  8459.5,  8715.5,  8971.5,
+        9227.5,  9483.5,  9739.5,  9995.5,  10251.5, 10507.5, 11019.5, 11531.5, 12043.5, 12555.5, 13067.5, 13579.5,
+        12555.5, 13067.5, 13579.5, 14091.5, 14603.5, 15115.5, 15627.5, 16139.5, 16651.5, 17163.5, 17675.5, 18187.5,
+        18699.5, 19211.5, 19723.5, 20235.5, 20747.5, 21771.5, 22795.5, 23819.5, 24843.5, 25867.5, 26891.5, 27915.5,
+        28939.5, 29963.5, 30987.5, 32011.5, 33035.5, 34059.5, 35083.5, 36107.5, 37131.5, 38155.5, 39179.5, 40203.5,
+        41227.5, 43275.5, 45323.5, 47371.5, 49419.5, 51467.5, 53515.5, 55563.5, 57611.5, 59659.5, 61707.5, 63755.5,
+        65803.5, 67851.5, 69899.5, 71947.5, 73995.5, 76043.5, 78091.5, 80139.5, 82187.5, 84235.5, 88331.5, 92427.5,
+        96523.5, 100620,  104716,  108812,  112908};
+
+    // Conditions are transferred once per IOV
+    // Access is performed based on the det id which is converted to a linear index
+    // 2 funcs below are taken from HcalTopology (reimplemented here).
+    // Inputs are constants that are also taken from HcalTopology
+    // but passed to the kernel as arguments using the HclaTopology itself
+    constexpr int32_t IPHI_MAX = 72;
+
+    __forceinline__ __device__ uint32_t did2linearIndexHB(
+        uint32_t const didraw, int const maxDepthHB, int const firstHBRing, int const lastHBRing, int const nEtaHB) {
+      HcalDetId did{didraw};
+      uint32_t const value = (did.depth() - 1) + maxDepthHB * (did.iphi() - 1);
+      return did.ieta() > 0 ? value + maxDepthHB * hcal::reconstruction::IPHI_MAX * (did.ieta() - firstHBRing)
+                            : value + maxDepthHB * hcal::reconstruction::IPHI_MAX * (did.ieta() + lastHBRing + nEtaHB);
+    }
+
+    __forceinline__ __device__ uint32_t did2linearIndexHE(uint32_t const didraw,
+                                                          int const maxDepthHE,
+                                                          int const maxPhiHE,
+                                                          int const firstHERing,
+                                                          int const lastHERing,
+                                                          int const nEtaHE) {
+      HcalDetId did{didraw};
+      uint32_t const value = (did.depth() - 1) + maxDepthHE * (did.iphi() - 1);
+      return did.ieta() > 0 ? value + maxDepthHE * maxPhiHE * (did.ieta() - firstHERing)
+                            : value + maxDepthHE * maxPhiHE * (did.ieta() + lastHERing + nEtaHE);
+    }
+
+    __forceinline__ __device__ uint32_t get_qiecoder_index(uint32_t const capid, uint32_t const range) {
+      return capid * 4 + range;
+    }
+
+    __forceinline__ __device__ float compute_reco_correction_factor(float const par1,
+                                                                    float const par2,
+                                                                    float const par3,
+                                                                    float const x) {
+      return par3 * x * x + par2 * x + par1;
+    }
+
+    // compute the charge using the adc, qie type and the appropriate qie shape array
+    __forceinline__ __device__ float compute_coder_charge(
+        int const qieType, uint8_t const adc, uint8_t const capid, float const* qieOffsets, float const* qieSlopes) {
+      auto const range = qieType == 0 ? (adc >> 5) & 0x3 : (adc >> 6) & 0x3;
+      auto const* qieShapeToUse = qieType == 0 ? qie8shape : qie11shape;
+      auto const nbins = qieType == 0 ? 32 : 64;
+      auto const center = adc % nbins == nbins - 1 ? 0.5 * (3 * qieShapeToUse[adc] - qieShapeToUse[adc - 1])
+                                                   : 0.5 * (qieShapeToUse[adc] + qieShapeToUse[adc + 1]);
+      auto const index = get_qiecoder_index(capid, range);
+      return (center - qieOffsets[index]) / qieSlopes[index];
+    }
+
+    // this is from
+    //  https://github.com/cms-sw/cmssw/blob/master/RecoLocalCalo/HcalRecProducers/src/HBHEPhase1Reconstructor.cc#L140
+
+    __forceinline__ __device__ float compute_diff_charge_gain(int const qieType,
+                                                              uint8_t adc,
+                                                              uint8_t const capid,
+                                                              float const* qieOffsets,
+                                                              float const* qieSlopes,
+                                                              bool const isqie11) {
+      constexpr uint32_t mantissaMaskQIE8 = 0x1fu;
+      constexpr uint32_t mantissaMaskQIE11 = 0x3f;
+      auto const mantissaMask = isqie11 ? mantissaMaskQIE11 : mantissaMaskQIE8;
+      auto const q = compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
+      auto const mantissa = adc & mantissaMask;
+
+      if (mantissa == 0u || mantissa == mantissaMask - 1u)
+        return compute_coder_charge(qieType, adc + 1u, capid, qieOffsets, qieSlopes) - q;
+      else if (mantissa == 1u || mantissa == mantissaMask)
+        return q - compute_coder_charge(qieType, adc - 1u, capid, qieOffsets, qieSlopes);
+      else {
+        auto const qup = compute_coder_charge(qieType, adc + 1u, capid, qieOffsets, qieSlopes);
+        auto const qdown = compute_coder_charge(qieType, adc - 1u, capid, qieOffsets, qieSlopes);
+        auto const upgain = qup - q;
+        auto const downgain = q - qdown;
+        auto const averagegain = (qup - qdown) / 2.f;
+        if (std::abs(upgain - downgain) < 0.01f * averagegain)
+          return averagegain;
+        else {
+          auto const q2up = compute_coder_charge(qieType, adc + 2u, capid, qieOffsets, qieSlopes);
+          auto const q2down = compute_coder_charge(qieType, adc - 2u, capid, qieOffsets, qieSlopes);
+          auto const upgain2 = q2up - qup;
+          auto const downgain2 = qdown - q2down;
+          if (std::abs(upgain2 - upgain) < std::abs(downgain2 - downgain))
+            return upgain;
+          else
+            return downgain;
+        }
+      }
+    }
+
+    // TODO: remove what's not needed
+    // originally from from RecoLocalCalo/HcalRecAlgos/src/PulseShapeFunctor.cc
+    __forceinline__ __device__ float compute_pulse_shape_value(float const pulse_time,
+                                                               int const sample,
+                                                               int const shift,
+                                                               float const* acc25nsVec,
+                                                               float const* diff25nsItvlVec,
+                                                               float const* accVarLenIdxMinusOneVec,
+                                                               float const* diffVarItvlIdxMinusOneVec,
+                                                               float const* accVarLenIdxZeroVec,
+                                                               float const* diffVarItvlIdxZeroVec) {
+      // constants
+      constexpr float slew = 0.f;
+      constexpr auto ns_per_bx = hcal::constants::nsPerBX;
+
+      // FIXME: clean up all the rounding... this is coming from original cpu version
+      float const i_start_float = -hcal::constants::iniTimeShift - pulse_time - slew > 0.f
+                                      ? 0.f
+                                      : std::abs(-hcal::constants::iniTimeShift - pulse_time - slew) + 1.f;
+      int i_start = static_cast<int>(i_start_float);
+      float offset_start = static_cast<float>(i_start) - hcal::constants::iniTimeShift - pulse_time - slew;
+      // FIXME: do we need a check for nan???
+#ifdef HCAL_MAHI_GPUDEBUG
+      if (shift == 0)
+        printf("i_start_float = %f i_start = %d offset_start = %f\n", i_start_float, i_start, offset_start);
+#endif
+
+      // boundary
+      if (offset_start == 1.0f) {
+        offset_start = 0.f;
+        i_start -= 1;
+      }
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      if (shift == 0)
+        printf("i_start_float = %f i_start = %d offset_start = %f\n", i_start_float, i_start, offset_start);
+#endif
+
+      int const bin_start = static_cast<int>(offset_start);
+      auto const bin_start_up = static_cast<float>(bin_start) + 0.5f;
+      int const bin_0_start = offset_start < bin_start_up ? bin_start - 1 : bin_start;
+      int const its_start = i_start / ns_per_bx;
+      int const distTo25ns_start = hcal::constants::nsPerBX - 1 - i_start % ns_per_bx;
+      auto const factor = offset_start - static_cast<float>(bin_0_start) - 0.5;
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      if (shift == 0) {
+        printf("bin_start = %d bin_0_start = %d its_start = %d distTo25ns_start = %d factor = %f\n",
+               bin_start,
+               bin_0_start,
+               its_start,
+               distTo25ns_start,
+               factor);
+      }
+#endif
+
+      auto const sample_over10ts = sample + shift;
+      float value = 0.0f;
+      if (sample_over10ts == its_start) {
+        value = bin_0_start == -1
+                    ? accVarLenIdxMinusOneVec[distTo25ns_start] + factor * diffVarItvlIdxMinusOneVec[distTo25ns_start]
+                    : accVarLenIdxZeroVec[distTo25ns_start] + factor * diffVarItvlIdxZeroVec[distTo25ns_start];
+      } else if (sample_over10ts > its_start) {
+        int const bin_idx = distTo25ns_start + 1 + (sample_over10ts - its_start - 1) * ns_per_bx + bin_0_start;
+        value = acc25nsVec[bin_idx] + factor * diff25nsItvlVec[bin_idx];
+      }
+      return value;
+    }
+
+  }  // namespace reconstruction
+}  // namespace hcal
+
+#endif  // RecoLocalCalo_HcalRecProducers_src_KernelHelpers_h
diff --git a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
new file mode 100644
index 0000000000000..a1fc79b41eca6
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu
@@ -0,0 +1,1223 @@
+#include <Eigen/Dense>
+
+#include "DataFormats/CaloRecHit/interface/MultifitComputations.h"
+
+// needed to compile with USER_CXXFLAGS="-DCOMPUTE_TDC_TIME"
+#include "DataFormats/HcalRecHit/interface/HcalSpecialTimes.h"
+// TODO reuse some of the HCAL constats from
+//#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConstants.h"
+// ?
+
+#include "SimpleAlgoGPU.h"
+#include "KernelHelpers.h"
+
+#ifdef HCAL_MAHI_GPUDEBUG
+#define DETID_TO_DEBUG 1125647428
+#endif
+
+namespace hcal {
+  namespace mahi {
+
+    // TODO: provide constants from configuration
+    // from RecoLocalCalo/HcalRecProducers/python/HBHEMahiParameters_cfi.py
+    constexpr int nMaxItersMin = 50;
+    constexpr int nMaxItersNNLS = 500;
+    constexpr double nnlsThresh = 1e-11;
+    constexpr float deltaChi2Threashold = 1e-3;
+
+    // from RecoLocalCalo/HcalRecProducers/src/HBHEPhase1Reconstructor.cc
+    __forceinline__ __device__ float get_raw_charge(double const charge,
+                                                    double const pedestal,
+                                                    float const* shrChargeMinusPedestal,
+                                                    float const* parLin1Values,
+                                                    float const* parLin2Values,
+                                                    float const* parLin3Values,
+                                                    int32_t const nsamplesForCompute,
+                                                    int32_t const soi,
+                                                    int const sipmQTSShift,
+                                                    int const sipmQNTStoSum,
+                                                    int const sipmType,
+                                                    float const fcByPE,
+                                                    bool const isqie11) {
+      float rawCharge;
+
+      if (!isqie11)
+        rawCharge = charge;
+      else {
+        auto const parLin1 = parLin1Values[sipmType - 1];
+        auto const parLin2 = parLin2Values[sipmType - 1];
+        auto const parLin3 = parLin3Values[sipmType - 1];
+
+        int const first = std::max(soi + sipmQTSShift, 0);
+        int const last = std::min(soi + sipmQNTStoSum, nsamplesForCompute);
+        float sipmq = 0.0f;
+        for (auto ts = first; ts < last; ts++)
+          sipmq += shrChargeMinusPedestal[threadIdx.y * nsamplesForCompute + ts];
+        auto const effectivePixelsFired = sipmq / fcByPE;
+        auto const factor =
+            hcal::reconstruction::compute_reco_correction_factor(parLin1, parLin2, parLin3, effectivePixelsFired);
+        rawCharge = (charge - pedestal) * factor + pedestal;
+
+#ifdef HCAL_MAHI_GPUDEBUG
+        printf("first = %d last = %d sipmQ = %f factor = %f rawCharge = %f\n", first, last, sipmq, factor, rawCharge);
+#endif
+      }
+      return rawCharge;
+    }
+
+    // Assume: same number of samples for HB and HE
+    // TODO: add/validate restrict (will increase #registers in use by the kernel)
+    __global__ void kernel_prep1d_sameNumberOfSamples(float* amplitudes,
+                                                      float* noiseTerms,
+                                                      float* outputEnergy,
+                                                      float* outputChi2,
+                                                      uint16_t const* dataf01HE,
+                                                      uint16_t const* dataf5HB,
+                                                      uint16_t const* dataf3HB,
+                                                      uint32_t const* idsf01HE,
+                                                      uint32_t const* idsf5HB,
+                                                      uint32_t const* idsf3HB,
+                                                      uint32_t const stridef01HE,
+                                                      uint32_t const stridef5HB,
+                                                      uint32_t const stridef3HB,
+                                                      uint32_t const nchannelsf01HE,
+                                                      uint32_t const nchannelsf5HB,
+                                                      uint8_t const* npresamplesf5HB,
+                                                      int8_t* soiSamples,
+                                                      float* method0Energy,
+                                                      float* method0Time,
+                                                      uint32_t* outputdid,
+                                                      uint32_t const nchannels,
+                                                      uint32_t const* recoParam1Values,
+                                                      uint32_t const* recoParam2Values,
+                                                      float const* qieCoderOffsets,
+                                                      float const* qieCoderSlopes,
+                                                      int const* qieTypes,
+                                                      float const* pedestalWidths,
+                                                      float const* effectivePedestalWidths,
+                                                      float const* pedestals,
+                                                      float const* effectivePedestals,
+                                                      bool const useEffectivePedestals,
+                                                      int const* sipmTypeValues,
+                                                      float const* fcByPEValues,
+                                                      float const* parLin1Values,
+                                                      float const* parLin2Values,
+                                                      float const* parLin3Values,
+                                                      float const* gainValues,
+                                                      float const* respCorrectionValues,
+                                                      int const maxDepthHB,
+                                                      int const maxDepthHE,
+                                                      int const maxPhiHE,
+                                                      int const firstHBRing,
+                                                      int const lastHBRing,
+                                                      int const firstHERing,
+                                                      int const lastHERing,
+                                                      int const nEtaHB,
+                                                      int const nEtaHE,
+                                                      int const sipmQTSShift,
+                                                      int const sipmQNTStoSum,
+                                                      int const firstSampleShift,
+                                                      uint32_t const offsetForHashes,
+                                                      float const ts4Thresh,
+                                                      int const startingSample) {
+      // indices + runtime constants
+      auto const sample = threadIdx.x + startingSample;
+      auto const sampleWithinWindow = threadIdx.x;
+      int32_t const nsamplesForCompute = blockDim.x;
+      auto const lch = threadIdx.y;
+      auto const gch = lch + blockDim.y * blockIdx.x;
+      auto const nchannels_per_block = blockDim.y;
+      auto const linearThPerBlock = threadIdx.x + threadIdx.y * blockDim.x;
+
+      // remove
+      if (gch >= nchannels)
+        return;
+
+      // initialize all output buffers
+      if (sampleWithinWindow == 0) {
+        outputdid[gch] = 0;
+        method0Energy[gch] = 0;
+        method0Time[gch] = 0;
+        outputEnergy[gch] = 0;
+        outputChi2[gch] = 0;
+      }
+
+#ifdef HCAL_MAHI_GPUDEBUG
+#ifdef HCAL_MAHI_GPUDEBUG_SINGLECHANNEL
+      if (gch > 0)
+        return;
+#endif
+#endif
+
+      // configure shared mem
+      extern __shared__ char smem[];
+      float* shrEnergyM0PerTS = reinterpret_cast<float*>(smem);
+      float* shrChargeMinusPedestal = shrEnergyM0PerTS + nsamplesForCompute * nchannels_per_block;
+      float* shrMethod0EnergyAccum = shrChargeMinusPedestal + nsamplesForCompute * nchannels_per_block;
+      float* shrEnergyM0TotalAccum = shrMethod0EnergyAccum + nchannels_per_block;
+      unsigned long long int* shrMethod0EnergySamplePair =
+          reinterpret_cast<unsigned long long int*>(shrEnergyM0TotalAccum + nchannels_per_block);
+      if (sampleWithinWindow == 0) {
+        shrMethod0EnergyAccum[lch] = 0;
+        shrMethod0EnergySamplePair[lch] = __float_as_uint(std::numeric_limits<float>::min());
+        shrEnergyM0TotalAccum[lch] = 0;
+      }
+
+      // offset output
+      auto* amplitudesForChannel = amplitudes + nsamplesForCompute * gch;
+      auto* noiseTermsForChannel = noiseTerms + nsamplesForCompute * gch;
+      auto const nchannelsf015 = nchannelsf01HE + nchannelsf5HB;
+
+      // get event input quantities
+      auto const stride = gch < nchannelsf01HE ? stridef01HE : (gch < nchannelsf015 ? stridef5HB : stridef3HB);
+      auto const nsamples = gch < nchannelsf01HE ? compute_nsamples<Flavor1>(stride)
+                                                 : (gch < nchannelsf015 ? compute_nsamples<Flavor5>(stride)
+                                                                        : compute_nsamples<Flavor3>(stride));
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      assert(nsamples == nsamplesForCompute || nsamples - startingSample == nsamplesForCompute);
+#endif
+
+      auto const id = gch < nchannelsf01HE
+                          ? idsf01HE[gch]
+                          : (gch < nchannelsf015 ? idsf5HB[gch - nchannelsf01HE] : idsf3HB[gch - nchannelsf015]);
+      auto const did = HcalDetId{id};
+      auto const adc =
+          gch < nchannelsf01HE
+              ? adc_for_sample<Flavor1>(dataf01HE + stride * gch, sample)
+              : (gch < nchannelsf015 ? adc_for_sample<Flavor5>(dataf5HB + stride * (gch - nchannelsf01HE), sample)
+                                     : adc_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample));
+      auto const capid =
+          gch < nchannelsf01HE
+              ? capid_for_sample<Flavor1>(dataf01HE + stride * gch, sample)
+              : (gch < nchannelsf015 ? capid_for_sample<Flavor5>(dataf5HB + stride * (gch - nchannelsf01HE), sample)
+                                     : capid_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample));
+
+#ifdef HCAL_MAHI_GPUDEBUG
+#ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID
+      if (id != DETID_TO_DEBUG)
+        return;
+#endif
+#endif
+
+      // compute hash for this did
+      auto const hashedId =
+          did.subdetId() == HcalBarrel
+              ? hcal::reconstruction::did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
+              : hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) +
+                    offsetForHashes;
+
+      // conditions based on the hash
+      // FIXME: remove hardcoded values
+      auto const qieType = qieTypes[hashedId] > 0 ? 1 : 0;  // 2 types at this point
+      auto const* qieOffsets = qieCoderOffsets + hashedId * HcalQIECodersGPU::numValuesPerChannel;
+      auto const* qieSlopes = qieCoderSlopes + hashedId * HcalQIECodersGPU::numValuesPerChannel;
+      auto const* pedestalsForChannel = pedestals + hashedId * 4;
+      auto const* pedestalWidthsForChannel = useEffectivePedestals && (gch < nchannelsf01HE || gch >= nchannelsf015)
+                                                 ? effectivePedestalWidths + hashedId * 4
+                                                 : pedestalWidths + hashedId * 4;
+      auto const* gains = gainValues + hashedId * 4;
+      auto const gain = gains[capid];
+      auto const gain0 = gains[0];
+      auto const respCorrection = respCorrectionValues[hashedId];
+      auto const pedestal = pedestalsForChannel[capid];
+      auto const pedestalWidth = pedestalWidthsForChannel[capid];
+      // if needed, only use effective pedestals for f01
+      auto const pedestalToUseForMethod0 = useEffectivePedestals && (gch < nchannelsf01HE || gch >= nchannelsf015)
+                                               ? effectivePedestals[hashedId * 4 + capid]
+                                               : pedestal;
+      auto const sipmType = sipmTypeValues[hashedId];
+      auto const fcByPE = fcByPEValues[hashedId];
+      auto const recoParam1 = recoParam1Values[hashedId];
+      auto const recoParam2 = recoParam2Values[hashedId];
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      printf("qieType = %d qieOffset0 = %f qieOffset1 = %f qieSlope0 = %f qieSlope1 = %f\n",
+             qieType,
+             qieOffsets[0],
+             qieOffsets[1],
+             qieSlopes[0],
+             qieSlopes[1]);
+#endif
+
+      // compute charge
+      auto const charge = hcal::reconstruction::compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
+
+      shrChargeMinusPedestal[linearThPerBlock] = charge - pedestal;
+      if (gch < nchannelsf01HE) {
+        // NOTE: assume that soi is high only for a single guy!
+        //   which must be the case. cpu version does not check for that
+        //   if that is not the case, we will see that with cuda mmecheck
+        auto const soibit = soibit_for_sample<Flavor1>(dataf01HE + stride * gch, sample);
+        if (soibit == 1)
+          soiSamples[gch] = sampleWithinWindow;
+      } else if (gch >= nchannelsf015) {
+        auto const soibit = soibit_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample);
+        if (soibit == 1)
+          soiSamples[gch] = sampleWithinWindow;
+      }
+      __syncthreads();
+      int32_t const soi = gch < nchannelsf01HE
+                              ? soiSamples[gch]
+                              : (gch < nchannelsf015 ? npresamplesf5HB[gch - nchannelsf01HE] : soiSamples[gch]);
+      //int32_t const soi = gch >= nchannelsf01HE
+      //    ? npresamplesf5HB[gch - nchannelsf01HE]
+      //    : soiSamples[gch];
+      // this is here just to make things uniform...
+      if (gch >= nchannelsf01HE && gch < nchannelsf015 && sampleWithinWindow == 0)
+        soiSamples[gch] = npresamplesf5HB[gch - nchannelsf01HE];
+
+      //
+      // compute various quantities (raw charge and tdc stuff)
+      // NOTE: this branch will be divergent only for a single warp that
+      // sits on the boundary when flavor 01 channels end and flavor 5 start
+      //
+      float const rawCharge = get_raw_charge(charge,
+                                             pedestal,
+                                             shrChargeMinusPedestal,
+                                             parLin1Values,
+                                             parLin2Values,
+                                             parLin3Values,
+                                             nsamplesForCompute,
+                                             soi,
+                                             sipmQTSShift,
+                                             sipmQNTStoSum,
+                                             sipmType,
+                                             fcByPE,
+                                             gch < nchannelsf01HE || gch >= nchannelsf015);
+
+      auto const dfc = hcal::reconstruction::compute_diff_charge_gain(
+          qieType, adc, capid, qieOffsets, qieSlopes, gch < nchannelsf01HE || gch >= nchannelsf015);
+
+#ifdef COMPUTE_TDC_TIME
+      float tdcTime;
+      if (gch >= nchannelsf01HE && gch < nchannelsf015) {
+        tdcTime = HcalSpecialTimes::UNKNOWN_T_NOTDC;
+      } else {
+        if (gch < nchannelsf01HE)
+          tdcTime = HcalSpecialTimes::getTDCTime(tdc_for_sample<Flavor1>(dataf01HE + stride * gch, sample));
+        else if (gch >= nchannelsf015)
+          tdcTime =
+              HcalSpecialTimes::getTDCTime(tdc_for_sample<Flavor3>(dataf3HB + stride * (gch - nchannelsf015), sample));
+      }
+#endif  // COMPUTE_TDC_TIME
+
+      // compute method 0 quantities
+      // TODO: need to apply containment
+      // TODO: need to apply time slew
+      // TODO: for < run 3, apply HBM legacy energy correction
+      auto const nsamplesToAdd = recoParam1 < 10 ? recoParam2 : (recoParam1 >> 14) & 0xF;
+      auto const startSampleTmp = soi + firstSampleShift;
+      auto const startSample = startSampleTmp < 0 ? 0 : startSampleTmp;
+      auto const endSample =
+          startSample + nsamplesToAdd < nsamplesForCompute ? startSample + nsamplesToAdd : nsamplesForCompute;
+      // NOTE: gain is a small number < 10^-3, multiply it last
+      auto const energym0_per_ts = gain * ((rawCharge - pedestalToUseForMethod0) * respCorrection);
+      auto const energym0_per_ts_gain0 = gain0 * ((rawCharge - pedestalToUseForMethod0) * respCorrection);
+      // store to shared mem
+      shrEnergyM0PerTS[lch * nsamplesForCompute + sampleWithinWindow] = energym0_per_ts;
+      atomicAdd(&shrEnergyM0TotalAccum[lch], energym0_per_ts_gain0);
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      printf(
+          "id = %u sample = %d gch = %d hashedId = %u adc = %u capid = %u\n"
+          "   charge = %f rawCharge = %f dfc = %f pedestal = %f\n"
+          "   gain = %f respCorrection = %f energym0_per_ts = %f\n",
+          id,
+          sample,
+          gch,
+          hashedId,
+          adc,
+          capid,
+          charge,
+          rawCharge,
+          dfc,
+          pedestalToUseForMethod0,
+          gain,
+          respCorrection,
+          energym0_per_ts);
+      printf(
+          "startSample = %d endSample = %d param1 = %u param2 = %u\n", startSample, endSample, recoParam1, recoParam2);
+#endif
+
+      if (sampleWithinWindow >= startSample && sampleWithinWindow < endSample) {
+        atomicAdd(&shrMethod0EnergyAccum[lch], energym0_per_ts);
+        // pack sample, energy as 64 bit value
+        unsigned long long int old = shrMethod0EnergySamplePair[lch], assumed;
+        unsigned long long int val =
+            (static_cast<unsigned long long int>(sampleWithinWindow) << 32) + __float_as_uint(energym0_per_ts);
+        do {
+          assumed = old;
+          // decode energy, sample values
+          //int const current_sample = (assumed >> 32) & 0xffffffff;
+          float const current_energy = __uint_as_float(assumed & 0xffffffff);
+          if (energym0_per_ts > current_energy)
+            old = atomicCAS(&shrMethod0EnergySamplePair[lch], assumed, val);
+          else
+            break;
+        } while (assumed != old);
+      }
+      __syncthreads();
+
+      // NOTE: must take soi, as values for that thread are used...
+      if (sampleWithinWindow == soi) {
+        auto const method0_energy = shrMethod0EnergyAccum[lch];
+        auto const val = shrMethod0EnergySamplePair[lch];
+        int const max_sample = (val >> 32) & 0xffffffff;
+        float const max_energy = __uint_as_float(val & 0xffffffff);
+        float const max_energy_1 =
+            max_sample < nsamplesForCompute - 1 ? shrEnergyM0PerTS[lch * nsamplesForCompute + max_sample + 1] : 0.f;
+        float const position = nsamplesToAdd < nsamplesForCompute ? max_sample - soi : max_sample;
+        auto const sum = max_energy + max_energy_1;
+        // FIXME: for full comparison with cpu method 0  timing,
+        // need to correct by slew
+        // requires an accumulator -> more shared mem -> omit here unless
+        // really needed
+        float const time =
+            max_energy > 0.f && max_energy_1 > 0.f ? 25.f * (position + max_energy_1 / sum) : 25.f * position;
+
+        // store method0 quantities to global mem
+        outputdid[gch] = id;
+        method0Energy[gch] = method0_energy;
+        method0Time[gch] = time;
+
+#ifdef HCAL_MAHI_GPUDEBUG
+        printf("tsTOT = %f tstrig = %f ts4Thresh = %f\n", shrEnergyM0TotalAccum[lch], energym0_per_ts_gain0, ts4Thresh);
+#endif
+
+        // check as in cpu version if mahi is not needed
+        // FIXME: KNOWN ISSUE: observed a problem when rawCharge and pedestal
+        // are basically equal and generate -0.00000...
+        // needs to be treated properly
+        if (!(shrEnergyM0TotalAccum[lch] > 0 && energym0_per_ts_gain0 > ts4Thresh)) {
+          // do not need to run mahi minimization
+          //outputEnergy[gch] = 0; energy already inited to 0
+          outputChi2[gch] = -9999.f;
+        }
+
+#ifdef HCAL_MAHI_GPUDEBUG
+        printf("method0_energy = %f max_sample = %d max_energy = %f time = %f\n",
+               method0_energy,
+               max_sample,
+               max_energy,
+               time);
+#endif
+      }
+
+      //
+      // preparations for mahi fit
+      //
+      auto const amplitude = rawCharge - pedestalToUseForMethod0;
+      auto const noiseADC = (1. / std::sqrt(12)) * dfc;
+      auto const noisePhotoSq = amplitude > pedestalWidth ? (amplitude * fcByPE) : 0.f;
+      auto const noiseTerm = noiseADC * noiseADC + noisePhotoSq + pedestalWidth * pedestalWidth;
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      printf(
+          "charrge(%d) = %f pedestal(%d) = %f dfc(%d) = %f pedestalWidth(%d) = %f noiseADC(%d) = %f noisPhoto(%d) = "
+          "%f\n",
+          sample,
+          rawCharge,
+          sample,
+          pedestalToUseForMethod0,
+          sample,
+          dfc,
+          sample,
+          pedestalWidth,
+          sample,
+          noiseADC,
+          sample,
+          noisePhotoSq);
+#endif
+
+      // store to global memory
+      amplitudesForChannel[sampleWithinWindow] = amplitude;
+      noiseTermsForChannel[sampleWithinWindow] = noiseTerm;
+    }
+
+    // TODO: need to add an array of offsets for pulses (a la activeBXs...)
+    // Assume for now 8 pulses
+    __global__ void kernel_prep_pulseMatrices_sameNumberOfSamples(float* pulseMatrices,
+                                                                  float* pulseMatricesM,
+                                                                  float* pulseMatricesP,
+                                                                  int const* pulseOffsets,
+                                                                  float const* amplitudes,
+                                                                  uint32_t const* idsf01HE,
+                                                                  uint32_t const* idsf5HB,
+                                                                  uint32_t const* idsf3HB,
+                                                                  uint32_t const nchannelsf01HE,
+                                                                  uint32_t const nchannelsf5HB,
+                                                                  uint32_t const nchannelsTotal,
+                                                                  int8_t const* soiSamples,
+                                                                  uint32_t const* recoPulseShapeIds,
+                                                                  float const* acc25nsVecValues,
+                                                                  float const* diff25nsItvlVecValues,
+                                                                  float const* accVarLenIdxMinusOneVecValues,
+                                                                  float const* diffVarItvlIdxMinusOneVecValues,
+                                                                  float const* accVarLenIdxZeroVecValues,
+                                                                  float const* diffVarItvlIdxZeroVecValues,
+                                                                  float const meanTime,
+                                                                  float const timeSigmaSiPM,
+                                                                  float const timeSigmaHPD,
+                                                                  int const maxDepthHB,
+                                                                  int const maxDepthHE,
+                                                                  int const maxPhiHE,
+                                                                  int const firstHBRing,
+                                                                  int const lastHBRing,
+                                                                  int const firstHERing,
+                                                                  int const lastHERing,
+                                                                  int const nEtaHB,
+                                                                  int const nEtaHE,
+                                                                  uint32_t const offsetForHashes,
+                                                                  bool const applyTimeSlew,
+                                                                  float const tzeroTimeSlew,
+                                                                  float const slopeTimeSlew,
+                                                                  float const tmaxTimeSlew) {
+      // indices
+      auto const ipulse = threadIdx.y;
+      auto const npulses = blockDim.y;
+      auto const sample = threadIdx.x;
+      auto const nsamples = blockDim.x;
+      auto const lch = threadIdx.z;
+      auto const gch = lch + blockIdx.x * blockDim.z;
+      auto const nchannelsf015 = nchannelsf01HE + nchannelsf5HB;
+
+      if (gch >= nchannelsTotal)
+        return;
+
+      // conditions
+      auto const id = gch < nchannelsf01HE
+                          ? idsf01HE[gch]
+                          : (gch < nchannelsf015 ? idsf5HB[gch - nchannelsf01HE] : idsf3HB[gch - nchannelsf015]);
+      //auto const id = gch >= nchannelsf01HE
+      //    ? idsf5HB[gch - nchannelsf01HE]
+      //    : idsf01HE[gch];
+      auto const deltaT = gch >= nchannelsf01HE && gch < nchannelsf015 ? timeSigmaHPD : timeSigmaSiPM;
+      auto const did = DetId{id};
+      auto const hashedId =
+          did.subdetId() == HcalBarrel
+              ? hcal::reconstruction::did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
+              : hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) +
+                    offsetForHashes;
+      auto const recoPulseShapeId = recoPulseShapeIds[hashedId];
+      auto const* acc25nsVec = acc25nsVecValues + recoPulseShapeId * hcal::constants::maxPSshapeBin;
+      auto const* diff25nsItvlVec = diff25nsItvlVecValues + recoPulseShapeId * hcal::constants::maxPSshapeBin;
+      auto const* accVarLenIdxMinusOneVec = accVarLenIdxMinusOneVecValues + recoPulseShapeId * hcal::constants::nsPerBX;
+      auto const* diffVarItvlIdxMinusOneVec =
+          diffVarItvlIdxMinusOneVecValues + recoPulseShapeId * hcal::constants::nsPerBX;
+      auto const* accVarLenIdxZeroVec = accVarLenIdxZeroVecValues + recoPulseShapeId * hcal::constants::nsPerBX;
+      auto const* diffVarItvlIdxZeroVec = diffVarItvlIdxZeroVecValues + recoPulseShapeId * hcal::constants::nsPerBX;
+
+      // offset output arrays
+      auto* pulseMatrix = pulseMatrices + nsamples * npulses * gch;
+      auto* pulseMatrixM = pulseMatricesM + nsamples * npulses * gch;
+      auto* pulseMatrixP = pulseMatricesP + nsamples * npulses * gch;
+
+      // amplitude per ipulse
+      int const soi = soiSamples[gch];
+      int const pulseOffset = pulseOffsets[ipulse];
+      auto const amplitude = amplitudes[gch * nsamples + pulseOffset + soi];
+
+#ifdef HCAL_MAHI_GPUDEBUG
+#ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID
+      if (id != DETID_TO_DEBUG)
+        return;
+#endif
+#endif
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      if (sample == 0 && ipulse == 0) {
+        for (int i = 0; i < 8; i++)
+          printf("amplitude(%d) = %f\n", i, amplitudes[gch * nsamples + i]);
+        printf("acc25nsVec and diff25nsItvlVec for recoPulseShapeId = %u\n", recoPulseShapeId);
+        for (int i = 0; i < 256; i++) {
+          printf("acc25nsVec(%d) = %f diff25nsItvlVec(%d) = %f\n", i, acc25nsVec[i], i, diff25nsItvlVec[i]);
+        }
+        printf("accVarLenIdxZEROVec and accVarLenIdxMinusOneVec\n");
+        for (int i = 0; i < 25; i++) {
+          printf("accVarLenIdxZEROVec(%d) = %f accVarLenIdxMinusOneVec(%d) = %f\n",
+                 i,
+                 accVarLenIdxZeroVec[i],
+                 i,
+                 accVarLenIdxMinusOneVec[i]);
+        }
+        printf("diffVarItvlIdxZEROVec and diffVarItvlIdxMinusOneVec\n");
+        for (int i = 0; i < 25; i++) {
+          printf("diffVarItvlIdxZEROVec(%d) = %f diffVarItvlIdxMinusOneVec(%d) = %f\n",
+                 i,
+                 diffVarItvlIdxZeroVec[i],
+                 i,
+                 diffVarItvlIdxMinusOneVec[i]);
+        }
+      }
+#endif
+
+      auto t0 = meanTime;
+      if (applyTimeSlew) {
+        if (amplitude <= 1.0f)
+          t0 += hcal::reconstruction::compute_time_slew_delay(1.0, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew);
+        else
+          t0 += hcal::reconstruction::compute_time_slew_delay(amplitude, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew);
+      }
+      auto const t0m = -deltaT + t0;
+      auto const t0p = deltaT + t0;
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      if (sample == 0 && ipulse == 0) {
+        printf("time values: %f %f %f\n", t0, t0m, t0p);
+      }
+
+      if (sample == 0 && ipulse == 0) {
+        for (int i = 0; i < hcal::constants::maxSamples; i++) {
+          auto const value = hcal::reconstruction::compute_pulse_shape_value(t0,
+                                                                             i,
+                                                                             0,
+                                                                             acc25nsVec,
+                                                                             diff25nsItvlVec,
+                                                                             accVarLenIdxMinusOneVec,
+                                                                             diffVarItvlIdxMinusOneVec,
+                                                                             accVarLenIdxZeroVec,
+                                                                             diffVarItvlIdxZeroVec);
+          printf("pulse(%d) = %f\n", i, value);
+        }
+        printf("\n");
+        for (int i = 0; i < hcal::constants::maxSamples; i++) {
+          auto const value = hcal::reconstruction::compute_pulse_shape_value(t0p,
+                                                                             i,
+                                                                             0,
+                                                                             acc25nsVec,
+                                                                             diff25nsItvlVec,
+                                                                             accVarLenIdxMinusOneVec,
+                                                                             diffVarItvlIdxMinusOneVec,
+                                                                             accVarLenIdxZeroVec,
+                                                                             diffVarItvlIdxZeroVec);
+          printf("pulseP(%d) = %f\n", i, value);
+        }
+        printf("\n");
+        for (int i = 0; i < hcal::constants::maxSamples; i++) {
+          auto const value = hcal::reconstruction::compute_pulse_shape_value(t0m,
+                                                                             i,
+                                                                             0,
+                                                                             acc25nsVec,
+                                                                             diff25nsItvlVec,
+                                                                             accVarLenIdxMinusOneVec,
+                                                                             diffVarItvlIdxMinusOneVec,
+                                                                             accVarLenIdxZeroVec,
+                                                                             diffVarItvlIdxZeroVec);
+          printf("pulseM(%d) = %f\n", i, value);
+        }
+      }
+#endif
+
+      // FIXME: shift should be treated properly,
+      // here assume 8 time slices and 8 samples
+      auto const shift = 4 - soi;  // as in cpu version!
+
+      // auto const offset = ipulse - soi;
+      // auto const idx = sample - offset;
+      int32_t const idx = sample - pulseOffset;
+      auto const value = idx >= 0 && idx < nsamples
+                             ? hcal::reconstruction::compute_pulse_shape_value(t0,
+                                                                               idx,
+                                                                               shift,
+                                                                               acc25nsVec,
+                                                                               diff25nsItvlVec,
+                                                                               accVarLenIdxMinusOneVec,
+                                                                               diffVarItvlIdxMinusOneVec,
+                                                                               accVarLenIdxZeroVec,
+                                                                               diffVarItvlIdxZeroVec)
+                             : 0;
+      auto const value_t0m = idx >= 0 && idx < nsamples
+                                 ? hcal::reconstruction::compute_pulse_shape_value(t0m,
+                                                                                   idx,
+                                                                                   shift,
+                                                                                   acc25nsVec,
+                                                                                   diff25nsItvlVec,
+                                                                                   accVarLenIdxMinusOneVec,
+                                                                                   diffVarItvlIdxMinusOneVec,
+                                                                                   accVarLenIdxZeroVec,
+                                                                                   diffVarItvlIdxZeroVec)
+                                 : 0;
+      auto const value_t0p = idx >= 0 && idx < nsamples
+                                 ? hcal::reconstruction::compute_pulse_shape_value(t0p,
+                                                                                   idx,
+                                                                                   shift,
+                                                                                   acc25nsVec,
+                                                                                   diff25nsItvlVec,
+                                                                                   accVarLenIdxMinusOneVec,
+                                                                                   diffVarItvlIdxMinusOneVec,
+                                                                                   accVarLenIdxZeroVec,
+                                                                                   diffVarItvlIdxZeroVec)
+                                 : 0;
+
+      // store to global
+      if (amplitude > 0.f) {
+        pulseMatrix[ipulse * nsamples + sample] = value;
+        pulseMatrixM[ipulse * nsamples + sample] = value_t0m;
+        pulseMatrixP[ipulse * nsamples + sample] = value_t0p;
+      } else {
+        pulseMatrix[ipulse * nsamples + sample] = 0.f;
+        pulseMatrixM[ipulse * nsamples + sample] = 0.f;
+        pulseMatrixP[ipulse * nsamples + sample] = 0.f;
+      }
+    }
+
+    template <int NSAMPLES, int NPULSES>
+    __forceinline__ __device__ void update_covariance(
+        calo::multifit::ColumnVector<NPULSES> const& resultAmplitudesVector,
+        calo::multifit::MapSymM<float, NSAMPLES>& covarianceMatrix,
+        Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrix,
+        Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixM,
+        Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixP) {
+#pragma unroll
+      for (int ipulse = 0; ipulse < NPULSES; ipulse++) {
+        auto const resultAmplitude = resultAmplitudesVector(ipulse);
+        if (resultAmplitude == 0)
+          continue;
+
+#ifdef HCAL_MAHI_GPUDEBUG
+        printf("pulse cov array for ibx = %d\n", ipulse);
+#endif
+
+        // preload a column
+        float pmcol[NSAMPLES], pmpcol[NSAMPLES], pmmcol[NSAMPLES];
+#pragma unroll
+        for (int counter = 0; counter < NSAMPLES; counter++) {
+          pmcol[counter] = __ldg(&pulseMatrix.coeffRef(counter, ipulse));
+          pmpcol[counter] = __ldg(&pulseMatrixP.coeffRef(counter, ipulse));
+          pmmcol[counter] = __ldg(&pulseMatrixM.coeffRef(counter, ipulse));
+        }
+
+        auto const ampl2 = resultAmplitude * resultAmplitude;
+#pragma unroll
+        for (int col = 0; col < NSAMPLES; col++) {
+          auto const valueP_col = pmpcol[col];
+          auto const valueM_col = pmmcol[col];
+          auto const value_col = pmcol[col];
+          auto const tmppcol = valueP_col - value_col;
+          auto const tmpmcol = valueM_col - value_col;
+
+          // diagonal
+          auto tmp_value = 0.5 * (tmppcol * tmppcol + tmpmcol * tmpmcol);
+          covarianceMatrix(col, col) += ampl2 * tmp_value;
+
+// FIXME: understand if this actually gets unrolled
+#pragma unroll
+          for (int row = col + 1; row < NSAMPLES; row++) {
+            float const valueP_row = pmpcol[row];  //pulseMatrixP(j, ipulseReal);
+            float const value_row = pmcol[row];    //pulseMatrix(j, ipulseReal);
+            float const valueM_row = pmmcol[row];  //pulseMatrixM(j, ipulseReal);
+
+            float tmpprow = valueP_row - value_row;
+            float tmpmrow = valueM_row - value_row;
+
+            auto const covValue = 0.5 * (tmppcol * tmpprow + tmpmcol * tmpmrow);
+
+            covarianceMatrix(row, col) += ampl2 * covValue;
+          }
+        }
+      }
+    }
+
+    template <int NSAMPLES, int NPULSES>
+    __global__ void kernel_minimize(float* outputEnergy,
+                                    float* outputChi2,
+                                    float const* __restrict__ inputAmplitudes,
+                                    float const* __restrict__ pulseMatrices,
+                                    float const* __restrict__ pulseMatricesM,
+                                    float const* __restrict__ pulseMatricesP,
+                                    int const* __restrict__ pulseOffsetValues,
+                                    float const* __restrict__ noiseTerms,
+                                    int8_t const* __restrict__ soiSamples,
+                                    float const* __restrict__ pedestalWidths,
+                                    float const* __restrict__ effectivePedestalWidths,
+                                    bool const useEffectivePedestals,
+                                    uint32_t const* __restrict__ idsf01HE,
+                                    uint32_t const* __restrict__ idsf5HB,
+                                    uint32_t const* __restrict__ idsf3HB,
+                                    float const* __restrict__ gainValues,
+                                    float const* __restrict__ respCorrectionValues,
+                                    uint32_t const nchannelsf01HE,
+                                    uint32_t const nchannelsf5HB,
+                                    uint32_t const nchannelsTotal,
+                                    uint32_t const offsetForHashes,
+                                    int const maxDepthHB,
+                                    int const maxDepthHE,
+                                    int const maxPhiHE,
+                                    int const firstHBRing,
+                                    int const lastHBRing,
+                                    int const firstHERing,
+                                    int const lastHERing,
+                                    int const nEtaHB,
+                                    int const nEtaHE) {
+      // can be relaxed if needed - minor updates are needed in that case!
+      static_assert(NPULSES == NSAMPLES);
+
+      // indices
+      auto const gch = threadIdx.x + blockIdx.x * blockDim.x;
+      auto const nchannelsf015 = nchannelsf01HE + nchannelsf5HB;
+      if (gch >= nchannelsTotal)
+        return;
+
+      // if chi2 is set to -9999 do not run minimization
+      if (outputChi2[gch] == -9999.f)
+        return;
+
+      // configure shared mem
+      extern __shared__ char shrmem[];
+      float* shrMatrixLFnnlsStorage =
+          reinterpret_cast<float*>(shrmem) + calo::multifit::MapSymM<float, NPULSES>::total * threadIdx.x;
+      float* shrAtAStorage = reinterpret_cast<float*>(shrmem) +
+                             calo::multifit::MapSymM<float, NPULSES>::total * (threadIdx.x + blockDim.x);
+
+      // conditions for pedestal widths
+      auto const id = gch < nchannelsf01HE
+                          ? idsf01HE[gch]
+                          : (gch < nchannelsf015 ? idsf5HB[gch - nchannelsf01HE] : idsf3HB[gch - nchannelsf015]);
+      auto const did = DetId{id};
+      auto const hashedId =
+          did.subdetId() == HcalBarrel
+              ? hcal::reconstruction::did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB)
+              : hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) +
+                    offsetForHashes;
+
+      auto const* pedestalWidthsForChannel = useEffectivePedestals && (gch < nchannelsf01HE || gch >= nchannelsf015)
+                                                 ? effectivePedestalWidths + hashedId * 4
+                                                 : pedestalWidths + hashedId * 4;
+      auto const averagePedestalWidth2 = 0.25 * (pedestalWidthsForChannel[0] * pedestalWidthsForChannel[0] +
+                                                 pedestalWidthsForChannel[1] * pedestalWidthsForChannel[1] +
+                                                 pedestalWidthsForChannel[2] * pedestalWidthsForChannel[2] +
+                                                 pedestalWidthsForChannel[3] * pedestalWidthsForChannel[3]);
+      auto const* gains = gainValues + hashedId * 4;
+      // FIXME on cpu ts 0 capid was used - does it make any difference
+      auto const gain = gains[0];
+      auto const respCorrection = respCorrectionValues[hashedId];
+
+#ifdef HCAL_MAHI_GPUDEBUG
+#ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID
+      if (id != DETID_TO_DEBUG)
+        return;
+#endif
+#endif
+
+      /*
+      // TODO: provide this properly
+      int const soi = soiSamples[gch];
+      */
+      calo::multifit::ColumnVector<NPULSES, int> pulseOffsets;
+#pragma unroll
+      for (int i = 0; i < NPULSES; ++i)
+        pulseOffsets(i) = i;
+      //        pulseOffsets(i) = pulseOffsetValues[i] - pulseOffsetValues[0];
+
+      // output amplitudes/weights
+      calo::multifit::ColumnVector<NPULSES> resultAmplitudesVector = calo::multifit::ColumnVector<NPULSES>::Zero();
+
+      // map views
+      Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> inputAmplitudesView{inputAmplitudes + gch * NSAMPLES};
+      Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> noiseTermsView{noiseTerms + gch * NSAMPLES};
+      Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixMView{pulseMatricesM +
+                                                                                              gch * NSAMPLES * NPULSES};
+      Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixPView{pulseMatricesP +
+                                                                                              gch * NSAMPLES * NPULSES};
+      Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixView{pulseMatrices +
+                                                                                             gch * NSAMPLES * NPULSES};
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      for (int i = 0; i < NSAMPLES; i++)
+        printf("inputValues(%d) = %f noiseTerms(%d) = %f\n", i, inputAmplitudesView(i), i, noiseTermsView(i));
+      for (int i = 0; i < NSAMPLES; i++) {
+        for (int j = 0; j < NPULSES; j++)
+          printf("%f ", glbPulseMatrixView(i, j));
+        printf("\n");
+      }
+      printf("\n");
+      for (int i = 0; i < NSAMPLES; i++) {
+        for (int j = 0; j < NPULSES; j++)
+          printf("%f ", glbPulseMatrixMView(i, j));
+        printf("\n");
+      }
+      printf("\n");
+      for (int i = 0; i < NSAMPLES; i++) {
+        for (int j = 0; j < NPULSES; j++)
+          printf("%f ", glbPulseMatrixPView(i, j));
+        printf("\n");
+      }
+#endif
+
+      int npassive = 0;
+      float chi2 = 0, previous_chi2 = 0.f, chi2_2itersback = 0.f;
+      for (int iter = 1; iter < nMaxItersMin; iter++) {
+        //float covarianceMatrixStorage[MapSymM<float, NSAMPLES>::total];
+        // NOTE: only works when NSAMPLES == NPULSES
+        // if does not hold -> slightly rearrange shared mem to still reuse
+        // shared memory
+        float* covarianceMatrixStorage = shrMatrixLFnnlsStorage;
+        calo::multifit::MapSymM<float, NSAMPLES> covarianceMatrix{covarianceMatrixStorage};
+#pragma unroll
+        for (int counter = 0; counter < calo::multifit::MapSymM<float, NSAMPLES>::total; counter++)
+          covarianceMatrixStorage[counter] = averagePedestalWidth2;
+#pragma unroll
+        for (int counter = 0; counter < calo::multifit::MapSymM<float, NSAMPLES>::stride; counter++)
+          covarianceMatrix(counter, counter) += __ldg(&noiseTermsView.coeffRef(counter));
+
+        // update covariance matrix
+        update_covariance(
+            resultAmplitudesVector, covarianceMatrix, glbPulseMatrixView, glbPulseMatrixMView, glbPulseMatrixPView);
+
+#ifdef HCAL_MAHI_GPUDEBUG
+        printf("covariance matrix\n");
+        for (int i = 0; i < 8; i++) {
+          for (int j = 0; j < 8; j++)
+            printf("%f ", covarianceMatrix(i, j));
+          printf("\n");
+        }
+#endif
+
+        // compute Cholesky Decomposition L matrix
+        //matrixDecomposition.compute(covarianceMatrix);
+        //auto const& matrixL = matrixDecomposition.matrixL();
+        float matrixLStorage[calo::multifit::MapSymM<float, NSAMPLES>::total];
+        calo::multifit::MapSymM<float, NSAMPLES> matrixL{matrixLStorage};
+        calo::multifit::compute_decomposition_unrolled(matrixL, covarianceMatrix);
+
+        //
+        // replace eigen
+        //
+        //auto const& A = matrixDecomposition
+        //    .matrixL()
+        //    .solve(pulseMatrixView);
+        calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES> A;
+        calo::multifit::solve_forward_subst_matrix(A, glbPulseMatrixView, matrixL);
+
+        //
+        // remove eigen
+        //
+        //auto const& b = matrixL
+        //   .solve(inputAmplitudesView);
+        //
+        float reg_b[NSAMPLES];
+        calo::multifit::solve_forward_subst_vector(reg_b, inputAmplitudesView, matrixL);
+
+        // TODO: we do not really need to change these matrcies
+        // will be fixed in the optimized version
+        //ColMajorMatrix<NPULSES, NPULSES> AtA = A.transpose() * A;
+        //ColumnVector<NPULSES> Atb = A.transpose() * b;
+        //ColMajorMatrix<NPULSES, NPULSES> AtA;
+        //float AtAStorage[MapSymM<float, NPULSES>::total];
+        calo::multifit::MapSymM<float, NPULSES> AtA{shrAtAStorage};
+        calo::multifit::ColumnVector<NPULSES> Atb;
+#pragma unroll
+        for (int icol = 0; icol < NPULSES; icol++) {
+          float reg_ai[NSAMPLES];
+
+// load column icol
+#pragma unroll
+          for (int counter = 0; counter < NSAMPLES; counter++)
+            reg_ai[counter] = A(counter, icol);
+
+          // compute diagonal
+          float sum = 0.f;
+#pragma unroll
+          for (int counter = 0; counter < NSAMPLES; counter++)
+            sum += reg_ai[counter] * reg_ai[counter];
+
+          // store
+          AtA(icol, icol) = sum;
+
+// go thru the other columns
+#pragma unroll
+          for (int j = icol + 1; j < NPULSES; j++) {
+            // load column j
+            float reg_aj[NSAMPLES];
+#pragma unroll
+            for (int counter = 0; counter < NSAMPLES; counter++)
+              reg_aj[counter] = A(counter, j);
+
+            // accum
+            float sum = 0.f;
+#pragma unroll
+            for (int counter = 0; counter < NSAMPLES; counter++)
+              sum += reg_aj[counter] * reg_ai[counter];
+
+            // store
+            //AtA(icol, j) = sum;
+            AtA(j, icol) = sum;
+          }
+
+          // Atb accum
+          float sum_atb = 0;
+#pragma unroll
+          for (int counter = 0; counter < NSAMPLES; counter++)
+            sum_atb += reg_ai[counter] * reg_b[counter];
+
+          // store atb
+          Atb(icol) = sum_atb;
+        }
+
+#ifdef HCAL_MAHI_GPUDEBUG
+        printf("AtA\n");
+        for (int i = 0; i < 8; i++) {
+          for (int j = 0; j < 8; j++)
+            printf("%f ", AtA(i, j));
+          printf("\n");
+        }
+        printf("Atb\n");
+        for (int i = 0; i < 8; i++)
+          printf("%f ", Atb(i));
+        printf("\n");
+        printf("result Amplitudes before nnls\n");
+        for (int i = 0; i < 8; i++)
+          printf("%f ", resultAmplitudesVector(i));
+        printf("\n");
+#endif
+
+        // for fnnls
+        calo::multifit::MapSymM<float, NPULSES> matrixLForFnnls{shrMatrixLFnnlsStorage};
+
+        // run fast nnls
+        calo::multifit::fnnls(
+            AtA, Atb, resultAmplitudesVector, npassive, pulseOffsets, matrixLForFnnls, nnlsThresh, nMaxItersNNLS, 10, 10);
+
+#ifdef HCAL_MAHI_GPUDEBUG
+        printf("result Amplitudes\n");
+        for (int i = 0; i < 8; i++)
+          printf("resultAmplitudes(%d) = %f\n", i, resultAmplitudesVector(i));
+#endif
+
+        calo::multifit::calculateChiSq(matrixL, glbPulseMatrixView, resultAmplitudesVector, inputAmplitudesView, chi2);
+
+        auto const deltaChi2 = std::abs(chi2 - previous_chi2);
+        if (chi2 == chi2_2itersback && chi2 < previous_chi2)
+          break;
+
+        // update
+        chi2_2itersback = previous_chi2;
+        previous_chi2 = chi2;
+
+        // exit condition
+        if (deltaChi2 < deltaChi2Threashold)
+          break;
+      }
+
+#ifdef HCAL_MAHI_GPUDEBUG
+      for (int i = 0; i < NPULSES; i++)
+        printf("pulseOffsets(%d) = %d outputAmplitudes(%d) = %f\n", i, pulseOffsets(i), i, resultAmplitudesVector(i));
+      printf("chi2 = %f\n", chi2);
+#endif
+
+      outputChi2[gch] = chi2;
+      auto const idx_for_energy = std::abs(pulseOffsetValues[0]);
+      outputEnergy[gch] = (gain * resultAmplitudesVector(idx_for_energy)) * respCorrection;
+      /*
+      #pragma unroll
+      for (int i=0; i<NPULSES; i++)
+          if (pulseOffsets[i] == soi)
+              // NOTE: gain is a number < 10^-3/4, multiply first to avoid stab issues
+              outputEnergy[gch] = (gain*resultAmplitudesVector(i))*respCorrection;
+      */
+    }
+
+  }  // namespace mahi
+}  // namespace hcal
+
+namespace hcal {
+  namespace reconstruction {
+
+    void entryPoint(InputDataGPU const& inputGPU,
+                    OutputDataGPU& outputGPU,
+                    ConditionsProducts const& conditions,
+                    ScratchDataGPU& scratch,
+                    ConfigParameters const& configParameters,
+                    cudaStream_t cudaStream) {
+      auto const totalChannels = inputGPU.f01HEDigis.size + inputGPU.f5HBDigis.size + inputGPU.f3HBDigis.size;
+
+      // FIXME: may be move this assignment to emphasize this more clearly
+      // FIXME: number of channels for output might change given that
+      //   some channesl might be filtered out
+      outputGPU.recHits.size = totalChannels;
+
+      // TODO: this can be lifted by implementing a separate kernel
+      // similar to the default one, but properly handling the diff in #sample
+      // or modifying existing one
+      auto const f01nsamples = compute_nsamples<Flavor1>(inputGPU.f01HEDigis.stride);
+      auto const f5nsamples = compute_nsamples<Flavor5>(inputGPU.f5HBDigis.stride);
+      auto const f3nsamples = compute_nsamples<Flavor3>(inputGPU.f3HBDigis.stride);
+      int constexpr windowSize = 8;
+      int const startingSample = f01nsamples - windowSize;
+      assert(startingSample == 0 || startingSample == 2);
+      if (inputGPU.f01HEDigis.stride > 0 && inputGPU.f5HBDigis.stride > 0)
+        assert(f01nsamples == f5nsamples);
+      if (inputGPU.f01HEDigis.stride > 0 && inputGPU.f3HBDigis.stride > 0)
+        assert(f01nsamples == f3nsamples);
+
+      dim3 threadsPerBlock{windowSize, configParameters.kprep1dChannelsPerBlock};
+      int blocks = static_cast<uint32_t>(threadsPerBlock.y) > totalChannels
+                       ? 1
+                       : (totalChannels + threadsPerBlock.y - 1) / threadsPerBlock.y;
+      int nbytesShared =
+          ((2 * windowSize + 2) * sizeof(float) + sizeof(uint64_t)) * configParameters.kprep1dChannelsPerBlock;
+      hcal::mahi::kernel_prep1d_sameNumberOfSamples<<<blocks, threadsPerBlock, nbytesShared, cudaStream>>>(
+          scratch.amplitudes.get(),
+          scratch.noiseTerms.get(),
+          outputGPU.recHits.energy.get(),
+          outputGPU.recHits.chi2.get(),
+          inputGPU.f01HEDigis.data.get(),
+          inputGPU.f5HBDigis.data.get(),
+          inputGPU.f3HBDigis.data.get(),
+          inputGPU.f01HEDigis.ids.get(),
+          inputGPU.f5HBDigis.ids.get(),
+          inputGPU.f3HBDigis.ids.get(),
+          inputGPU.f01HEDigis.stride,
+          inputGPU.f5HBDigis.stride,
+          inputGPU.f3HBDigis.stride,
+          inputGPU.f01HEDigis.size,
+          inputGPU.f5HBDigis.size,
+          inputGPU.f5HBDigis.npresamples.get(),
+          scratch.soiSamples.get(),
+          outputGPU.recHits.energyM0.get(),
+          outputGPU.recHits.timeM0.get(),
+          outputGPU.recHits.did.get(),
+          totalChannels,
+          conditions.recoParams.param1,
+          conditions.recoParams.param2,
+          conditions.qieCoders.offsets,
+          conditions.qieCoders.slopes,
+          conditions.qieTypes.values,
+          conditions.pedestalWidths.values,
+          conditions.effectivePedestalWidths.values,
+          conditions.pedestals.values,
+          conditions.convertedEffectivePedestals ? conditions.convertedEffectivePedestals->values
+                                                 : conditions.pedestals.values,
+          configParameters.useEffectivePedestals,
+          conditions.sipmParameters.type,
+          conditions.sipmParameters.fcByPE,
+          conditions.sipmCharacteristics.parLin1,
+          conditions.sipmCharacteristics.parLin2,
+          conditions.sipmCharacteristics.parLin3,
+          conditions.gains.values,
+          conditions.respCorrs.values,
+          conditions.topology->maxDepthHB(),
+          conditions.topology->maxDepthHE(),
+          conditions.recConstants->getNPhi(1) > hcal::reconstruction::IPHI_MAX ? conditions.recConstants->getNPhi(1)
+                                                                               : hcal::reconstruction::IPHI_MAX,
+          conditions.topology->firstHBRing(),
+          conditions.topology->lastHBRing(),
+          conditions.topology->firstHERing(),
+          conditions.topology->lastHERing(),
+          conditions.recConstants->getEtaRange(0).second - conditions.recConstants->getEtaRange(0).first + 1,
+          conditions.topology->firstHERing() > conditions.topology->lastHERing()
+              ? 0
+              : (conditions.topology->lastHERing() - conditions.topology->firstHERing() + 1),
+          configParameters.sipmQTSShift,
+          configParameters.sipmQNTStoSum,
+          configParameters.firstSampleShift,
+          conditions.offsetForHashes,
+          configParameters.ts4Thresh,
+          startingSample);
+      cudaCheck(cudaGetLastError());
+
+      // 1024 is the max threads per block for gtx1080
+      // FIXME: take this from cuda service or something like that
+      uint32_t const channelsPerBlock = 1024 / (windowSize * conditions.pulseOffsetsHost.size());
+      dim3 threadsPerBlock2{windowSize, static_cast<uint32_t>(conditions.pulseOffsetsHost.size()), channelsPerBlock};
+      int blocks2 =
+          threadsPerBlock2.z > totalChannels ? 1 : (totalChannels + threadsPerBlock2.z - 1) / threadsPerBlock2.z;
+
+#ifdef HCAL_MAHI_CPUDEBUG
+      std::cout << "threads: " << threadsPerBlock2.x << " " << threadsPerBlock2.y << "  " << threadsPerBlock2.z
+                << std::endl;
+      std::cout << "blocks: " << blocks2 << std::endl;
+#endif
+
+      hcal::mahi::kernel_prep_pulseMatrices_sameNumberOfSamples<<<blocks2, threadsPerBlock2, 0, cudaStream>>>(
+          scratch.pulseMatrices.get(),
+          scratch.pulseMatricesM.get(),
+          scratch.pulseMatricesP.get(),
+          conditions.pulseOffsets.values,
+          scratch.amplitudes.get(),
+          inputGPU.f01HEDigis.ids.get(),
+          inputGPU.f5HBDigis.ids.get(),
+          inputGPU.f3HBDigis.ids.get(),
+          inputGPU.f01HEDigis.size,
+          inputGPU.f5HBDigis.size,
+          totalChannels,
+          scratch.soiSamples.get(),
+          conditions.recoParams.ids,
+          conditions.recoParams.acc25nsVec,
+          conditions.recoParams.diff25nsItvlVec,
+          conditions.recoParams.accVarLenIdxMinusOneVec,
+          conditions.recoParams.diffVarItvlIdxMinusOneVec,
+          conditions.recoParams.accVarLenIdxZEROVec,
+          conditions.recoParams.diffVarItvlIdxZEROVec,
+          configParameters.meanTime,
+          configParameters.timeSigmaSiPM,
+          configParameters.timeSigmaHPD,
+          conditions.topology->maxDepthHB(),
+          conditions.topology->maxDepthHE(),
+          conditions.recConstants->getNPhi(1) > hcal::reconstruction::IPHI_MAX ? conditions.recConstants->getNPhi(1)
+                                                                               : hcal::reconstruction::IPHI_MAX,
+          conditions.topology->firstHBRing(),
+          conditions.topology->lastHBRing(),
+          conditions.topology->firstHERing(),
+          conditions.topology->lastHERing(),
+          conditions.recConstants->getEtaRange(0).second - conditions.recConstants->getEtaRange(0).first + 1,
+          conditions.topology->firstHERing() > conditions.topology->lastHERing()
+              ? 0
+              : (conditions.topology->lastHERing() - conditions.topology->firstHERing() + 1),
+          conditions.offsetForHashes,
+          configParameters.applyTimeSlew,
+          configParameters.tzeroTimeSlew,
+          configParameters.slopeTimeSlew,
+          configParameters.tmaxTimeSlew);
+      cudaCheck(cudaGetLastError());
+
+      // number of samples is checked in above assert
+      if (conditions.pulseOffsetsHost.size() == 8u) {
+        // FIXME: provide constants from configuration
+        uint32_t threadsPerBlock = configParameters.kernelMinimizeThreads[0];
+        uint32_t blocks = threadsPerBlock > totalChannels ? 1 : (totalChannels + threadsPerBlock - 1) / threadsPerBlock;
+        auto const nbytesShared = 2 * threadsPerBlock * calo::multifit::MapSymM<float, 8>::total * sizeof(float);
+        hcal::mahi::kernel_minimize<8, 8><<<blocks, threadsPerBlock, nbytesShared, cudaStream>>>(
+            outputGPU.recHits.energy.get(),
+            outputGPU.recHits.chi2.get(),
+            scratch.amplitudes.get(),
+            scratch.pulseMatrices.get(),
+            scratch.pulseMatricesM.get(),
+            scratch.pulseMatricesP.get(),
+            conditions.pulseOffsets.values,
+            scratch.noiseTerms.get(),
+            scratch.soiSamples.get(),
+            conditions.pedestalWidths.values,
+            conditions.effectivePedestalWidths.values,
+            configParameters.useEffectivePedestals,
+            inputGPU.f01HEDigis.ids.get(),
+            inputGPU.f5HBDigis.ids.get(),
+            inputGPU.f3HBDigis.ids.get(),
+            conditions.gains.values,
+            conditions.respCorrs.values,
+            inputGPU.f01HEDigis.size,
+            inputGPU.f5HBDigis.size,
+            totalChannels,
+            conditions.offsetForHashes,
+            conditions.topology->maxDepthHB(),
+            conditions.topology->maxDepthHE(),
+            conditions.recConstants->getNPhi(1) > hcal::reconstruction::IPHI_MAX ? conditions.recConstants->getNPhi(1)
+                                                                                 : hcal::reconstruction::IPHI_MAX,
+            conditions.topology->firstHBRing(),
+            conditions.topology->lastHBRing(),
+            conditions.topology->firstHERing(),
+            conditions.topology->lastHERing(),
+            conditions.recConstants->getEtaRange(0).second - conditions.recConstants->getEtaRange(0).first + 1,
+            conditions.topology->firstHERing() > conditions.topology->lastHERing()
+                ? 0
+                : (conditions.topology->lastHERing() - conditions.topology->firstHERing() + 1));
+      } else {
+        throw cms::Exception("Invalid MahiGPU configuration")
+            << "Currently support only 8 pulses and 8 time samples and provided: " << f01nsamples << " samples and "
+            << conditions.pulseOffsetsHost.size() << " pulses" << std::endl;
+      }
+    }
+
+  }  // namespace reconstruction
+}  // namespace hcal
diff --git a/RecoLocalCalo/HcalRecProducers/src/SimpleAlgoGPU.h b/RecoLocalCalo/HcalRecProducers/src/SimpleAlgoGPU.h
new file mode 100644
index 0000000000000..c0bb499b517a7
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/src/SimpleAlgoGPU.h
@@ -0,0 +1,19 @@
+#ifndef RecoLocalCalo_HcalRecProducers_src_SimpleAlgoGPU_h
+#define RecoLocalCalo_HcalRecProducers_src_SimpleAlgoGPU_h
+
+#include "DeclsForKernels.h"
+
+namespace hcal {
+  namespace reconstruction {
+
+    void entryPoint(InputDataGPU const&,
+                    OutputDataGPU&,
+                    ConditionsProducts const&,
+                    ScratchDataGPU&,
+                    ConfigParameters const&,
+                    cudaStream_t);
+
+  }
+}  // namespace hcal
+
+#endif  // RecoLocalCalo_HcalRecProducers_src_SimpleAlgoGPU_h
diff --git a/RecoLocalCalo/HcalRecProducers/test/make_GPUvsCPU_HCAL_plots.py b/RecoLocalCalo/HcalRecProducers/test/make_GPUvsCPU_HCAL_plots.py
new file mode 100644
index 0000000000000..2b97efc2f2d8c
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/test/make_GPUvsCPU_HCAL_plots.py
@@ -0,0 +1,28 @@
+import FWCore.ParameterSet.Config as cms
+
+process = cms.Process("PLOT")
+
+process.load("FWCore.MessageService.MessageLogger_cfi")
+process.options = cms.untracked.PSet(
+    wantSummary = cms.untracked.bool(False)
+)
+
+process.load('Configuration.StandardSequences.GeometryRecoDB_cff')
+process.load("Configuration.StandardSequences.FrontierConditions_GlobalTag_cff")
+from Configuration.AlCa.GlobalTag import GlobalTag
+process.GlobalTag = GlobalTag(process.GlobalTag, 'auto:run2_hlt_relval', '')
+
+process.maxEvents = cms.untracked.PSet( input = cms.untracked.int32(-1) )
+process.MessageLogger.cerr.FwkReport.reportEvery = 500
+
+process.source = cms.Source("PoolSource",
+    fileNames = cms.untracked.vstring('file:GPUvsCPU_HCAL_rechits.root')
+)
+
+process.comparisonPlots = cms.EDAnalyzer('HCALGPUAnalyzer')
+
+process.TFileService = cms.Service('TFileService',
+    fileName = cms.string('GPUvsCPU_HCAL_plots.root')
+)
+
+process.path = cms.Path(process.comparisonPlots)
diff --git a/RecoLocalCalo/HcalRecProducers/test/make_GPUvsCPU_HCAL_rechits.py b/RecoLocalCalo/HcalRecProducers/test/make_GPUvsCPU_HCAL_rechits.py
new file mode 100644
index 0000000000000..32d4104a842ef
--- /dev/null
+++ b/RecoLocalCalo/HcalRecProducers/test/make_GPUvsCPU_HCAL_rechits.py
@@ -0,0 +1,152 @@
+import FWCore.ParameterSet.Config as cms
+
+from Configuration.StandardSequences.Eras import eras
+#from Configuration.ProcessModifiers.gpu_cff import gpu
+
+process = cms.Process('RECOgpu', eras.Run2_2018)
+
+# import of standard configurations
+process.load('Configuration.StandardSequences.Services_cff')
+process.load('FWCore.MessageService.MessageLogger_cfi')
+process.load('HeterogeneousCore.CUDAServices.CUDAService_cfi')
+
+process.load('Configuration.StandardSequences.FrontierConditions_GlobalTag_cff')
+from Configuration.AlCa.GlobalTag import GlobalTag
+process.GlobalTag = GlobalTag(process.GlobalTag, 'auto:run2_hlt_relval', '')
+
+process.maxEvents = cms.untracked.PSet(
+    input = cms.untracked.int32(1000)
+)
+
+#-----------------------------------------
+# INPUT
+#-----------------------------------------
+
+process.source = cms.Source("PoolSource",
+  fileNames = cms.untracked.vstring('/store/data/Run2018D/EphemeralHLTPhysics1/RAW/v1/000/323/775/00000/A27DFA33-8FCB-BE42-A2D2-1A396EEE2B6E.root')
+)
+
+process.hltGetRaw = cms.EDAnalyzer( "HLTGetRaw",
+    RawDataCollection = cms.InputTag( "rawDataCollector" )
+)
+
+process.input = cms.Path( process.hltGetRaw )
+
+#-----------------------------------------
+# CMSSW/Hcal non-DQM Related Module import
+#-----------------------------------------
+
+process.load('Configuration.StandardSequences.GeometryRecoDB_cff')
+process.load("RecoLocalCalo.Configuration.hcalLocalReco_cff")
+process.load("EventFilter.HcalRawToDigi.HcalRawToDigi_cfi")
+process.load("RecoLuminosity.LumiProducer.bunchSpacingProducer_cfi")
+
+process.hcalDigis.InputLabel = cms.InputTag("rawDataCollector")
+
+#-----------------------------------------
+# CMSSW/Hcal GPU related files
+#-----------------------------------------
+
+process.load("RecoLocalCalo.HcalRecProducers.hbheRecHitProducerGPUTask_cff")
+process.load("RecoLocalCalo.HcalRecProducers.hcalCPURecHitsProducer_cfi")
+process.hcalCPURecHitsProducer.recHitsM0LabelIn = cms.InputTag("hbheRecHitProducerGPU","")
+process.hcalCPURecHitsProducer.recHitsM0LabelOut = cms.string("")
+
+#-----------------------------------------
+# Temporary customization (things not implemented on the GPU)
+#-----------------------------------------
+
+## the one below is taken directly from the DB, regard M0
+#process.hbheprereco.algorithm.correctForPhaseContainment = cms.bool(False)
+
+## do always 8 pulse
+process.hbheprereco.algorithm.chiSqSwitch = cms.double(-1)
+
+## to match hard coded setting (will be fixed on CPU)
+process.hbheprereco.algorithm.nMaxItersMin = cms.int32(50)
+
+#-----------------------------------------
+# Final Custmization for Run3
+#-----------------------------------------
+
+# we will not run arrival Time at HLT
+process.hbheprereco.algorithm.calculateArrivalTime = cms.bool(False)
+
+## we do not need this
+process.hbheprereco.algorithm.applyLegacyHBMCorrection = cms.bool(False)
+
+# we only run Mahi at HLT
+process.hbheprereco.algorithm.useM3 = cms.bool(False)
+
+# we will not have the HPD noise flags in Run3, as will be all siPM
+process.hbheprereco.setLegacyFlagsQIE8 = cms.bool(False)
+process.hbheprereco.setNegativeFlagsQIE8 = cms.bool(False)
+process.hbheprereco.setNoiseFlagsQIE8 = cms.bool(False)
+process.hbheprereco.setPulseShapeFlagsQIE8 = cms.bool(False)
+
+# for testing M0 only
+##process.hbheprereco.algorithm.useMahi = cms.bool(False)
+
+#-----------------------------------------
+# OUTPUT
+#-----------------------------------------
+
+#process.out = cms.OutputModule("AsciiOutputModule",
+#    outputCommands = cms.untracked.vstring(
+#        'keep *_*_*_*', 
+#    ),
+#    verbosity = cms.untracked.uint32(0)
+#)
+
+process.out = cms.OutputModule("PoolOutputModule",
+    fileName = cms.untracked.string("GPUvsCPU_HCAL_rechits.root")
+)
+
+#---------------
+
+process.finalize = cms.EndPath(process.out)
+
+process.bunchSpacing = cms.Path(
+    process.bunchSpacingProducer
+)
+
+#-----------------------------------------
+# gpu test
+#-----------------------------------------
+
+process.digiPathCPU = cms.Path(
+    process.hcalDigis 
+)
+
+process.recoPathCPU = cms.Path(
+     process.hbheprereco
+)
+
+#---------------
+
+## hcalCPUDigisProducer <-- this convert the GPU digi on cpu (for dqm)
+process.recoPathGPU = cms.Path(
+    process.hbheRecHitProducerGPUSequence
+    * process.hcalCPURecHitsProducer
+)
+
+#---------------
+
+process.schedule = cms.Schedule(
+    process.input,
+    process.digiPathCPU,
+    process.recoPathCPU,
+    process.recoPathGPU,
+    process.finalize
+)
+
+process.options = cms.untracked.PSet(
+    numberOfThreads = cms.untracked.uint32(8),
+    numberOfStreams = cms.untracked.uint32(8),
+    SkipEvent = cms.untracked.vstring('ProductNotFound'),
+    wantSummary = cms.untracked.bool(True)
+)
+
+# report CUDAService messages
+process.MessageLogger.cerr.FwkReport.reportEvery = 100
+process.MessageLogger.categories.append("CUDAService")