diff --git a/EventFilter/EcalRawToDigi/BuildFile.xml b/EventFilter/EcalRawToDigi/BuildFile.xml
index da28405324833..31e115bdc7880 100644
--- a/EventFilter/EcalRawToDigi/BuildFile.xml
+++ b/EventFilter/EcalRawToDigi/BuildFile.xml
@@ -8,7 +8,6 @@
 <use name="DataFormats/EcalDetId"/>
 <use name="DataFormats/EcalDigi"/>
 <use name="DataFormats/EcalRawData"/>
-<use name="DataFormats/EcalRecHit"/>
 <use name="DataFormats/FEDRawData"/>
 <use name="FWCore/Framework"/>
 <use name="FWCore/MessageLogger"/>
@@ -18,8 +17,6 @@
 <use name="Geometry/Records"/>
 <use name="HeterogeneousCore/CUDACore"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
-<use name="RecoLocalCalo/EcalRecAlgos"/>
-<use name="RecoLocalCalo/EcalRecProducers"/>
 <use name="Utilities/StorageFactory"/>
 <export>
   <lib name="1"/>
diff --git a/EventFilter/EcalRawToDigi/interface/EcalRegionCabling.h b/EventFilter/EcalRawToDigi/interface/EcalRegionCabling.h
index fa6e9f5d5a161..38a9ebdf18cb8 100644
--- a/EventFilter/EcalRawToDigi/interface/EcalRegionCabling.h
+++ b/EventFilter/EcalRawToDigi/interface/EcalRegionCabling.h
@@ -1,14 +1,11 @@
-#ifndef EcalRegionCabling_H
-#define EcalRegionCabling_H
+#ifndef EventFilter_EcalRawToDigi_interface_EcalRegionCabling_h
+#define EventFilter_EcalRawToDigi_interface_EcalRegionCabling_h
 
-#include "Geometry/EcalMapping/interface/EcalElectronicsMapping.h"
-#include "Geometry/EcalMapping/interface/ESElectronicsMapper.h"
-
-#include "DataFormats/EcalRecHit/interface/EcalRecHit.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "DataFormats/FEDRawData/interface/FEDNumbering.h"
-
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "Geometry/EcalMapping/interface/ESElectronicsMapper.h"
+#include "Geometry/EcalMapping/interface/EcalElectronicsMapping.h"
 
 class EcalRegionCabling {
 public:
@@ -73,4 +70,4 @@ class EcalRegionCabling {
   const ESElectronicsMapper* es_mapping_;
 };
 
-#endif
+#endif  // EventFilter_EcalRawToDigi_interface_EcalRegionCabling_h
diff --git a/EventFilter/EcalRawToDigi/plugins/BuildFile.xml b/EventFilter/EcalRawToDigi/plugins/BuildFile.xml
index 3bd25122ff029..df29d960b57e9 100644
--- a/EventFilter/EcalRawToDigi/plugins/BuildFile.xml
+++ b/EventFilter/EcalRawToDigi/plugins/BuildFile.xml
@@ -3,7 +3,6 @@
 <use name="EventFilter/EcalRawToDigi"/>
 <use name="CUDADataFormats/EcalDigi" />
 <use name="DataFormats/Candidate"/>
-<use name="DataFormats/EcalRecHit"/>
 <use name="DataFormats/L1GlobalMuonTrigger"/>
 <use name="DataFormats/L1Trigger"/>
 <use name="CondFormats/L1TObjects"/>
@@ -14,10 +13,6 @@
 <use name="FWCore/Framework"/>
 <use name="HeterogeneousCore/CUDACore"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
-<use name="RecoEcal/EgammaCoreTools"/>
-<use name="TrackingTools/Records"/>
-<use name="TrackingTools/GeomPropagators"/>
-<use name="TrackingTools/TrajectoryState"/>
-<library file="*.cc" name="EventFilterEcalRawToDigiPlugins">
+<library file="*.cc *.cu" name="EventFilterEcalRawToDigiPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/EventFilter/EcalRawToDigi/plugins/DeclsForKernels.h b/EventFilter/EcalRawToDigi/plugins/DeclsForKernels.h
new file mode 100644
index 0000000000000..9eab30a33fb3c
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/plugins/DeclsForKernels.h
@@ -0,0 +1,108 @@
+#ifndef EventFilter_EcalRawToDigi_plugins_DeclsForKernels_h
+#define EventFilter_EcalRawToDigi_plugins_DeclsForKernels_h
+
+#include <vector>
+
+#include "EventFilter/EcalRawToDigi/interface/DCCRawDataDefinitions.h"
+#include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+namespace ecal {
+  namespace raw {
+
+    constexpr auto empty_event_size = EMPTYEVENTSIZE;
+    constexpr uint32_t nfeds_max = 54;
+    constexpr uint32_t nbytes_per_fed_max = 10 * 1024;
+
+    struct InputDataCPU {
+      std::vector<unsigned char, cms::cuda::HostAllocator<unsigned char>> data;
+      std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t>> offsets;
+      std::vector<int, cms::cuda::HostAllocator<int>> feds;
+
+      void allocate() {
+        // 2KB per FED resize
+        data.resize(nfeds_max * sizeof(unsigned char) * nbytes_per_fed_max);
+        offsets.resize(nfeds_max, 0);
+        feds.resize(nfeds_max, 0);
+      }
+    };
+
+    struct ConfigurationParameters {
+      uint32_t maxChannels;
+    };
+
+    struct OutputDataCPU {
+      // [0] - eb, [1] - ee
+      std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t>> nchannels;
+
+      void allocate() { nchannels.resize(2); }
+    };
+
+    struct OutputDataGPU {
+      uint16_t *samplesEB = nullptr, *samplesEE = nullptr;
+      uint32_t *idsEB = nullptr, *idsEE = nullptr;
+
+      // FIXME: we should separate max channels parameter for eb and ee
+      // FIXME: replace hardcoded values
+      void allocate(ConfigurationParameters const &config) {
+        cudaCheck(cudaMalloc((void **)&samplesEB, config.maxChannels * sizeof(uint16_t) * 10));
+        cudaCheck(cudaMalloc((void **)&samplesEE, config.maxChannels * sizeof(uint16_t) * 10));
+        cudaCheck(cudaMalloc((void **)&idsEB, config.maxChannels * sizeof(uint32_t)));
+        cudaCheck(cudaMalloc((void **)&idsEE, config.maxChannels * sizeof(uint32_t)));
+      }
+
+      void deallocate(ConfigurationParameters const &config) {
+        if (samplesEB) {
+          cudaCheck(cudaFree(samplesEB));
+          cudaCheck(cudaFree(samplesEE));
+          cudaCheck(cudaFree(idsEB));
+          cudaCheck(cudaFree(idsEE));
+        }
+      }
+    };
+
+    struct ScratchDataGPU {
+      // [0] = EB
+      // [1] = EE
+      uint32_t *pChannelsCounter = nullptr;
+
+      void allocate(ConfigurationParameters const &config) {
+        cudaCheck(cudaMalloc((void **)&pChannelsCounter, sizeof(uint32_t) * 2));
+      }
+
+      void deallocate(ConfigurationParameters const &config) {
+        if (pChannelsCounter) {
+          cudaCheck(cudaFree(pChannelsCounter));
+        }
+      }
+    };
+
+    struct InputDataGPU {
+      unsigned char *data = nullptr;
+      uint32_t *offsets = nullptr;
+      int *feds = nullptr;
+
+      void allocate() {
+        cudaCheck(cudaMalloc((void **)&data, sizeof(unsigned char) * nbytes_per_fed_max * nfeds_max));
+        cudaCheck(cudaMalloc((void **)&offsets, sizeof(uint32_t) * nfeds_max));
+        cudaCheck(cudaMalloc((void **)&feds, sizeof(int) * nfeds_max));
+      }
+
+      void deallocate() {
+        if (data) {
+          cudaCheck(cudaFree(data));
+          cudaCheck(cudaFree(offsets));
+          cudaCheck(cudaFree(feds));
+        }
+      }
+    };
+
+    struct ConditionsProducts {
+      ElectronicsMappingGPU::Product const &eMappingProduct;
+    };
+
+  }  // namespace raw
+}  // namespace ecal
+
+#endif  // EventFilter_EcalRawToDigi_plugins_DeclsForKernels_h
diff --git a/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc b/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc
index a046ce92fe336..fa53efe6286f1 100644
--- a/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc
+++ b/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc
@@ -6,9 +6,7 @@
 #include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
 #include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
 #include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
-#include "EventFilter/EcalRawToDigi/interface/DeclsForKernels.h"
 #include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h"
-#include "EventFilter/EcalRawToDigi/interface/UnpackGPU.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
 #include "FWCore/Framework/interface/MakerMacros.h"
@@ -18,6 +16,9 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
+#include "DeclsForKernels.h"
+#include "UnpackGPU.h"
+
 class EcalCPUDigisProducer : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
   explicit EcalCPUDigisProducer(edm::ParameterSet const& ps);
diff --git a/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc b/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc
index 18dc2307e9bfc..e2a127e950252 100644
--- a/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc
+++ b/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc
@@ -1,29 +1,20 @@
 #include <iostream>
 
-// framework
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
-//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h"
-
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-
-// algorithm specific
-
-#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
-#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
 #include "CUDADataFormats/EcalDigi/interface/DigisCollection.h"
-
 #include "CondFormats/DataRecord/interface/EcalMappingElectronicsRcd.h"
-
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
 #include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-#include "EventFilter/EcalRawToDigi/interface/DeclsForKernels.h"
-#include "EventFilter/EcalRawToDigi/interface/UnpackGPU.h"
+#include "DeclsForKernels.h"
+#include "UnpackGPU.h"
 
 class EcalRawToDigiGPU : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
diff --git a/EventFilter/EcalRawToDigi/plugins/UnpackGPU.cu b/EventFilter/EcalRawToDigi/plugins/UnpackGPU.cu
new file mode 100644
index 0000000000000..79f916d510e72
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/plugins/UnpackGPU.cu
@@ -0,0 +1,337 @@
+#include "EventFilter/EcalRawToDigi/interface/ElectronicsIdGPU.h"
+
+#include "UnpackGPU.h"
+
+namespace ecal {
+  namespace raw {
+
+    __forceinline__ __device__ void print_raw_buffer(uint8_t const* const buffer,
+                                                     uint32_t const nbytes,
+                                                     uint32_t const nbytes_per_row = 20) {
+      for (uint32_t i = 0; i < nbytes; i++) {
+        if (i % nbytes_per_row == 0 && i > 0)
+          printf("\n");
+        printf("%02X ", buffer[i]);
+      }
+    }
+
+    __forceinline__ __device__ void print_first3bits(uint64_t const* buffer, uint32_t size) {
+      for (uint32_t i = 0; i < size; ++i) {
+        uint8_t const b61 = (buffer[i] >> 61) & 0x1;
+        uint8_t const b62 = (buffer[i] >> 62) & 0x1;
+        uint8_t const b63 = (buffer[i] >> 63) & 0x1;
+        printf("[word: %u] %u%u%u\n", i, b63, b62, b61);
+      }
+    }
+
+    __forceinline__ __device__ bool is_barrel(uint8_t dccid) {
+      return dccid >= ElectronicsIdGPU::MIN_DCCID_EBM && dccid <= ElectronicsIdGPU::MAX_DCCID_EBP;
+    }
+
+    __forceinline__ __device__ uint8_t fed2dcc(int fed) { return static_cast<uint8_t>(fed - 600); }
+
+    __forceinline__ __device__ int zside_for_eb(ElectronicsIdGPU const& eid) {
+      int dcc = eid.dccId();
+      return ((dcc >= ElectronicsIdGPU::MIN_DCCID_EBM && dcc <= ElectronicsIdGPU::MAX_DCCID_EBM)) ? -1 : 1;
+    }
+
+    __forceinline__ __device__ bool is_synced_towerblock(uint16_t const dccbx,
+                                                         uint16_t const bx,
+                                                         uint16_t const dccl1,
+                                                         uint16_t const l1) {
+      bool const bxsync = (bx == 0 && dccbx == 3564) || (bx == dccbx && dccbx != 3564);
+      bool const l1sync = (l1 == ((dccl1 - 1) & 0xfff));
+      return bxsync && l1sync;
+    }
+
+    __forceinline__ __device__ bool right_tower_for_eb(int tower) {
+      // for EB, two types of tower (LVRB top/bottom)
+      if ((tower > 12 && tower < 21) || (tower > 28 && tower < 37) || (tower > 44 && tower < 53) ||
+          (tower > 60 && tower < 69))
+        return true;
+      else
+        return false;
+    }
+
+    __forceinline__ __device__ uint32_t compute_ebdetid(ElectronicsIdGPU const& eid) {
+      // as in Geometry/EcalMaping/.../EcalElectronicsMapping
+      auto const dcc = eid.dccId();
+      auto const tower = eid.towerId();
+      auto const strip = eid.stripId();
+      auto const xtal = eid.xtalId();
+
+      int smid = 0;
+      int iphi = 0;
+      bool EBPlus = (zside_for_eb(eid) > 0);
+      bool EBMinus = !EBPlus;
+
+      if (zside_for_eb(eid) < 0) {
+        smid = dcc + 19 - ElectronicsIdGPU::DCCID_PHI0_EBM;
+        iphi = (smid - 19) * ElectronicsIdGPU::kCrystalsInPhi;
+        iphi += 5 * ((tower - 1) % ElectronicsIdGPU::kTowersInPhi);
+      } else {
+        smid = dcc + 1 - ElectronicsIdGPU::DCCID_PHI0_EBP;
+        iphi = (smid - 1) * ElectronicsIdGPU::kCrystalsInPhi;
+        iphi += 5 * (ElectronicsIdGPU::kTowersInPhi - ((tower - 1) % ElectronicsIdGPU::kTowersInPhi) - 1);
+      }
+
+      bool RightTower = right_tower_for_eb(tower);
+      int ieta = 5 * ((tower - 1) / ElectronicsIdGPU::kTowersInPhi) + 1;
+      if (RightTower) {
+        ieta += (strip - 1);
+        if (strip % 2 == 1) {
+          if (EBMinus)
+            iphi += (xtal - 1) + 1;
+          else
+            iphi += (4 - (xtal - 1)) + 1;
+        } else {
+          if (EBMinus)
+            iphi += (4 - (xtal - 1)) + 1;
+          else
+            iphi += (xtal - 1) + 1;
+        }
+      } else {
+        ieta += 4 - (strip - 1);
+        if (strip % 2 == 1) {
+          if (EBMinus)
+            iphi += (4 - (xtal - 1)) + 1;
+          else
+            iphi += (xtal - 1) + 1;
+        } else {
+          if (EBMinus)
+            iphi += (xtal - 1) + 1;
+          else
+            iphi += (4 - (xtal - 1)) + 1;
+        }
+      }
+
+      if (zside_for_eb(eid) < 0)
+        ieta = -ieta;
+
+      DetId did{DetId::Ecal, EcalBarrel};
+      return did.rawId() | ((ieta > 0) ? (0x10000 | (ieta << 9)) : ((-ieta) << 9)) | (iphi & 0x1FF);
+    }
+
+    __forceinline__ __device__ int adc(uint16_t sample) { return sample & 0xfff; }
+
+    __forceinline__ __device__ int gainId(uint16_t sample) { return (sample >> 12) & 0x3; }
+
+    template <int NTHREADS>
+    __global__ void kernel_unpack_test(unsigned char const* __restrict__ data,
+                                       uint32_t const* __restrict__ offsets,
+                                       int const* __restrict__ feds,
+                                       uint16_t* samplesEB,
+                                       uint16_t* samplesEE,
+                                       uint32_t* idsEB,
+                                       uint32_t* idsEE,
+                                       uint32_t* pChannelsCounterEBEE,
+                                       uint32_t const* eid2did,
+                                       uint32_t const nbytesTotal) {
+      // indices
+      auto const ifed = blockIdx.x;
+
+      // FIXME: use only the very first fed
+      //if (ifed!=10) return;
+
+      // offset in bytes
+      auto const offset = offsets[ifed];
+      // fed id
+      auto const fed = feds[ifed];
+      auto const isBarrel = is_barrel(static_cast<uint8_t>(fed - 600));
+      // size
+      auto const size = ifed == gridDim.x - 1 ? nbytesTotal - offset : offsets[ifed + 1] - offset;
+      auto* samples = isBarrel ? samplesEB : samplesEE;
+      auto* ids = isBarrel ? idsEB : idsEE;
+      auto* pChannelsCounter = isBarrel ? &pChannelsCounterEBEE[0] : &pChannelsCounterEBEE[1];
+
+      // FIXME: debugging
+      //printf("ifed = %u fed = %d offset = %u size = %u\n", ifed, fed, offset, size);
+
+      // offset to the right raw buffer
+      uint64_t const* buffer = reinterpret_cast<uint64_t const*>(data + offset);
+
+      // dump first 3 bits for each 64-bit word
+      //print_first3bits(buffer, size / 8);
+
+      //
+      // fed header
+      //
+      auto const fed_header = buffer[0];
+      uint32_t bx = (fed_header >> 20) & 0xfff;
+      uint32_t lv1 = (fed_header >> 32) & 0xffffff;
+
+      // 9 for fed + dcc header
+      // 36 for 4 EE TCC blocks or 18 for 1 EB TCC block
+      // 6 for SR block size
+
+      // dcc header w2
+      auto const w2 = buffer[2];
+      uint8_t const fov = (w2 >> 48) & 0xf;
+
+      //
+      // print Tower block headers
+      //
+      uint8_t ntccblockwords = isBarrel ? 18 : 36;
+      auto const* tower_blocks_start = buffer + 9 + ntccblockwords + 6;
+      auto const* trailer = buffer + (size / 8 - 1);
+      auto const* current_tower_block = tower_blocks_start;
+      while (current_tower_block != trailer) {
+        auto const w = *current_tower_block;
+        uint8_t ttid = w & 0xff;
+        uint16_t bxlocal = (w >> 16) & 0xfff;
+        uint16_t lv1local = (w >> 32) & 0xfff;
+        uint16_t block_length = (w >> 48) & 0x1ff;
+
+        uint16_t const dccbx = bx & 0xfff;
+        uint16_t const dccl1 = lv1 & 0xfff;
+        // fov>=1 is required to support simulated data for which bx==bxlocal==0
+        if (fov >= 1 && !is_synced_towerblock(dccbx, bxlocal, dccl1, lv1local)) {
+          current_tower_block += block_length;
+          continue;
+        }
+
+        // go through all the channels
+        // get the next channel coordinates
+        uint32_t nchannels = (block_length - 1) / 3;
+
+        // 1 threads per channel in this block
+        for (uint32_t ich = 0; ich < nchannels; ich += NTHREADS) {
+          auto const i_to_access = ich + threadIdx.x;
+          // threads outside of the range -> leave the loop
+          if (i_to_access >= nchannels)
+            break;
+
+          // inc the channel's counter and get the pos where to store
+          auto const wdata = current_tower_block[1 + i_to_access * 3];
+          uint8_t const stripid = wdata & 0x7;
+          uint8_t const xtalid = (wdata >> 4) & 0x7;
+          ElectronicsIdGPU eid{fed2dcc(fed), ttid, stripid, xtalid};
+          auto const didraw = isBarrel ? compute_ebdetid(eid) : eid2did[eid.linearIndex()];
+          // FIXME: what kind of channels are these guys
+          if (didraw == 0)
+            continue;
+
+          // get samples
+          uint16_t sampleValues[10];
+          sampleValues[0] = (wdata >> 16) & 0x3fff;
+          sampleValues[1] = (wdata >> 32) & 0x3fff;
+          sampleValues[2] = (wdata >> 48) & 0x3fff;
+          auto const wdata1 = current_tower_block[2 + i_to_access * 3];
+          sampleValues[3] = wdata1 & 0x3fff;
+          sampleValues[4] = (wdata1 >> 16) & 0x3fff;
+          sampleValues[5] = (wdata1 >> 32) & 0x3fff;
+          sampleValues[6] = (wdata1 >> 48) & 0x3fff;
+          auto const wdata2 = current_tower_block[3 + i_to_access * 3];
+          sampleValues[7] = wdata2 & 0x3fff;
+          sampleValues[8] = (wdata2 >> 16) & 0x3fff;
+          sampleValues[9] = (wdata2 >> 32) & 0x3fff;
+          //printf("stripid = %u xtalid = %u\n", stripid, xtalid);
+
+          // check gain
+          bool isSaturation = true;
+          short firstGainZeroSampID{-1}, firstGainZeroSampADC{-1};
+          for (uint32_t si = 0; si < 10; si++) {
+            if (gainId(sampleValues[si]) == 0) {
+              firstGainZeroSampID = si;
+              firstGainZeroSampADC = adc(sampleValues[si]);
+              break;
+            }
+          }
+          if (firstGainZeroSampID != -1) {
+            unsigned int plateauEnd = std::min(10u, (unsigned int)(firstGainZeroSampID + 5));
+            for (unsigned int s = firstGainZeroSampID; s < plateauEnd; s++) {
+              if (gainId(sampleValues[s]) == 0 && adc(sampleValues[s]) == firstGainZeroSampADC) {
+                ;
+              } else {
+                isSaturation = false;
+                break;
+              }  //it's not saturation
+            }
+            // get rid of channels which are stuck in gain0
+            if (firstGainZeroSampID < 3) {
+              isSaturation = false;
+            }
+            if (!isSaturation)
+              continue;
+          } else {  // there is no zero gainId sample
+            // gain switch check
+            short numGain = 1;
+            bool gainSwitchError = false;
+            for (unsigned int si = 1; si < 10; si++) {
+              if ((gainId(sampleValues[si - 1]) > gainId(sampleValues[si])) && numGain < 5)
+                gainSwitchError = true;
+              if (gainId(sampleValues[si - 1]) == gainId(sampleValues[si]))
+                numGain++;
+              else
+                numGain = 1;
+            }
+            if (gainSwitchError)
+              continue;
+          }
+
+          auto const pos = atomicAdd(pChannelsCounter, 1);
+
+          // store to global
+          ids[pos] = didraw;
+          samples[pos * 10] = sampleValues[0];
+          samples[pos * 10 + 1] = sampleValues[1];
+          samples[pos * 10 + 2] = sampleValues[2];
+          samples[pos * 10 + 3] = sampleValues[3];
+          samples[pos * 10 + 4] = sampleValues[4];
+          samples[pos * 10 + 5] = sampleValues[5];
+          samples[pos * 10 + 6] = sampleValues[6];
+          samples[pos * 10 + 7] = sampleValues[7];
+          samples[pos * 10 + 8] = sampleValues[8];
+          samples[pos * 10 + 9] = sampleValues[9];
+        }
+
+        current_tower_block += block_length;
+      }
+    }
+
+    void entryPoint(InputDataCPU const& inputCPU,
+                    InputDataGPU& inputGPU,
+                    OutputDataGPU& outputGPU,
+                    ScratchDataGPU& scratchGPU,
+                    OutputDataCPU& outputCPU,
+                    ConditionsProducts const& conditions,
+                    cudaStream_t cudaStream,
+                    uint32_t const nfedsWithData,
+                    uint32_t const nbytesTotal) {
+      // transfer
+      cudaCheck(cudaMemcpyAsync(
+          inputGPU.data, inputCPU.data.data(), nbytesTotal * sizeof(unsigned char), cudaMemcpyHostToDevice, cudaStream));
+      cudaCheck(cudaMemcpyAsync(inputGPU.offsets,
+                                inputCPU.offsets.data(),
+                                nfedsWithData * sizeof(uint32_t),
+                                cudaMemcpyHostToDevice,
+                                cudaStream));
+      cudaCheck(cudaMemsetAsync(scratchGPU.pChannelsCounter,
+                                0,
+                                sizeof(uint32_t) * 2,  // EB + EE
+                                cudaStream));
+      cudaCheck(cudaMemcpyAsync(
+          inputGPU.feds, inputCPU.feds.data(), nfedsWithData * sizeof(int), cudaMemcpyHostToDevice, cudaStream));
+
+      kernel_unpack_test<32><<<nfedsWithData, 32, 0, cudaStream>>>(inputGPU.data,
+                                                                   inputGPU.offsets,
+                                                                   inputGPU.feds,
+                                                                   outputGPU.samplesEB,
+                                                                   outputGPU.samplesEE,
+                                                                   outputGPU.idsEB,
+                                                                   outputGPU.idsEE,
+                                                                   scratchGPU.pChannelsCounter,
+                                                                   conditions.eMappingProduct.eid2did,
+                                                                   nbytesTotal);
+      cudaCheck(cudaGetLastError());
+
+      // transfer the counters for how many eb and ee channels we got
+      cudaCheck(cudaMemcpyAsync(outputCPU.nchannels.data(),
+                                scratchGPU.pChannelsCounter,
+                                sizeof(uint32_t) * 2,
+                                cudaMemcpyDeviceToHost,
+                                cudaStream));
+    }
+
+  }  // namespace raw
+}  // namespace ecal
diff --git a/EventFilter/EcalRawToDigi/plugins/UnpackGPU.h b/EventFilter/EcalRawToDigi/plugins/UnpackGPU.h
new file mode 100644
index 0000000000000..d98906e7e24a7
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/plugins/UnpackGPU.h
@@ -0,0 +1,23 @@
+#ifndef EventFilter_EcalRawToDigi_plugins_UnpackGPU_h
+#define EventFilter_EcalRawToDigi_plugins_UnpackGPU_h
+
+#include "DeclsForKernels.h"
+
+namespace ecal {
+  namespace raw {
+
+    // FIXME: bundle up uint32_t values
+    void entryPoint(InputDataCPU const&,
+                    InputDataGPU&,
+                    OutputDataGPU&,
+                    ScratchDataGPU&,
+                    OutputDataCPU&,
+                    ConditionsProducts const&,
+                    cudaStream_t,
+                    uint32_t const,
+                    uint32_t const);
+
+  }  // namespace raw
+}  // namespace ecal
+
+#endif  // EventFilter_EcalRawToDigi_plugins_UnpackGPU_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.cu
new file mode 100644
index 0000000000000..36601a0fb9230
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.cu
@@ -0,0 +1,505 @@
+#include <iostream>
+#include <limits>
+
+#include <cuda.h>
+
+#include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h"
+#include "CondFormats/EcalObjects/interface/EcalPulseShapes.h"
+#include "CondFormats/EcalObjects/interface/EcalSamplesCorrelation.h"
+#include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
+#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
+#include "DataFormats/Math/interface/approx_exp.h"
+#include "DataFormats/Math/interface/approx_log.h"
+
+#include "AmplitudeComputationCommonKernels.h"
+#include "KernelHelpers.h"
+
+namespace ecal {
+  namespace multifit {
+
+    ///
+    /// assume kernel launch configuration is
+    /// (MAXSAMPLES * nchannels, blocks)
+    ///
+    __global__ void kernel_prep_1d_and_initialize(EcalPulseShape const* shapes_in,
+                                                  uint16_t const* digis_in_eb,
+                                                  uint32_t const* dids_eb,
+                                                  uint16_t const* digis_in_ee,
+                                                  uint32_t const* dids_ee,
+                                                  SampleVector* amplitudes,
+                                                  SampleVector* amplitudesForMinimization,
+                                                  SampleGainVector* gainsNoise,
+                                                  float const* mean_x1,
+                                                  float const* mean_x12,
+                                                  float const* rms_x12,
+                                                  float const* mean_x6,
+                                                  float const* gain6Over1,
+                                                  float const* gain12Over6,
+                                                  bool* hasSwitchToGain6,
+                                                  bool* hasSwitchToGain1,
+                                                  bool* isSaturated,
+                                                  ::ecal::reco::StorageScalarType* energies,
+                                                  ::ecal::reco::StorageScalarType* chi2,
+                                                  ::ecal::reco::StorageScalarType* g_pedestal,
+                                                  uint32_t* dids_out,
+                                                  uint32_t* flags,
+                                                  char* acState,
+                                                  BXVectorType* bxs,
+                                                  uint32_t const offsetForHashes,
+                                                  uint32_t const offsetForInputs,
+                                                  bool const gainSwitchUseMaxSampleEB,
+                                                  bool const gainSwitchUseMaxSampleEE,
+                                                  int const nchannels) {
+      constexpr bool dynamicPedestal = false;  //---- default to false, ok
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+      constexpr int sample_max = 5;
+      constexpr int full_pulse_max = 9;
+      int const tx = threadIdx.x + blockIdx.x * blockDim.x;
+      int const nchannels_per_block = blockDim.x / nsamples;
+      int const ch = tx / nsamples;
+      // for accessing input arrays
+      int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+      int const inputTx = ch >= offsetForInputs ? tx - offsetForInputs * 10 : tx;
+      // eb is first and then ee
+      auto const* digis_in = ch >= offsetForInputs ? digis_in_ee : digis_in_eb;
+      auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+      int const sample = threadIdx.x % nsamples;
+
+      if (ch < nchannels) {
+        // array of 10 x channels per block
+        // TODO: any other way of doing simple reduction
+        // assume bool is 1 byte, should be quite safe
+        extern __shared__ char shared_mem[];
+        bool* shr_hasSwitchToGain6 = reinterpret_cast<bool*>(shared_mem);
+        bool* shr_hasSwitchToGain1 = shr_hasSwitchToGain6 + nchannels_per_block * nsamples;
+        bool* shr_hasSwitchToGain0 = shr_hasSwitchToGain1 + nchannels_per_block * nsamples;
+        bool* shr_isSaturated = shr_hasSwitchToGain0 + nchannels_per_block * nsamples;
+        bool* shr_hasSwitchToGain0_tmp = shr_isSaturated + nchannels_per_block * nsamples;
+        char* shr_counts = reinterpret_cast<char*>(shr_hasSwitchToGain0_tmp) + nchannels_per_block * nsamples;
+
+        //
+        // indices
+        //
+        auto const did = DetId{dids[inputCh]};
+        auto const isBarrel = did.subdetId() == EcalBarrel;
+        // TODO offset for ee, 0 for eb
+        auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
+                                       : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
+
+        //
+        // pulse shape template
+        /*
+        for (int isample=sample; isample<EcalPulseShape::TEMPLATESAMPLES; 
+            isample+=nsamples)
+            shapes_out[ch](isample + 7) = shapes_in[hashedId].pdfval[isample];
+            */
+
+        // will be used in the future for setting state
+        auto const rmsForChecking = rms_x12[hashedId];
+
+        //
+        // amplitudes
+        //
+        int const adc = ecal::mgpa::adc(digis_in[inputTx]);
+        int const gainId = ecal::mgpa::gainId(digis_in[inputTx]);
+        SampleVector::Scalar amplitude = 0.;
+        SampleVector::Scalar pedestal = 0.;
+        SampleVector::Scalar gainratio = 0.;
+
+        // store into shared mem for initialization
+        shr_hasSwitchToGain6[threadIdx.x] = gainId == EcalMgpaBitwiseGain6;
+        shr_hasSwitchToGain1[threadIdx.x] = gainId == EcalMgpaBitwiseGain1;
+        shr_hasSwitchToGain0_tmp[threadIdx.x] = gainId == EcalMgpaBitwiseGain0;
+        shr_hasSwitchToGain0[threadIdx.x] = shr_hasSwitchToGain0_tmp[threadIdx.x];
+        shr_counts[threadIdx.x] = 0;
+        __syncthreads();
+
+        // non-divergent branch (except for the last 4 threads)
+        if (threadIdx.x <= blockDim.x - 5) {
+#pragma unroll
+          for (int i = 0; i < 5; i++)
+            shr_counts[threadIdx.x] += shr_hasSwitchToGain0[threadIdx.x + i];
+        }
+        shr_isSaturated[threadIdx.x] = shr_counts[threadIdx.x] == 5;
+
+        //
+        // unrolled reductions
+        // TODO
+        //
+        if (sample < 5) {
+          shr_hasSwitchToGain6[threadIdx.x] =
+              shr_hasSwitchToGain6[threadIdx.x] || shr_hasSwitchToGain6[threadIdx.x + 5];
+          shr_hasSwitchToGain1[threadIdx.x] =
+              shr_hasSwitchToGain1[threadIdx.x] || shr_hasSwitchToGain1[threadIdx.x + 5];
+
+          // duplication of hasSwitchToGain0 in order not to
+          // introduce another syncthreads
+          shr_hasSwitchToGain0_tmp[threadIdx.x] =
+              shr_hasSwitchToGain0_tmp[threadIdx.x] || shr_hasSwitchToGain0_tmp[threadIdx.x + 5];
+        }
+        __syncthreads();
+
+        if (sample < 2) {
+          // note, both threads per channel take value [3] twice to avoid another if
+          shr_hasSwitchToGain6[threadIdx.x] = shr_hasSwitchToGain6[threadIdx.x] ||
+                                              shr_hasSwitchToGain6[threadIdx.x + 2] ||
+                                              shr_hasSwitchToGain6[threadIdx.x + 3];
+          shr_hasSwitchToGain1[threadIdx.x] = shr_hasSwitchToGain1[threadIdx.x] ||
+                                              shr_hasSwitchToGain1[threadIdx.x + 2] ||
+                                              shr_hasSwitchToGain1[threadIdx.x + 3];
+
+          shr_hasSwitchToGain0_tmp[threadIdx.x] = shr_hasSwitchToGain0_tmp[threadIdx.x] ||
+                                                  shr_hasSwitchToGain0_tmp[threadIdx.x + 2] ||
+                                                  shr_hasSwitchToGain0_tmp[threadIdx.x + 3];
+
+          // sample < 2 -> first 2 threads of each channel will be used here
+          // => 0 -> will compare 3 and 4 and put into 0
+          // => 1 -> will compare 4 and 5 and put into 1
+          shr_isSaturated[threadIdx.x] = shr_isSaturated[threadIdx.x + 3] || shr_isSaturated[threadIdx.x + 4];
+        }
+        __syncthreads();
+
+        bool check_hasSwitchToGain0 = false;
+
+        if (sample == 0) {
+          shr_hasSwitchToGain6[threadIdx.x] =
+              shr_hasSwitchToGain6[threadIdx.x] || shr_hasSwitchToGain6[threadIdx.x + 1];
+          shr_hasSwitchToGain1[threadIdx.x] =
+              shr_hasSwitchToGain1[threadIdx.x] || shr_hasSwitchToGain1[threadIdx.x + 1];
+          shr_hasSwitchToGain0_tmp[threadIdx.x] =
+              shr_hasSwitchToGain0_tmp[threadIdx.x] || shr_hasSwitchToGain0_tmp[threadIdx.x + 1];
+
+          hasSwitchToGain6[ch] = shr_hasSwitchToGain6[threadIdx.x];
+          hasSwitchToGain1[ch] = shr_hasSwitchToGain1[threadIdx.x];
+
+          // set only for the threadIdx.x corresponding to sample==0
+          check_hasSwitchToGain0 = shr_hasSwitchToGain0_tmp[threadIdx.x];
+
+          shr_isSaturated[threadIdx.x + 3] = shr_isSaturated[threadIdx.x] || shr_isSaturated[threadIdx.x + 1];
+          isSaturated[ch] = shr_isSaturated[threadIdx.x + 3];
+        }
+
+        // TODO: w/o this sync, there is a race
+        // if (threadIdx == sample_max) below uses max sample thread, not for 0 sample
+        // check if we can remove it
+        __syncthreads();
+
+        // TODO: divergent branch
+        if (gainId == 0 || gainId == 3) {
+          pedestal = mean_x1[hashedId];
+          gainratio = gain6Over1[hashedId] * gain12Over6[hashedId];
+          gainsNoise[ch](sample) = 2;
+        } else if (gainId == 1) {
+          pedestal = mean_x12[hashedId];
+          gainratio = 1.;
+          gainsNoise[ch](sample) = 0;
+        } else if (gainId == 2) {
+          pedestal = mean_x6[hashedId];
+          gainratio = gain12Over6[hashedId];
+          gainsNoise[ch](sample) = 1;
+        }
+
+        // TODO: compile time constant -> branch should be non-divergent
+        if (dynamicPedestal)
+          amplitude = static_cast<SampleVector::Scalar>(adc) * gainratio;
+        else
+          amplitude = (static_cast<SampleVector::Scalar>(adc) - pedestal) * gainratio;
+        amplitudes[ch][sample] = amplitude;
+
+#ifdef ECAL_RECO_CUDA_DEBUG
+        printf("%d %d %d %d %f %f %f\n", tx, ch, sample, adc, amplitude, pedestal, gainratio);
+        if (adc == 0)
+          printf("adc is zero\n");
+#endif
+
+        //
+        // initialization
+        //
+        amplitudesForMinimization[ch](sample) = 0;
+        bxs[ch](sample) = sample - 5;
+
+        // select the thread for the max sample
+        //---> hardcoded above to be 5th sample, ok
+        if (sample == sample_max) {
+          //
+          // initialization
+          //
+          acState[ch] = static_cast<char>(MinimizationState::NotFinished);
+          energies[ch] = 0;
+          chi2[ch] = 0;
+          g_pedestal[ch] = 0;
+          uint32_t flag = 0;
+          dids_out[ch] = did.rawId();
+
+          // start of this channel in shared mem
+          int const chStart = threadIdx.x - sample_max;
+          // thread for the max sample in shared mem
+          int const threadMax = threadIdx.x;
+          auto const gainSwitchUseMaxSample = isBarrel ? gainSwitchUseMaxSampleEB : gainSwitchUseMaxSampleEE;
+
+          // this flag setting is applied to all of the cases
+          if (shr_hasSwitchToGain6[chStart])
+            flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain6;
+          if (shr_hasSwitchToGain1[chStart])
+            flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain1;
+
+          // this corresponds to cpu branching on lastSampleBeforeSaturation
+          // likely false
+          if (check_hasSwitchToGain0) {
+            // assign for the case some sample having gainId == 0
+            //energies[ch] = amplitudes[ch][sample_max];
+            energies[ch] = amplitude;
+
+            // check if samples before sample_max have true
+            bool saturated_before_max = false;
+#pragma unroll
+            for (char ii = 0; ii < 5; ii++)
+              saturated_before_max = saturated_before_max || shr_hasSwitchToGain0[chStart + ii];
+
+            // if saturation is in the max sample and not in the first 5
+            if (!saturated_before_max && shr_hasSwitchToGain0[threadMax])
+              energies[ch] = 49140;  // 4095 * 12
+                                     //---- AM FIXME : no pedestal subtraction???
+                                     //It should be "(4095. - pedestal) * gainratio"
+
+            // set state flag to terminate further processing of this channel
+            acState[ch] = static_cast<char>(MinimizationState::Precomputed);
+            flag |= 0x1 << EcalUncalibratedRecHit::kSaturated;
+            flags[ch] = flag;
+            return;
+          }
+
+          // according to cpu version
+          //            auto max_amplitude = amplitudes[ch][sample_max];
+          auto const max_amplitude = amplitude;
+          // according to cpu version
+          auto shape_value = shapes_in[hashedId].pdfval[full_pulse_max - 7];
+          // note, no syncing as the same thread will be accessing here
+          bool hasGainSwitch =
+              shr_hasSwitchToGain6[chStart] || shr_hasSwitchToGain1[chStart] || shr_isSaturated[chStart + 3];
+
+          // pedestal is final unconditionally
+          g_pedestal[ch] = pedestal;
+          if (hasGainSwitch && gainSwitchUseMaxSample) {
+            // thread for sample=0 will access the right guys
+            energies[ch] = max_amplitude / shape_value;
+            acState[ch] = static_cast<char>(MinimizationState::Precomputed);
+            flags[ch] = flag;
+            return;
+          }
+
+          // this happens cause sometimes rms_x12 is 0...
+          // needs to be checkec why this is the case
+          // general case here is that noisecov is a Zero matrix
+          if (rmsForChecking == 0) {
+            acState[ch] = static_cast<char>(MinimizationState::Precomputed);
+            flags[ch] = flag;
+            return;
+          }
+
+          // for the case when no shortcuts were taken
+          flags[ch] = flag;
+        }
+      }
+    }
+
+    ///
+    /// assume kernel launch configuration is
+    /// ([MAXSAMPLES, MAXSAMPLES], nchannels)
+    ///
+    __global__ void kernel_prep_2d(SampleGainVector const* gainNoise,
+                                   uint32_t const* dids_eb,
+                                   uint32_t const* dids_ee,
+                                   float const* rms_x12,
+                                   float const* rms_x6,
+                                   float const* rms_x1,
+                                   float const* gain12Over6,
+                                   float const* gain6Over1,
+                                   double const* G12SamplesCorrelationEB,
+                                   double const* G6SamplesCorrelationEB,
+                                   double const* G1SamplesCorrelationEB,
+                                   double const* G12SamplesCorrelationEE,
+                                   double const* G6SamplesCorrelationEE,
+                                   double const* G1SamplesCorrelationEE,
+                                   SampleMatrix* noisecov,
+                                   PulseMatrixType* pulse_matrix,
+                                   EcalPulseShape const* pulse_shape,
+                                   bool const* hasSwitchToGain6,
+                                   bool const* hasSwitchToGain1,
+                                   bool const* isSaturated,
+                                   uint32_t const offsetForHashes,
+                                   uint32_t const offsetForInputs) {
+      int const ch = blockIdx.x;
+      int const tx = threadIdx.x;
+      int const ty = threadIdx.y;
+      constexpr float addPedestalUncertainty = 0.f;
+      constexpr bool dynamicPedestal = false;
+      constexpr bool simplifiedNoiseModelForGainSwitch = true;  //---- default is true
+
+      // to access input arrays (ids and digis only)
+      int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+      auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+
+      bool tmp0 = hasSwitchToGain6[ch];
+      bool tmp1 = hasSwitchToGain1[ch];
+      auto const did = DetId{dids[inputCh]};
+      auto const isBarrel = did.subdetId() == EcalBarrel;
+      auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
+                                     : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
+      auto const G12SamplesCorrelation = isBarrel ? G12SamplesCorrelationEB : G12SamplesCorrelationEE;
+      auto const* G6SamplesCorrelation = isBarrel ? G6SamplesCorrelationEB : G6SamplesCorrelationEE;
+      auto const* G1SamplesCorrelation = isBarrel ? G1SamplesCorrelationEB : G1SamplesCorrelationEE;
+      bool tmp2 = isSaturated[ch];
+      bool hasGainSwitch = tmp0 || tmp1 || tmp2;
+      auto const vidx = ecal::abs(ty - tx);
+
+      // non-divergent branch for all threads per block
+      if (hasGainSwitch) {
+        // TODO: did not include simplified noise model
+        float noise_value = 0;
+
+        // non-divergent branch - all threads per block
+        // TODO: all of these constants indicate that
+        // that these parts could be splitted into completely different
+        // kernels and run one of them only depending on the config
+        if (simplifiedNoiseModelForGainSwitch) {
+          int isample_max = 5;  // according to cpu defs
+          int gainidx = gainNoise[ch][isample_max];
+
+          // non-divergent branches
+          if (gainidx == 0)
+            //noise_value = rms_x12[ch]*rms_x12[ch]*noisecorrs[0](ty, tx);
+            noise_value = rms_x12[hashedId] * rms_x12[hashedId] * G12SamplesCorrelation[vidx];
+          if (gainidx == 1)
+            //                noise_value = gain12Over6[ch]*gain12Over6[ch] * rms_x6[ch]*rms_x6[ch]
+            //                    *noisecorrs[1](ty, tx);
+            noise_value = gain12Over6[hashedId] * gain12Over6[hashedId] * rms_x6[hashedId] * rms_x6[hashedId] *
+                          G6SamplesCorrelation[vidx];
+          if (gainidx == 2)
+            //                noise_value = gain12Over6[ch]*gain12Over6[ch]
+            //                    * gain6Over1[ch]*gain6Over1[ch] * rms_x1[ch]*rms_x1[ch]
+            //                    * noisecorrs[2](ty, tx);
+            noise_value = gain12Over6[hashedId] * gain12Over6[hashedId] * gain6Over1[hashedId] * gain6Over1[hashedId] *
+                          rms_x1[hashedId] * rms_x1[hashedId] * G1SamplesCorrelation[vidx];
+          if (!dynamicPedestal && addPedestalUncertainty > 0.f)
+            noise_value += addPedestalUncertainty * addPedestalUncertainty;
+        } else {
+          int gainidx = 0;
+          char mask = gainidx;
+          int pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
+          //            noise_value += /* gainratio is 1*/ rms_x12[ch]*rms_x12[ch]
+          //                *pedestal*noisecorrs[0](ty, tx);
+          noise_value +=
+              /* gainratio is 1*/ rms_x12[hashedId] * rms_x12[hashedId] * pedestal * G12SamplesCorrelation[vidx];
+          // non-divergent branch
+          if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
+            noise_value += /* gainratio is 1 */
+                addPedestalUncertainty * addPedestalUncertainty * pedestal;
+          }
+
+          //
+          gainidx = 1;
+          mask = gainidx;
+          pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
+          //            noise_value += gain12Over6[ch]*gain12Over6[ch]
+          //                *rms_x6[ch]*rms_x6[ch]*pedestal*noisecorrs[1](ty, tx);
+          noise_value += gain12Over6[hashedId] * gain12Over6[hashedId] * rms_x6[hashedId] * rms_x6[hashedId] *
+                         pedestal * G6SamplesCorrelation[vidx];
+          // non-divergent branch
+          if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
+            noise_value += gain12Over6[hashedId] * gain12Over6[hashedId] * addPedestalUncertainty *
+                           addPedestalUncertainty * pedestal;
+          }
+
+          //
+          gainidx = 2;
+          mask = gainidx;
+          pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
+          float tmp = gain6Over1[hashedId] * gain12Over6[hashedId];
+          //            noise_value += tmp*tmp * rms_x1[ch]*rms_x1[ch]
+          //                *pedestal*noisecorrs[2](ty, tx);
+          noise_value += tmp * tmp * rms_x1[hashedId] * rms_x1[hashedId] * pedestal * G1SamplesCorrelation[vidx];
+          // non-divergent branch
+          if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
+            noise_value += tmp * tmp * addPedestalUncertainty * addPedestalUncertainty * pedestal;
+          }
+        }
+
+        noisecov[ch](ty, tx) = noise_value;
+      } else {
+        auto rms = rms_x12[hashedId];
+        float noise_value = rms * rms * G12SamplesCorrelation[vidx];
+        if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
+          //----  add fully correlated component to noise covariance to inflate pedestal uncertainty
+          noise_value += addPedestalUncertainty * addPedestalUncertainty;
+        }
+        noisecov[ch](ty, tx) = noise_value;
+      }
+
+      // pulse matrix
+      //    int const bx = tx - 5; // -5 -4 -3 ... 3 4
+      //    int bx = (*bxs)(tx);
+      //    int const offset = 7 - 3 - bx;
+      int const posToAccess = 9 - tx + ty;  // see cpu for reference
+      float const value = posToAccess >= 7 ? pulse_shape[hashedId].pdfval[posToAccess - 7] : 0;
+      pulse_matrix[ch](ty, tx) = value;
+    }
+
+    __global__ void kernel_permute_results(SampleVector* amplitudes,
+                                           BXVectorType const* activeBXs,
+                                           ::ecal::reco::StorageScalarType* energies,
+                                           char const* acState,
+                                           int const nchannels) {
+      // constants
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      int const tx = threadIdx.x + blockIdx.x * blockDim.x;
+      int const ch = tx / nsamples;
+      int const iii = tx % nsamples;  // this is to address activeBXs
+
+      if (ch >= nchannels)
+        return;
+
+      // channels that have amplitude precomputed do not need results to be permuted
+      auto const state = static_cast<MinimizationState>(acState[ch]);
+      if (state == MinimizationState::Precomputed)
+        return;
+
+      // configure shared memory and cp into it
+      extern __shared__ char smem[];
+      SampleVector::Scalar* values = reinterpret_cast<SampleVector::Scalar*>(smem);
+      values[threadIdx.x] = amplitudes[ch](iii);
+      __syncthreads();
+
+      // get the sample for this bx
+      auto const sample = static_cast<int>(activeBXs[ch](iii)) + 5;
+
+      // store back to global
+      amplitudes[ch](sample) = values[threadIdx.x];
+
+      // store sample 5 separately
+      // only for the case when minimization was performed
+      // not for cases with precomputed amplitudes
+      if (sample == 5)
+        energies[ch] = values[threadIdx.x];
+    }
+
+///
+/// Build an Ecal RecHit.
+/// TODO: Use SoA data structures on the host directly
+/// the reason for removing this from minimize kernel is to isolate the minimize +
+/// again, building an aos rec hit involves strides... -> bad memory access pattern
+///
+#ifdef RUN_BUILD_AOS_RECHIT
+    __global__ void kernel_build_rechit(
+        float const* energies, float const* chi2s, uint32_t* dids, EcalUncalibratedRecHit* rechits, int nchannels) {
+      int idx = threadIdx.x + blockDim.x * blockIdx.x;
+      if (idx < nchannels) {
+        rechits[idx] = EcalUncalibratedRecHit{dids[idx], energies[idx], 0, 0, chi2s[idx], 0};
+      }
+    }
+#endif
+
+  }  // namespace multifit
+}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.h
new file mode 100644
index 0000000000000..3e813b529a2ad
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.h
@@ -0,0 +1,98 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationCommonKernels_h
+#define RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationCommonKernels_h
+
+#include "Common.h"
+#include "DeclsForKernels.h"
+#include "EigenMatrixTypes_gpu.h"
+
+class EcalPulseShape;
+// this flag setting is applied to all of the cases
+class EcalPulseCovariance;
+class EcalUncalibratedRecHit;
+
+namespace ecal {
+  namespace multifit {
+
+    ///
+    /// assume kernel launch configuration is
+    /// (MAXSAMPLES * nchannels, blocks)
+    /// TODO: is there a point to split this kernel further to separate reductions
+    ///
+    __global__ void kernel_prep_1d_and_initialize(EcalPulseShape const* shapes_in,
+                                                  uint16_t const* digis_in_eb,
+                                                  uint32_t const* dids_eb,
+                                                  uint16_t const* digis_in_ee,
+                                                  uint32_t const* dids_ee,
+                                                  SampleVector* amplitudes,
+                                                  SampleVector* amplitudesForMinimization,
+                                                  SampleGainVector* gainsNoise,
+                                                  float const* mean_x1,
+                                                  float const* mean_x12,
+                                                  float const* rms_x12,
+                                                  float const* mean_x6,
+                                                  float const* gain6Over1,
+                                                  float const* gain12Over6,
+                                                  bool* hasSwitchToGain6,
+                                                  bool* hasSwitchToGain1,
+                                                  bool* isSaturated,
+                                                  ::ecal::reco::StorageScalarType* energies,
+                                                  ::ecal::reco::StorageScalarType* chi2,
+                                                  ::ecal::reco::StorageScalarType* pedestal,
+                                                  uint32_t* dids_out,
+                                                  uint32_t* flags,
+                                                  char* acState,
+                                                  BXVectorType* bxs,
+                                                  uint32_t const offsetForHashes,
+                                                  uint32_t const offsetForInputs,
+                                                  bool const gainSwitchUseMaxSampleEB,
+                                                  bool const gainSwitchUseMaxSampleEE,
+                                                  int const nchannels);
+
+    ///
+    /// assume kernel launch configuration is
+    /// ([MAXSAMPLES, MAXSAMPLES], nchannels)
+    ///
+    __global__ void kernel_prep_2d(SampleGainVector const* gainNoise,
+                                   uint32_t const* dids_eb,
+                                   uint32_t const* dids_ee,
+                                   float const* rms_x12,
+                                   float const* rms_x6,
+                                   float const* rms_x1,
+                                   float const* gain12Over6,
+                                   float const* gain6Over1,
+                                   double const* G12SamplesCorrelationEB,
+                                   double const* G6SamplesCorrelationEB,
+                                   double const* G1SamplesCorrelationEB,
+                                   double const* G12SamplesCorrelationEE,
+                                   double const* G6SamplesCorrelationEE,
+                                   double const* G1SamplesCorrelationEE,
+                                   SampleMatrix* noisecov,
+                                   PulseMatrixType* pulse_matrix,
+                                   EcalPulseShape const* pulse_shape,
+                                   bool const* hasSwitchToGain6,
+                                   bool const* hasSwitchToGain1,
+                                   bool const* isSaturated,
+                                   uint32_t const offsetForHashes,
+                                   uint32_t const offsetForInputs);
+
+    __global__ void kernel_permute_results(SampleVector* amplitudes,
+                                           BXVectorType const* activeBXs,
+                                           ::ecal::reco::StorageScalarType* energies,
+                                           char const* acState,
+                                           int const nchannels);
+
+///
+/// Build an Ecal RecHit.
+/// TODO: Use SoA data structures on the host directly
+/// the reason for removing this from minimize kernel is to isolate the minimize +
+/// again, building an aos rec hit involves strides... -> bad memory access pattern
+///
+#ifdef RUN_BUILD_AOS_RECHIT
+    __global__ void kernel_build_rechit(
+        float const* energies, float const* chi2s, uint32_t* dids, EcalUncalibratedRecHit* rechits, int nchannels);
+#endif
+
+  }  // namespace multifit
+}  // namespace ecal
+
+#endif  // RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationCommonKernels_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu
new file mode 100644
index 0000000000000..42c0b8aab85db
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu
@@ -0,0 +1,400 @@
+#include <iostream>
+#include <limits>
+
+#include <cuda.h>
+
+#include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h"
+#include "CondFormats/EcalObjects/interface/EcalPulseShapes.h"
+#include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+#include "DataFormats/Math/interface/approx_exp.h"
+#include "DataFormats/Math/interface/approx_log.h"
+
+#include "KernelHelpers.h"
+#include "AmplitudeComputationKernels.h"
+#include "AmplitudeComputationCommonKernels.h"
+
+namespace ecal {
+  namespace multifit {
+
+    void eigen_solve_submatrix(SampleMatrix& mat, SampleVector& invec, SampleVector& outvec, unsigned NP) {
+      using namespace Eigen;
+      switch (NP) {  // pulse matrix is always square.
+        case 10: {
+          Matrix<SampleMatrix::Scalar, 10, 10> temp = mat.topLeftCorner<10, 10>();
+          outvec.head<10>() = temp.ldlt().solve(invec.head<10>());
+          break;
+        }
+        case 9: {
+          Matrix<SampleMatrix::Scalar, 9, 9> temp = mat.topLeftCorner<9, 9>();
+          outvec.head<9>() = temp.ldlt().solve(invec.head<9>());
+          break;
+        }
+        case 8: {
+          Matrix<SampleMatrix::Scalar, 8, 8> temp = mat.topLeftCorner<8, 8>();
+          outvec.head<8>() = temp.ldlt().solve(invec.head<8>());
+          break;
+        }
+        case 7: {
+          Matrix<SampleMatrix::Scalar, 7, 7> temp = mat.topLeftCorner<7, 7>();
+          outvec.head<7>() = temp.ldlt().solve(invec.head<7>());
+          break;
+        }
+        case 6: {
+          Matrix<SampleMatrix::Scalar, 6, 6> temp = mat.topLeftCorner<6, 6>();
+          outvec.head<6>() = temp.ldlt().solve(invec.head<6>());
+          break;
+        }
+        case 5: {
+          Matrix<SampleMatrix::Scalar, 5, 5> temp = mat.topLeftCorner<5, 5>();
+          outvec.head<5>() = temp.ldlt().solve(invec.head<5>());
+          break;
+        }
+        case 4: {
+          Matrix<SampleMatrix::Scalar, 4, 4> temp = mat.topLeftCorner<4, 4>();
+          outvec.head<4>() = temp.ldlt().solve(invec.head<4>());
+          break;
+        }
+        case 3: {
+          Matrix<SampleMatrix::Scalar, 3, 3> temp = mat.topLeftCorner<3, 3>();
+          outvec.head<3>() = temp.ldlt().solve(invec.head<3>());
+          break;
+        }
+        case 2: {
+          Matrix<SampleMatrix::Scalar, 2, 2> temp = mat.topLeftCorner<2, 2>();
+          outvec.head<2>() = temp.ldlt().solve(invec.head<2>());
+          break;
+        }
+        case 1: {
+          Matrix<SampleMatrix::Scalar, 1, 1> temp = mat.topLeftCorner<1, 1>();
+          outvec.head<1>() = temp.ldlt().solve(invec.head<1>());
+          break;
+        }
+        default:
+          return;
+      }
+    }
+
+    template <typename MatrixType>
+    __device__ __forceinline__ bool update_covariance(EcalPulseCovariance const& pulse_covariance,
+                                                      MatrixType& inverse_cov,
+                                                      SampleVector const& amplitudes) {
+      constexpr int nsamples = SampleVector::RowsAtCompileTime;
+      constexpr int npulses = BXVectorType::RowsAtCompileTime;
+
+#pragma unroll
+      for (unsigned int ipulse = 0; ipulse < npulses; ipulse++) {
+        auto const amplitude = amplitudes.coeff(ipulse);
+        if (amplitude == 0)
+          continue;
+
+        // FIXME: ipulse - 5 -> ipulse - firstOffset
+        int bx = ipulse - 5;
+        int first_sample_t = std::max(0, bx + 3);
+        int offset = -3 - bx;
+
+        auto const value_sq = amplitude * amplitude;
+
+        for (int col = first_sample_t; col < nsamples; col++) {
+          for (int row = col; row < nsamples; row++) {
+            inverse_cov(row, col) += value_sq * __ldg(&pulse_covariance.covval[row + offset][col + offset]);
+          }
+        }
+      }
+
+      return true;
+    }
+
+    ///
+    /// launch ctx parameters are (nchannels / block, blocks)
+    /// TODO: trivial impl for now, there must be a way to improve
+    ///
+    /// Conventions:
+    ///   - amplitudes -> solution vector, what we are fitting for
+    ///   - samples -> raw detector responses
+    ///   - passive constraint - satisfied constraint
+    ///   - active constraint - unsatisfied (yet) constraint
+    ///
+    __global__ void kernel_minimize(uint32_t const* dids_eb,
+                                    uint32_t const* dids_ee,
+                                    SampleMatrix const* __restrict__ noisecov,
+                                    EcalPulseCovariance const* __restrict__ pulse_covariance,
+                                    BXVectorType* bxs,
+                                    SampleVector const* __restrict__ samples,
+                                    SampleVector* amplitudes,
+                                    PulseMatrixType const* __restrict__ pulse_matrix,
+                                    ::ecal::reco::StorageScalarType* chi2s,
+                                    ::ecal::reco::StorageScalarType* energies,
+                                    char* acState,
+                                    int nchannels,
+                                    int max_iterations,
+                                    uint32_t const offsetForHashes,
+                                    uint32_t const offsetForInputs) {
+      // FIXME: ecal has 10 samples and 10 pulses....
+      // but this needs to be properly treated and renamed everywhere
+      constexpr auto NSAMPLES = SampleMatrix::RowsAtCompileTime;
+      constexpr auto NPULSES = SampleMatrix::RowsAtCompileTime;
+      static_assert(NSAMPLES == NPULSES);
+
+      using DataType = SampleVector::Scalar;
+
+      extern __shared__ char shrmem[];
+      DataType* shrMatrixLForFnnlsStorage =
+          reinterpret_cast<DataType*>(shrmem) + MapSymM<DataType, NPULSES>::total * threadIdx.x;
+      DataType* shrAtAStorage =
+          reinterpret_cast<DataType*>(shrmem) + MapSymM<DataType, NPULSES>::total * (threadIdx.x + blockDim.x);
+
+      // FIXME: remove eitehr idx or ch -> they are teh same thing
+      int idx = threadIdx.x + blockDim.x * blockIdx.x;
+      auto const ch = idx;
+      if (idx < nchannels) {
+        if (static_cast<MinimizationState>(acState[idx]) == MinimizationState::Precomputed)
+          return;
+
+        // get the hash
+        int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+        auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+        auto const did = DetId{dids[inputCh]};
+        auto const isBarrel = did.subdetId() == EcalBarrel;
+        auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
+                                       : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
+
+        // inits
+        int iter = 0;
+        int npassive = 0;
+
+        ColumnVector<NPULSES, int> pulseOffsets;
+#pragma unroll
+        for (int i = 0; i < NPULSES; ++i)
+          pulseOffsets(i) = i;
+
+        ColumnVector<NPULSES, DataType> resultAmplitudes;
+#pragma unroll
+        for (int counter = 0; counter < NPULSES; counter++)
+          resultAmplitudes(counter) = 0;
+
+        // inits
+        //SampleDecompLLT covariance_decomposition;
+        //SampleMatrix inverse_cov;
+        SampleVector::Scalar chi2 = 0, chi2_now = 0;
+
+        // loop until ocnverge
+        while (true) {
+          if (iter >= max_iterations)
+            break;
+
+          //inverse_cov = noisecov[idx];
+          //DataType covMatrixStorage[MapSymM<DataType, NSAMPLES>::total];
+          DataType* covMatrixStorage = shrMatrixLForFnnlsStorage;
+          MapSymM<DataType, NSAMPLES> covMatrix{covMatrixStorage};
+          int counter = 0;
+#pragma unroll
+          for (int col = 0; col < NSAMPLES; col++)
+#pragma unroll
+            for (int row = col; row < NSAMPLES; row++)
+              covMatrixStorage[counter++] = __ldg(&noisecov[idx].coeffRef(row, col));
+
+          update_covariance(pulse_covariance[hashedId], covMatrix, resultAmplitudes);
+
+          // compute actual covariance decomposition
+          //covariance_decomposition.compute(inverse_cov);
+          //auto const& matrixL = covariance_decomposition.matrixL();
+          DataType matrixLStorage[MapSymM<DataType, NSAMPLES>::total];
+          MapSymM<DataType, NSAMPLES> matrixL{matrixLStorage};
+          compute_decomposition_unrolled(matrixL, covMatrix);
+
+          // L * A = P
+          ColMajorMatrix<NSAMPLES, NPULSES> A;
+          solve_forward_subst_matrix(A, pulse_matrix[idx], matrixL);
+
+          // L b = s
+          float reg_b[NSAMPLES];
+          solve_forward_subst_vector(reg_b, samples[idx], matrixL);
+
+          // FIXME: shared mem
+          //DataType AtAStorage[MapSymM<DataType, NPULSES>::total];
+          MapSymM<DataType, NPULSES> AtA{shrAtAStorage};
+          //SampleMatrix AtA;
+          SampleVector Atb;
+#pragma unroll
+          for (int icol = 0; icol < NPULSES; icol++) {
+            float reg_ai[NSAMPLES];
+
+// load column icol
+#pragma unroll
+            for (int counter = 0; counter < NSAMPLES; counter++)
+              reg_ai[counter] = A(counter, icol);
+
+            // compute diagoanl
+            float sum = 0.f;
+#pragma unroll
+            for (int counter = 0; counter < NSAMPLES; counter++)
+              sum += reg_ai[counter] * reg_ai[counter];
+
+            // store
+            AtA(icol, icol) = sum;
+
+// go thru the other columns
+#pragma unroll
+            for (int j = icol + 1; j < NPULSES; j++) {
+              // load column j
+              float reg_aj[NSAMPLES];
+#pragma unroll
+              for (int counter = 0; counter < NSAMPLES; counter++)
+                reg_aj[counter] = A(counter, j);
+
+              // accum
+              float sum = 0.f;
+#pragma unroll
+              for (int counter = 0; counter < NSAMPLES; counter++)
+                sum += reg_aj[counter] * reg_ai[counter];
+
+              // store
+              //AtA(icol, j) = sum;
+              AtA(j, icol) = sum;
+            }
+
+            // Atb accum
+            float sum_atb = 0.f;
+#pragma unroll
+            for (int counter = 0; counter < NSAMPLES; counter++)
+              sum_atb += reg_ai[counter] * reg_b[counter];
+
+            // store atb
+            Atb(icol) = sum_atb;
+          }
+
+          // FIXME: shared mem
+          //DataType matrixLForFnnlsStorage[MapSymM<DataType, NPULSES>::total];
+          MapSymM<DataType, NPULSES> matrixLForFnnls{shrMatrixLForFnnlsStorage};
+
+          fnnls(AtA,
+                Atb,
+                //amplitudes[idx],
+                resultAmplitudes,
+                npassive,
+                pulseOffsets,
+                matrixLForFnnls,
+                1e-11,
+                500);
+
+          {
+            DataType accum[NSAMPLES];
+// load accum
+#pragma unroll
+            for (int counter = 0; counter < NSAMPLES; counter++)
+              accum[counter] = -samples[idx](counter);
+
+            // iterate
+            for (int icol = 0; icol < NPULSES; icol++) {
+              DataType pm_col[NSAMPLES];
+
+// preload a column of pulse matrix
+#pragma unroll
+              for (int counter = 0; counter < NSAMPLES; counter++)
+                pm_col[counter] = __ldg(&pulse_matrix[idx].coeffRef(counter, icol));
+
+// accum
+#pragma unroll
+              for (int counter = 0; counter < NSAMPLES; counter++)
+                accum[counter] += resultAmplitudes[icol] * pm_col[counter];
+            }
+
+            DataType reg_L[NSAMPLES];
+            DataType accumSum = 0;
+
+// preload a column and load column 0 of cholesky
+#pragma unroll
+            for (int i = 0; i < NSAMPLES; i++)
+              reg_L[i] = matrixL(i, 0);
+
+            // compute x0 and store it
+            auto x_prev = accum[0] / reg_L[0];
+            accumSum += x_prev * x_prev;
+
+// iterate
+#pragma unroll
+            for (int iL = 1; iL < NSAMPLES; iL++) {
+// update accum
+#pragma unroll
+              for (int counter = iL; counter < NSAMPLES; counter++)
+                accum[counter] -= x_prev * reg_L[counter];
+
+// load the next column of cholesky
+#pragma unroll
+              for (int counter = iL; counter < NSAMPLES; counter++)
+                reg_L[counter] = matrixL(counter, iL);
+
+              // compute the next x for M(iL, icol)
+              x_prev = accum[iL] / reg_L[iL];
+
+              // store teh result value
+              accumSum += x_prev * x_prev;
+            }
+
+            chi2_now = accumSum;
+          }
+
+          auto deltachi2 = chi2_now - chi2;
+          chi2 = chi2_now;
+
+          if (ecal::abs(deltachi2) < 1e-3)
+            break;
+
+          //---- AM: TEST
+          //---- it was 3 lines above, now here as in the CPU version
+          ++iter;
+        }
+
+        // store to global output values
+        // FIXME: amplitudes are used in global directly
+        chi2s[idx] = chi2;
+        energies[idx] = resultAmplitudes(5);
+
+#pragma unroll
+        for (int counter = 0; counter < NPULSES; counter++)
+          amplitudes[idx](counter) = resultAmplitudes(counter);
+      }
+    }
+
+    namespace v1 {
+
+      void minimization_procedure(EventInputDataGPU const& eventInputGPU,
+                                  EventOutputDataGPU& eventOutputGPU,
+                                  EventDataForScratchGPU& scratch,
+                                  ConditionsProducts const& conditions,
+                                  ConfigurationParameters const& configParameters,
+                                  cudaStream_t cudaStream) {
+        using DataType = SampleVector::Scalar;
+        unsigned int totalChannels = eventInputGPU.ebDigis.ndigis + eventInputGPU.eeDigis.ndigis;
+        //    unsigned int threads_min = conf.threads.x;
+        // TODO: configure from python
+        unsigned int threads_min = configParameters.kernelMinimizeThreads[0];
+        unsigned int blocks_min = threads_min > totalChannels ? 1 : (totalChannels + threads_min - 1) / threads_min;
+        uint32_t const offsetForHashes = conditions.offsetForHashes;
+        uint32_t const offsetForInputs = eventInputGPU.ebDigis.ndigis;
+        auto const nbytesShared =
+            2 * threads_min * MapSymM<DataType, SampleVector::RowsAtCompileTime>::total * sizeof(DataType);
+        kernel_minimize<<<blocks_min, threads_min, nbytesShared, cudaStream>>>(
+            eventInputGPU.ebDigis.ids,
+            eventInputGPU.eeDigis.ids,
+            scratch.noisecov,
+            conditions.pulseCovariances.values,
+            scratch.activeBXs,
+            scratch.samples,
+            (SampleVector*)eventOutputGPU.amplitudesAll,
+            scratch.pulse_matrix,
+            eventOutputGPU.chi2,
+            eventOutputGPU.amplitude,
+            scratch.acState,
+            totalChannels,
+            50,
+            offsetForHashes,
+            offsetForInputs);
+        cudaCheck(cudaGetLastError());
+      }
+
+    }  // namespace v1
+
+  }  // namespace multifit
+}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.h
new file mode 100644
index 0000000000000..b8202f75b653b
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.h
@@ -0,0 +1,29 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationKernels_h
+#define RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationKernels_h
+
+#include "Common.h"
+#include "DeclsForKernels.h"
+#include "EigenMatrixTypes_gpu.h"
+
+class EcalPulseShape;
+class EcalPulseCovariance;
+class EcalUncalibratedRecHit;
+
+namespace ecal {
+  namespace multifit {
+
+    namespace v1 {
+
+      void minimization_procedure(EventInputDataGPU const& eventInputGPU,
+                                  EventOutputDataGPU& eventOutputGPU,
+                                  EventDataForScratchGPU& scratch,
+                                  ConditionsProducts const& conditions,
+                                  ConfigurationParameters const& configParameters,
+                                  cudaStream_t cudaStream);
+
+    }
+
+  }  // namespace multifit
+}  // namespace ecal
+
+#endif  // RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationKernels_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml
index 3b1d2c0cf159d..61eed4689fd20 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml
+++ b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml
@@ -1,21 +1,22 @@
 <use name="cuda"/>
 <use name="CUDADataFormats/EcalRecHitSoA"/>
-<use name="HeterogeneousCore/CUDACore"/>
-<use name="HeterogeneousCore/CUDAUtilities"/>
-<use name="CalibCalorimetry/EcalTPGTools"/>
-<use name="RecoLocalCalo/EcalDeadChannelRecoveryAlgos"/>
-<use name="Geometry/CaloGeometry"/>
-<use name="RecoLocalCalo/EcalRecProducers"/>
-<use name="FWCore/Framework"/>
-<use name="FWCore/ParameterSet"/>
 <use name="CalibCalorimetry/EcalLaserCorrection"/>
+<use name="CalibCalorimetry/EcalTPGTools"/>
+<use name="CondFormats/DataRecord"/>
+<use name="CondFormats/ESObjects"/>
+<use name="CondFormats/EcalObjects"/>
 <use name="DataFormats/EcalDigi"/>
 <use name="DataFormats/EcalRecHit"/>
-<use name="CondFormats/EcalObjects"/>
-<use name="CondFormats/ESObjects"/>
-<use name="CondFormats/DataRecord"/>
+<use name="FWCore/Framework"/>
+<use name="FWCore/ParameterSet"/>
+<use name="Geometry/CaloGeometry"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="RecoLocalCalo/EcalDeadChannelRecoveryAlgos"/>
 <use name="RecoLocalCalo/EcalRecAlgos"/>
+<use name="RecoLocalCalo/EcalRecProducers"/>
 <use name="SimCalorimetry/EcalSimAlgos"/>
-<library file="*.cc" name="RecoLocalCaloEcalRecProducersPlugins">
+
+<library file="*.cc *.cu" name="RecoLocalCaloEcalRecProducersPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/Common.h b/RecoLocalCalo/EcalRecProducers/plugins/Common.h
new file mode 100644
index 0000000000000..4e7afbd0819db
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/Common.h
@@ -0,0 +1,46 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_Common_h
+#define RecoLocalCalo_EcalRecProducers_plugins_Common_h
+
+#include <cstdint>
+#include <cmath>
+#include <cassert>
+#include <chrono>
+#include <iostream>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+// a workaround for std::abs not being a constexpr function
+namespace ecal {
+
+  template <typename T>
+  constexpr T abs(T const& value) {
+    return ::std::max(value, -value);
+  }
+
+  // temporary
+  namespace mgpa {
+
+    constexpr int adc(uint16_t sample) { return sample & 0xfff; }
+    constexpr int gainId(uint16_t sample) { return (sample >> 12) & 0x3; }
+
+  }  // namespace mgpa
+
+}  // namespace ecal
+
+template <typename T>
+struct DurationMeasurer {
+  DurationMeasurer(std::string const& msg) : msg_{msg}, start_{std::chrono::high_resolution_clock::now()} {}
+
+  ~DurationMeasurer() {
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<T>(end - start_).count();
+    std::cout << msg_ << "\nduration = " << duration << std::endl;
+  }
+
+private:
+  std::string msg_;
+  std::chrono::high_resolution_clock::time_point start_;
+};
+
+#endif  // RecoLocalCalo_EcalRecProducers_plugins_Common_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/DeclsForKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/DeclsForKernels.h
new file mode 100644
index 0000000000000..1ea0fcc430882
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/DeclsForKernels.h
@@ -0,0 +1,333 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_DeclsForKernels_h
+#define RecoLocalCalo_EcalRecProducers_plugins_DeclsForKernels_h
+
+#include <vector>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/EcalDigi/interface/DigisCollection.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h"
+#include "CondFormats/EcalObjects/interface/EcalChannelStatus.h"
+#include "CondFormats/EcalObjects/interface/EcalChannelStatusCode.h"
+#include "CondFormats/EcalObjects/interface/EcalGainRatios.h"
+#include "CondFormats/EcalObjects/interface/EcalPedestals.h"
+#include "CondFormats/EcalObjects/interface/EcalTimeBiasCorrections.h"
+#include "CondFormats/EcalObjects/interface/EcalTimeOffsetConstant.h"
+#include "CondFormats/EcalObjects/interface/EcalWeightSet.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h"
+
+#include "EigenMatrixTypes_gpu.h"
+
+struct EcalPulseShape;
+class EcalSampleMask;
+class EcalTimeBiasCorrections;
+struct EcalPulseCovariance;
+class EcalDigiCollection;
+class EcalXtalGroupId;
+class EcalSamplesCorrelation;
+class EBDigiCollection;
+class EEDigiCollection;
+
+namespace ecal {
+  namespace multifit {
+
+    enum class TimeComputationState : char { NotFinished = 0, Finished = 1 };
+    enum class MinimizationState : char {
+      NotFinished = 0,
+      Finished = 1,
+      Precomputed = 2,
+    };
+
+    //
+    struct EventInputDataGPU {
+      ecal::DigisCollection const& ebDigis;
+      ecal::DigisCollection const& eeDigis;
+    };
+
+    // parameters have a fixed type
+    // Can we go by with single precision
+    struct ConfigurationParameters {
+      using type = double;
+      // device ptrs
+      type *amplitudeFitParametersEB = nullptr, *amplitudeFitParametersEE = nullptr;
+
+      uint32_t timeFitParametersSizeEB, timeFitParametersSizeEE;
+      // device ptrs
+      type *timeFitParametersEB = nullptr, *timeFitParametersEE = nullptr;
+
+      type timeFitLimitsFirstEB, timeFitLimitsFirstEE;
+      type timeFitLimitsSecondEB, timeFitLimitsSecondEE;
+
+      type timeConstantTermEB, timeConstantTermEE;
+
+      type timeNconstEB, timeNconstEE;
+
+      type amplitudeThreshEE, amplitudeThreshEB;
+
+      type outOfTimeThreshG12pEB, outOfTimeThreshG12mEB;
+      type outOfTimeThreshG12pEE, outOfTimeThreshG12mEE;
+      type outOfTimeThreshG61pEE, outOfTimeThreshG61mEE;
+      type outOfTimeThreshG61pEB, outOfTimeThreshG61mEB;
+
+      std::array<uint32_t, 3> kernelMinimizeThreads;
+
+      bool shouldRunTimingComputation;
+    };
+
+    struct EventOutputDataGPU final : public ::ecal::UncalibratedRecHit<::ecal::Tag::ptr> {
+      void allocate(ConfigurationParameters const& configParameters, uint32_t size) {
+        cudaCheck(cudaMalloc((void**)&amplitudesAll, size * sizeof(SampleVector)));
+        cudaCheck(cudaMalloc((void**)&amplitude, size * sizeof(::ecal::reco::StorageScalarType)));
+        cudaCheck(cudaMalloc((void**)&chi2, size * sizeof(::ecal::reco::StorageScalarType)));
+        cudaCheck(cudaMalloc((void**)&pedestal, size * sizeof(::ecal::reco::StorageScalarType)));
+
+        if (configParameters.shouldRunTimingComputation) {
+          cudaCheck(cudaMalloc((void**)&jitter, size * sizeof(::ecal::reco::StorageScalarType)));
+          cudaCheck(cudaMalloc((void**)&jitterError, size * sizeof(::ecal::reco::StorageScalarType)));
+        }
+
+        cudaCheck(cudaMalloc((void**)&did, size * sizeof(uint32_t)));
+        cudaCheck(cudaMalloc((void**)&flags, size * sizeof(uint32_t)));
+      }
+
+      void deallocate(ConfigurationParameters const& configParameters) {
+        cudaCheck(cudaFree(amplitudesAll));
+        cudaCheck(cudaFree(amplitude));
+        cudaCheck(cudaFree(chi2));
+        cudaCheck(cudaFree(pedestal));
+        if (configParameters.shouldRunTimingComputation) {
+          cudaCheck(cudaFree(jitter));
+          cudaCheck(cudaFree(jitterError));
+        }
+        cudaCheck(cudaFree(did));
+        cudaCheck(cudaFree(flags));
+      }
+    };
+
+    struct EventDataForScratchGPU {
+      SampleVector* samples = nullptr;
+      SampleGainVector* gainsNoise = nullptr;
+
+      SampleMatrix* noisecov = nullptr;
+      PulseMatrixType* pulse_matrix = nullptr;
+      BXVectorType* activeBXs = nullptr;
+      char* acState = nullptr;
+
+      bool *hasSwitchToGain6 = nullptr, *hasSwitchToGain1 = nullptr, *isSaturated = nullptr;
+
+      SampleVector::Scalar *sample_values, *sample_value_errors;
+      bool* useless_sample_values;
+      SampleVector::Scalar* chi2sNullHypot;
+      SampleVector::Scalar* sum0sNullHypot;
+      SampleVector::Scalar* sumAAsNullHypot;
+      char* pedestal_nums;
+      SampleVector::Scalar *tMaxAlphaBetas, *tMaxErrorAlphaBetas;
+      SampleVector::Scalar *accTimeMax, *accTimeWgt;
+      SampleVector::Scalar *ampMaxAlphaBeta, *ampMaxError;
+      SampleVector::Scalar *timeMax, *timeError;
+      TimeComputationState* tcState;
+
+      void allocate(ConfigurationParameters const& configParameters, uint32_t size) {
+        cudaCheck(cudaMalloc((void**)&samples, size * sizeof(SampleVector)));
+        cudaCheck(cudaMalloc((void**)&gainsNoise, size * sizeof(SampleGainVector)));
+
+        cudaCheck(cudaMalloc((void**)&noisecov, size * sizeof(SampleMatrix)));
+        cudaCheck(cudaMalloc((void**)&pulse_matrix, size * sizeof(PulseMatrixType)));
+        cudaCheck(cudaMalloc((void**)&activeBXs, size * sizeof(BXVectorType)));
+        cudaCheck(cudaMalloc((void**)&acState, size * sizeof(char)));
+
+        cudaCheck(cudaMalloc((void**)&hasSwitchToGain6, size * sizeof(bool)));
+        cudaCheck(cudaMalloc((void**)&hasSwitchToGain1, size * sizeof(bool)));
+        cudaCheck(cudaMalloc((void**)&isSaturated, size * sizeof(bool)));
+
+        if (configParameters.shouldRunTimingComputation) {
+          cudaCheck(cudaMalloc((void**)&sample_values, size * sizeof(SampleVector)));
+          cudaCheck(cudaMalloc((void**)&sample_value_errors, size * sizeof(SampleVector)));
+          cudaCheck(cudaMalloc((void**)&useless_sample_values, size * sizeof(bool) * EcalDataFrame::MAXSAMPLES));
+          cudaCheck(cudaMalloc((void**)&chi2sNullHypot, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&sum0sNullHypot, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&sumAAsNullHypot, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&pedestal_nums, size * sizeof(char)));
+
+          cudaCheck(cudaMalloc((void**)&tMaxAlphaBetas, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&tMaxErrorAlphaBetas, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&accTimeMax, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&accTimeWgt, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&ampMaxAlphaBeta, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&ampMaxError, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&timeMax, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&timeError, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&tcState, size * sizeof(TimeComputationState)));
+        }
+      }
+
+      void deallocate(ConfigurationParameters const& configParameters) {
+        cudaCheck(cudaFree(samples));
+        cudaCheck(cudaFree(gainsNoise));
+
+        cudaCheck(cudaFree(noisecov));
+        cudaCheck(cudaFree(pulse_matrix));
+        cudaCheck(cudaFree(activeBXs));
+        cudaCheck(cudaFree(acState));
+
+        cudaCheck(cudaFree(hasSwitchToGain6));
+        cudaCheck(cudaFree(hasSwitchToGain1));
+        cudaCheck(cudaFree(isSaturated));
+
+        if (configParameters.shouldRunTimingComputation) {
+          cudaCheck(cudaFree(sample_values));
+          cudaCheck(cudaFree(sample_value_errors));
+          cudaCheck(cudaFree(useless_sample_values));
+          cudaCheck(cudaFree(chi2sNullHypot));
+          cudaCheck(cudaFree(sum0sNullHypot));
+          cudaCheck(cudaFree(sumAAsNullHypot));
+          cudaCheck(cudaFree(pedestal_nums));
+
+          cudaCheck(cudaFree(tMaxAlphaBetas));
+          cudaCheck(cudaFree(tMaxErrorAlphaBetas));
+          cudaCheck(cudaFree(accTimeMax));
+          cudaCheck(cudaFree(accTimeWgt));
+          cudaCheck(cudaFree(ampMaxAlphaBeta));
+          cudaCheck(cudaFree(ampMaxError));
+          cudaCheck(cudaFree(timeMax));
+          cudaCheck(cudaFree(timeError));
+          cudaCheck(cudaFree(tcState));
+        }
+      }
+    };
+
+    // const refs products to conditions
+    struct ConditionsProducts {
+      EcalPedestalsGPU::Product const& pedestals;
+      EcalGainRatiosGPU::Product const& gainRatios;
+      EcalPulseShapesGPU::Product const& pulseShapes;
+      EcalPulseCovariancesGPU::Product const& pulseCovariances;
+      EcalSamplesCorrelationGPU::Product const& samplesCorrelation;
+      EcalTimeBiasCorrectionsGPU::Product const& timeBiasCorrections;
+      EcalTimeCalibConstantsGPU::Product const& timeCalibConstants;
+      EcalSampleMask const& sampleMask;
+      EcalTimeOffsetConstant const& timeOffsetConstant;
+      uint32_t offsetForHashes;
+    };
+
+    //*/
+
+    struct xyz {
+      int x, y, z;
+    };
+
+    struct conf_data {
+      xyz threads;
+      bool runV1;
+      cudaStream_t cuStream;
+    };
+
+  }  // namespace multifit
+}  // namespace ecal
+
+//
+// ECAL Rechit producer
+//
+
+namespace ecal {
+  namespace rechit {
+
+    // parameters that are read in the configuration file for rechit producer
+    struct ConfigurationParameters {
+      // device ptrs
+      int* ChannelStatusToBeExcluded = nullptr;
+      uint32_t ChannelStatusToBeExcludedSize;
+
+      bool killDeadChannels;
+
+      bool recoverEBIsolatedChannels;
+      bool recoverEEIsolatedChannels;
+      bool recoverEBVFE;
+      bool recoverEEVFE;
+      bool recoverEBFE;
+      bool recoverEEFE;
+
+      float EBLaserMIN;
+      float EELaserMIN;
+      float EBLaserMAX;
+      float EELaserMAX;
+
+      //       std::vector<std::vector<uint32_t> > v_DB_reco_flags;
+      int* expanded_v_DB_reco_flags;
+      uint32_t* expanded_Sizes_v_DB_reco_flags;
+      uint32_t* expanded_flagbit_v_DB_reco_flags;
+      uint32_t expanded_v_DB_reco_flagsSize;
+
+      uint32_t flagmask;
+
+      //
+      //       bool shouldRunTimingComputation;
+    };
+
+    struct EventOutputDataGPU final : public ::ecal::RecHit<::ecal::Tag::ptr> {
+      void allocate(ConfigurationParameters const& configParameters, uint32_t size) {
+        //      void allocate(uint32_t size) {
+        //---- configParameters -> needed only to decide if to save the timing information or not
+
+        cudaCheck(cudaMalloc((void**)&energy, size * sizeof(::ecal::reco::StorageScalarType)));
+        cudaCheck(cudaMalloc((void**)&time, size * sizeof(::ecal::reco::StorageScalarType)));
+        cudaCheck(cudaMalloc((void**)&chi2, size * sizeof(::ecal::reco::StorageScalarType)));
+        cudaCheck(cudaMalloc((void**)&flagBits, size * sizeof(uint32_t)));
+        cudaCheck(cudaMalloc((void**)&extra, size * sizeof(uint32_t)));
+        cudaCheck(cudaMalloc((void**)&did, size * sizeof(uint32_t)));
+      }
+
+      void deallocate(ConfigurationParameters const& configParameters) {
+        //     void deallocate() {
+        //---- configParameters -> needed only to decide if to save the timing information or not
+
+        cudaCheck(cudaFree(energy));
+        cudaCheck(cudaFree(time));
+        cudaCheck(cudaFree(chi2));
+        cudaCheck(cudaFree(flagBits));
+        cudaCheck(cudaFree(extra));
+        cudaCheck(cudaFree(did));
+      }
+    };
+
+    struct EventInputDataGPU {
+      ecal::UncalibratedRecHit<ecal::Tag::ptr> const& ebUncalibRecHits;
+      ecal::UncalibratedRecHit<ecal::Tag::ptr> const& eeUncalibRecHits;
+    };
+
+    // const refs products to conditions
+    struct ConditionsProducts {
+      EcalRechitADCToGeVConstantGPU::Product const& ADCToGeV;
+      EcalIntercalibConstantsGPU::Product const& Intercalib;
+      EcalRechitChannelStatusGPU::Product const& ChannelStatus;
+      //
+      EcalLaserAPDPNRatiosGPU::Product const& LaserAPDPNRatios;
+      EcalLaserAPDPNRatiosRefGPU::Product const& LaserAPDPNRatiosRef;
+      EcalLaserAlphasGPU::Product const& LaserAlphas;
+      EcalLinearCorrectionsGPU::Product const& LinearCorrections;
+      //
+      //
+      uint32_t offsetForHashes;
+    };
+
+  }  // namespace rechit
+}  // namespace ecal
+
+#endif  // RecoLocalCalo_EcalRecProducers_plugins_DeclsForKernels_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.cu
new file mode 100644
index 0000000000000..d816414aa71a0
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.cu
@@ -0,0 +1,673 @@
+#include <cuda.h>
+
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
+
+#include "EcalRecHitBuilderKernels.h"
+#include "KernelHelpers.h"
+
+namespace ecal {
+  namespace rechit {
+
+    // uncalibrecHit flags
+    enum UncalibRecHitFlags {
+      kGood = -1,  // channel is good (mutually exclusive with other states)  setFlagBit(kGood) reset flags_ to zero
+      kPoorReco,   // channel has been badly reconstructed (e.g. bad shape, bad chi2 etc.)
+      kSaturated,  // saturated channel
+      kOutOfTime,  // channel out of time
+      kLeadingEdgeRecovered,  // saturated channel: energy estimated from the leading edge before saturation
+      kHasSwitchToGain6,      // at least one data frame is in G6
+      kHasSwitchToGain1       // at least one data frame is in G1
+    };
+
+    // recHit flags
+    enum RecHitFlags {
+      RecHitFlags_kGood = 0,  // channel ok, the energy and time measurement are reliable
+      RecHitFlags_kPoorReco,  // the energy is available from the UncalibRecHit, but approximate (bad shape, large chi2)
+      RecHitFlags_kOutOfTime,  // the energy is available from the UncalibRecHit (sync reco), but the event is out of time
+      RecHitFlags_kFaultyHardware,  // The energy is available from the UncalibRecHit, channel is faulty at some hardware level (e.g. noisy)
+      RecHitFlags_kNoisy,      // the channel is very noisy
+      RecHitFlags_kPoorCalib,  // the energy is available from the UncalibRecHit, but the calibration of the channel is poor
+      RecHitFlags_kSaturated,             // saturated channel (recovery not tried)
+      RecHitFlags_kLeadingEdgeRecovered,  // saturated channel: energy estimated from the leading edge before saturation
+      RecHitFlags_kNeighboursRecovered,   // saturated/isolated dead: energy estimated from neighbours
+      RecHitFlags_kTowerRecovered,        // channel in TT with no data link, info retrieved from Trigger Primitive
+      RecHitFlags_kDead,                  // channel is dead and any recovery fails
+      RecHitFlags_kKilled,                // MC only flag: the channel is killed in the real detector
+      RecHitFlags_kTPSaturated,           // the channel is in a region with saturated TP
+      RecHitFlags_kL1SpikeFlag,           // the channel is in a region with TP with sFGVB = 0
+      RecHitFlags_kWeird,                 // the signal is believed to originate from an anomalous deposit (spike)
+      RecHitFlags_kDiWeird,               // the signal is anomalous, and neighbors another anomalous signal
+      RecHitFlags_kHasSwitchToGain6,      // at least one data frame is in G6
+      RecHitFlags_kHasSwitchToGain1,      // at least one data frame is in G1
+      //
+      RecHitFlags_kUnknown  // to ease the interface with functions returning flags.
+    };
+
+    // status code
+    enum EcalChannelStatusCode_Code {
+      kOk = 0,
+      kDAC,
+      kNoLaser,
+      kNoisy,
+      kNNoisy,
+      kNNNoisy,
+      kNNNNoisy,
+      kNNNNNoisy,
+      kFixedG6,
+      kFixedG1,
+      kFixedG0,
+      kNonRespondingIsolated,
+      kDeadVFE,
+      kDeadFE,
+      kNoDataNoTP
+    };
+
+    __global__ void kernel_create_ecal_rehit(
+        // configuration
+        int const* ChannelStatusToBeExcluded,
+        uint32_t ChannelStatusToBeExcludedSize,
+        bool const killDeadChannels,
+        bool const recoverEBIsolatedChannels,
+        bool const recoverEEIsolatedChannels,
+        bool const recoverEBVFE,
+        bool const recoverEEVFE,
+        bool const recoverEBFE,
+        bool const recoverEEFE,
+        float const EBLaserMIN,
+        float const EELaserMIN,
+        float const EBLaserMAX,
+        float const EELaserMAX,
+        // for flags setting
+        int const* expanded_v_DB_reco_flags,  // FIXME AM: to be checked
+        uint32_t const* expanded_Sizes_v_DB_reco_flags,
+        uint32_t const* expanded_flagbit_v_DB_reco_flags,
+        uint32_t expanded_v_DB_reco_flagsSize,
+        uint32_t flagmask,
+        // conditions
+        float const* adc2gev,
+        float const* intercalib,
+        uint16_t const* status,
+        float const* apdpnrefs,
+        float const* alphas,
+        // input for transparency corrections
+        float const* p1,
+        float const* p2,
+        float const* p3,
+        edm::TimeValue_t const* t1,
+        edm::TimeValue_t const* t2,
+        edm::TimeValue_t const* t3,
+        // input for linear corrections
+        float const* lp1,
+        float const* lp2,
+        float const* lp3,
+        edm::TimeValue_t const* lt1,
+        edm::TimeValue_t const* lt2,
+        edm::TimeValue_t const* lt3,
+        // time, used for time dependent corrections
+        edm::TimeValue_t const event_time,
+        // input
+        uint32_t const* did_eb,
+        uint32_t const* did_ee,
+        ::ecal::reco::StorageScalarType const* amplitude_eb,  // in adc counts
+        ::ecal::reco::StorageScalarType const* amplitude_ee,  // in adc counts
+        ::ecal::reco::StorageScalarType const* time_eb,
+        ::ecal::reco::StorageScalarType const* time_ee,
+        ::ecal::reco::StorageScalarType const* chi2_eb,
+        ::ecal::reco::StorageScalarType const* chi2_ee,
+        uint32_t const* flags_eb,
+        uint32_t const* flags_ee,
+        // output
+        uint32_t* did,
+        ::ecal::reco::StorageScalarType* energy,  // in energy [GeV]
+        ::ecal::reco::StorageScalarType* time,
+        ::ecal::reco::StorageScalarType* chi2,
+        uint32_t* flagBits,
+        uint32_t* extra,
+        // other
+        int const nchannels,
+        uint32_t const nChannelsBarrel,
+        uint32_t const offsetForHashes) {
+      //
+      //    NB: energy   "type_wrapper<reco::StorageScalarType, L>::type" most likely std::vector<float>
+      //
+
+      for (int ch = threadIdx.x + blockDim.x * blockIdx.x; ch < nchannels; ch += blockDim.x * gridDim.x) {
+        //       int ch = threadIdx.x + blockDim.x*blockIdx.x;
+
+        //       if (ch < nchannels) {
+
+        bool isEndcap = (ch >= nChannelsBarrel);
+
+        int const inputCh = isEndcap ? ch - nChannelsBarrel : ch;
+
+        uint32_t const* didCh = isEndcap ? did_ee : did_eb;
+
+        // only two values, EB or EE
+        // AM : FIXME : why not using "isBarrel" ?    isBarrel ? adc2gev[0] : adc2gev[1]
+        float adc2gev_to_use = isEndcap ? adc2gev[1]   // ee
+                                        : adc2gev[0];  // eb
+
+        // first EB and then EE
+
+        ::ecal::reco::StorageScalarType const* amplitude = isEndcap ? amplitude_ee : amplitude_eb;
+
+        //::ecal::reco::StorageScalarType const* time_in = isEndcap ? time_ee : time_eb;
+
+        ::ecal::reco::StorageScalarType const* chi2_in = isEndcap ? chi2_ee : chi2_eb;
+
+        uint32_t const* flags_in = isEndcap ? flags_ee : flags_eb;
+
+        // simple copy
+        did[ch] = didCh[inputCh];
+
+        auto const did_to_use = DetId{didCh[inputCh]};
+
+        auto const isBarrel = did_to_use.subdetId() == EcalBarrel;
+        auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did_to_use.rawId())
+                                       : offsetForHashes + ecal::reconstruction::hashedIndexEE(did_to_use.rawId());
+
+        float const intercalib_to_use = intercalib[hashedId];
+
+        // get laser coefficient
+        float lasercalib = 1.;
+
+        //
+        // AM: ideas
+        //
+        //    One possibility is to create the map of laser corrections once on CPU
+        //    for all crystals and push them on GPU.
+        //    Then only if the LS is different, update the laser correction
+        //    The variation within a LS is not worth pursuing (<< 0.1% !!)
+        //    and below the precision we can claim on the laser corrections (right?).
+        //    This will save quite some time (also for the CPU version?)
+        //
+
+        int iLM = 1;
+
+        if (isBarrel) {
+          iLM = ecal::reconstruction::laser_monitoring_region_EB(did_to_use.rawId());
+        } else {
+          iLM = ecal::reconstruction::laser_monitoring_region_EE(did_to_use.rawId());
+        }
+
+        long long t_i = 0, t_f = 0;
+        float p_i = 0, p_f = 0;
+        long long lt_i = 0, lt_f = 0;
+        float lp_i = 0, lp_f = 0;
+
+        // laser
+        if (event_time >= t1[iLM - 1] && event_time < t2[iLM - 1]) {
+          t_i = t1[iLM - 1];
+          t_f = t2[iLM - 1];
+          p_i = p1[hashedId];
+          p_f = p2[hashedId];
+        } else if (event_time >= t2[iLM - 1] && event_time <= t3[iLM - 1]) {
+          t_i = t2[iLM - 1];
+          t_f = t3[iLM - 1];
+          p_i = p2[hashedId];
+          p_f = p3[hashedId];
+        } else if (event_time < t1[iLM - 1]) {
+          t_i = t1[iLM - 1];
+          t_f = t2[iLM - 1];
+          p_i = p1[hashedId];
+          p_f = p2[hashedId];
+
+        } else if (event_time > t3[iLM - 1]) {
+          t_i = t2[iLM - 1];
+          t_f = t3[iLM - 1];
+          p_i = p2[hashedId];
+          p_f = p3[hashedId];
+        }
+
+        // linear corrections
+        if (event_time >= lt1[iLM - 1] && event_time < lt2[iLM - 1]) {
+          lt_i = lt1[iLM - 1];
+          lt_f = lt2[iLM - 1];
+          lp_i = lp1[hashedId];
+          lp_f = lp2[hashedId];
+        } else if (event_time >= lt2[iLM - 1] && event_time <= lt3[iLM - 1]) {
+          lt_i = lt2[iLM - 1];
+          lt_f = lt3[iLM - 1];
+          lp_i = lp2[hashedId];
+          lp_f = lp3[hashedId];
+        } else if (event_time < lt1[iLM - 1]) {
+          lt_i = lt1[iLM - 1];
+          lt_f = lt2[iLM - 1];
+          lp_i = lp1[hashedId];
+          lp_f = lp2[hashedId];
+
+        } else if (event_time > lt3[iLM - 1]) {
+          lt_i = lt2[iLM - 1];
+          lt_f = lt3[iLM - 1];
+          lp_i = lp2[hashedId];
+          lp_f = lp3[hashedId];
+        }
+
+        // apdpnref and alpha
+        float apdpnref = apdpnrefs[hashedId];
+        float alpha = alphas[hashedId];
+
+        // now calculate transparency correction
+        if (apdpnref != 0 && (t_i - t_f) != 0 && (lt_i - lt_f) != 0) {
+          long long tt = event_time;  // never subtract two unsigned!
+          float interpolatedLaserResponse =
+              p_i / apdpnref + float(tt - t_i) * (p_f - p_i) / (apdpnref * float(t_f - t_i));
+
+          float interpolatedLinearResponse =
+              lp_i / apdpnref + float(tt - lt_i) * (lp_f - lp_i) / (apdpnref * float(lt_f - lt_i));  // FIXED BY FC
+
+          if (interpolatedLinearResponse > 2.f || interpolatedLinearResponse < 0.1f) {
+            interpolatedLinearResponse = 1.f;
+          }
+          if (interpolatedLaserResponse <= 0.) {
+            // AM :  how the heck is it possible?
+            //             interpolatedLaserResponse = 0.0001;
+            lasercalib = 1.;
+
+          } else {
+            float interpolatedTransparencyResponse = interpolatedLaserResponse / interpolatedLinearResponse;
+
+            // ... and now this:
+            lasercalib = 1.f / (std::pow(interpolatedTransparencyResponse, alpha) * interpolatedLinearResponse);
+          }
+        }
+
+        //
+        // Check for channels to be excluded from reconstruction
+        //
+        //
+        // Default energy? Not to be updated if "ChannelStatusToBeExcluded"
+        // Exploited later by the module "EcalRecHitConvertGPU2CPUFormat"
+        //
+        energy[ch] = -1;  //---- AM: default, un-physical, ok
+
+        // truncate the chi2
+        if (chi2_in[inputCh] > 64)
+          chi2[ch] = 64;
+        else
+          chi2[ch] = chi2_in[inputCh];
+
+        // default values for the flags
+        flagBits[ch] = 0;
+        extra[ch] = 0;
+
+        static const int chStatusMask = 0x1f;
+        // ChannelStatusToBeExcluded is a "int" then I put "dbstatus" to be the same
+        int dbstatus = EcalChannelStatusCode_Code((status[hashedId]) & chStatusMask);
+        if (ChannelStatusToBeExcludedSize != 0) {
+          bool skip_this_channel = false;
+          for (int ich_to_check = 0; ich_to_check < ChannelStatusToBeExcludedSize; ich_to_check++) {
+            if (ChannelStatusToBeExcluded[ich_to_check] == dbstatus) {
+              skip_this_channel = true;
+              break;
+            }
+          }
+          if (skip_this_channel) {
+            // skip this channel
+            continue;
+          }
+        }
+
+        // Take our association map of dbstatuses-> recHit flagbits and return the apporpriate flagbit word
+
+        //
+        // AM: get the smaller "flagbit_counter" with match
+        //
+
+        uint32_t temporary_flagBits = 0;
+
+        int iterator_flags = 0;
+        bool need_to_exit = false;
+        int flagbit_counter = 0;
+        while (!need_to_exit) {
+          iterator_flags = 0;
+          for (unsigned int i = 0; i != expanded_v_DB_reco_flagsSize; ++i) {
+            // check the correct "flagbit"
+            if (expanded_flagbit_v_DB_reco_flags[i] == flagbit_counter) {
+              for (unsigned int j = 0; j < expanded_Sizes_v_DB_reco_flags[i]; j++) {
+                if (expanded_v_DB_reco_flags[iterator_flags] == dbstatus) {
+                  temporary_flagBits = 0x1 << expanded_flagbit_v_DB_reco_flags[i];
+                  need_to_exit = true;
+                  break;  // also from the big loop!!!
+                }
+                iterator_flags++;
+              }
+            } else {
+              // if not, got to the next bunch directly
+              iterator_flags += expanded_Sizes_v_DB_reco_flags[i];
+            }
+
+            if (need_to_exit) {
+              break;
+            }
+          }
+          flagbit_counter += 1;
+        }
+
+        flagBits[ch] = temporary_flagBits;
+
+        if ((flagmask & temporary_flagBits) && killDeadChannels) {
+          // skip this channel
+          continue;
+        }
+
+        //
+        // multiply the adc counts with factors to get GeV
+        //
+
+        //         energy[ch] = amplitude[inputCh] * adc2gev_to_use * intercalib_to_use ;
+        energy[ch] = amplitude[inputCh] * adc2gev_to_use * intercalib_to_use * lasercalib;
+
+        // Time is not saved so far, FIXME
+        //         time[ch] = time_in[inputCh];
+
+        // NB: calculate the "flagBits extra"  --> not really "flags", but actually an encoded version of energy uncertainty, time unc., ...
+
+        //
+        // extra packing ...
+        //
+
+        uint32_t offset;
+        uint32_t width;
+        uint32_t value;
+
+        float chi2_temp = chi2[ch];
+        if (chi2_temp > 64)
+          chi2_temp = 64;
+        // use 7 bits
+        uint32_t rawChi2 = lround(chi2_temp / 64. * ((1 << 7) - 1));
+
+        offset = 0;
+        width = 7;
+        value = 0;
+
+        uint32_t mask = ((1 << width) - 1) << offset;
+        value &= ~mask;
+        value |= (rawChi2 & ((1U << width) - 1)) << offset;
+
+        //         extra[ch] = value;
+        //
+
+        // rawEnergy is actually "error" !!!
+        uint32_t rawEnergy = 0;
+
+        // AM: FIXME: this is not propagated currently to the uncalibrecHit collection SOA
+        //            if you want to store this in "extra", we need first to add it to the uncalibrecHit results
+        //            then it will be something like the following
+        //         amplitudeError[inputCh] * adc2gev_to_use * intercalib_to_use * lasercalib
+        //
+        //
+
+        float amplitudeError_ch = 0.;  // amplitudeError[ch];
+
+        if (amplitudeError_ch > 0.001) {
+          //           uint16_t exponent = getPower10(amplitudeError_ch);
+
+          static constexpr float p10[] = {1.e-2f, 1.e-1f, 1.f, 1.e1f, 1.e2f, 1.e3f, 1.e4f, 1.e5f, 1.e6f};
+          int b = amplitudeError_ch < p10[4] ? 0 : 5;
+          for (; b < 9; ++b)
+            if (amplitudeError_ch < p10[b])
+              break;
+
+          uint16_t exponent = b;
+
+          static constexpr float ip10[] = {1.e5f, 1.e4f, 1.e3f, 1.e2f, 1.e1f, 1.e0f, 1.e-1f, 1.e-2f, 1.e-3f, 1.e-4};
+          uint16_t significand = lround(amplitudeError_ch * ip10[exponent]);
+          // use 13 bits (3 exponent, 10 significand)
+          rawEnergy = exponent << 10 | significand;
+        }
+
+        offset = 8;
+        width = 13;
+        // value from last change, ok
+
+        mask = ((1 << width) - 1) << offset;
+        value &= ~mask;
+        value |= (rawEnergy & ((1U << width) - 1)) << offset;
+
+        uint32_t jitterErrorBits = 0;
+        jitterErrorBits = jitterErrorBits & 0xFF;
+
+        offset = 24;
+        width = 8;
+        // value from last change, ok
+
+        mask = ((1 << width) - 1) << offset;
+        value &= ~mask;
+        value |= (jitterErrorBits & ((1U << width) - 1)) << offset;
+
+        //
+        // now finally set "extra[ch]"
+        //
+        extra[ch] = value;
+
+        //
+        // additional flags setting
+        //
+        // using correctly the flags as calculated at the UncalibRecHit stage
+        //
+        // Now fill flags
+
+        bool good = true;
+
+        if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kLeadingEdgeRecovered))) {
+          flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kLeadingEdgeRecovered));
+          good = false;
+        }
+
+        if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kSaturated))) {
+          // leading edge recovery failed - still keep the information
+          // about the saturation and do not flag as dead
+          flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kSaturated));
+          good = false;
+        }
+
+        //
+        // AM: why do we have two tests one after the other checking almost the same thing???
+        // Please clean up the code, ... also the original one!
+        //
+        // uncalibRH.isSaturated() --->
+        //
+        //                                   bool EcalUncalibratedRecHit::isSaturated() const {
+        //                                     return EcalUncalibratedRecHit::checkFlag(kSaturated);
+        //                                   }
+        //
+        //
+
+        if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kSaturated))) {
+          flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kSaturated));
+          good = false;
+        }
+
+        if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kOutOfTime))) {
+          flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kOutOfTime));
+          good = false;
+        }
+        if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kPoorReco))) {
+          flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorReco));
+          good = false;
+        }
+        if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kHasSwitchToGain6))) {
+          flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kHasSwitchToGain6));
+        }
+        if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kHasSwitchToGain1))) {
+          flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kHasSwitchToGain1));
+        }
+
+        if (good) {
+          flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kGood));
+        }
+
+        if (isBarrel && (lasercalib < EBLaserMIN || lasercalib > EBLaserMAX)) {
+          flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorCalib));
+        }
+        if (!isBarrel && (lasercalib < EELaserMIN || lasercalib > EELaserMAX)) {
+          flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorCalib));
+        }
+
+        // recover, killing, and other stuff
+
+        //
+        // Structure:
+        //  EB
+        //  EE
+        //
+        //
+        //  - single MVA
+        //  - democratic sharing
+        //  - kill all the other cases
+        //
+
+        bool is_Single = false;
+        bool is_FE = false;
+        bool is_VFE = false;
+
+        bool is_recoverable = false;  // DetIdToBeRecovered
+
+        if (dbstatus == 10 || dbstatus == 11 || dbstatus == 12) {
+          is_recoverable = true;
+        }
+
+        if (is_recoverable) {
+          if (dbstatus == EcalChannelStatusCode_Code::kDeadVFE) {
+            is_VFE = true;
+          } else if (dbstatus == EcalChannelStatusCode_Code::kDeadVFE) {
+            is_FE = true;
+          } else {
+            is_Single = true;
+          }
+
+          // EB
+          if (isBarrel) {
+            if (is_Single || is_FE || is_VFE) {
+              // single MVA
+              if (is_Single && (recoverEBIsolatedChannels || !killDeadChannels)) {
+              }
+              // decmocratic sharing
+              else if (is_FE && (recoverEBFE || !killDeadChannels)) {
+              }
+              // kill all the other cases
+              else {
+                energy[ch] = 0.;  // Need to set also the flags ...
+              }
+            }
+          }
+          // EE
+          else {
+            if (is_Single || is_FE || is_VFE) {
+              // single MVA
+              if (is_Single && (recoverEBIsolatedChannels || !killDeadChannels)) {
+              }
+              // decmocratic sharing
+              else if (is_FE && (recoverEBFE || !killDeadChannels)) {
+                //
+                //  Code is definitely too long ...
+                //
+
+              }
+              // kill all the other cases
+              else {
+                energy[ch] = 0.;  // Need to set also the flags ...
+              }
+            }
+          }
+        }
+
+      }  // end channel
+    }
+
+    // host version, to be called by the plugin
+    void create_ecal_rehit(EventInputDataGPU const& eventInputGPU,
+                           EventOutputDataGPU& eventOutputGPU,
+                           //     eventDataForScratchGPU_,
+                           ConditionsProducts const& conditions,
+                           ConfigurationParameters const& configParameters,
+                           uint32_t const nChannelsBarrel,
+                           edm::TimeValue_t const event_time,
+                           cudaStream_t cudaStream) {
+      int nchannels = eventInputGPU.ebUncalibRecHits.size + eventInputGPU.eeUncalibRecHits.size;
+
+      //       unsigned int nchannels_per_block = 32;
+      unsigned int nchannels_per_block = 16;
+      unsigned int threads_min = nchannels_per_block;
+      unsigned int blocks_min = (nchannels + threads_min - 1) / threads_min;  // TEST : to be optimized (AM)
+
+      //
+      // kernel create rechit
+      //
+
+      //       auto const nbytesShared = 2 * threads_min * MapSymM<DataType, SampleVector::RowsAtCompileTime>::total * sizeof(DataType);
+
+      kernel_create_ecal_rehit<<<blocks_min, threads_min, 0, cudaStream>>>(
+          //       kernel_create_ecal_rehit <<< blocks_min, threads_min, nbytesShared, cudaStream >>> (
+          //       kernel_create_ecal_rehit <<< blocks_min, threads_min >>> (
+          // configuration
+          configParameters.ChannelStatusToBeExcluded,
+          configParameters.ChannelStatusToBeExcludedSize,
+          configParameters.killDeadChannels,
+          configParameters.recoverEBIsolatedChannels,
+          configParameters.recoverEEIsolatedChannels,
+          configParameters.recoverEBVFE,
+          configParameters.recoverEEVFE,
+          configParameters.recoverEBFE,
+          configParameters.recoverEEFE,
+          configParameters.EBLaserMIN,
+          configParameters.EELaserMIN,
+          configParameters.EBLaserMAX,
+          configParameters.EELaserMAX,
+          // for flags setting
+          configParameters.expanded_v_DB_reco_flags,
+          configParameters.expanded_Sizes_v_DB_reco_flags,
+          configParameters.expanded_flagbit_v_DB_reco_flags,
+          configParameters.expanded_v_DB_reco_flagsSize,
+          configParameters.flagmask,
+          // conditions
+          conditions.ADCToGeV.adc2gev,
+          conditions.Intercalib.values,
+          conditions.ChannelStatus.status,
+          conditions.LaserAPDPNRatiosRef.values,
+          conditions.LaserAlphas.values,
+          // input for transparency corrections
+          conditions.LaserAPDPNRatios.p1,
+          conditions.LaserAPDPNRatios.p2,
+          conditions.LaserAPDPNRatios.p3,
+          conditions.LaserAPDPNRatios.t1,
+          conditions.LaserAPDPNRatios.t2,
+          conditions.LaserAPDPNRatios.t3,
+          // input for linear corrections
+          conditions.LinearCorrections.p1,
+          conditions.LinearCorrections.p2,
+          conditions.LinearCorrections.p3,
+          conditions.LinearCorrections.t1,
+          conditions.LinearCorrections.t2,
+          conditions.LinearCorrections.t3,
+          // time, used for time dependent corrections
+          event_time,
+          // input
+          eventInputGPU.ebUncalibRecHits.did,
+          eventInputGPU.eeUncalibRecHits.did,
+          eventInputGPU.ebUncalibRecHits.amplitude,
+          eventInputGPU.eeUncalibRecHits.amplitude,
+          eventInputGPU.ebUncalibRecHits.jitter,
+          eventInputGPU.eeUncalibRecHits.jitter,
+          eventInputGPU.ebUncalibRecHits.chi2,
+          eventInputGPU.eeUncalibRecHits.chi2,
+          eventInputGPU.ebUncalibRecHits.flags,
+          eventInputGPU.eeUncalibRecHits.flags,
+          // output
+          eventOutputGPU.did,
+          eventOutputGPU.energy,
+          eventOutputGPU.time,
+          eventOutputGPU.chi2,
+          eventOutputGPU.flagBits,
+          eventOutputGPU.extra,
+          // other
+          nchannels,
+          nChannelsBarrel,
+          conditions.offsetForHashes);
+    }
+
+  }  // namespace rechit
+
+}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.h
new file mode 100644
index 0000000000000..831c602a24e61
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.h
@@ -0,0 +1,94 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_EcalRecHitBuilderKernels_h
+#define RecoLocalCalo_EcalRecProducers_plugins_EcalRecHitBuilderKernels_h
+
+//
+// Builder of ECAL RecHits on GPU
+//
+
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
+#include "DataFormats/Provenance/interface/Timestamp.h"
+
+#include "Common.h"
+#include "DeclsForKernels.h"
+
+namespace ecal {
+  namespace rechit {
+
+    __global__ void kernel_create_ecal_rehit(
+        // configuration
+        int const* ChannelStatusToBeExcluded,
+        uint32_t ChannelStatusToBeExcludedSize,
+        bool killDeadChannels,
+        bool const recoverEBIsolatedChannels,
+        bool const recoverEEIsolatedChannels,
+        bool const recoverEBVFE,
+        bool const recoverEEVFE,
+        bool const recoverEBFE,
+        bool const recoverEEFE,
+        // for flags setting
+        int const* expanded_v_DB_reco_flags,
+        uint32_t const* expanded_Sizes_v_DB_reco_flags,
+        uint32_t const* expanded_flagbit_v_DB_reco_flags,
+        uint32_t expanded_v_DB_reco_flagsSize,
+        uint32_t flagmask,
+        // conditions
+        float const* adc2gev,
+        float const* intercalib,
+        uint16_t const* status,
+        float const* apdpnrefs,
+        float const* alphas,
+        // input for transparency corrections
+        float const* p1,
+        float const* p2,
+        float const* p3,
+        edm::TimeValue_t const* t1,
+        edm::TimeValue_t const* t2,
+        edm::TimeValue_t const* t3,
+        // input for linear corrections
+        float const* lp1,
+        float const* lp2,
+        float const* lp3,
+        edm::TimeValue_t const* lt1,
+        edm::TimeValue_t const* lt2,
+        edm::TimeValue_t const* lt3,
+        // time, used for time dependent corrections
+        edm::TimeValue_t const event_time,
+        // input
+        uint32_t const* did_eb,
+        uint32_t const* did_ee,
+        ::ecal::reco::StorageScalarType const* amplitude_eb,  // in adc counts
+        ::ecal::reco::StorageScalarType const* amplitude_ee,  // in adc counts
+        ::ecal::reco::StorageScalarType const* time_eb,
+        ::ecal::reco::StorageScalarType const* time_ee,
+        ::ecal::reco::StorageScalarType const* chi2_eb,
+        ::ecal::reco::StorageScalarType const* chi2_ee,
+        uint32_t const* flags_eb,
+        uint32_t const* flags_ee,
+        // output
+        uint32_t* did,
+        ::ecal::reco::StorageScalarType* energy,  // in energy [GeV]
+        ::ecal::reco::StorageScalarType* time,
+        ::ecal::reco::StorageScalarType* chi2,
+        uint32_t* flagBits,
+        uint32_t* extra,
+        int const nchannels,
+        uint32_t const nChannelsBarrel,
+        uint32_t const offsetForHashes);
+
+    // host version, to be called by the plugin
+
+    void create_ecal_rehit(EventInputDataGPU const& eventInputGPU,
+                           EventOutputDataGPU& eventOutputGPU,
+                           //     eventDataForScratchGPU_,
+                           ConditionsProducts const& conditions,
+                           ConfigurationParameters const& configParameters,
+                           uint32_t const nChannelsBarrel,
+                           edm::TimeValue_t const event_time,
+                           cudaStream_t cudaStream);
+
+  }  // namespace rechit
+
+}  // namespace ecal
+
+#endif  // RecoLocalCalo_EcalRecProducers_plugins_EcalRecHitBuilderKernels_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc
index 151762c6b63d3..5766fa0ffcb85 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc
@@ -1,18 +1,16 @@
-// framework
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
+#include <iostream>
 
-// algorithm specific
-#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
 #include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
 #include "DataFormats/EcalRecHit/interface/EcalRecHit.h"
 #include "DataFormats/EcalRecHit/interface/EcalRecHitCollections.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/Common.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
 
-#include <iostream>
+#include "Common.h"
 
 class EcalRecHitConvertGPU2CPUFormat : public edm::stream::EDProducer<> {
 public:
@@ -57,7 +55,7 @@ EcalRecHitConvertGPU2CPUFormat::~EcalRecHitConvertGPU2CPUFormat() {}
 void EcalRecHitConvertGPU2CPUFormat::produce(edm::Event& event, edm::EventSetup const& setup) {
   auto const& hRecHitsGPUEB = event.get(recHitsGPUEB_);
   auto const& hRecHitsGPUEE = event.get(recHitsGPUEE_);
-  
+
   auto recHitsCPUEB = std::make_unique<EBRecHitCollection>();
   auto recHitsCPUEE = std::make_unique<EERecHitCollection>();
   recHitsCPUEB->reserve(hRecHitsGPUEB.energy.size());
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
index 248248a4cca88..c13d00a530feb 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
@@ -1,50 +1,33 @@
-// framework
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-
-// format
-#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
 #include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
 #include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h"
-
-// needed for definition of flags
-#include "DataFormats/EcalRecHit/interface/EcalRecHit.h"
-
-// the kernels
-#include "RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h"
-
-// conditions cpu
+#include "CommonTools/Utils/interface/StringToEnumValue.h"
 #include "CondFormats/DataRecord/interface/EcalADCToGeVConstantRcd.h"
-#include "CondFormats/DataRecord/interface/EcalIntercalibConstantsRcd.h"
 #include "CondFormats/DataRecord/interface/EcalChannelStatusRcd.h"
-
+#include "CondFormats/DataRecord/interface/EcalIntercalibConstantsRcd.h"
 #include "CondFormats/DataRecord/interface/EcalLaserAPDPNRatiosRcd.h"
 #include "CondFormats/DataRecord/interface/EcalLaserAPDPNRatiosRefRcd.h"
 #include "CondFormats/DataRecord/interface/EcalLaserAlphasRcd.h"
 #include "CondFormats/DataRecord/interface/EcalLinearCorrectionsRcd.h"
-
-// conditions gpu
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h"
+#include "DataFormats/EcalRecHit/interface/EcalRecHit.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h"
-
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h"
 
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-
-// configuration
-#include "CommonTools/Utils/interface/StringToEnumValue.h"
+#include "EcalRecHitBuilderKernels.h"
 
 class EcalRecHitProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
@@ -175,18 +158,16 @@ EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) {
 
   configParameters_.ChannelStatusToBeExcludedSize = v_chstatus_.size();
 
-  
   // call CUDA API functions only if CUDA is available
   edm::Service<CUDAService> cs;
   if (cs and cs->enabled()) {
-    
     cudaCheck(cudaMalloc((void**)&configParameters_.ChannelStatusToBeExcluded, sizeof(int) * v_chstatus_.size()));
     cudaCheck(cudaMemcpy(configParameters_.ChannelStatusToBeExcluded,
                          v_chstatus_.data(),
                          v_chstatus_.size() * sizeof(int),
                          cudaMemcpyHostToDevice));
   }
-  
+
   //
   //     https://github.com/cms-sw/cmssw/blob/266e21cfc9eb409b093e4cf064f4c0a24c6ac293/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitWorkerSimple.cc
   //
@@ -216,33 +197,33 @@ EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) {
   // call CUDA API functions only if CUDA is available
   if (cs and cs->enabled()) {
     // actual values
-    cudaCheck(
-        cudaMalloc((void**)&configParameters_.expanded_v_DB_reco_flags, sizeof(int) * expanded_v_DB_reco_flags_.size()));
-    
+    cudaCheck(cudaMalloc((void**)&configParameters_.expanded_v_DB_reco_flags,
+                         sizeof(int) * expanded_v_DB_reco_flags_.size()));
+
     cudaCheck(cudaMemcpy(configParameters_.expanded_v_DB_reco_flags,
                          expanded_v_DB_reco_flags_.data(),
                          expanded_v_DB_reco_flags_.size() * sizeof(int),
                          cudaMemcpyHostToDevice));
-    
+
     // sizes
     cudaCheck(cudaMalloc((void**)&configParameters_.expanded_Sizes_v_DB_reco_flags,
                          sizeof(uint32_t) * expanded_Sizes_v_DB_reco_flags_.size()));
-    
+
     cudaCheck(cudaMemcpy(configParameters_.expanded_Sizes_v_DB_reco_flags,
                          expanded_Sizes_v_DB_reco_flags_.data(),
                          expanded_Sizes_v_DB_reco_flags_.size() * sizeof(uint32_t),
                          cudaMemcpyHostToDevice));
-    
+
     // keys
     cudaCheck(cudaMalloc((void**)&configParameters_.expanded_flagbit_v_DB_reco_flags,
                          sizeof(uint32_t) * expanded_flagbit_v_DB_reco_flags_.size()));
-    
+
     cudaCheck(cudaMemcpy(configParameters_.expanded_flagbit_v_DB_reco_flags,
                          expanded_flagbit_v_DB_reco_flags_.data(),
                          expanded_flagbit_v_DB_reco_flags_.size() * sizeof(uint32_t),
                          cudaMemcpyHostToDevice));
   }
-  
+
   configParameters_.expanded_v_DB_reco_flagsSize = expanded_flagbit_v_DB_reco_flags_.size();
 
   flagmask_ = 0;
@@ -266,7 +247,6 @@ EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) {
 }
 
 EcalRecHitProducerGPU::~EcalRecHitProducerGPU() {
-  
   edm::Service<CUDAService> cs;
   if (cs and cs->enabled()) {
     // free event ouput data
@@ -303,7 +283,7 @@ void EcalRecHitProducerGPU::acquire(edm::Event const& event,
   if ((neb_ + nee_) > maxNumberHits_) {
     edm::LogError("EcalRecHitProducerGPU") << "max number of channels exceeded. See options 'maxNumberHits' ";
   }
-  
+
   int nchannelsEB = ebUncalibRecHits.size;  // --> offsetForInput, first EB and then EE
 
   // conditions
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc
index 20f51ea5245df..69c6b7a67831c 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc
@@ -1,18 +1,16 @@
-// framework
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
+#include <iostream>
 
-// algorithm specific
-#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
 #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
-#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
 #include "DataFormats/EcalRecHit/interface/EcalRecHitCollections.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/Common.h"
+#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
 
-#include <iostream>
+#include "Common.h"
 
 class EcalUncalibRecHitConvertGPU2CPUFormat : public edm::stream::EDProducer<> {
 public:
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu
new file mode 100644
index 0000000000000..e79a5dff85d9f
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu
@@ -0,0 +1,312 @@
+#include <iostream>
+#include <limits>
+
+#include <cuda.h>
+
+#include "CondFormats/EcalObjects/interface/EcalMGPAGainRatio.h"
+#include "CondFormats/EcalObjects/interface/EcalPedestals.h"
+#include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h"
+#include "CondFormats/EcalObjects/interface/EcalPulseShapes.h"
+#include "CondFormats/EcalObjects/interface/EcalSampleMask.h"
+#include "CondFormats/EcalObjects/interface/EcalSamplesCorrelation.h"
+#include "CondFormats/EcalObjects/interface/EcalXtalGroupId.h"
+#include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+
+#include "AmplitudeComputationCommonKernels.h"
+#include "AmplitudeComputationKernels.h"
+#include "Common.h"
+#include "EcalUncalibRecHitMultiFitAlgo_gpu_new.h"
+#include "TimeComputationKernels.h"
+
+//#define DEBUG
+
+//#define ECAL_RECO_CUDA_DEBUG
+
+namespace ecal {
+  namespace multifit {
+
+    void entryPoint(EventInputDataGPU const& eventInputGPU,
+                    EventOutputDataGPU& eventOutputGPU,
+                    EventDataForScratchGPU& scratch,
+                    ConditionsProducts const& conditions,
+                    ConfigurationParameters const& configParameters,
+                    cudaStream_t cudaStream) {
+      using digis_type = std::vector<uint16_t>;
+      using dids_type = std::vector<uint32_t>;
+      // accodring to the cpu setup  //----> hardcoded
+      bool const gainSwitchUseMaxSampleEB = true;
+      // accodring to the cpu setup  //----> hardcoded
+      bool const gainSwitchUseMaxSampleEE = false;
+
+      uint32_t const offsetForHashes = conditions.offsetForHashes;
+      uint32_t const offsetForInputs = eventInputGPU.ebDigis.ndigis;
+      unsigned int totalChannels = eventInputGPU.ebDigis.ndigis + eventInputGPU.eeDigis.ndigis;
+
+      //
+      // 1d preparation kernel
+      //
+      unsigned int nchannels_per_block = 32;
+      unsigned int threads_1d = 10 * nchannels_per_block;
+      unsigned int blocks_1d = threads_1d > 10 * totalChannels ? 1 : (totalChannels * 10 + threads_1d - 1) / threads_1d;
+      int shared_bytes = nchannels_per_block * EcalDataFrame::MAXSAMPLES *
+                         (sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(char) + sizeof(bool));
+      kernel_prep_1d_and_initialize<<<blocks_1d, threads_1d, shared_bytes, cudaStream>>>(
+          conditions.pulseShapes.values,
+          eventInputGPU.ebDigis.data,
+          eventInputGPU.ebDigis.ids,
+          eventInputGPU.eeDigis.data,
+          eventInputGPU.eeDigis.ids,
+          scratch.samples,
+          (SampleVector*)eventOutputGPU.amplitudesAll,
+          scratch.gainsNoise,
+          conditions.pedestals.mean_x1,
+          conditions.pedestals.mean_x12,
+          conditions.pedestals.rms_x12,
+          conditions.pedestals.mean_x6,
+          conditions.gainRatios.gain6Over1,
+          conditions.gainRatios.gain12Over6,
+          scratch.hasSwitchToGain6,
+          scratch.hasSwitchToGain1,
+          scratch.isSaturated,
+          eventOutputGPU.amplitude,
+          eventOutputGPU.chi2,
+          eventOutputGPU.pedestal,
+          eventOutputGPU.did,
+          eventOutputGPU.flags,
+          scratch.acState,
+          scratch.activeBXs,
+          offsetForHashes,
+          offsetForInputs,
+          gainSwitchUseMaxSampleEB,
+          gainSwitchUseMaxSampleEE,
+          totalChannels);
+      cudaCheck(cudaGetLastError());
+
+      //
+      // 2d preparation kernel
+      //
+      int blocks_2d = totalChannels;
+      dim3 threads_2d{10, 10};
+      kernel_prep_2d<<<blocks_2d, threads_2d, 0, cudaStream>>>(scratch.gainsNoise,
+                                                               eventInputGPU.ebDigis.ids,
+                                                               eventInputGPU.eeDigis.ids,
+                                                               conditions.pedestals.rms_x12,
+                                                               conditions.pedestals.rms_x6,
+                                                               conditions.pedestals.rms_x1,
+                                                               conditions.gainRatios.gain12Over6,
+                                                               conditions.gainRatios.gain6Over1,
+                                                               conditions.samplesCorrelation.EBG12SamplesCorrelation,
+                                                               conditions.samplesCorrelation.EBG6SamplesCorrelation,
+                                                               conditions.samplesCorrelation.EBG1SamplesCorrelation,
+                                                               conditions.samplesCorrelation.EEG12SamplesCorrelation,
+                                                               conditions.samplesCorrelation.EEG6SamplesCorrelation,
+                                                               conditions.samplesCorrelation.EEG1SamplesCorrelation,
+                                                               scratch.noisecov,
+                                                               scratch.pulse_matrix,
+                                                               conditions.pulseShapes.values,
+                                                               scratch.hasSwitchToGain6,
+                                                               scratch.hasSwitchToGain1,
+                                                               scratch.isSaturated,
+                                                               offsetForHashes,
+                                                               offsetForInputs);
+      cudaCheck(cudaGetLastError());
+
+      // run minimization kernels
+      v1::minimization_procedure(eventInputGPU, eventOutputGPU, scratch, conditions, configParameters, cudaStream);
+
+      if (configParameters.shouldRunTimingComputation) {
+        //
+        // TODO: this guy can run concurrently with other kernels,
+        // there is no dependence on the order of execution
+        //
+        unsigned int threads_time_init = threads_1d;
+        unsigned int blocks_time_init = blocks_1d;
+        int sharedBytesInit = 2 * threads_time_init * sizeof(SampleVector::Scalar);
+        kernel_time_computation_init<<<blocks_time_init, threads_time_init, sharedBytesInit, cudaStream>>>(
+            eventInputGPU.ebDigis.data,
+            eventInputGPU.ebDigis.ids,
+            eventInputGPU.eeDigis.data,
+            eventInputGPU.eeDigis.ids,
+            conditions.pedestals.rms_x12,
+            conditions.pedestals.rms_x6,
+            conditions.pedestals.rms_x1,
+            conditions.pedestals.mean_x12,
+            conditions.pedestals.mean_x6,
+            conditions.pedestals.mean_x1,
+            conditions.gainRatios.gain12Over6,
+            conditions.gainRatios.gain6Over1,
+            scratch.sample_values,
+            scratch.sample_value_errors,
+            scratch.ampMaxError,
+            scratch.useless_sample_values,
+            scratch.pedestal_nums,
+            offsetForHashes,
+            offsetForInputs,
+            conditions.sampleMask.getEcalSampleMaskRecordEB(),
+            conditions.sampleMask.getEcalSampleMaskRecordEE(),
+            totalChannels);
+        cudaCheck(cudaGetLastError());
+
+        //
+        // TODO: small kernel only for EB. It needs to be checked if
+        /// fusing such small kernels is beneficial in here
+        //
+        // we are running only over EB digis
+        // therefore we need to create threads/blocks only for that
+        unsigned int const threadsFixMGPA = threads_1d;
+        unsigned int const blocksFixMGPA =
+            threadsFixMGPA > 10 * eventInputGPU.ebDigis.ndigis
+                ? 1
+                : (10 * eventInputGPU.ebDigis.ndigis + threadsFixMGPA - 1) / threadsFixMGPA;
+        kernel_time_compute_fixMGPAslew<<<blocksFixMGPA, threadsFixMGPA, 0, cudaStream>>>(
+            eventInputGPU.ebDigis.data,
+            eventInputGPU.eeDigis.data,
+            scratch.sample_values,
+            scratch.sample_value_errors,
+            scratch.useless_sample_values,
+            conditions.sampleMask.getEcalSampleMaskRecordEB(),
+            totalChannels,
+            offsetForInputs);
+        cudaCheck(cudaGetLastError());
+
+        //
+        //
+        //
+        int sharedBytes = EcalDataFrame::MAXSAMPLES * nchannels_per_block * 4 * sizeof(SampleVector::Scalar);
+        auto const threads_nullhypot = threads_1d;
+        auto const blocks_nullhypot = blocks_1d;
+        kernel_time_compute_nullhypot<<<blocks_nullhypot, threads_nullhypot, sharedBytes, cudaStream>>>(
+            scratch.sample_values,
+            scratch.sample_value_errors,
+            scratch.useless_sample_values,
+            scratch.chi2sNullHypot,
+            scratch.sum0sNullHypot,
+            scratch.sumAAsNullHypot,
+            totalChannels);
+        cudaCheck(cudaGetLastError());
+
+        unsigned int nchannels_per_block_makeratio = 10;
+        unsigned int threads_makeratio = 45 * nchannels_per_block_makeratio;
+        unsigned int blocks_makeratio = threads_makeratio > 45 * totalChannels
+                                            ? 1
+                                            : (totalChannels * 45 + threads_makeratio - 1) / threads_makeratio;
+        int sharedBytesMakeRatio = 5 * threads_makeratio * sizeof(SampleVector::Scalar);
+        kernel_time_compute_makeratio<<<blocks_makeratio, threads_makeratio, sharedBytesMakeRatio, cudaStream>>>(
+            scratch.sample_values,
+            scratch.sample_value_errors,
+            eventInputGPU.ebDigis.ids,
+            eventInputGPU.eeDigis.ids,
+            scratch.useless_sample_values,
+            scratch.pedestal_nums,
+            configParameters.amplitudeFitParametersEB,
+            configParameters.amplitudeFitParametersEE,
+            configParameters.timeFitParametersEB,
+            configParameters.timeFitParametersEE,
+            scratch.sumAAsNullHypot,
+            scratch.sum0sNullHypot,
+            scratch.tMaxAlphaBetas,
+            scratch.tMaxErrorAlphaBetas,
+            scratch.accTimeMax,
+            scratch.accTimeWgt,
+            scratch.tcState,
+            configParameters.timeFitParametersSizeEB,
+            configParameters.timeFitParametersSizeEE,
+            configParameters.timeFitLimitsFirstEB,
+            configParameters.timeFitLimitsFirstEE,
+            configParameters.timeFitLimitsSecondEB,
+            configParameters.timeFitLimitsSecondEE,
+            totalChannels,
+            offsetForInputs);
+        cudaCheck(cudaGetLastError());
+
+        //
+        //
+        //
+        auto const threads_findamplchi2 = threads_1d;
+        auto const blocks_findamplchi2 = blocks_1d;
+        int const sharedBytesFindAmplChi2 = 2 * threads_findamplchi2 * sizeof(SampleVector::Scalar);
+        kernel_time_compute_findamplchi2_and_finish<<<blocks_findamplchi2,
+                                                      threads_findamplchi2,
+                                                      sharedBytesFindAmplChi2,
+                                                      cudaStream>>>(scratch.sample_values,
+                                                                    scratch.sample_value_errors,
+                                                                    eventInputGPU.ebDigis.ids,
+                                                                    eventInputGPU.eeDigis.ids,
+                                                                    scratch.useless_sample_values,
+                                                                    scratch.tMaxAlphaBetas,
+                                                                    scratch.tMaxErrorAlphaBetas,
+                                                                    scratch.accTimeMax,
+                                                                    scratch.accTimeWgt,
+                                                                    configParameters.amplitudeFitParametersEB,
+                                                                    configParameters.amplitudeFitParametersEE,
+                                                                    scratch.sumAAsNullHypot,
+                                                                    scratch.sum0sNullHypot,
+                                                                    scratch.chi2sNullHypot,
+                                                                    scratch.tcState,
+                                                                    scratch.ampMaxAlphaBeta,
+                                                                    scratch.ampMaxError,
+                                                                    scratch.timeMax,
+                                                                    scratch.timeError,
+                                                                    totalChannels,
+                                                                    offsetForInputs);
+        cudaCheck(cudaGetLastError());
+
+        //
+        //
+        //
+        auto const threads_timecorr = 32;
+        auto const blocks_timecorr =
+            threads_timecorr > totalChannels ? 1 : (totalChannels + threads_timecorr - 1) / threads_timecorr;
+        kernel_time_correction_and_finalize<<<blocks_timecorr, threads_timecorr, 0, cudaStream>>>(
+            eventOutputGPU.amplitude,
+            eventInputGPU.ebDigis.data,
+            eventInputGPU.ebDigis.ids,
+            eventInputGPU.eeDigis.data,
+            eventInputGPU.eeDigis.ids,
+            conditions.timeBiasCorrections.EBTimeCorrAmplitudeBins,
+            conditions.timeBiasCorrections.EETimeCorrAmplitudeBins,
+            conditions.timeBiasCorrections.EBTimeCorrShiftBins,
+            conditions.timeBiasCorrections.EETimeCorrShiftBins,
+            scratch.timeMax,
+            scratch.timeError,
+            conditions.pedestals.rms_x12,
+            conditions.timeCalibConstants.values,
+            eventOutputGPU.jitter,
+            eventOutputGPU.jitterError,
+            eventOutputGPU.flags,
+            conditions.timeBiasCorrections.EBTimeCorrAmplitudeBinsSize,
+            conditions.timeBiasCorrections.EETimeCorrAmplitudeBinsSize,
+            configParameters.timeConstantTermEB,
+            configParameters.timeConstantTermEE,
+            conditions.timeOffsetConstant.getEBValue(),
+            conditions.timeOffsetConstant.getEEValue(),
+            configParameters.timeNconstEB,
+            configParameters.timeNconstEE,
+            configParameters.amplitudeThreshEB,
+            configParameters.amplitudeThreshEE,
+            configParameters.outOfTimeThreshG12pEB,
+            configParameters.outOfTimeThreshG12pEE,
+            configParameters.outOfTimeThreshG12mEB,
+            configParameters.outOfTimeThreshG12mEE,
+            configParameters.outOfTimeThreshG61pEB,
+            configParameters.outOfTimeThreshG61pEE,
+            configParameters.outOfTimeThreshG61mEB,
+            configParameters.outOfTimeThreshG61mEE,
+            offsetForHashes,
+            offsetForInputs,
+            totalChannels);
+        cudaCheck(cudaGetLastError());
+      }
+
+      /*
+    cudaEventRecord(end_event, 0);
+    cudaEventSynchronize(end_event);
+    float ms;
+    cudaEventElapsedTime(&ms, start_event, end_event);
+    std::cout << "elapsed time = " << ms << std::endl;
+    */
+    }
+
+  }  // namespace multifit
+}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgo_gpu_new.h b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgo_gpu_new.h
new file mode 100644
index 0000000000000..93923e63be632
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgo_gpu_new.h
@@ -0,0 +1,23 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_EcalUncalibRecHitMultiFitAlgo_gpu_new_h
+#define RecoLocalCalo_EcalRecProducers_plugins_EcalUncalibRecHitMultiFitAlgo_gpu_new_h
+
+#include <vector>
+
+#include <cuda.h>
+
+#include "DeclsForKernels.h"
+
+namespace ecal {
+  namespace multifit {
+
+    void entryPoint(EventInputDataGPU const&,
+                    EventOutputDataGPU&,
+                    EventDataForScratchGPU&,
+                    ConditionsProducts const&,
+                    ConfigurationParameters const&,
+                    cudaStream_t);
+
+  }  // namespace multifit
+}  // namespace ecal
+
+#endif  // RecoLocalCalo_EcalRecProducers_plugins_EcalUncalibRecHitMultiFitAlgo_gpu_new_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc
index cdf21e3240954..5bd54787ee155 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc
@@ -1,44 +1,35 @@
-// framework
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
-//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h"
-
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-
-// algorithm specific
-#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
-#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/Common.h"
-
 #include <iostream>
 
-#include "CondFormats/EcalObjects/interface/EcalTimeOffsetConstant.h"
-
-#include "CondFormats/DataRecord/interface/EcalPedestalsRcd.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
 #include "CondFormats/DataRecord/interface/EcalGainRatiosRcd.h"
-#include "CondFormats/DataRecord/interface/EcalPulseShapesRcd.h"
+#include "CondFormats/DataRecord/interface/EcalPedestalsRcd.h"
 #include "CondFormats/DataRecord/interface/EcalPulseCovariancesRcd.h"
+#include "CondFormats/DataRecord/interface/EcalPulseShapesRcd.h"
+#include "CondFormats/DataRecord/interface/EcalSampleMaskRcd.h"
 #include "CondFormats/DataRecord/interface/EcalSamplesCorrelationRcd.h"
 #include "CondFormats/DataRecord/interface/EcalTimeBiasCorrectionsRcd.h"
 #include "CondFormats/DataRecord/interface/EcalTimeCalibConstantsRcd.h"
 #include "CondFormats/DataRecord/interface/EcalTimeOffsetConstantRcd.h"
-#include "CondFormats/DataRecord/interface/EcalSampleMaskRcd.h"
-
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h"
+#include "CondFormats/EcalObjects/interface/EcalTimeOffsetConstant.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h"
 
-#include "RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalUncalibRecHitMultiFitAlgo_gpu_new.h"
+#include "Common.h"
+#include "DeclsForKernels.h"
+#include "EcalUncalibRecHitMultiFitAlgo_gpu_new.h"
 
 class EcalUncalibRecHitProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
@@ -290,7 +281,7 @@ void EcalUncalibRecHitProducerGPU::acquire(edm::Event const& event,
   if ((neb_ + nee_) > maxNumberHits_) {
     edm::LogError("EcalUncalibRecHitProducerGPU") << "max number of channels exceeded. See options 'maxNumberHits' ";
   }
-  
+
   // conditions
   setup.get<EcalPedestalsRcd>().get(pedestalsHandle_);
   setup.get<EcalGainRatiosRcd>().get(gainRatiosHandle_);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EigenMatrixTypes_gpu.h b/RecoLocalCalo/EcalRecProducers/plugins/EigenMatrixTypes_gpu.h
new file mode 100644
index 0000000000000..bbf9cb0dbb5c9
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EigenMatrixTypes_gpu.h
@@ -0,0 +1,49 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_EigenMatrixTypes_gpu_h
+#define RecoLocalCalo_EcalRecProducers_plugins_EigenMatrixTypes_gpu_h
+
+#include <array>
+
+#include <Eigen/Dense>
+
+#include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h"
+
+namespace ecal {
+  namespace multifit {
+
+    constexpr int SampleVectorSize = 10;
+    constexpr int FullSampleVectorSize = 19;
+    constexpr int PulseVectorSize = 12;
+    constexpr int NGains = 3;
+
+    using data_type = ::ecal::reco::ComputationScalarType;
+
+    typedef Eigen::Matrix<data_type, SampleVectorSize, SampleVectorSize> PulseMatrixType;
+    typedef Eigen::Matrix<char, SampleVectorSize, 1> BXVectorType;
+    using SampleMatrixD = Eigen::Matrix<double, SampleVectorSize, SampleVectorSize>;
+
+    typedef Eigen::Matrix<data_type, SampleVectorSize, 1> SampleVector;
+    typedef Eigen::Matrix<data_type, FullSampleVectorSize, 1> FullSampleVector;
+    typedef Eigen::Matrix<data_type, Eigen::Dynamic, 1, 0, PulseVectorSize, 1> PulseVector;
+    typedef Eigen::Matrix<char, Eigen::Dynamic, 1, 0, PulseVectorSize, 1> BXVector;
+    typedef Eigen::Matrix<char, SampleVectorSize, 1> SampleGainVector;
+    typedef Eigen::Matrix<data_type, SampleVectorSize, SampleVectorSize> SampleMatrix;
+    typedef Eigen::Matrix<data_type, FullSampleVectorSize, FullSampleVectorSize> FullSampleMatrix;
+    typedef Eigen::Matrix<data_type, Eigen::Dynamic, Eigen::Dynamic, 0, PulseVectorSize, PulseVectorSize> PulseMatrix;
+    typedef Eigen::Matrix<data_type, SampleVectorSize, Eigen::Dynamic, 0, SampleVectorSize, PulseVectorSize>
+        SamplePulseMatrix;
+    typedef Eigen::LLT<SampleMatrix> SampleDecompLLT;
+    typedef Eigen::LLT<SampleMatrixD> SampleDecompLLTD;
+    typedef Eigen::LLT<PulseMatrix> PulseDecompLLT;
+    typedef Eigen::LDLT<PulseMatrix> PulseDecompLDLT;
+
+    typedef Eigen::Matrix<data_type, 1, 1> SingleMatrix;
+    typedef Eigen::Matrix<data_type, 1, 1> SingleVector;
+
+    typedef std::array<SampleMatrixD, NGains> SampleMatrixGainArray;
+
+    using PermutationMatrix = Eigen::PermutationMatrix<SampleMatrix::RowsAtCompileTime>;
+
+  }  // namespace multifit
+}  // namespace ecal
+
+#endif  // RecoLocalCalo_EcalRecProducers_plugins_EigenMatrixTypes_gpu_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.cu b/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.cu
new file mode 100644
index 0000000000000..00c61e04e1596
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.cu
@@ -0,0 +1,323 @@
+#include "DataFormats/EcalDetId/interface/EBDetId.h"
+#include "DataFormats/EcalDetId/interface/EEDetId.h"
+
+#include "KernelHelpers.h"
+
+namespace ecal {
+  namespace reconstruction {
+
+    namespace internal {
+
+      namespace barrel {
+
+        __device__ __forceinline__ bool positiveZ(uint32_t id) { return id & 0x10000; }
+
+        __device__ __forceinline__ uint32_t ietaAbs(uint32_t id) { return (id >> 9) & 0x7F; }
+
+        __device__ __forceinline__ uint32_t iphi(uint32_t id) { return id & 0x1FF; }
+
+        __device__ int dccFromSm(int ism) {
+          int iz = 1;
+          if (ism > 18)
+            iz = -1;
+          if (iz == -1)
+            ism -= 18;
+          int idcc = 9 + ism;
+          if (iz == +1)
+            idcc += 18;
+          return idcc;
+        }
+
+        __device__ int sm(int ieta, int iphi) {
+          int iz = 1;
+          if (ieta < 0)
+            iz = -1;
+          ieta *= iz;
+          int iphi_ = iphi;
+          if (iphi_ > 360)
+            iphi_ -= 360;
+          int ism = (iphi_ - 1) / 20 + 1;
+          if (iz == -1)
+            ism += 18;
+          return ism;
+        }
+
+        __device__ int dcc(int ieta, int iphi) {
+          int ism = sm(ieta, iphi);
+          return dccFromSm(ism);
+        }
+
+        //
+        // ---- why on hell things are so complex and not simple ???
+        //
+
+        __device__ int lm_channel(int iX, int iY) {
+          static const int idx_[] = {
+              // clang-format off
+         // 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16
+            1, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8,  // 3
+            1, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8,  // 2
+            1, 3, 3, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 9, 9, 9, 9,  // 1
+            1, 3, 3, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 9, 9, 9, 9  // 0
+         // 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16
+              // clang-format on
+          };
+
+          int il, ic, ii;
+          const int iym = 4;
+          const int ixm = 17;
+          int iX_ = iX + 1;
+          int iY_ = iY + 1;
+          il = iym - iY_;
+          ic = iX_ - 1;
+          ii = il * ixm + ic;
+          if (ii < 0 || ii > (int)(sizeof(idx_) / sizeof(int))) {
+            return -1;
+          };
+          return idx_[ii];
+        }
+
+        __device__ int localCoord_x(int ieta, int iphi) {
+          int iz = 1;
+          if (ieta < 0) {
+            iz = -1;
+          }
+          ieta *= iz;
+          //   int iphi_ = iphi;
+          //   if (iphi_ > 360) {
+          //     iphi_ -= 360;
+          //   }
+          int ix = ieta - 1;
+          //   int iy = (iphi_ - 1) % 20;
+          //   if (iz == -1) {
+          //     iy = 19 - iy;
+          //   }
+
+          return ix;
+        }
+
+        __device__ int localCoord_y(int ieta, int iphi) {
+          int iz = 1;
+          if (ieta < 0) {
+            iz = -1;
+          }
+          //   ieta *= iz;
+          int iphi_ = iphi;
+          if (iphi_ > 360) {
+            iphi_ -= 360;
+          }
+          //   int ix = ieta - 1;
+          int iy = (iphi_ - 1) % 20;
+          if (iz == -1) {
+            iy = 19 - iy;
+          }
+
+          return iy;
+        }
+
+        __device__ int lmmod(int ieta, int iphi) {
+          int ix = localCoord_x(ieta, iphi);
+          int iy = localCoord_y(ieta, iphi);
+
+          return lm_channel(ix / 5, iy / 5);
+        }
+
+        __device__ int side(int ieta, int iphi) {
+          int ilmmod = lmmod(ieta, iphi);
+          return (ilmmod % 2 == 0) ? 1 : 0;
+        }
+
+      }  // namespace barrel
+
+    }  // namespace internal
+
+    __device__ uint32_t hashedIndexEB(uint32_t id) {
+      using namespace internal::barrel;
+      return (EBDetId::MAX_IETA + (positiveZ(id) ? ietaAbs(id) - 1 : -ietaAbs(id))) * EBDetId::MAX_IPHI + iphi(id) - 1;
+    }
+
+    //
+    // https://cmssdt.cern.ch/lxr/source/CalibCalorimetry/EcalLaserAnalyzer/src/MEEBGeom.cc
+    //  function: "lmr"
+
+    __device__ int laser_monitoring_region_EB(uint32_t id) {
+      using namespace internal::barrel;
+
+      int ieta;
+      if (positiveZ(id)) {
+        ieta = ietaAbs(id);
+      } else {
+        ieta = -ietaAbs(id);
+      }
+
+      int idcc = dcc(ieta, (int)(iphi(id)));
+      int ism = idcc - 9;
+
+      int iside = side(ieta, (int)(iphi(id)));
+      //   int iside = positiveZ(id) ? 1 : 0;
+
+      return (1 + 2 * (ism - 1) + iside);
+      //   return ieta;
+      //   return (int) (iphi(id));
+      //   return idcc;
+      //   return iside;
+    }
+
+    namespace internal {
+
+      namespace endcap {
+
+        __device__ __forceinline__ uint32_t ix(uint32_t id) { return (id >> 7) & 0x7F; }
+
+        __device__ __forceinline__ uint32_t iy(uint32_t id) { return id & 0x7F; }
+
+        __device__ __forceinline__ bool positiveZ(uint32_t id) { return id & 0x4000; }
+
+        // these constants come from EE Det Id
+        __constant__ const unsigned short kxf[] = {
+            41, 51, 41, 51, 41, 51, 36, 51, 36, 51, 26, 51, 26, 51, 26, 51, 21, 51, 21, 51, 21, 51, 21, 51, 21,
+            51, 16, 51, 16, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 9,  51, 9,  51, 9,  51, 9,  51, 9,  51,
+            6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 4,  51, 4,  51, 4,
+            51, 4,  51, 4,  56, 1,  58, 1,  59, 1,  60, 1,  61, 1,  61, 1,  62, 1,  62, 1,  62, 1,  62, 1,  62,
+            1,  62, 1,  62, 1,  62, 1,  62, 1,  62, 1,  61, 1,  61, 1,  60, 1,  59, 1,  58, 4,  56, 4,  51, 4,
+            51, 4,  51, 4,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51,
+            9,  51, 9,  51, 9,  51, 9,  51, 9,  51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 16, 51, 16, 51, 21,
+            51, 21, 51, 21, 51, 21, 51, 21, 51, 26, 51, 26, 51, 26, 51, 36, 51, 36, 51, 41, 51, 41, 51, 41, 51};
+
+        __constant__ const unsigned short kdi[] = {
+            0,    10,   20,   30,   40,   50,   60,   75,   90,   105,  120,  145,  170,  195,  220,  245,  270,
+            300,  330,  360,  390,  420,  450,  480,  510,  540,  570,  605,  640,  675,  710,  747,  784,  821,
+            858,  895,  932,  969,  1006, 1043, 1080, 1122, 1164, 1206, 1248, 1290, 1332, 1374, 1416, 1458, 1500,
+            1545, 1590, 1635, 1680, 1725, 1770, 1815, 1860, 1905, 1950, 1995, 2040, 2085, 2130, 2175, 2220, 2265,
+            2310, 2355, 2400, 2447, 2494, 2541, 2588, 2635, 2682, 2729, 2776, 2818, 2860, 2903, 2946, 2988, 3030,
+            3071, 3112, 3152, 3192, 3232, 3272, 3311, 3350, 3389, 3428, 3467, 3506, 3545, 3584, 3623, 3662, 3701,
+            3740, 3779, 3818, 3857, 3896, 3935, 3974, 4013, 4052, 4092, 4132, 4172, 4212, 4253, 4294, 4336, 4378,
+            4421, 4464, 4506, 4548, 4595, 4642, 4689, 4736, 4783, 4830, 4877, 4924, 4969, 5014, 5059, 5104, 5149,
+            5194, 5239, 5284, 5329, 5374, 5419, 5464, 5509, 5554, 5599, 5644, 5689, 5734, 5779, 5824, 5866, 5908,
+            5950, 5992, 6034, 6076, 6118, 6160, 6202, 6244, 6281, 6318, 6355, 6392, 6429, 6466, 6503, 6540, 6577,
+            6614, 6649, 6684, 6719, 6754, 6784, 6814, 6844, 6874, 6904, 6934, 6964, 6994, 7024, 7054, 7079, 7104,
+            7129, 7154, 7179, 7204, 7219, 7234, 7249, 7264, 7274, 7284, 7294, 7304, 7314};
+
+        __device__ int quadrant(int iX, int iY) {
+          bool near = iX >= 11;
+          bool far = !near;
+          bool top = iY >= 11;
+          bool bot = !top;
+
+          int iquad = 0;
+          if (near && top)
+            iquad = 1;
+          if (far && top)
+            iquad = 2;
+          if (far && bot)
+            iquad = 3;
+          if (near && bot)
+            iquad = 4;
+
+          return iquad;
+        }
+
+        __device__ int sector(int iX, int iY) {
+          //  Y (towards the surface)
+          //  T
+          //  |
+          //  |
+          //  |
+          //  o---------| X  (towards center of LHC)
+          //
+          static const int idx_[] = {
+              // clang-format off
+             // 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
+                0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0,  // 20
+                0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0,  // 19
+                0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 8, 0, 0, 0,  // 18
+                0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 8, 8, 8, 0, 0,  // 17
+                0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 8, 8, 8, 8, 0,  // 16
+                0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 9, 9, 9, 9, 8, 8, 8, 8, 8, 0,  // 15
+                0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 9, 9, 9, 8, 8, 8, 8, 8, 8, 0,  // 14
+                2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8,  // 13
+                3, 3, 2, 2, 2, 2, 2, 2, 2, 0, 0, 8, 8, 8, 8, 8, 8, 8, 7, 7,  // 12
+                3, 3, 3, 3, 3, 3, 3, 2, 0, 0, 0, 0, 8, 7, 7, 7, 7, 7, 7, 7,  // 11
+                3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7,  // 10
+                3, 3, 3, 3, 3, 3, 3, 4, 4, 0, 0, 6, 6, 7, 7, 7, 7, 7, 7, 7,  // 9
+                3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 7, 7, 7,  // 8
+                0, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 0,  // 7
+                0, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 0,  // 6
+                0, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 0,  // 5
+                0, 0, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 0, 0,  // 4
+                0, 0, 0, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 0, 0, 0,  // 3
+                0, 0, 0, 0, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 0, 0, 0, 0,  // 2
+                0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0   // 1
+             // 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
+              // clang-format on
+          };
+
+          int iym, ixm, il, ic, ii;
+          iym = 20;
+          ixm = 20;
+          int iX_ = iX;
+          int iY_ = iY;
+          il = iym - iY_;
+          ic = iX_ - 1;
+          ii = il * ixm + ic;
+
+          if (ii < 0 || ii > (int)(sizeof(idx_) / sizeof(int)) || idx_[ii] == 0) {
+            return -1;
+          };
+          return idx_[ii];
+        }
+
+      }  // namespace endcap
+
+    }  // namespace internal
+
+    __device__ uint32_t hashedIndexEE(uint32_t id) {
+      using namespace internal::endcap;
+
+      const uint32_t jx(ix(id));
+      const uint32_t jd(2 * (iy(id) - 1) + (jx - 1) / 50);
+      return ((positiveZ(id) ? EEDetId::kEEhalf : 0) + kdi[jd] + jx - kxf[jd]);
+    }
+
+    //
+    // https://cmssdt.cern.ch/lxr/source/CalibCalorimetry/EcalLaserAnalyzer/src/MEEEGeom.cc
+    // https://github.com/cms-sw/cmssw/blob/master/CalibCalorimetry/EcalLaserCorrection/src/EcalLaserDbService.cc
+    //
+
+    __device__ int laser_monitoring_region_EE(uint32_t id) {
+      using namespace internal::endcap;
+
+      // SuperCrysCoord
+      uint32_t iX = (ix(id) - 1) / 5 + 1;
+      uint32_t iY = (iy(id) - 1) / 5 + 1;
+
+      // Correct convention
+      //   * @param iz iz/zside index: -1 for EE-, +1 for EE+
+      //   https://github.com/cms-sw/cmssw/blob/master/DataFormats/EcalDetId/interface/EEDetId.h#L68-L71
+      //   zside in https://github.com/cms-sw/cmssw/blob/master/CalibCalorimetry/EcalLaserCorrection/src/EcalLaserDbService.cc#L63
+      //
+      int iz = positiveZ(id) ? 1 : -1;
+
+      int iquad = quadrant(iX, iY);
+      int isect = sector(iX, iY);
+      if (isect < 0)
+        return -1;
+
+      int ilmr = 0;
+      ilmr = isect - 6;
+      if (ilmr <= 0)
+        ilmr += 9;
+      if (ilmr == 9)
+        ilmr++;
+      if (ilmr == 8 && iquad == 4)
+        ilmr++;
+      if (iz == +1)
+        ilmr += 72;
+      else
+        ilmr += 82;
+
+      return ilmr;
+    }
+
+  }  // namespace reconstruction
+}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h b/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h
new file mode 100644
index 0000000000000..103087c3517f9
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h
@@ -0,0 +1,452 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_KernelHelpers_h
+#define RecoLocalCalo_EcalRecProducers_plugins_KernelHelpers_h
+
+#include <cmath>
+#include <limits>
+#include <type_traits>
+
+#include <Eigen/Dense>
+
+namespace ecal {
+  namespace multifit {
+
+    template <int NROWS, int NCOLS>
+    using ColMajorMatrix = Eigen::Matrix<float, NROWS, NCOLS, Eigen::ColMajor>;
+
+    template <int NROWS, int NCOLS>
+    using RowMajorMatrix = Eigen::Matrix<float, NROWS, NCOLS, Eigen::RowMajor>;
+
+    template <int SIZE, typename T = float>
+    using ColumnVector = Eigen::Matrix<T, SIZE, 1>;
+
+    template <int SIZE, typename T = float>
+    using RowVector = Eigen::Matrix<T, 1, SIZE>;
+
+    // FIXME: provide specialization for Row Major layout
+    template <typename T, int Stride, int Order = Eigen::ColMajor>
+    struct MapSymM {
+      using type = T;
+      using base_type = typename std::remove_const<type>::type;
+
+      static constexpr int total = Stride * (Stride + 1) / 2;
+      static constexpr int stride = Stride;
+      T* data;
+
+      __forceinline__ __device__ MapSymM(T* data) : data{data} {}
+
+      __forceinline__ __device__ T const& operator()(int const row, int const col) const {
+        auto const tmp = (Stride - col) * (Stride - col + 1) / 2;
+        auto const index = total - tmp + row - col;
+        return data[index];
+      }
+
+      template <typename U = T>
+      __forceinline__ __device__ typename std::enable_if<std::is_same<base_type, U>::value, base_type>::type&
+      operator()(int const row, int const col) {
+        auto const tmp = (Stride - col) * (Stride - col + 1) / 2;
+        auto const index = total - tmp + row - col;
+        return data[index];
+      }
+    };
+
+    // FIXME: either use/modify/improve eigen or make this more generic
+    // this is a map for a pulse matrix to building a 2d matrix for each channel
+    // and hide indexing
+    template <typename T>
+    struct MapMForPM {
+      using type = T;
+      using base_type = typename std::remove_cv<type>::type;
+
+      type* data;
+      __forceinline__ __device__ MapMForPM(type* data) : data{data} {}
+
+      __forceinline__ __device__ base_type operator()(int const row, int const col) const {
+        auto const index = 2 - col + row;
+        return index >= 0 ? data[index] : 0;
+      }
+    };
+
+    // simple/trivial cholesky decomposition impl
+    template <typename MatrixType1, typename MatrixType2>
+    __forceinline__ __device__ void compute_decomposition_unrolled(MatrixType1& L, MatrixType2 const& M) {
+      auto const sqrtm_0_0 = std::sqrt(M(0, 0));
+      L(0, 0) = sqrtm_0_0;
+      using T = typename MatrixType1::base_type;
+
+#pragma unroll
+      for (int i = 1; i < MatrixType1::stride; i++) {
+        T sumsq{0};
+        for (int j = 0; j < i; j++) {
+          T sumsq2{0};
+          auto const m_i_j = M(i, j);
+          for (int k = 0; k < j; ++k)
+            sumsq2 += L(i, k) * L(j, k);
+
+          auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
+          L(i, j) = value_i_j;
+
+          sumsq += value_i_j * value_i_j;
+        }
+
+        auto const l_i_i = std::sqrt(M(i, i) - sumsq);
+        L(i, i) = l_i_i;
+      }
+    }
+
+    template <typename MatrixType1, typename MatrixType2>
+    __forceinline__ __device__ void compute_decomposition(MatrixType1& L, MatrixType2 const& M, int const N) {
+      auto const sqrtm_0_0 = std::sqrt(M(0, 0));
+      L(0, 0) = sqrtm_0_0;
+      using T = typename MatrixType1::base_type;
+
+      for (int i = 1; i < N; i++) {
+        T sumsq{0};
+        for (int j = 0; j < i; j++) {
+          T sumsq2{0};
+          auto const m_i_j = M(i, j);
+          for (int k = 0; k < j; ++k)
+            sumsq2 += L(i, k) * L(j, k);
+
+          auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
+          L(i, j) = value_i_j;
+
+          sumsq += value_i_j * value_i_j;
+        }
+
+        auto const l_i_i = std::sqrt(M(i, i) - sumsq);
+        L(i, i) = l_i_i;
+      }
+    }
+
+    template <typename MatrixType1, typename MatrixType2, typename VectorType>
+    __forceinline__ __device__ void compute_decomposition_forwardsubst_with_offsets(
+        MatrixType1& L,
+        MatrixType2 const& M,
+        float b[MatrixType1::stride],
+        VectorType const& Atb,
+        int const N,
+        ColumnVector<MatrixType1::stride, int> const& pulseOffsets) {
+      auto const real_0 = pulseOffsets(0);
+      auto const sqrtm_0_0 = std::sqrt(M(real_0, real_0));
+      L(0, 0) = sqrtm_0_0;
+      using T = typename MatrixType1::base_type;
+      b[0] = Atb(real_0) / sqrtm_0_0;
+
+      for (int i = 1; i < N; i++) {
+        auto const i_real = pulseOffsets(i);
+        T sumsq{0};
+        T total = 0;
+        auto const atb = Atb(i_real);
+        for (int j = 0; j < i; j++) {
+          auto const j_real = pulseOffsets(j);
+          T sumsq2{0};
+          auto const m_i_j = M(std::max(i_real, j_real), std::min(i_real, j_real));
+          for (int k = 0; k < j; ++k)
+            sumsq2 += L(i, k) * L(j, k);
+
+          auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
+          L(i, j) = value_i_j;
+
+          sumsq += value_i_j * value_i_j;
+          total += value_i_j * b[j];
+        }
+
+        auto const l_i_i = std::sqrt(M(i_real, i_real) - sumsq);
+        L(i, i) = l_i_i;
+        b[i] = (atb - total) / l_i_i;
+      }
+    }
+
+    template <typename MatrixType1, typename MatrixType2, typename VectorType>
+    __forceinline__ __device__ void update_decomposition_forwardsubst_with_offsets(
+        MatrixType1& L,
+        MatrixType2 const& M,
+        float b[MatrixType1::stride],
+        VectorType const& Atb,
+        int const N,
+        ColumnVector<MatrixType1::stride, int> const& pulseOffsets) {
+      using T = typename MatrixType1::base_type;
+      auto const i = N - 1;
+      auto const i_real = pulseOffsets(i);
+      T sumsq{0};
+      T total = 0;
+      for (int j = 0; j < i; j++) {
+        auto const j_real = pulseOffsets(j);
+        T sumsq2{0};
+        auto const m_i_j = M(std::max(i_real, j_real), std::min(i_real, j_real));
+        for (int k = 0; k < j; ++k)
+          sumsq2 += L(i, k) * L(j, k);
+
+        auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
+        L(i, j) = value_i_j;
+        sumsq += value_i_j * value_i_j;
+
+        total += value_i_j * b[j];
+      }
+
+      auto const l_i_i = std::sqrt(M(i_real, i_real) - sumsq);
+      L(i, i) = l_i_i;
+      b[i] = (Atb(i_real) - total) / l_i_i;
+    }
+
+    template <typename MatrixType1, typename MatrixType2, typename MatrixType3>
+    __device__ void solve_forward_subst_matrix(MatrixType1& A,
+                                               MatrixType2 const& pulseMatrixView,
+                                               MatrixType3 const& matrixL) {
+      // FIXME: this assumes pulses are on columns and samples on rows
+      constexpr auto NPULSES = MatrixType2::ColsAtCompileTime;
+      constexpr auto NSAMPLES = MatrixType2::RowsAtCompileTime;
+
+#pragma unroll
+      for (int icol = 0; icol < NPULSES; icol++) {
+        float reg_b[NSAMPLES];
+        float reg_L[NSAMPLES];
+
+// preload a column and load column 0 of cholesky
+#pragma unroll
+        for (int i = 0; i < NSAMPLES; i++) {
+          reg_b[i] = __ldg(&pulseMatrixView.coeffRef(i, icol));
+          reg_L[i] = matrixL(i, 0);
+        }
+
+        // compute x0 and store it
+        auto x_prev = reg_b[0] / reg_L[0];
+        A(0, icol) = x_prev;
+
+// iterate
+#pragma unroll
+        for (int iL = 1; iL < NSAMPLES; iL++) {
+// update accum
+#pragma unroll
+          for (int counter = iL; counter < NSAMPLES; counter++)
+            reg_b[counter] -= x_prev * reg_L[counter];
+
+// load the next column of cholesky
+#pragma unroll
+          for (int counter = iL; counter < NSAMPLES; counter++)
+            reg_L[counter] = matrixL(counter, iL);
+
+          // compute the next x for M(iL, icol)
+          x_prev = reg_b[iL] / reg_L[iL];
+
+          // store the result value
+          A(iL, icol) = x_prev;
+        }
+      }
+    }
+
+    template <typename MatrixType1, typename MatrixType2>
+    __device__ void solve_forward_subst_vector(float reg_b[MatrixType1::RowsAtCompileTime],
+                                               MatrixType1 inputAmplitudesView,
+                                               MatrixType2 matrixL) {
+      constexpr auto NSAMPLES = MatrixType1::RowsAtCompileTime;
+
+      float reg_b_tmp[NSAMPLES];
+      float reg_L[NSAMPLES];
+
+// preload a column and load column 0 of cholesky
+#pragma unroll
+      for (int i = 0; i < NSAMPLES; i++) {
+        reg_b_tmp[i] = inputAmplitudesView(i);
+        reg_L[i] = matrixL(i, 0);
+      }
+
+      // compute x0 and store it
+      auto x_prev = reg_b_tmp[0] / reg_L[0];
+      reg_b[0] = x_prev;
+
+// iterate
+#pragma unroll
+      for (int iL = 1; iL < NSAMPLES; iL++) {
+// update accum
+#pragma unroll
+        for (int counter = iL; counter < NSAMPLES; counter++)
+          reg_b_tmp[counter] -= x_prev * reg_L[counter];
+
+// load the next column of cholesky
+#pragma unroll
+        for (int counter = iL; counter < NSAMPLES; counter++)
+          reg_L[counter] = matrixL(counter, iL);
+
+        // compute the next x for M(iL, icol)
+        x_prev = reg_b_tmp[iL] / reg_L[iL];
+
+        // store the result value
+        reg_b[iL] = x_prev;
+      }
+    }
+
+    // TODO: add active bxs
+    template <typename MatrixType, typename VectorType>
+    __device__ void fnnls(MatrixType const& AtA,
+                          VectorType const& Atb,
+                          VectorType& solution,
+                          int& npassive,
+                          ColumnVector<VectorType::RowsAtCompileTime, int>& pulseOffsets,
+                          MapSymM<float, VectorType::RowsAtCompileTime>& matrixL,
+                          double const eps,
+                          int const maxIterations) {
+      // constants
+      constexpr auto NPULSES = VectorType::RowsAtCompileTime;
+
+      // to keep track of where to terminate if converged
+      Eigen::Index w_max_idx_prev = 0;
+      float w_max_prev = 0;
+      auto eps_to_use = eps;
+      bool recompute = false;
+
+      // used throughout
+      VectorType s;
+      float reg_b[NPULSES];
+      //float matrixLStorage[MapSymM<float, NPULSES>::total];
+      //MapSymM<float, NPULSES> matrixL{matrixLStorage};
+
+      int iter = 0;
+      while (true) {
+        if (iter > 0 || npassive == 0) {
+          auto const nactive = NPULSES - npassive;
+          // exit if there are no more pulses to constrain
+          if (nactive == 0)
+            break;
+
+          // compute the gradient
+          //w.tail(nactive) = Atb.tail(nactive) - (AtA * solution).tail(nactive);
+          Eigen::Index w_max_idx;
+          float w_max = -std::numeric_limits<float>::max();
+          for (int icol = npassive; icol < NPULSES; icol++) {
+            auto const icol_real = pulseOffsets(icol);
+            auto const atb = Atb(icol_real);
+            float sum = 0;
+#pragma unroll
+            for (int counter = 0; counter < NPULSES; counter++)
+              sum += counter > icol_real ? AtA(counter, icol_real) * solution(counter)
+                                         : AtA(icol_real, counter) * solution(counter);
+
+            auto const w = atb - sum;
+            if (w > w_max) {
+              w_max = w;
+              w_max_idx = icol - npassive;
+            }
+          }
+
+          // check for convergence
+          if (w_max < eps_to_use || w_max_idx == w_max_idx_prev && w_max == w_max_prev)
+            break;
+
+          if (iter >= maxIterations)
+            break;
+
+          w_max_prev = w_max;
+          w_max_idx_prev = w_max_idx;
+
+          // move index to the right part of the vector
+          w_max_idx += npassive;
+
+          Eigen::numext::swap(pulseOffsets.coeffRef(npassive), pulseOffsets.coeffRef(w_max_idx));
+          ++npassive;
+        }
+
+        // inner loop
+        while (true) {
+          if (npassive == 0)
+            break;
+
+          //s.head(npassive)
+          //auto const& matrixL =
+          //    AtA.topLeftCorner(npassive, npassive)
+          //        .llt().matrixL();
+          //.solve(Atb.head(npassive));
+          if (recompute || iter == 0)
+            compute_decomposition_forwardsubst_with_offsets(matrixL, AtA, reg_b, Atb, npassive, pulseOffsets);
+          else
+            update_decomposition_forwardsubst_with_offsets(matrixL, AtA, reg_b, Atb, npassive, pulseOffsets);
+
+          // run backward substituion
+          s(npassive - 1) = reg_b[npassive - 1] / matrixL(npassive - 1, npassive - 1);
+          for (int i = npassive - 2; i >= 0; --i) {
+            float total = 0;
+            for (int j = i + 1; j < npassive; j++)
+              total += matrixL(j, i) * s(j);
+
+            s(i) = (reg_b[i] - total) / matrixL(i, i);
+          }
+
+          // done if solution values are all positive
+          bool hasNegative = false;
+          bool hasNans = false;
+          for (int counter = 0; counter < npassive; counter++) {
+            auto const s_ii = s(counter);
+            hasNegative |= s_ii <= 0;
+            hasNans |= std::isnan(s_ii);
+          }
+
+          // FIXME: temporary solution. my cholesky impl is unstable yielding nans
+          // this check removes nans - do not accept solution unless all values
+          // are stable
+          if (hasNans)
+            break;
+          if (!hasNegative) {
+            for (int i = 0; i < npassive; i++) {
+              auto const i_real = pulseOffsets(i);
+              solution(i_real) = s(i);
+            }
+            //solution.head(npassive) = s.head(npassive);
+            recompute = false;
+            break;
+          }
+
+          // there were negative values -> have to recompute the whole decomp
+          recompute = true;
+
+          auto alpha = std::numeric_limits<float>::max();
+          Eigen::Index alpha_idx = 0, alpha_idx_real = 0;
+          for (int i = 0; i < npassive; i++) {
+            if (s[i] <= 0.) {
+              auto const i_real = pulseOffsets(i);
+              auto const ratio = solution[i_real] / (solution[i_real] - s[i]);
+              if (ratio < alpha) {
+                alpha = ratio;
+                alpha_idx = i;
+                alpha_idx_real = i_real;
+              }
+            }
+          }
+
+          // upadte solution
+          for (int i = 0; i < npassive; i++) {
+            auto const i_real = pulseOffsets(i);
+            solution(i_real) += alpha * (s(i) - solution(i_real));
+          }
+          //solution.head(npassive) += alpha *
+          //    (s.head(npassive) - solution.head(npassive));
+          solution[alpha_idx_real] = 0;
+          --npassive;
+
+          Eigen::numext::swap(pulseOffsets.coeffRef(npassive), pulseOffsets.coeffRef(alpha_idx));
+        }
+
+        // as in cpu
+        ++iter;
+        if (iter % 16 == 0)
+          eps_to_use *= 2;
+      }
+    }
+
+  }  // namespace multifit
+}  // namespace ecal
+
+namespace ecal {
+  namespace reconstruction {
+
+    __device__ uint32_t hashedIndexEB(uint32_t id);
+
+    __device__ uint32_t hashedIndexEE(uint32_t id);
+
+    __device__ int laser_monitoring_region_EB(uint32_t id);
+
+    __device__ int laser_monitoring_region_EE(uint32_t id);
+
+  }  // namespace reconstruction
+}  // namespace ecal
+
+#endif  // RecoLocalCalo_EcalRecProducers_plugins_KernelHelpers_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.cu
new file mode 100644
index 0000000000000..ddb9334826f40
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.cu
@@ -0,0 +1,1131 @@
+#include <iostream>
+#include <limits>
+
+#include <cuda.h>
+
+#include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
+#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
+#include "DataFormats/Math/interface/approx_exp.h"
+#include "DataFormats/Math/interface/approx_log.h"
+
+#include "Common.h"
+#include "TimeComputationKernels.h"
+#include "KernelHelpers.h"
+
+//#define DEBUG
+
+//#define ECAL_RECO_CUDA_DEBUG
+
+namespace ecal {
+  namespace multifit {
+
+    __device__ __forceinline__ bool use_sample(unsigned int sample_mask, unsigned int sample) {
+      return sample_mask & (0x1 << (EcalDataFrame::MAXSAMPLES - (sample + 1)));
+    }
+
+    __global__ void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values,
+                                                  SampleVector::Scalar const* sample_value_errors,
+                                                  bool const* useless_sample_values,
+                                                  SampleVector::Scalar* chi2s,
+                                                  SampleVector::Scalar* sum0s,
+                                                  SampleVector::Scalar* sumAAs,
+                                                  const int nchannels) {
+      using ScalarType = SampleVector::Scalar;
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      int tx = threadIdx.x + blockDim.x * blockIdx.x;
+      int ltx = threadIdx.x;
+      int ch = tx / nsamples;
+      int nchannels_per_block = blockDim.x / nsamples;
+
+      // TODO: make sure that this branch plays nicely with __syncthreads inside
+      // can there be a deadlock even if the thread is inactive
+      if (ch < nchannels) {
+        //
+        int sample = tx % nsamples;
+
+        // shared mem inits
+        extern __shared__ char sdata[];
+        char* s_sum0 = sdata;
+        SampleVector::Scalar* s_sum1 = reinterpret_cast<SampleVector::Scalar*>(s_sum0 + nchannels_per_block * nsamples);
+        SampleVector::Scalar* s_sumA = s_sum1 + nchannels_per_block * nsamples;
+        SampleVector::Scalar* s_sumAA = s_sumA + nchannels_per_block * nsamples;
+
+        // TODO make sure no div by 0
+        const auto inv_error =
+            useless_sample_values[tx] ? 0.0 : 1.0 / (sample_value_errors[tx] * sample_value_errors[tx]);
+        const auto sample_value = sample_values[tx];
+        s_sum0[ltx] = useless_sample_values[tx] ? 0 : 1;
+        s_sum1[ltx] = inv_error;
+        s_sumA[ltx] = sample_value * inv_error;
+        s_sumAA[ltx] = sample_value * sample_value * inv_error;
+        __syncthreads();
+
+        // 5 threads for [0, 4] samples
+        if (sample < 5) {
+          s_sum0[ltx] += s_sum0[ltx + 5];
+          s_sum1[ltx] += s_sum1[ltx + 5];
+          s_sumA[ltx] += s_sumA[ltx + 5];
+          s_sumAA[ltx] += s_sumAA[ltx + 5];
+        }
+        __syncthreads();
+
+        if (sample < 2) {
+          // note double counting of sample 3
+          s_sum0[ltx] += s_sum0[ltx + 2] + s_sum0[ltx + 3];
+          s_sum1[ltx] += s_sum1[ltx + 2] + s_sum1[ltx + 3];
+          s_sumA[ltx] += s_sumA[ltx + 2] + s_sumA[ltx + 3];
+          s_sumAA[ltx] += s_sumAA[ltx + 2] + s_sumAA[ltx + 3];
+        }
+        __syncthreads();
+
+        if (sample == 0) {
+          // note, subtract to remove the double counting of sample == 3
+          //s_sum0[ltx] += s_sum0[ltx+1] - s_sum0[ltx+3];
+          //s_sum1[ltx] += s_sum1[ltx+1] - s_sum1[ltx+3];
+          //s_sumA[ltx] += s_sumA[ltx+1] - s_sumA[ltx+3];
+          //s_sumAA[ltx] += s_sumAA[ltx+1] - s_sumAA[ltx+3];
+          const auto sum0 = s_sum0[ltx] + s_sum0[ltx + 1] - s_sum0[ltx + 3];
+          const auto sum1 = s_sum1[ltx] + s_sum1[ltx + 1] - s_sum1[ltx + 3];
+          const auto sumA = s_sumA[ltx] + s_sumA[ltx + 1] - s_sumA[ltx + 3];
+          const auto sumAA = s_sumAA[ltx] + s_sumAA[ltx + 1] - s_sumAA[ltx + 3];
+          const auto chi2 = sum0 > 0 ? (sumAA - sumA * sumA / sum1) / sum0 : static_cast<ScalarType>(0);
+          chi2s[ch] = chi2;
+          sum0s[ch] = sum0;
+          sumAAs[ch] = sumAA;
+
+#ifdef DEBUG_TC_NULLHYPOT
+          if (ch == 0) {
+            printf("chi2 = %f sum0 = %d sumAA = %f\n", chi2, static_cast<int>(sum0), sumAA);
+          }
+#endif
+        }
+      }
+    }
+
+    constexpr float fast_expf(float x) { return unsafe_expf<6>(x); }
+    constexpr float fast_logf(float x) { return unsafe_logf<7>(x); }
+
+    //#define DEBUG_TC_MAKERATIO
+    //
+    // launch ctx parameters are
+    // 45 threads per channel, X channels per block, Y blocks
+    // 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9
+    // TODO: it might be much beter to use 32 threads per channel instead of 45
+    // to simplify the synchronization
+    //
+    __global__ void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values,
+                                                  SampleVector::Scalar const* sample_value_errors,
+                                                  uint32_t const* dids_eb,
+                                                  uint32_t const* dids_ee,
+                                                  bool const* useless_sample_values,
+                                                  char const* pedestal_nums,
+                                                  ConfigurationParameters::type const* amplitudeFitParametersEB,
+                                                  ConfigurationParameters::type const* amplitudeFitParametersEE,
+                                                  ConfigurationParameters::type const* timeFitParametersEB,
+                                                  ConfigurationParameters::type const* timeFitParametersEE,
+                                                  SampleVector::Scalar const* sumAAsNullHypot,
+                                                  SampleVector::Scalar const* sum0sNullHypot,
+                                                  SampleVector::Scalar* tMaxAlphaBetas,
+                                                  SampleVector::Scalar* tMaxErrorAlphaBetas,
+                                                  SampleVector::Scalar* g_accTimeMax,
+                                                  SampleVector::Scalar* g_accTimeWgt,
+                                                  TimeComputationState* g_state,
+                                                  unsigned const int timeFitParameters_sizeEB,
+                                                  unsigned const int timeFitParameters_sizeEE,
+                                                  ConfigurationParameters::type const timeFitLimits_firstEB,
+                                                  ConfigurationParameters::type const timeFitLimits_firstEE,
+                                                  ConfigurationParameters::type const timeFitLimits_secondEB,
+                                                  ConfigurationParameters::type const timeFitLimits_secondEE,
+                                                  const int nchannels,
+                                                  uint32_t const offsetForInputs) {
+      using ScalarType = SampleVector::Scalar;
+
+      // constants
+      constexpr int nthreads_per_channel = 45;  // n=10, n(n-1)/2
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      const int gtx = threadIdx.x + blockDim.x * blockIdx.x;
+      const int ch = gtx / nthreads_per_channel;
+      const int ltx = threadIdx.x % nthreads_per_channel;
+      const int ch_start = ch * nsamples;
+      const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+      const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+
+      // rmeove inactive threads
+      // TODO: need to understand if this is 100% safe in presence of syncthreads
+      if (ch >= nchannels)
+        return;
+
+      const auto did = DetId{dids[inputCh]};
+      const auto isBarrel = did.subdetId() == EcalBarrel;
+      const auto* amplitudeFitParameters = isBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE;
+      const auto* timeFitParameters = isBarrel ? timeFitParametersEB : timeFitParametersEE;
+      const auto timeFitParameters_size = isBarrel ? timeFitParameters_sizeEB : timeFitParameters_sizeEE;
+      const auto timeFitLimits_first = isBarrel ? timeFitLimits_firstEB : timeFitLimits_firstEE;
+      const auto timeFitLimits_second = isBarrel ? timeFitLimits_secondEB : timeFitLimits_secondEE;
+
+      extern __shared__ char smem[];
+      ScalarType* shr_chi2s = reinterpret_cast<ScalarType*>(smem);
+      ScalarType* shr_time_wgt = shr_chi2s + blockDim.x;
+      ScalarType* shr_time_max = shr_time_wgt + blockDim.x;
+      ScalarType* shrTimeMax = shr_time_max + blockDim.x;
+      ScalarType* shrTimeWgt = shrTimeMax + blockDim.x;
+
+      // map tx -> (sample_i, sample_j)
+      int sample_i, sample_j = 0;
+      if (ltx >= 0 && ltx <= 8) {
+        sample_i = 0;
+        sample_j = 1 + ltx;
+      } else if (ltx <= 16) {
+        sample_i = 1;
+        sample_j = 2 + ltx - 9;
+      } else if (ltx <= 23) {
+        sample_i = 2;
+        sample_j = 3 + ltx - 17;
+      } else if (ltx <= 29) {
+        sample_i = 3;
+        sample_j = 4 + ltx - 24;
+      } else if (ltx <= 34) {
+        sample_i = 4;
+        sample_j = 5 + ltx - 30;
+      } else if (ltx <= 38) {
+        sample_i = 5;
+        sample_j = 6 + ltx - 35;
+      } else if (ltx <= 41) {
+        sample_i = 6;
+        sample_j = 7 + ltx - 39;
+      } else if (ltx <= 43) {
+        sample_i = 7;
+        sample_j = 8 + ltx - 42;
+      } else if (ltx <= 44) {
+        sample_i = 8;
+        sample_j = 9;
+      } else
+        assert(false);
+
+      const auto tx_i = ch_start + sample_i;
+      const auto tx_j = ch_start + sample_j;
+
+      //
+      // note, given the way we partition the block, with 45 threads per channel
+      // we will end up with inactive threads which need to be dragged along
+      // through the synching point
+      //
+      /*
+    bool const condToExit = ch >= nchannels
+        ? true
+        : useless_sample_values[tx_i] 
+          || useless_sample_values[tx_j]
+          || sample_values[tx_i]<=1 || sample_values[tx_j]<=1;
+          */
+      bool const condForUselessSamples = useless_sample_values[tx_i] || useless_sample_values[tx_j] ||
+                                         sample_values[tx_i] <= 1 || sample_values[tx_j] <= 1;
+
+      //
+      // see cpu implementation for explanation
+      //
+      ScalarType chi2 = std::numeric_limits<ScalarType>::max();
+      ScalarType tmax = 0;
+      ScalarType tmaxerr = 0;
+      shrTimeMax[threadIdx.x] = 0;
+      shrTimeWgt[threadIdx.x] = 0;
+      bool internalCondForSkipping1 = true;
+      bool internalCondForSkipping2 = true;
+      if (!condForUselessSamples) {
+        const auto rtmp = sample_values[tx_i] / sample_values[tx_j];
+        const auto invampl_i = 1.0 / sample_values[tx_i];
+        const auto relErr2_i = sample_value_errors[tx_i] * sample_value_errors[tx_i] * invampl_i * invampl_i;
+        const auto invampl_j = 1.0 / sample_values[tx_j];
+        const auto relErr2_j = sample_value_errors[tx_j] * sample_value_errors[tx_j] * invampl_j * invampl_j;
+        const auto err1 = rtmp * rtmp * (relErr2_i + relErr2_j);
+        auto err2 = sample_value_errors[tx_j] * (sample_values[tx_i] - sample_values[tx_j]) * (invampl_j * invampl_j);
+        // TODO non-divergent branch for a block if each block has 1 channel
+        // otherwise non-divergent for groups of 45 threads
+        // at this point, pedestal_nums[ch] can be either 0, 1 or 2
+        if (pedestal_nums[ch] == 2)
+          err2 *= err2 * 0.5;
+        const auto err3 = (0.289 * 0.289) * (invampl_j * invampl_j);
+        const auto total_error = std::sqrt(err1 + err2 + err3);
+
+        const auto alpha = amplitudeFitParameters[0];
+        const auto beta = amplitudeFitParameters[1];
+        const auto alphabeta = alpha * beta;
+        const auto invalphabeta = 1.0 / alphabeta;
+
+        // variables instead of a struct
+        const auto ratio_index = sample_i;
+        const auto ratio_step = sample_j - sample_i;
+        const auto ratio_value = rtmp;
+        const auto ratio_error = total_error;
+
+        const auto rlim_i_j = fast_expf(static_cast<ScalarType>(sample_j - sample_i) / beta) - 0.001;
+        internalCondForSkipping1 = !(total_error < 1.0 && rtmp > 0.001 && rtmp < rlim_i_j);
+        if (!internalCondForSkipping1) {
+          //
+          // precompute.
+          // in cpu version this was done conditionally
+          // however easier to do it here (precompute) and then just filter out
+          // if not needed
+          //
+          const auto l_timeFitLimits_first = timeFitLimits_first;
+          const auto l_timeFitLimits_second = timeFitLimits_second;
+          if (ratio_step == 1 && ratio_value >= l_timeFitLimits_first && ratio_value <= l_timeFitLimits_second) {
+            const auto time_max_i = static_cast<ScalarType>(ratio_index);
+            auto u = timeFitParameters[timeFitParameters_size - 1];
+#pragma unroll
+            for (int k = timeFitParameters_size - 2; k >= 0; k--)
+              u = u * ratio_value + timeFitParameters[k];
+
+            auto du = (timeFitParameters_size - 1) * (timeFitParameters[timeFitParameters_size - 1]);
+            for (int k = timeFitParameters_size - 2; k >= 1; k--)
+              du = du * ratio_value + k * timeFitParameters[k];
+
+            const auto error2 = ratio_error * ratio_error * du * du;
+            const auto time_max = error2 > 0 ? (time_max_i - u) / error2 : static_cast<ScalarType>(0);
+            const auto time_wgt = error2 > 0 ? 1.0 / error2 : static_cast<ScalarType>(0);
+
+            // store into shared mem
+            // note, this name is essentially identical to the one used
+            // below.
+            shrTimeMax[threadIdx.x] = error2 > 0 ? time_max : 0;
+            shrTimeWgt[threadIdx.x] = error2 > 0 ? time_wgt : 0;
+          } else {
+            shrTimeMax[threadIdx.x] = 0;
+            shrTimeWgt[threadIdx.x] = 0;
+          }
+
+          // continue with ratios
+          const auto stepOverBeta = static_cast<SampleVector::Scalar>(ratio_step) / beta;
+          const auto offset = static_cast<SampleVector::Scalar>(ratio_index) + alphabeta;
+          const auto rmin = std::max(ratio_value - ratio_error, 0.001);
+          const auto rmax = std::min(ratio_value + ratio_error,
+                                     fast_expf(static_cast<SampleVector::Scalar>(ratio_step) / beta) - 0.001);
+          const auto time1 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmin)) / alpha) - 1.0);
+          const auto time2 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmax)) / alpha) - 1.0);
+
+          // set these guys
+          tmax = 0.5 * (time1 + time2);
+          tmaxerr = 0.5 * std::sqrt((time1 - time2) * (time1 - time2));
+#ifdef DEBUG_TC_MAKERATIO
+          if (ch == 1 || ch == 0)
+            printf("ch = %d ltx = %d tmax = %f tmaxerr = %f time1 = %f time2 = %f offset = %f rmin = %f rmax = %f\n",
+                   ch,
+                   ltx,
+                   tmax,
+                   tmaxerr,
+                   time1,
+                   time2,
+                   offset,
+                   rmin,
+                   rmax);
+#endif
+
+          SampleVector::Scalar sumAf = 0;
+          SampleVector::Scalar sumff = 0;
+          const int itmin = std::max(-1, static_cast<int>(std::floor(tmax - alphabeta)));
+          auto loffset = (static_cast<ScalarType>(itmin) - tmax) * invalphabeta;
+          // TODO: data dependence
+          for (int it = itmin + 1; it < nsamples; it++) {
+            loffset += invalphabeta;
+            if (useless_sample_values[ch_start + it])
+              continue;
+            const auto inverr2 = 1.0 / (sample_value_errors[ch_start + it] * sample_value_errors[ch_start + it]);
+            const auto term1 = 1.0 + loffset;
+            const auto f = (term1 > 1e-6) ? fast_expf(alpha * (fast_logf(term1) - loffset)) : 0;
+            sumAf += sample_values[ch_start + it] * (f * inverr2);
+            sumff += f * (f * inverr2);
+          }
+
+          const auto sumAA = sumAAsNullHypot[ch];
+          const auto sum0 = sum0sNullHypot[ch];
+          chi2 = sumAA;
+          // TODO: sum0 can not be 0 below, need to introduce the check upfront
+          if (sumff > 0) {
+            chi2 = sumAA - sumAf * (sumAf / sumff);
+          }
+          chi2 /= sum0;
+
+#ifdef DEBUG_TC_MAKERATIO
+          if (ch == 1 || ch == 0)
+            printf("ch = %d ltx = %d sumAf = %f sumff = %f sumAA = %f sum0 = %d tmax = %f tmaxerr = %f chi2 = %f\n",
+                   ch,
+                   ltx,
+                   sumAf,
+                   sumff,
+                   sumAA,
+                   static_cast<int>(sum0),
+                   tmax,
+                   tmaxerr,
+                   chi2);
+#endif
+
+          if (chi2 > 0 && tmax > 0 && tmaxerr > 0)
+            internalCondForSkipping2 = false;
+          else
+            chi2 = std::numeric_limits<ScalarType>::max();
+        }
+      }
+
+      // store into smem
+      shr_chi2s[threadIdx.x] = chi2;
+      __syncthreads();
+
+      // find min chi2 - quite crude for now
+      // TODO validate/check
+      char iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
+      bool oddElements = nthreads_per_channel % 2;
+#pragma unroll
+      while (iter >= 1) {
+        if (ltx < iter)
+          // for odd ns, the last guy will just store itself
+          // exception is for ltx == 0 and iter==1
+          shr_chi2s[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
+                                       ? shr_chi2s[threadIdx.x]
+                                       : std::min(shr_chi2s[threadIdx.x], shr_chi2s[threadIdx.x + iter]);
+        __syncthreads();
+        oddElements = iter % 2;
+        iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
+      }
+
+      // filter out inactive or useless samples threads
+      if (!condForUselessSamples && !internalCondForSkipping1 && !internalCondForSkipping2) {
+        // min chi2, now compute weighted average of tmax measurements
+        // see cpu version for more explanation
+        const auto chi2min = shr_chi2s[threadIdx.x - ltx];
+        const auto chi2Limit = chi2min + 1.0;
+        const auto inverseSigmaSquared = chi2 < chi2Limit ? 1.0 / (tmaxerr * tmaxerr) : 0.0;
+
+#ifdef DEBUG_TC_MAKERATIO
+        if (ch == 1 || ch == 0)
+          printf("ch = %d ltx = %d chi2min = %f chi2Limit = %f inverseSigmaSquared = %f\n",
+                 ch,
+                 ltx,
+                 chi2min,
+                 chi2Limit,
+                 inverseSigmaSquared);
+#endif
+
+        // store into shared mem and run reduction
+        // TODO: check if cooperative groups would be better
+        // TODO: check if shuffling intrinsics are better
+        shr_time_wgt[threadIdx.x] = inverseSigmaSquared;
+        shr_time_max[threadIdx.x] = tmax * inverseSigmaSquared;
+      } else {
+        shr_time_wgt[threadIdx.x] = 0;
+        shr_time_max[threadIdx.x] = 0;
+      }
+      __syncthreads();
+
+      // reduce to compute time_max and time_wgt
+      iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
+      oddElements = nthreads_per_channel % 2;
+#pragma unroll
+      while (iter >= 1) {
+        if (ltx < iter) {
+          shr_time_wgt[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
+                                          ? shr_time_wgt[threadIdx.x]
+                                          : shr_time_wgt[threadIdx.x] + shr_time_wgt[threadIdx.x + iter];
+          shr_time_max[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
+                                          ? shr_time_max[threadIdx.x]
+                                          : shr_time_max[threadIdx.x] + shr_time_max[threadIdx.x + iter];
+          shrTimeMax[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
+                                        ? shrTimeMax[threadIdx.x]
+                                        : shrTimeMax[threadIdx.x] + shrTimeMax[threadIdx.x + iter];
+          shrTimeWgt[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
+                                        ? shrTimeWgt[threadIdx.x]
+                                        : shrTimeWgt[threadIdx.x] + shrTimeWgt[threadIdx.x + iter];
+        }
+
+        __syncthreads();
+        oddElements = iter % 2;
+        iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
+      }
+
+      // load from shared memory the 0th guy (will contain accumulated values)
+      // compute
+      // store into global mem
+      if (ltx == 0) {
+        const auto tmp_time_max = shr_time_max[threadIdx.x];
+        const auto tmp_time_wgt = shr_time_wgt[threadIdx.x];
+
+        // we are done if there number of time ratios is 0
+        if (tmp_time_wgt == 0 && tmp_time_max == 0) {
+          g_state[ch] = TimeComputationState::Finished;
+          return;
+        }
+
+        // no div by 0
+        const auto tMaxAlphaBeta = tmp_time_max / tmp_time_wgt;
+        const auto tMaxErrorAlphaBeta = 1.0 / std::sqrt(tmp_time_wgt);
+
+        tMaxAlphaBetas[ch] = tMaxAlphaBeta;
+        tMaxErrorAlphaBetas[ch] = tMaxErrorAlphaBeta;
+        g_accTimeMax[ch] = shrTimeMax[threadIdx.x];
+        g_accTimeWgt[ch] = shrTimeWgt[threadIdx.x];
+        g_state[ch] = TimeComputationState::NotFinished;
+
+#ifdef DEBUG_TC_MAKERATIO
+        printf("ch = %d time_max = %f time_wgt = %f\n", ch, tmp_time_max, tmp_time_wgt);
+        printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f timeMax = %f timeWgt = %f\n",
+               ch,
+               tMaxAlphaBeta,
+               tMaxErrorAlphaBeta,
+               shrTimeMax[threadIdx.x],
+               shrTimeWgt[threadIdx.x]);
+#endif
+      }
+    }
+
+    /// launch ctx parameters are
+    /// 10 threads per channel, N channels per block, Y blocks
+    /// TODO: do we need to keep the state around or can be removed?!
+    //#define DEBUG_FINDAMPLCHI2_AND_FINISH
+    __global__ void kernel_time_compute_findamplchi2_and_finish(
+        SampleVector::Scalar const* sample_values,
+        SampleVector::Scalar const* sample_value_errors,
+        uint32_t const* dids_eb,
+        uint32_t const* dids_ee,
+        bool const* useless_samples,
+        SampleVector::Scalar const* g_tMaxAlphaBeta,
+        SampleVector::Scalar const* g_tMaxErrorAlphaBeta,
+        SampleVector::Scalar const* g_accTimeMax,
+        SampleVector::Scalar const* g_accTimeWgt,
+        ConfigurationParameters::type const* amplitudeFitParametersEB,
+        ConfigurationParameters::type const* amplitudeFitParametersEE,
+        SampleVector::Scalar const* sumAAsNullHypot,
+        SampleVector::Scalar const* sum0sNullHypot,
+        SampleVector::Scalar const* chi2sNullHypot,
+        TimeComputationState* g_state,
+        SampleVector::Scalar* g_ampMaxAlphaBeta,
+        SampleVector::Scalar* g_ampMaxError,
+        SampleVector::Scalar* g_timeMax,
+        SampleVector::Scalar* g_timeError,
+        const int nchannels,
+        uint32_t const offsetForInputs) {
+      using ScalarType = SampleVector::Scalar;
+
+      // constants
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
+      const int ch = gtx / nsamples;
+      const int sample = threadIdx.x % nsamples;
+      const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+      const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+
+      // configure shared mem
+      // per block, we need #threads per block * 2 * sizeof(ScalarType)
+      // we run with N channels per block
+      extern __shared__ char smem[];
+      ScalarType* shr_sumAf = reinterpret_cast<ScalarType*>(smem);
+      ScalarType* shr_sumff = shr_sumAf + blockDim.x;
+
+      if (ch >= nchannels)
+        return;
+
+      auto state = g_state[ch];
+      const auto did = DetId{dids[inputCh]};
+      const auto* amplitudeFitParameters =
+          did.subdetId() == EcalBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE;
+
+      // TODO is that better than storing into global and launching another kernel
+      // for the first 10 threads
+      if (state == TimeComputationState::NotFinished) {
+        const auto alpha = amplitudeFitParameters[0];
+        const auto beta = amplitudeFitParameters[1];
+        const auto alphabeta = alpha * beta;
+        const auto invalphabeta = 1.0 / alphabeta;
+        const auto tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
+        const auto sample_value = sample_values[gtx];
+        const auto sample_value_error = sample_value_errors[gtx];
+        const auto inverr2 =
+            useless_samples[gtx] ? static_cast<ScalarType>(0) : 1.0 / (sample_value_error * sample_value_error);
+        const auto offset = (static_cast<ScalarType>(sample) - tMaxAlphaBeta) * invalphabeta;
+        const auto term1 = 1.0 + offset;
+        const auto f = term1 > 1e-6 ? fast_expf(alpha * (fast_logf(term1) - offset)) : static_cast<ScalarType>(0.0);
+        const auto sumAf = sample_value * (f * inverr2);
+        const auto sumff = f * (f * inverr2);
+
+        // store into shared mem
+        shr_sumAf[threadIdx.x] = sumAf;
+        shr_sumff[threadIdx.x] = sumff;
+      } else {
+        shr_sumAf[threadIdx.x] = 0;
+        shr_sumff[threadIdx.x] = 0;
+      }
+      __syncthreads();
+
+      // reduce
+      // unroll completely here (but hardcoded)
+      if (sample < 5) {
+        shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x + 5];
+        shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x + 5];
+      }
+      __syncthreads();
+
+      if (sample < 2) {
+        // will need to subtract for ltx = 3, we double count here
+        shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x + 2] + shr_sumAf[threadIdx.x + 3];
+        shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x + 2] + shr_sumff[threadIdx.x + 3];
+      }
+      __syncthreads();
+
+      if (sample == 0) {
+        // exit if the state is done
+        // note, we do not exit before all __synchtreads are finished
+        if (state == TimeComputationState::Finished) {
+          g_timeMax[ch] = 5;
+          g_timeError[ch] = -999;
+          return;
+        }
+
+        // subtract to avoid double counting
+        const auto sumff = shr_sumff[threadIdx.x] + shr_sumff[threadIdx.x + 1] - shr_sumff[threadIdx.x + 3];
+        const auto sumAf = shr_sumAf[threadIdx.x] + shr_sumAf[threadIdx.x + 1] - shr_sumAf[threadIdx.x + 3];
+
+        const auto ampMaxAlphaBeta = sumff > 0 ? sumAf / sumff : 0;
+        const auto sumAA = sumAAsNullHypot[ch];
+        const auto sum0 = sum0sNullHypot[ch];
+        const auto nullChi2 = chi2sNullHypot[ch];
+        if (sumff > 0) {
+          const auto chi2AlphaBeta = (sumAA - sumAf * sumAf / sumff) / sum0;
+          if (chi2AlphaBeta > nullChi2) {
+            // null hypothesis is better
+            state = TimeComputationState::Finished;
+#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
+            printf("ch = %d chi2AlphaBeta = %f nullChi2 = %f sumAA = %f sumAf = %f sumff = %f sum0 = %f\n",
+                   ch,
+                   chi2AlphaBeta,
+                   nullChi2,
+                   sumAA,
+                   sumAf,
+                   sumff,
+                   sum0);
+#endif
+          }
+
+          // store to global
+          g_ampMaxAlphaBeta[ch] = ampMaxAlphaBeta;
+        } else {
+#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
+          printf("ch = %d sum0 = %f sumAA = %f sumff = %f sumAf = %f\n", ch, sum0, sumAA, sumff, sumAf);
+#endif
+          state = TimeComputationState::Finished;
+        }
+
+        // store the state to global and finish calcs
+        g_state[ch] = state;
+        if (state == TimeComputationState::Finished) {
+          // store default values into global
+          g_timeMax[ch] = 5;
+          g_timeError[ch] = -999;
+#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
+          printf("ch = %d finished state\n", ch);
+#endif
+          return;
+        }
+
+        const auto ampMaxError = g_ampMaxError[ch];
+        const auto test_ratio = ampMaxAlphaBeta / ampMaxError;
+        const auto accTimeMax = g_accTimeMax[ch];
+        const auto accTimeWgt = g_accTimeWgt[ch];
+        const auto tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
+        const auto tMaxErrorAlphaBeta = g_tMaxErrorAlphaBeta[ch];
+        // branch to separate large vs small pulses
+        // see cpu version for more info
+        if (test_ratio > 5.0 && accTimeWgt > 0) {
+          const auto tMaxRatio = accTimeWgt > 0 ? accTimeMax / accTimeWgt : static_cast<ScalarType>(0);
+          const auto tMaxErrorRatio = accTimeWgt > 0 ? 1.0 / std::sqrt(accTimeWgt) : static_cast<ScalarType>(0);
+
+          if (test_ratio > 10.0) {
+            g_timeMax[ch] = tMaxRatio;
+            g_timeError[ch] = tMaxErrorRatio;
+
+#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
+            printf("ch = %d tMaxRatio = %f tMaxErrorRatio = %f\n", ch, tMaxRatio, tMaxErrorRatio);
+#endif
+          } else {
+            const auto timeMax = (tMaxAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) +
+                                  tMaxRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) /
+                                 5.0;
+            const auto timeError = (tMaxErrorAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) +
+                                    tMaxErrorRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) /
+                                   5.0;
+            state = TimeComputationState::Finished;
+            g_state[ch] = state;
+            g_timeMax[ch] = timeMax;
+            g_timeError[ch] = timeError;
+
+#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
+            printf("ch = %d timeMax = %f timeError = %f\n", ch, timeMax, timeError);
+#endif
+          }
+        } else {
+          state = TimeComputationState::Finished;
+          g_state[ch] = state;
+          g_timeMax[ch] = tMaxAlphaBeta;
+          g_timeError[ch] = tMaxErrorAlphaBeta;
+
+#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
+          printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f\n", ch, tMaxAlphaBeta, tMaxErrorAlphaBeta);
+#endif
+        }
+      }
+    }
+
+    __global__ void kernel_time_compute_fixMGPAslew(uint16_t const* digis_eb,
+                                                    uint16_t const* digis_ee,
+                                                    SampleVector::Scalar* sample_values,
+                                                    SampleVector::Scalar* sample_value_errors,
+                                                    bool* useless_sample_values,
+                                                    unsigned const int sample_mask,
+                                                    const int nchannels,
+                                                    uint32_t const offsetForInputs) {
+      using ScalarType = SampleVector::Scalar;
+
+      // constants
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
+      const int ch = gtx / nsamples;
+      const int sample = threadIdx.x % nsamples;
+      const int inputGtx = ch >= offsetForInputs ? gtx - offsetForInputs * nsamples : gtx;
+      const auto* digis = ch >= offsetForInputs ? digis_ee : digis_eb;
+
+      // remove thread for sample 0, oversubscribing is easier than ....
+      if (ch >= nchannels || sample == 0)
+        return;
+
+      if (!use_sample(sample_mask, sample))
+        return;
+
+      const auto gainIdPrev = ecal::mgpa::gainId(digis[inputGtx - 1]);
+      const auto gainIdNext = ecal::mgpa::gainId(digis[inputGtx]);
+      if (gainIdPrev >= 1 && gainIdPrev <= 3 && gainIdNext >= 1 && gainIdNext <= 3 && gainIdPrev < gainIdNext) {
+        sample_values[gtx - 1] = 0;
+        sample_value_errors[gtx - 1] = 1e+9;
+        useless_sample_values[gtx - 1] = true;
+      }
+    }
+
+    __global__ void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values,
+                                             SampleVector::Scalar const* sample_value_errors,
+                                             uint32_t const* dids,
+                                             bool const* useless_samples,
+                                             SampleVector::Scalar const* g_timeMax,
+                                             SampleVector::Scalar const* amplitudeFitParametersEB,
+                                             SampleVector::Scalar const* amplitudeFitParametersEE,
+                                             SampleVector::Scalar* g_amplitudeMax,
+                                             const int nchannels) {
+      using ScalarType = SampleVector::Scalar;
+
+      // constants
+      constexpr ScalarType corr4 = 1.;
+      constexpr ScalarType corr6 = 1.;
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
+      const int ch = gtx / nsamples;
+      const int sample = threadIdx.x % nsamples;
+
+      if (ch >= nchannels)
+        return;
+
+      const auto did = DetId{dids[ch]};
+      const auto* amplitudeFitParameters =
+          did.subdetId() == EcalBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE;
+
+      // configure shared mem
+      extern __shared__ char smem[];
+      ScalarType* shr_sum1 = reinterpret_cast<ScalarType*>(smem);
+      auto* shr_sumA = shr_sum1 + blockDim.x;
+      auto* shr_sumF = shr_sumA + blockDim.x;
+      auto* shr_sumAF = shr_sumF + blockDim.x;
+      auto* shr_sumFF = shr_sumAF + blockDim.x;
+
+      const auto alpha = amplitudeFitParameters[0];
+      const auto beta = amplitudeFitParameters[1];
+      const auto timeMax = g_timeMax[ch];
+      const auto pedestalLimit = timeMax - (alpha * beta) - 1.0;
+      const auto sample_value = sample_values[gtx];
+      const auto sample_value_error = sample_value_errors[gtx];
+      const auto inverr2 =
+          sample_value_error > 0 ? 1. / (sample_value_error * sample_value_error) : static_cast<ScalarType>(0);
+      const auto termOne = 1 + (sample - timeMax) / (alpha * beta);
+      const auto f = termOne > 1.e-5 ? fast_expf(alpha * fast_logf(termOne) - (sample - timeMax) / beta)
+                                     : static_cast<ScalarType>(0.);
+
+      bool const cond = ((sample < pedestalLimit) || (f > 0.6 * corr6 && sample <= timeMax) ||
+                         (f > 0.4 * corr4 && sample >= timeMax)) &&
+                        !useless_samples[gtx];
+
+      // store into shared mem
+      shr_sum1[threadIdx.x] = cond ? inverr2 : static_cast<ScalarType>(0);
+      shr_sumA[threadIdx.x] = cond ? sample_value * inverr2 : static_cast<ScalarType>(0);
+      shr_sumF[threadIdx.x] = cond ? f * inverr2 : static_cast<ScalarType>(0);
+      shr_sumAF[threadIdx.x] = cond ? (f * inverr2) * sample_value : static_cast<ScalarType>(0);
+      shr_sumFF[threadIdx.x] = cond ? f * (f * inverr2) : static_cast<ScalarType>(0);
+
+      // reduction
+      if (sample <= 4) {
+        shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x + 5];
+        shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x + 5];
+        shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x + 5];
+        shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x + 5];
+        shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x + 5];
+      }
+      __syncthreads();
+
+      if (sample < 2) {
+        // note: we double count sample 3
+        shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x + 2] + shr_sum1[threadIdx.x + 3];
+        shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x + 2] + shr_sumA[threadIdx.x + 3];
+        shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x + 2] + shr_sumF[threadIdx.x + 3];
+        shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x + 2] + shr_sumAF[threadIdx.x + 3];
+        shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x + 2] + shr_sumFF[threadIdx.x + 3];
+      }
+      __syncthreads();
+
+      if (sample == 0) {
+        const auto sum1 = shr_sum1[threadIdx.x] + shr_sum1[threadIdx.x + 1] - shr_sum1[threadIdx.x + 3];
+        const auto sumA = shr_sumA[threadIdx.x] + shr_sumA[threadIdx.x + 1] - shr_sumA[threadIdx.x + 3];
+        const auto sumF = shr_sumF[threadIdx.x] + shr_sumF[threadIdx.x + 1] - shr_sumF[threadIdx.x + 3];
+        const auto sumAF = shr_sumAF[threadIdx.x] + shr_sumAF[threadIdx.x + 1] - shr_sumAF[threadIdx.x + 3];
+        const auto sumFF = shr_sumFF[threadIdx.x] + shr_sumFF[threadIdx.x + 1] - shr_sumFF[threadIdx.x + 3];
+
+        const auto denom = sumFF * sum1 - sumF * sumF;
+        const auto condForDenom = sum1 > 0 && ecal::abs(denom) > 1.e-20;
+        const auto amplitudeMax = condForDenom ? (sumAF * sum1 - sumA * sumF) / denom : static_cast<ScalarType>(0.);
+
+        // store into global mem
+        g_amplitudeMax[ch] = amplitudeMax;
+      }
+    }
+
+    //#define ECAL_RECO_CUDA_TC_INIT_DEBUG
+    __global__ void kernel_time_computation_init(uint16_t const* digis_eb,
+                                                 uint32_t const* dids_eb,
+                                                 uint16_t const* digis_ee,
+                                                 uint32_t const* dids_ee,
+                                                 float const* rms_x12,
+                                                 float const* rms_x6,
+                                                 float const* rms_x1,
+                                                 float const* mean_x12,
+                                                 float const* mean_x6,
+                                                 float const* mean_x1,
+                                                 float const* gain12Over6,
+                                                 float const* gain6Over1,
+                                                 SampleVector::Scalar* sample_values,
+                                                 SampleVector::Scalar* sample_value_errors,
+                                                 SampleVector::Scalar* ampMaxError,
+                                                 bool* useless_sample_values,
+                                                 char* pedestal_nums,
+                                                 uint32_t const offsetForHashes,
+                                                 uint32_t const offsetForInputs,
+                                                 unsigned const int sample_maskEB,
+                                                 unsigned const int sample_maskEE,
+                                                 int nchannels) {
+      using ScalarType = SampleVector::Scalar;
+
+      // constants
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      const int tx = threadIdx.x + blockDim.x * blockIdx.x;
+      const int ch = tx / nsamples;
+      const int inputTx = ch >= offsetForInputs ? tx - offsetForInputs * nsamples : tx;
+      const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+      const auto* digis = ch >= offsetForInputs ? digis_ee : digis_eb;
+      const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+
+      if (ch < nchannels) {
+        // indices/inits
+        const int sample = tx % nsamples;
+        const int input_ch_start = inputCh * nsamples;
+        SampleVector::Scalar pedestal = 0.;
+        int num = 0;
+
+        // configure shared mem
+        extern __shared__ char smem[];
+        ScalarType* shrSampleValues = reinterpret_cast<SampleVector::Scalar*>(smem);
+        ScalarType* shrSampleValueErrors = shrSampleValues + blockDim.x;
+
+        // 0 and 1 sample values
+        const auto adc0 = ecal::mgpa::adc(digis[input_ch_start]);
+        const auto gainId0 = ecal::mgpa::gainId(digis[input_ch_start]);
+        const auto adc1 = ecal::mgpa::adc(digis[input_ch_start + 1]);
+        const auto gainId1 = ecal::mgpa::gainId(digis[input_ch_start + 1]);
+        const auto did = DetId{dids[inputCh]};
+        const auto isBarrel = did.subdetId() == EcalBarrel;
+        const auto sample_mask = did.subdetId() == EcalBarrel ? sample_maskEB : sample_maskEE;
+        const auto hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
+                                       : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
+
+        // set pedestal
+        // TODO this branch is non-divergent for a group of 10 threads
+        if (gainId0 == 1 && use_sample(sample_mask, 0)) {
+          pedestal = static_cast<SampleVector::Scalar>(adc0);
+          num = 1;
+
+          const auto diff = adc1 - adc0;
+          if (gainId1 == 1 && use_sample(sample_mask, 1) && std::abs(diff) < 3 * rms_x12[hashedId]) {
+            pedestal = (pedestal + static_cast<SampleVector::Scalar>(adc1)) / 2.0;
+            num = 2;
+          }
+        } else {
+          pedestal = mean_x12[ch];
+        }
+
+        // ped subtracted and gain-renormalized samples.
+        const auto gainId = ecal::mgpa::gainId(digis[inputTx]);
+        const auto adc = ecal::mgpa::adc(digis[inputTx]);
+
+        bool bad = false;
+        SampleVector::Scalar sample_value, sample_value_error;
+        // TODO divergent branch
+        // TODO: piece below is general both for amplitudes and timing
+        // potentially there is a way to reduce the amount of code...
+        if (!use_sample(sample_mask, sample)) {
+          bad = true;
+          sample_value = 0;
+          sample_value_error = 0;
+        } else if (gainId == 1) {
+          sample_value = static_cast<SampleVector::Scalar>(adc) - pedestal;
+          sample_value_error = rms_x12[hashedId];
+        } else if (gainId == 2) {
+          sample_value = (static_cast<SampleVector::Scalar>(adc) - mean_x6[hashedId]) * gain12Over6[hashedId];
+          sample_value_error = rms_x6[hashedId] * gain12Over6[hashedId];
+        } else if (gainId == 3) {
+          sample_value = (static_cast<SampleVector::Scalar>(adc) - mean_x1[hashedId]) * gain6Over1[hashedId] *
+                         gain12Over6[hashedId];
+          sample_value_error = rms_x1[hashedId] * gain6Over1[hashedId] * gain12Over6[hashedId];
+        } else {
+          sample_value = 0;
+          sample_value_error = 0;
+          bad = true;
+        }
+
+        // TODO: make sure we save things correctly when sample is useless
+        const auto useless_sample = (sample_value_error <= 0) | bad;
+        useless_sample_values[tx] = useless_sample;
+        sample_values[tx] = sample_value;
+        sample_value_errors[tx] = useless_sample ? 1e+9 : sample_value_error;
+
+        // DEBUG
+#ifdef ECAL_RECO_CUDA_TC_INIT_DEBUG
+        if (ch == 0) {
+          printf("sample = %d sample_value = %f sample_value_error = %f useless = %c\n",
+                 sample,
+                 sample_value,
+                 sample_value_error,
+                 useless_sample ? '1' : '0');
+        }
+#endif
+
+        // store into the shared mem
+        shrSampleValues[threadIdx.x] = sample_value_error > 0 ? sample_value : std::numeric_limits<ScalarType>::min();
+        shrSampleValueErrors[threadIdx.x] = sample_value_error;
+        __syncthreads();
+
+        // perform the reduction with min
+        if (sample < 5) {
+          // note, if equal -> we keep the value with lower sample as for cpu
+          shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 5]
+                                                  ? shrSampleValueErrors[threadIdx.x + 5]
+                                                  : shrSampleValueErrors[threadIdx.x];
+          shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 5]);
+        }
+        __syncthreads();
+
+        // a bit of an overkill, but easier than to compare across 3 values
+        if (sample < 3) {
+          shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 3]
+                                                  ? shrSampleValueErrors[threadIdx.x + 3]
+                                                  : shrSampleValueErrors[threadIdx.x];
+          shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 3]);
+        }
+        __syncthreads();
+
+        if (sample < 2) {
+          shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 2]
+                                                  ? shrSampleValueErrors[threadIdx.x + 2]
+                                                  : shrSampleValueErrors[threadIdx.x];
+          shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 2]);
+        }
+        __syncthreads();
+
+        if (sample == 0) {
+          // we only needd the max error
+          const auto maxSampleValueError = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 1]
+                                               ? shrSampleValueErrors[threadIdx.x + 1]
+                                               : shrSampleValueErrors[threadIdx.x];
+
+          // # pedestal samples used
+          pedestal_nums[ch] = num;
+          // this is used downstream
+          ampMaxError[ch] = maxSampleValueError;
+
+          // DEBUG
+#ifdef ECAL_RECO_CUDA_TC_INIT_DEBUG
+          if (ch == 0) {
+            printf("pedestal_nums = %d ampMaxError = %f\n", num, maxSampleValueError);
+          }
+#endif
+        }
+      }
+    }
+
+    ///
+    /// launch context parameters: 1 thread per channel
+    ///
+    //#define DEBUG_TIME_CORRECTION
+    __global__ void kernel_time_correction_and_finalize(
+        //        SampleVector::Scalar const* g_amplitude,
+        ::ecal::reco::StorageScalarType const* g_amplitude,
+        uint16_t const* digis_eb,
+        uint32_t const* dids_eb,
+        uint16_t const* digis_ee,
+        uint32_t const* dids_ee,
+        float const* amplitudeBinsEB,
+        float const* amplitudeBinsEE,
+        float const* shiftBinsEB,
+        float const* shiftBinsEE,
+        SampleVector::Scalar const* g_timeMax,
+        SampleVector::Scalar const* g_timeError,
+        float const* g_rms_x12,
+        float const* timeCalibConstant,
+        float* g_jitter,
+        float* g_jitterError,
+        uint32_t* flags,
+        const int amplitudeBinsSizeEB,
+        const int amplitudeBinsSizeEE,
+        ConfigurationParameters::type const timeConstantTermEB,
+        ConfigurationParameters::type const timeConstantTermEE,
+        float const offsetTimeValueEB,
+        float const offsetTimeValueEE,
+        ConfigurationParameters::type const timeNconstEB,
+        ConfigurationParameters::type const timeNconstEE,
+        ConfigurationParameters::type const amplitudeThresholdEB,
+        ConfigurationParameters::type const amplitudeThresholdEE,
+        ConfigurationParameters::type const outOfTimeThreshG12pEB,
+        ConfigurationParameters::type const outOfTimeThreshG12pEE,
+        ConfigurationParameters::type const outOfTimeThreshG12mEB,
+        ConfigurationParameters::type const outOfTimeThreshG12mEE,
+        ConfigurationParameters::type const outOfTimeThreshG61pEB,
+        ConfigurationParameters::type const outOfTimeThreshG61pEE,
+        ConfigurationParameters::type const outOfTimeThreshG61mEB,
+        ConfigurationParameters::type const outOfTimeThreshG61mEE,
+        uint32_t const offsetForHashes,
+        uint32_t const offsetForInputs,
+        const int nchannels) {
+      using ScalarType = SampleVector::Scalar;
+
+      // constants
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
+      const int inputGtx = gtx >= offsetForInputs ? gtx - offsetForInputs : gtx;
+      const auto* dids = gtx >= offsetForInputs ? dids_ee : dids_eb;
+      const auto& digis = gtx >= offsetForInputs ? digis_ee : digis_eb;
+
+      // filter out outside of range threads
+      if (gtx >= nchannels)
+        return;
+
+      const auto did = DetId{dids[inputGtx]};
+      const auto isBarrel = did.subdetId() == EcalBarrel;
+      const auto hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
+                                     : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
+      const auto* amplitudeBins = isBarrel ? amplitudeBinsEB : amplitudeBinsEE;
+      const auto* shiftBins = isBarrel ? shiftBinsEB : shiftBinsEE;
+      const auto amplitudeBinsSize = isBarrel ? amplitudeBinsSizeEB : amplitudeBinsSizeEE;
+      const auto timeConstantTerm = isBarrel ? timeConstantTermEB : timeConstantTermEE;
+      const auto timeNconst = isBarrel ? timeNconstEB : timeNconstEE;
+      const auto offsetTimeValue = isBarrel ? offsetTimeValueEB : offsetTimeValueEE;
+      const auto amplitudeThreshold = isBarrel ? amplitudeThresholdEB : amplitudeThresholdEE;
+      const auto outOfTimeThreshG12p = isBarrel ? outOfTimeThreshG12pEB : outOfTimeThreshG12pEE;
+      const auto outOfTimeThreshG12m = isBarrel ? outOfTimeThreshG12mEB : outOfTimeThreshG12mEE;
+      const auto outOfTimeThreshG61p = isBarrel ? outOfTimeThreshG61pEB : outOfTimeThreshG61pEE;
+      const auto outOfTimeThreshG61m = isBarrel ? outOfTimeThreshG61mEB : outOfTimeThreshG61mEE;
+
+      // load some
+      const auto amplitude = g_amplitude[gtx];
+      const auto rms_x12 = g_rms_x12[hashedId];
+      const auto timeCalibConst = timeCalibConstant[hashedId];
+
+      int myBin = -1;
+      for (int bin = 0; bin < amplitudeBinsSize; bin++) {
+        if (amplitude > amplitudeBins[bin])
+          myBin = bin;
+        else
+          break;
+      }
+
+      ScalarType correction = 0;
+      if (myBin == -1) {
+        correction = shiftBins[0];
+      } else if (myBin == amplitudeBinsSize - 1) {
+        correction = shiftBins[myBin];
+      } else {
+        correction = shiftBins[myBin + 1] - shiftBins[myBin];
+        correction *= (amplitude - amplitudeBins[myBin]) / (amplitudeBins[myBin + 1] - amplitudeBins[myBin]);
+        correction += shiftBins[myBin];
+      }
+
+      // correction * 1./25.
+      correction = correction * 0.04;
+      const auto timeMax = g_timeMax[gtx];
+      const auto timeError = g_timeError[gtx];
+      const auto jitter = timeMax - 5 + correction;
+      const auto jitterError =
+          std::sqrt(timeError * timeError + timeConstantTerm * timeConstantTerm * 0.04 * 0.04);  // 0.04 = 1./25.
+
+#ifdef DEBUG_TIME_CORRECTION
+      //    if (gtx == 0) {
+      printf("ch = %d timeMax = %f timeError = %f jitter = %f correction = %f\n",
+             gtx,
+             timeMax,
+             timeError,
+             jitter,
+             correction);
+//    }
+#endif
+
+      // store back to  global
+      g_jitter[gtx] = jitter;
+      g_jitterError[gtx] = jitterError;
+
+      // set the flag
+      // TODO: replace with something more efficient (if required),
+      // for now just to make it work
+      if (amplitude > amplitudeThreshold * rms_x12) {
+        auto threshP = outOfTimeThreshG12p;
+        auto threshM = outOfTimeThreshG12m;
+        if (amplitude > 3000.) {
+          for (int isample = 0; isample < nsamples; isample++) {
+            int gainid = ecal::mgpa::gainId(digis[nsamples * inputGtx + isample]);
+            if (gainid != 1) {
+              threshP = outOfTimeThreshG61p;
+              threshM = outOfTimeThreshG61m;
+              break;
+            }
+          }
+        }
+
+        const auto correctedTime = (timeMax - 5) * 25 + timeCalibConst + offsetTimeValue;
+        const auto nterm = timeNconst * rms_x12 / amplitude;
+        const auto sigmat = std::sqrt(nterm * nterm + timeConstantTerm * timeConstantTerm);
+        if (correctedTime > sigmat * threshP || correctedTime < -sigmat * threshM)
+          flags[gtx] |= 0x1 << EcalUncalibratedRecHit::kOutOfTime;
+      }
+    }
+
+  }  // namespace multifit
+}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.h
new file mode 100644
index 0000000000000..68500ddfe70d2
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.h
@@ -0,0 +1,182 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_TimeComputationKernels_h
+#define RecoLocalCalo_EcalRecProducers_plugins_TimeComputationKernels_h
+
+#include <iostream>
+#include <limits>
+
+#include <cuda.h>
+
+#include "DataFormats/Math/interface/approx_exp.h"
+#include "DataFormats/Math/interface/approx_log.h"
+
+#include "Common.h"
+#include "DeclsForKernels.h"
+#include "EigenMatrixTypes_gpu.h"
+
+//#define DEBUG
+
+//#define ECAL_RECO_CUDA_DEBUG
+
+namespace ecal {
+  namespace multifit {
+
+    __global__ void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values,
+                                                  SampleVector::Scalar const* sample_value_errors,
+                                                  bool const* useless_sample_values,
+                                                  SampleVector::Scalar* chi2s,
+                                                  SampleVector::Scalar* sum0s,
+                                                  SampleVector::Scalar* sumAAs,
+                                                  int const nchannels);
+    //
+    // launch ctx parameters are
+    // 45 threads per channel, X channels per block, Y blocks
+    // 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9
+    // TODO: it might be much beter to use 32 threads per channel instead of 45
+    // to simplify the synchronization
+    //
+    __global__ void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values,
+                                                  SampleVector::Scalar const* sample_value_errors,
+                                                  uint32_t const* dids_eb,
+                                                  uint32_t const* dids_ee,
+                                                  bool const* useless_sample_values,
+                                                  char const* pedestal_nums,
+                                                  ConfigurationParameters::type const* amplitudeFitParametersEB,
+                                                  ConfigurationParameters::type const* amplitudeFitParametersEE,
+                                                  ConfigurationParameters::type const* timeFitParametersEB,
+                                                  ConfigurationParameters::type const* timeFitParametersEE,
+                                                  SampleVector::Scalar const* sumAAsNullHypot,
+                                                  SampleVector::Scalar const* sum0sNullHypot,
+                                                  SampleVector::Scalar* tMaxAlphaBetas,
+                                                  SampleVector::Scalar* tMaxErrorAlphaBetas,
+                                                  SampleVector::Scalar* g_accTimeMax,
+                                                  SampleVector::Scalar* g_accTimeWgt,
+                                                  TimeComputationState* g_state,
+                                                  unsigned int const timeFitParameters_sizeEB,
+                                                  unsigned int const timeFitParameters_sizeEE,
+                                                  ConfigurationParameters::type const timeFitLimits_firstEB,
+                                                  ConfigurationParameters::type const timeFitLimits_firstEE,
+                                                  ConfigurationParameters::type const timeFitLimits_secondEB,
+                                                  ConfigurationParameters::type const timeFitLimits_secondEE,
+                                                  int const nchannels,
+                                                  uint32_t const offsetForInputs);
+
+    /// launch ctx parameters are
+    /// 10 threads per channel, N channels per block, Y blocks
+    /// TODO: do we need to keep the state around or can be removed?!
+    //#define DEBUG_FINDAMPLCHI2_AND_FINISH
+    __global__ void kernel_time_compute_findamplchi2_and_finish(
+        SampleVector::Scalar const* sample_values,
+        SampleVector::Scalar const* sample_value_errors,
+        uint32_t const* dids_eb,
+        uint32_t const* dids_ee,
+        bool const* useless_samples,
+        SampleVector::Scalar const* g_tMaxAlphaBeta,
+        SampleVector::Scalar const* g_tMaxErrorAlphaBeta,
+        SampleVector::Scalar const* g_accTimeMax,
+        SampleVector::Scalar const* g_accTimeWgt,
+        ConfigurationParameters::type const* amplitudeFitParametersEB,
+        ConfigurationParameters::type const* amplitudeFitParametersEE,
+        SampleVector::Scalar const* sumAAsNullHypot,
+        SampleVector::Scalar const* sum0sNullHypot,
+        SampleVector::Scalar const* chi2sNullHypot,
+        TimeComputationState* g_state,
+        SampleVector::Scalar* g_ampMaxAlphaBeta,
+        SampleVector::Scalar* g_ampMaxError,
+        SampleVector::Scalar* g_timeMax,
+        SampleVector::Scalar* g_timeError,
+        int const nchannels,
+        uint32_t const offsetForInputs);
+
+    __global__ void kernel_time_compute_fixMGPAslew(uint16_t const* digis_eb,
+                                                    uint16_t const* digis_ee,
+                                                    SampleVector::Scalar* sample_values,
+                                                    SampleVector::Scalar* sample_value_errors,
+                                                    bool* useless_sample_values,
+                                                    unsigned int const sample_mask,
+                                                    int const nchannels,
+                                                    uint32_t const offsetForInputs);
+
+    __global__ void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values,
+                                             SampleVector::Scalar const* sample_value_errors,
+                                             uint32_t const* dids_eb,
+                                             uint32_t const* dids_ed,
+                                             bool const* useless_samples,
+                                             SampleVector::Scalar const* g_timeMax,
+                                             SampleVector::Scalar const* amplitudeFitParametersEB,
+                                             SampleVector::Scalar const* amplitudeFitParametersEE,
+                                             SampleVector::Scalar* g_amplitudeMax,
+                                             int const nchannels,
+                                             uint32_t const offsetForInputs);
+
+    //#define ECAL_RECO_CUDA_TC_INIT_DEBUG
+    __global__ void kernel_time_computation_init(uint16_t const* digis_eb,
+                                                 uint32_t const* dids_eb,
+                                                 uint16_t const* digis_ee,
+                                                 uint32_t const* dids_ee,
+                                                 float const* rms_x12,
+                                                 float const* rms_x6,
+                                                 float const* rms_x1,
+                                                 float const* mean_x12,
+                                                 float const* mean_x6,
+                                                 float const* mean_x1,
+                                                 float const* gain12Over6,
+                                                 float const* gain6Over1,
+                                                 SampleVector::Scalar* sample_values,
+                                                 SampleVector::Scalar* sample_value_errors,
+                                                 SampleVector::Scalar* ampMaxError,
+                                                 bool* useless_sample_values,
+                                                 char* pedestal_nums,
+                                                 uint32_t const offsetForHashes,
+                                                 uint32_t const offsetForInputs,
+                                                 unsigned int const sample_maskEB,
+                                                 unsigned int const sample_maskEE,
+                                                 int nchannels);
+
+    ///
+    /// launch context parameters: 1 thread per channel
+    ///
+    //#define DEBUG_TIME_CORRECTION
+    __global__ void kernel_time_correction_and_finalize(
+        //        SampleVector::Scalar const* g_amplitude,
+        ::ecal::reco::StorageScalarType const* g_amplitude,
+        uint16_t const* digis_eb,
+        uint32_t const* dids_eb,
+        uint16_t const* digis_ee,
+        uint32_t const* dids_ee,
+        float const* amplitudeBinsEB,
+        float const* amplitudeBinsEE,
+        float const* shiftBinsEB,
+        float const* shiftBinsEE,
+        SampleVector::Scalar const* g_timeMax,
+        SampleVector::Scalar const* g_timeError,
+        float const* g_rms_x12,
+        float const* timeCalibConstant,
+        ::ecal::reco::StorageScalarType* g_jitter,
+        ::ecal::reco::StorageScalarType* g_jitterError,
+        uint32_t* flags,
+        int const amplitudeBinsSizeEB,
+        int const amplitudeBinsSizeEE,
+        ConfigurationParameters::type const timeConstantTermEB,
+        ConfigurationParameters::type const timeConstantTermEE,
+        float const offsetTimeValueEB,
+        float const offsetTimeValueEE,
+        ConfigurationParameters::type const timeNconstEB,
+        ConfigurationParameters::type const timeNconstEE,
+        ConfigurationParameters::type const amplitudeThresholdEB,
+        ConfigurationParameters::type const amplitudeThresholdEE,
+        ConfigurationParameters::type const outOfTimeThreshG12pEB,
+        ConfigurationParameters::type const outOfTimeThreshG12pEE,
+        ConfigurationParameters::type const outOfTimeThreshG12mEB,
+        ConfigurationParameters::type const outOfTimeThreshG12mEE,
+        ConfigurationParameters::type const outOfTimeThreshG61pEB,
+        ConfigurationParameters::type const outOfTimeThreshG61pEE,
+        ConfigurationParameters::type const outOfTimeThreshG61mEB,
+        ConfigurationParameters::type const outOfTimeThreshG61mEE,
+        uint32_t const offsetForHashes,
+        uint32_t const offsetForInputs,
+        int const nchannels);
+
+  }  // namespace multifit
+}  // namespace ecal
+
+#endif  // RecoLocalCalo_EcalRecProducers_plugins_TimeComputationKernels_h