From e12ea51bdd3bc8c2ba21b4f11fe2222b0146fe2e Mon Sep 17 00:00:00 2001
From: Viktor Khristenko <vdkhristenko1991@gmail.com>
Date: Mon, 30 Mar 2020 12:44:16 +0200
Subject: [PATCH 01/30] raw to digi is adapted to 110x patatrack branch

---
 CUDADataFormats/EcalDigi/BuildFile.xml        |   8 +
 .../EcalDigi/interface/DigisCollection.h      |  29 ++
 CUDADataFormats/EcalDigi/src/classes.h        |   3 +
 CUDADataFormats/EcalDigi/src/classes_def.xml  |   4 +
 EventFilter/EcalRawToDigi/BuildFile.xml       |   5 +
 .../EcalRawToDigi/interface/DeclsForKernels.h | 118 +++++
 .../interface/ElectronicsIdGPU.h              |  93 ++++
 .../interface/ElectronicsMappingGPU.h         |  45 ++
 .../EcalRawToDigi/interface/UnpackGPU.h       |  17 +
 .../EcalRawToDigi/plugins/BuildFile.xml       |   6 +
 .../plugins/EcalCPUDigisProducer.cc           | 152 ++++++
 .../plugins/EcalRawESProducerGPU.h            |  44 ++
 .../plugins/EcalRawESProducersGPUDefs.cc      |  14 +
 .../EcalRawToDigi/plugins/EcalRawToDigiGPU.cc | 175 +++++++
 .../src/ElectronicsMappingGPU.cc              |  62 +++
 EventFilter/EcalRawToDigi/src/UnpackGPU.cu    | 476 ++++++++++++++++++
 16 files changed, 1251 insertions(+)
 create mode 100644 CUDADataFormats/EcalDigi/BuildFile.xml
 create mode 100644 CUDADataFormats/EcalDigi/interface/DigisCollection.h
 create mode 100644 CUDADataFormats/EcalDigi/src/classes.h
 create mode 100644 CUDADataFormats/EcalDigi/src/classes_def.xml
 create mode 100644 EventFilter/EcalRawToDigi/interface/DeclsForKernels.h
 create mode 100644 EventFilter/EcalRawToDigi/interface/ElectronicsIdGPU.h
 create mode 100644 EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h
 create mode 100644 EventFilter/EcalRawToDigi/interface/UnpackGPU.h
 create mode 100644 EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc
 create mode 100644 EventFilter/EcalRawToDigi/plugins/EcalRawESProducerGPU.h
 create mode 100644 EventFilter/EcalRawToDigi/plugins/EcalRawESProducersGPUDefs.cc
 create mode 100644 EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc
 create mode 100644 EventFilter/EcalRawToDigi/src/ElectronicsMappingGPU.cc
 create mode 100644 EventFilter/EcalRawToDigi/src/UnpackGPU.cu
diff --git a/CUDADataFormats/EcalDigi/BuildFile.xml b/CUDADataFormats/EcalDigi/BuildFile.xml
new file mode 100644
index 0000000000000..a1838ba91dc91
--- /dev/null
+++ b/CUDADataFormats/EcalDigi/BuildFile.xml
@@ -0,0 +1,8 @@
+<use name="DataFormats/Common"/>
+<use name="CUDADataFormats/Common"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="cuda"/>
+
+<export>
+  <lib   name="1"/>
+</export>
diff --git a/CUDADataFormats/EcalDigi/interface/DigisCollection.h b/CUDADataFormats/EcalDigi/interface/DigisCollection.h
new file mode 100644
index 0000000000000..31134e3ddbd8f
--- /dev/null
+++ b/CUDADataFormats/EcalDigi/interface/DigisCollection.h
@@ -0,0 +1,29 @@
+#ifndef CUDADataFormats_EcalDigi_interface_DigisCollection_h
+#define CUDADataFormats_EcalDigi_interface_DigisCollection_h
+
+namespace ecal {
+
+//
+// this is basically a view 
+// it does not own the actual memory -> does not reclaim
+//
+struct DigisCollection {
+    DigisCollection() = default;
+    DigisCollection(uint32_t *ids, uint16_t *data, uint32_t ndigis)
+        : ids{ids}, data{data}, ndigis{ndigis}
+    {}
+    DigisCollection(DigisCollection const&) = default;
+    DigisCollection& operator=(DigisCollection const&) = default;
+
+    DigisCollection(DigisCollection&&) = default;
+    DigisCollection& operator=(DigisCollection&&) = default;
+
+    // stride is statically known
+    uint32_t *ids=nullptr;
+    uint16_t *data=nullptr;
+    uint32_t ndigis;
+};
+
+}
+
+#endif // CUDADataFormats_EcalDigi_interface_DigisCollection_h
diff --git a/CUDADataFormats/EcalDigi/src/classes.h b/CUDADataFormats/EcalDigi/src/classes.h
new file mode 100644
index 0000000000000..981b7334a8d24
--- /dev/null
+++ b/CUDADataFormats/EcalDigi/src/classes.h
@@ -0,0 +1,3 @@
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/EcalDigi/interface/DigisCollection.h"
diff --git a/CUDADataFormats/EcalDigi/src/classes_def.xml b/CUDADataFormats/EcalDigi/src/classes_def.xml
new file mode 100644
index 0000000000000..07beed46d89d0
--- /dev/null
+++ b/CUDADataFormats/EcalDigi/src/classes_def.xml
@@ -0,0 +1,4 @@
+<lcgdict>
+    <class name="cms::cuda::Product<ecal::DigisCollection>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<ecal::DigisCollection>>" persistent="false"/>
+</lcgdict>
diff --git a/EventFilter/EcalRawToDigi/BuildFile.xml b/EventFilter/EcalRawToDigi/BuildFile.xml
index 3bb940370c1f4..61a07973df153 100644
--- a/EventFilter/EcalRawToDigi/BuildFile.xml
+++ b/EventFilter/EcalRawToDigi/BuildFile.xml
@@ -18,6 +18,11 @@
 <use   name="RecoLocalCalo/EcalRecProducers"/>
 <use   name="Utilities/StorageFactory"/>
 
+<use   name="cuda"/>
+<use   name="HeterogeneousCore/CUDAUtilities"/>
+<use   name="HeterogeneousCore/CUDACore"/>
+<use   name="CUDADataFormats/EcalDigi" />
+
 <export>
   <lib   name="1"/>
 </export>
diff --git a/EventFilter/EcalRawToDigi/interface/DeclsForKernels.h b/EventFilter/EcalRawToDigi/interface/DeclsForKernels.h
new file mode 100644
index 0000000000000..b9a0e739019ad
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/interface/DeclsForKernels.h
@@ -0,0 +1,118 @@
+#ifndef EventFilter_EcalRawToDigi_interface_DeclsForKernels_h
+#define EventFilter_EcalRawToDigi_interface_DeclsForKernels_h
+
+#include <vector>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "EventFilter/EcalRawToDigi/interface/DCCRawDataDefinitions.h"
+
+#include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h"
+
+namespace ecal { namespace raw {
+
+constexpr auto empty_event_size = EMPTYEVENTSIZE;
+constexpr uint32_t nfeds_max = 54;
+constexpr uint32_t nbytes_per_fed_max = 10 * 1024;
+
+struct InputDataCPU {
+    std::vector<unsigned char, CUDAHostAllocator<unsigned char>> data;
+    std::vector<uint32_t, CUDAHostAllocator<uint32_t>> offsets;
+    std::vector<int, CUDAHostAllocator<int>> feds;
+
+    void allocate() {
+        // 2KB per FED resize
+        data.resize(nfeds_max * sizeof(unsigned char) * nbytes_per_fed_max);
+        offsets.resize(nfeds_max, 0);
+        feds.resize(nfeds_max, 0);
+    }
+};
+
+struct ConfigurationParameters {
+    uint32_t maxChannels;
+};
+
+struct OutputDataCPU {
+    // [0] - eb, [1] - ee
+    std::vector<uint32_t, CUDAHostAllocator<uint32_t>> nchannels; 
+    
+    void allocate() {
+        nchannels.resize(2);
+    }
+};
+
+struct OutputDataGPU {
+    uint16_t *samplesEB=nullptr, *samplesEE = nullptr;
+    uint32_t *idsEB=nullptr, *idsEE = nullptr;
+
+    // FIXME: we should separate max channels parameter for eb and ee
+    // FIXME: replace hardcoded values
+    void allocate(ConfigurationParameters const& config) {
+        cudaCheck( cudaMalloc((void**)&samplesEB,
+            config.maxChannels * sizeof(uint16_t) * 10) );
+        cudaCheck( cudaMalloc((void**)&samplesEE,
+            config.maxChannels * sizeof(uint16_t) * 10) );
+        cudaCheck( cudaMalloc((void**)&idsEB,
+            config.maxChannels * sizeof(uint32_t)) );
+        cudaCheck( cudaMalloc((void**)&idsEE,
+            config.maxChannels * sizeof(uint32_t)) );
+    }
+
+    void deallocate(ConfigurationParameters const& config) {
+        if (samplesEB) {
+            cudaCheck( cudaFree(samplesEB) );
+            cudaCheck( cudaFree(samplesEE) );
+            cudaCheck( cudaFree(idsEB) );
+            cudaCheck( cudaFree(idsEE) );
+        }
+    }
+};
+
+struct ScratchDataGPU {
+    // [0] = EB
+    // [1] = EE
+    uint32_t *pChannelsCounter=nullptr;
+
+    void allocate(ConfigurationParameters const& config) {
+        cudaCheck( cudaMalloc((void**)&pChannelsCounter,
+            sizeof(uint32_t) * 2) );
+    }
+
+    void deallocate(ConfigurationParameters const& config) {
+        if (pChannelsCounter) {
+            cudaCheck( cudaFree(pChannelsCounter) );
+        }
+    }
+};
+
+struct InputDataGPU {
+    unsigned char *data=nullptr;
+    uint32_t *offsets=nullptr;
+    int *feds=nullptr;
+
+    void allocate() {
+        cudaCheck( cudaMalloc((void**)&data, 
+            sizeof(unsigned char) * nbytes_per_fed_max * nfeds_max) );
+        cudaCheck( cudaMalloc((void**)&offsets,
+            sizeof(uint32_t) * nfeds_max) );
+        cudaCheck( cudaMalloc((void**)&feds,
+            sizeof(int) * nfeds_max) );
+    }
+
+    void deallocate() {
+        if (data) {
+            cudaCheck( cudaFree(data) );
+            cudaCheck( cudaFree(offsets) );
+            cudaCheck( cudaFree(feds) );
+        }
+    }
+};
+
+struct ConditionsProducts {
+    ElectronicsMappingGPU::Product const& eMappingProduct;
+};
+
+}}
+
+#endif // EventFilter_EcalRawToDigi_interface_DeclsForKernels_h
diff --git a/EventFilter/EcalRawToDigi/interface/ElectronicsIdGPU.h b/EventFilter/EcalRawToDigi/interface/ElectronicsIdGPU.h
new file mode 100644
index 0000000000000..654ac2a42e0fe
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/interface/ElectronicsIdGPU.h
@@ -0,0 +1,93 @@
+#ifndef EventFilter_EcalRawToDigi_interface_ElectronicsIdGPU_h
+#define EventFilter_EcalRawToDigi_interface_ElectronicsIdGPU_h
+
+#include <cstdint>
+
+#include "DataFormats/EcalDetId/interface/EcalSubdetector.h"
+
+namespace ecal { namespace raw {
+
+/** \brief Ecal readout channel identification
+    [32:20] Unused (so far)
+    [19:13]  DCC id
+    [12:6]   tower
+    [5:3]    strip
+    [2:0]    xtal
+    Index starts from 1
+ */
+
+class ElectronicsIdGPU {
+public:
+  /** Default constructor -- invalid value */
+  constexpr ElectronicsIdGPU() : id_{0xFFFFFFFFu} {}
+  /** from raw */
+  constexpr ElectronicsIdGPU(uint32_t id) : id_{id} {}
+  /** Constructor from dcc,tower,channel **/
+  constexpr ElectronicsIdGPU(
+          uint8_t const dccid, uint8_t const towerid, 
+          uint8_t const stripid, uint8_t const xtalid)
+      : id_{static_cast<uint32_t>(
+          (xtalid & 0x7) | ((stripid & 0x7) << 3) | 
+          ((towerid & 0x7F) << 6) | ((dccid & 0x7F) << 13))}
+  {}
+
+  constexpr uint32_t operator()() { return id_; }
+  constexpr uint32_t rawId() const { return id_; }
+
+  /// get the DCC (Ecal Local DCC value not global one) id
+  constexpr uint8_t dccId() const { return (id_ >> 13) & 0x7F; }
+  /// get the tower id
+  constexpr uint8_t towerId() const { return (id_ >> 6) & 0x7F; }
+  /// get the tower id
+  constexpr uint8_t stripId() const { return (id_ >> 3) & 0x7; }
+  /// get the channel id
+  constexpr uint8_t xtalId() const { return (id_ & 0x7); }
+
+  /// get the subdet
+  //EcalSubdetector subdet() const;
+
+  /// get a fast, compact, unique index for linear lookups (maximum value = 4194303)
+  constexpr uint32_t linearIndex() const { return id_ & 0x3FFFFF; }
+
+  /// so far for EndCap only :
+  //int channelId() const;  // xtal id between 1 and 25
+
+  static constexpr int kTowersInPhi = 4; // see EBDetId
+  static constexpr int kCrystalsInPhi = 20; // see EBDetId
+
+  static constexpr uint8_t MAX_DCCID = 54;  //To be updated with correct and final number
+  static constexpr uint8_t MIN_DCCID = 1;
+  static constexpr uint8_t MAX_TOWERID = 70;
+  static constexpr uint8_t MIN_TOWERID = 1;
+  static constexpr uint8_t MAX_STRIPID = 5;
+  static constexpr uint8_t MIN_STRIPID = 1;
+  static constexpr uint8_t MAX_CHANNELID = 25;
+  static constexpr uint8_t MIN_CHANNELID = 1;
+  static constexpr uint8_t MAX_XTALID = 5;
+  static constexpr uint8_t MIN_XTALID = 1;
+
+  static constexpr int MIN_DCCID_EEM = 1;
+  static constexpr int MAX_DCCID_EEM = 9;
+  static constexpr int MIN_DCCID_EBM = 10;
+  static constexpr int MAX_DCCID_EBM = 27;
+  static constexpr int MIN_DCCID_EBP = 28;
+  static constexpr int MAX_DCCID_EBP = 45;
+  static constexpr int MIN_DCCID_EEP = 46;
+  static constexpr int MAX_DCCID_EEP = 54;
+
+  static constexpr int DCCID_PHI0_EBM = 10;
+  static constexpr int DCCID_PHI0_EBP = 28;
+
+  static constexpr int kDCCChannelBoundary = 17;
+  static constexpr int DCC_EBM = 10;  // id of the DCC in EB- which contains phi=0 deg.
+  static constexpr int DCC_EBP = 28;  // id of the DCC in EB+ which contains phi=0 deg.
+  static constexpr int DCC_EEM = 1;   // id of the DCC in EE- which contains phi=0 deg.
+  static constexpr int DCC_EEP = 46;  // id of the DCC in EE+ which contains phi=0 deg.
+
+private:
+  uint32_t id_;
+};
+
+}}
+
+#endif // EventFilter_EcalRawToDigi_interface_id_h
diff --git a/EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h b/EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h
new file mode 100644
index 0000000000000..91dacbd883473
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h
@@ -0,0 +1,45 @@
+#ifndef EventFilter_EcalRawToDigi_interface_ElectronicsMappingGPU_h
+#define EventFilter_EcalRawToDigi_interface_ElectronicsMappingGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalMappingElectronics.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif
+
+namespace ecal { namespace raw {
+
+class ElectronicsMappingGPU {
+public:
+    struct Product {
+        ~Product();
+        uint32_t *eid2did;
+    };
+
+#ifndef __CUDACC__
+
+    // rearrange pedestals
+    ElectronicsMappingGPU(EcalMappingElectronics const&);
+
+    // will call dealloation for Product thru ~Product
+    ~ElectronicsMappingGPU() = default;
+
+    // get device pointers
+    Product const& getProduct(cudaStream_t) const;
+
+    // 
+    static std::string name() { return std::string{"ecalElectronicsMappingGPU"}; }
+
+private:
+    // in the future, we need to arrange so to avoid this copy on the host
+    // store eb first then ee
+    std::vector<uint32_t, CUDAHostAllocator<uint32_t>> eid2did_;
+
+    cms::cuda::ESProduct<Product> product_;
+#endif
+};
+
+}}
+
+#endif // EventFilter_EcalRawToDigi_interface_ElectronicsMappingGPU_h
diff --git a/EventFilter/EcalRawToDigi/interface/UnpackGPU.h b/EventFilter/EcalRawToDigi/interface/UnpackGPU.h
new file mode 100644
index 0000000000000..8c80354699488
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/interface/UnpackGPU.h
@@ -0,0 +1,17 @@
+#ifndef EventFilter_EcalRawToDigi_interface_UnpackGPU_h
+#define EventFilter_EcalRawToDigi_interface_UnpackGPU_h
+
+#include "EventFilter/EcalRawToDigi/interface/DeclsForKernels.h"
+
+namespace ecal { namespace raw {
+
+// FIXME: bundle up uint32_t values
+void entryPoint(
+        InputDataCPU const&, InputDataGPU&, 
+        OutputDataGPU&, ScratchDataGPU&, 
+        OutputDataCPU&, ConditionsProducts const&,
+        cudaStream_t, uint32_t const, uint32_t const);
+
+}}
+
+#endif // EventFilter_EcalRawToDigi_interface_UnpackGPU_h
diff --git a/EventFilter/EcalRawToDigi/plugins/BuildFile.xml b/EventFilter/EcalRawToDigi/plugins/BuildFile.xml
index c2bfbb6adef14..296a6b2461f8c 100644
--- a/EventFilter/EcalRawToDigi/plugins/BuildFile.xml
+++ b/EventFilter/EcalRawToDigi/plugins/BuildFile.xml
@@ -14,6 +14,12 @@
 <use   name="TrackingTools/Records"/>
 <use   name="TrackingTools/GeomPropagators"/>
 <use   name="TrackingTools/TrajectoryState"/>
+
+<use   name="cuda"/>
+<use   name="HeterogeneousCore/CUDAUtilities"/>
+<use   name="HeterogeneousCore/CUDACore"/>
+<use   name="CUDADataFormats/EcalDigi" />
+
 <library   file="*.cc" name="EventFilterEcalRawToDigiPlugins">
   <flags   EDM_PLUGIN="1"/>
 </library>
diff --git a/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc b/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc
new file mode 100644
index 0000000000000..6f488053b204b
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc
@@ -0,0 +1,152 @@
+#include <iostream>
+
+// framework
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
+//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h" 
+
+// algorithm specific
+
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+#include "CUDADataFormats/EcalDigi/interface/DigisCollection.h"
+
+#include "CondFormats/DataRecord/interface/EcalMappingElectronicsRcd.h"
+
+#include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h"
+
+#include "EventFilter/EcalRawToDigi/interface/DeclsForKernels.h"
+#include "EventFilter/EcalRawToDigi/interface/UnpackGPU.h"
+
+class EcalCPUDigisProducer
+    : public edm::stream::EDProducer<edm::ExternalWork>
+{
+public:
+    explicit EcalCPUDigisProducer(edm::ParameterSet const& ps);
+    ~EcalCPUDigisProducer() override;
+    static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+    void acquire(edm::Event const&, 
+                 edm::EventSetup const&,
+                 edm::WaitingTaskWithArenaHolder) override;
+    void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+    edm::EDGetTokenT<cms::cuda::Product<ecal::DigisCollection>> digisInEBToken_, 
+        digisInEEToken_;
+    edm::EDPutTokenT<EBDigiCollection> digisOutEBToken_;
+    edm::EDPutTokenT<EEDigiCollection> digisOutEEToken_;
+
+    // FIXME better way to pass pointers from acquire to produce?
+    std::vector<uint32_t, CUDAHostAllocator<uint32_t>> idsebtmp, idseetmp;
+    std::vector<uint16_t, CUDAHostAllocator<uint16_t>> dataebtmp, dataeetmp;
+};
+
+void EcalCPUDigisProducer::fillDescriptions(
+        edm::ConfigurationDescriptions& confDesc) {
+    edm::ParameterSetDescription desc;
+
+    desc.add<edm::InputTag>("digisInLabelEB", 
+        edm::InputTag{"ecalRawToDigiGPU", "ebDigisGPU"});
+    desc.add<edm::InputTag>("digisInLabelEE", 
+        edm::InputTag{"ecalRawToDigiGPU", "eeDigisGPU"});
+    desc.add<std::string>("digisOutLabelEB", "ebDigis");
+    desc.add<std::string>("digisOutLabelEE", "eeDigis");
+
+    std::string label = "ecalCPUDigisProducer";
+    confDesc.add(label, desc);
+}
+
+EcalCPUDigisProducer::EcalCPUDigisProducer(
+        const edm::ParameterSet& ps) 
+    : digisInEBToken_{consumes<cms::cuda::Product<ecal::DigisCollection>>(
+        ps.getParameter<edm::InputTag>("digisInLabelEB"))}
+    , digisInEEToken_{consumes<cms::cuda::Product<ecal::DigisCollection>>(
+        ps.getParameter<edm::InputTag>("digisInLabelEE"))}
+    , digisOutEBToken_{produces<EBDigiCollection>(
+        ps.getParameter<std::string>("digisOutLabelEB"))}
+    , digisOutEEToken_{produces<EEDigiCollection>(
+        ps.getParameter<std::string>("digisOutLabelEE"))}
+{}
+
+EcalCPUDigisProducer::~EcalCPUDigisProducer() {}
+
+void EcalCPUDigisProducer::acquire(
+        edm::Event const& event,
+        edm::EventSetup const& setup,
+        edm::WaitingTaskWithArenaHolder taskHolder) 
+{
+    // retrieve data/ctx
+    auto const& ebdigisProduct = event.get(digisInEBToken_);
+    auto const& eedigisProduct = event.get(digisInEEToken_);
+    cms::cuda::ScopedContextAcquire ctx{ebdigisProduct, std::move(taskHolder)};
+    auto const& ebdigis = ctx.get(ebdigisProduct);
+    auto const& eedigis = ctx.get(eedigisProduct);
+
+    // resize out tmp buffers
+    // FIXME remove hardcoded values
+    idsebtmp.resize(ebdigis.ndigis);
+    dataebtmp.resize(ebdigis.ndigis * 10);
+    idseetmp.resize(eedigis.ndigis);
+    dataeetmp.resize(eedigis.ndigis * 10);
+
+    // enqeue transfers
+    cudaCheck( cudaMemcpyAsync(dataebtmp.data(),
+                               ebdigis.data,
+                               dataebtmp.size() * sizeof(uint16_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    cudaCheck( cudaMemcpyAsync(dataeetmp.data(),
+                               eedigis.data,
+                               dataeetmp.size() * sizeof(uint16_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    cudaCheck( cudaMemcpyAsync(idsebtmp.data(),
+                               ebdigis.ids,
+                               idsebtmp.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    cudaCheck( cudaMemcpyAsync(idseetmp.data(),
+                               eedigis.ids,
+                               idseetmp.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+}
+
+void EcalCPUDigisProducer::produce(
+        edm::Event& event, 
+        edm::EventSetup const& setup) 
+{
+    // output collections
+    auto digisEB = std::make_unique<EBDigiCollection>();
+    auto digisEE = std::make_unique<EEDigiCollection>();
+    digisEB->resize(idsebtmp.size());
+    digisEE->resize(idseetmp.size());
+    
+    // cast constness away
+    // use pointers to buffers instead of move operator= semantics
+    // cause we have different allocators in there...
+    auto *dataEB = const_cast<uint16_t*>(digisEB->data().data());
+    auto *dataEE = const_cast<uint16_t*>(digisEE->data().data());
+    auto *idsEB = const_cast<uint32_t*>(digisEB->ids().data());
+    auto *idsEE = const_cast<uint32_t*>(digisEE->ids().data());
+
+    // copy data
+    std::memcpy(dataEB, dataebtmp.data(), dataebtmp.size() * sizeof(uint16_t));
+    std::memcpy(dataEE, dataeetmp.data(), dataeetmp.size() * sizeof(uint16_t));
+    std::memcpy(idsEB, idsebtmp.data(), idsebtmp.size() * sizeof(uint32_t));
+    std::memcpy(idsEE, idseetmp.data(), idseetmp.size() * sizeof(uint32_t));
+
+    event.put(digisOutEBToken_, std::move(digisEB));
+    event.put(digisOutEEToken_, std::move(digisEE));
+}
+
+DEFINE_FWK_MODULE(EcalCPUDigisProducer);
diff --git a/EventFilter/EcalRawToDigi/plugins/EcalRawESProducerGPU.h b/EventFilter/EcalRawToDigi/plugins/EcalRawESProducerGPU.h
new file mode 100644
index 0000000000000..2aa5e3bc8fe89
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/plugins/EcalRawESProducerGPU.h
@@ -0,0 +1,44 @@
+#ifndef RecoLocalCalo_EcalRecProducers_src_EcalRawESProducerGPU_h
+#define RecoLocalCalo_EcalRecProducers_src_EcalRawESProducerGPU_h
+
+#include "FWCore/Framework/interface/ESProducer.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h"
+#include "FWCore/Framework/interface/ESTransientHandle.h"
+#include "FWCore/Framework/interface/ModuleFactory.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+
+#include <iostream>
+
+template<typename Target, typename Source, typename Record>
+class EcalRawESProducerGPU : public edm::ESProducer {
+public:
+    explicit EcalRawESProducerGPU(edm::ParameterSet const& ps)  {
+        auto const label = ps.getParameter<std::string>("label");
+        auto name = ps.getParameter<std::string>("ComponentName");
+        auto cc = setWhatProduced(this, name);
+        cc.setConsumes(token_, edm::ESInputTag{"", label});
+    }
+   
+    std::unique_ptr<Target> produce(Record const& record) {
+        // retrieve conditions in old format 
+        auto sourceProduct = record.getTransientHandle(token_);
+
+        return std::make_unique<Target>(*sourceProduct);
+    }
+
+    static void fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+        edm::ParameterSetDescription desc;
+
+        std::string label = Target::name() + "ESProducer";
+        desc.add<std::string>("ComponentName", "");
+        desc.add<std::string>("label", "")->setComment("Product Label");
+        confDesc.add(label, desc);
+    }
+
+private:
+    edm::ESGetToken<Source, Record> token_;
+};
+
+#endif
diff --git a/EventFilter/EcalRawToDigi/plugins/EcalRawESProducersGPUDefs.cc b/EventFilter/EcalRawToDigi/plugins/EcalRawESProducersGPUDefs.cc
new file mode 100644
index 0000000000000..6538cb0f32816
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/plugins/EcalRawESProducersGPUDefs.cc
@@ -0,0 +1,14 @@
+#include "EcalRawESProducerGPU.h"
+
+#include "CondFormats/DataRecord/interface/EcalMappingElectronicsRcd.h"
+
+#include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h"
+
+#include <iostream>
+
+using EcalElectronicsMappingGPUESProducer = EcalRawESProducerGPU<
+    ecal::raw::ElectronicsMappingGPU, 
+    EcalMappingElectronics, 
+    EcalMappingElectronicsRcd>;
+
+DEFINE_FWK_EVENTSETUP_MODULE(EcalElectronicsMappingGPUESProducer);
diff --git a/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc b/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc
new file mode 100644
index 0000000000000..3198017117cb6
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc
@@ -0,0 +1,175 @@
+#include <iostream>
+
+// framework
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
+//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h" 
+
+// algorithm specific
+
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+#include "CUDADataFormats/EcalDigi/interface/DigisCollection.h"
+
+#include "CondFormats/DataRecord/interface/EcalMappingElectronicsRcd.h"
+
+#include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h"
+
+#include "EventFilter/EcalRawToDigi/interface/DeclsForKernels.h"
+#include "EventFilter/EcalRawToDigi/interface/UnpackGPU.h"
+
+class EcalRawToDigiGPU
+    : public edm::stream::EDProducer<edm::ExternalWork>
+{
+public:
+    explicit EcalRawToDigiGPU(edm::ParameterSet const& ps);
+    ~EcalRawToDigiGPU() override;
+    static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+    void acquire(edm::Event const&, 
+                 edm::EventSetup const&,
+                 edm::WaitingTaskWithArenaHolder) override;
+    void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+    edm::EDGetTokenT<FEDRawDataCollection> rawDataToken_;
+    edm::EDPutTokenT<cms::cuda::Product<ecal::DigisCollection>> digisEBToken_, 
+        digisEEToken_;
+
+    cms::cuda::ContextState cudaState_;
+
+    std::vector<int> fedsToUnpack_;
+
+    ecal::raw::ConfigurationParameters config_;
+    // FIXME move this to use raii
+    ecal::raw::InputDataCPU inputCPU_;
+    ecal::raw::InputDataGPU inputGPU_;
+    ecal::raw::OutputDataGPU outputGPU_;
+    ecal::raw::ScratchDataGPU scratchGPU_;
+    ecal::raw::OutputDataCPU outputCPU_;
+};
+
+void EcalRawToDigiGPU::fillDescriptions(
+        edm::ConfigurationDescriptions& confDesc) {
+    edm::ParameterSetDescription desc;
+
+    desc.add<edm::InputTag>("InputLabel", edm::InputTag("rawDataCollector"));
+    std::vector<int> feds(54);
+    for (uint32_t i=0; i<54; ++i)
+        feds[i] = i+601;
+    desc.add<std::vector<int>>("FEDs", feds);
+    desc.add<uint32_t>("maxChannels", 20000);
+    desc.add<std::string>("digisLabelEB", "ebDigisGPU");
+    desc.add<std::string>("digisLabelEE", "eeDigisGPU");
+
+    std::string label = "ecalRawToDigiGPU";
+    confDesc.add(label, desc);
+}
+
+EcalRawToDigiGPU::EcalRawToDigiGPU(
+        const edm::ParameterSet& ps) 
+    : rawDataToken_{consumes<FEDRawDataCollection>(ps.getParameter<edm::InputTag>(
+        "InputLabel"))}
+    , digisEBToken_{produces<cms::cuda::Product<ecal::DigisCollection>>(
+        ps.getParameter<std::string>("digisLabelEB"))}
+    , digisEEToken_{produces<cms::cuda::Product<ecal::DigisCollection>>(
+        ps.getParameter<std::string>("digisLabelEE"))}
+    , fedsToUnpack_{ps.getParameter<std::vector<int>>("FEDs")}
+{
+    config_.maxChannels = ps.getParameter<uint32_t>("maxChannels");
+
+    inputCPU_.allocate();
+    inputGPU_.allocate();
+    outputGPU_.allocate(config_);
+    scratchGPU_.allocate(config_);
+    outputCPU_.allocate();
+}
+
+EcalRawToDigiGPU::~EcalRawToDigiGPU() {
+    inputGPU_.deallocate();
+    outputGPU_.deallocate(config_);
+    scratchGPU_.deallocate(config_);
+}
+
+void EcalRawToDigiGPU::acquire(
+        edm::Event const& event,
+        edm::EventSetup const& setup,
+        edm::WaitingTaskWithArenaHolder holder) 
+{
+    // raii
+    cms::cuda::ScopedContextAcquire ctx{
+        event.streamID(), std::move(holder), cudaState_};
+
+    // conditions
+    edm::ESHandle<ecal::raw::ElectronicsMappingGPU> eMappingHandle;
+    setup.get<EcalMappingElectronicsRcd>().get(eMappingHandle);
+    auto const& eMappingProduct = eMappingHandle->getProduct(ctx.stream());
+
+    // bundle up conditions
+    ecal::raw::ConditionsProducts conditions{eMappingProduct};
+
+    // event data
+    edm::Handle<FEDRawDataCollection> rawDataHandle;
+    event.getByToken(rawDataToken_, rawDataHandle);
+
+    // iterate over feds
+    // TODO: another idea
+    //   - loop over all feds to unpack and enqueue cuda memcpy 
+    //   - accumulate the sizes
+    //   - after the loop launch cuda memcpy for sizes
+    //   - enqueue the kernel
+    uint32_t currentCummOffset = 0;
+    uint32_t counter = 0;
+    for (auto const& fed : fedsToUnpack_) {
+        //std::cout << "fed: " << fed << std::endl;
+        auto const& data = rawDataHandle->FEDData(fed);
+        auto const nbytes = data.size();
+
+        // skip empty feds
+        if (nbytes < ecal::raw::empty_event_size)
+            continue;
+
+        // copy raw data into plain buffer
+        std::memcpy(inputCPU_.data.data() + currentCummOffset, data.data(), nbytes);
+        // set the offset in bytes from the start
+        inputCPU_.offsets[counter] = currentCummOffset;
+        inputCPU_.feds[counter] = fed;
+
+        // this is the current offset into the vector
+        currentCummOffset += nbytes;
+        ++counter;
+    }
+
+    ecal::raw::entryPoint(
+        inputCPU_, inputGPU_, outputGPU_, scratchGPU_, outputCPU_,
+        conditions, ctx.stream(), counter, currentCummOffset);
+}
+
+void EcalRawToDigiGPU::produce(
+        edm::Event& event, 
+        edm::EventSetup const& setup) 
+{
+    cms::cuda::ScopedContextProduce ctx{cudaState_};
+
+    // get the number of channels 
+    auto const nchannelsEB = outputCPU_.nchannels[0];
+    auto const nchannelsEE = outputCPU_.nchannels[1];
+    
+    ecal::DigisCollection digisEB{outputGPU_.idsEB, 
+        outputGPU_.samplesEB, nchannelsEB};
+    ecal::DigisCollection digisEE{outputGPU_.idsEE,
+        outputGPU_.samplesEE, nchannelsEE};
+
+    ctx.emplace(event, digisEBToken_, std::move(digisEB));
+    ctx.emplace(event, digisEEToken_, std::move(digisEE));
+}
+
+DEFINE_FWK_MODULE(EcalRawToDigiGPU);
diff --git a/EventFilter/EcalRawToDigi/src/ElectronicsMappingGPU.cc b/EventFilter/EcalRawToDigi/src/ElectronicsMappingGPU.cc
new file mode 100644
index 0000000000000..c09a963b62a1d
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/src/ElectronicsMappingGPU.cc
@@ -0,0 +1,62 @@
+#include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "DataFormats/EcalDetId/interface/EcalElectronicsId.h"
+
+namespace ecal { namespace raw {
+
+// TODO: 0x3FFFFF * 4B ~= 16MB
+// tmp solution for linear mapping of eid -> did
+ElectronicsMappingGPU::ElectronicsMappingGPU(EcalMappingElectronics const& mapping) 
+    : eid2did_(0x3FFFFF)
+{   
+
+    // fill in eb
+    // TODO: EB vector is actually empty
+    auto const& barrelValues = mapping.barrelItems();
+    for (unsigned int i=0; i<barrelValues.size(); i++) {
+        EcalElectronicsId eid{barrelValues[i].electronicsid};
+        EBDetId did{EBDetId::unhashIndex(i)};
+        eid2did_[eid.linearIndex()] = did.rawId();
+    }
+    
+    // fill in ee
+    auto const& endcapValues = mapping.endcapItems();
+    for (unsigned int i=0; i<endcapValues.size(); i++) {
+        EcalElectronicsId eid{endcapValues[i].electronicsid};
+        EEDetId did{EEDetId::unhashIndex(i)};
+        eid2did_[eid.linearIndex()] = did.rawId();
+    }
+}
+
+ElectronicsMappingGPU::Product::~Product() {
+    // deallocation
+    cudaCheck( cudaFree(eid2did) );
+}
+
+ElectronicsMappingGPU::Product const& ElectronicsMappingGPU::getProduct(
+        cudaStream_t cudaStream) const
+{
+    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
+        [this](ElectronicsMappingGPU::Product& product, cudaStream_t cudaStream) {
+            // malloc
+            cudaCheck( cudaMalloc((void**)&product.eid2did,
+                                  this->eid2did_.size() * sizeof(uint32_t)) );
+
+            // transfer 
+            cudaCheck( cudaMemcpyAsync(product.eid2did,
+                                       this->eid2did_.data(),
+                                       this->eid2did_.size() * sizeof(uint32_t),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+        }
+    );
+
+    return product;
+}
+
+}}
+
+TYPELOOKUP_DATA_REG(ecal::raw::ElectronicsMappingGPU);
diff --git a/EventFilter/EcalRawToDigi/src/UnpackGPU.cu b/EventFilter/EcalRawToDigi/src/UnpackGPU.cu
new file mode 100644
index 0000000000000..8c9f05535b70d
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/src/UnpackGPU.cu
@@ -0,0 +1,476 @@
+#include "EventFilter/EcalRawToDigi/interface/UnpackGPU.h"
+#include "EventFilter/EcalRawToDigi/interface/ElectronicsIdGPU.h"
+
+namespace ecal { namespace raw {
+
+__forceinline__ __device__
+void print_raw_buffer(
+        uint8_t const* const buffer, 
+        uint32_t const nbytes, uint32_t const nbytes_per_row = 20) {
+    for (uint32_t i=0; i<nbytes; i++) {
+        if (i % nbytes_per_row == 0 && i>0)
+            printf("\n");
+        printf("%02X ", buffer[i]);
+    }
+}
+
+__forceinline__ __device__
+void print_first3bits(uint64_t const* buffer, uint32_t size) {
+    for (uint32_t i=0; i<size; ++i) {
+        uint8_t const b61 = (buffer[i] >> 61) & 0x1;
+        uint8_t const b62 = (buffer[i] >> 62) & 0x1;
+        uint8_t const b63 = (buffer[i] >> 63) & 0x1;
+        printf("[word: %u] %u%u%u\n", i,
+            b63, b62, b61);
+    }
+}
+
+__forceinline__ __device__
+bool is_barrel(uint8_t dccid) {
+    return dccid >= ElectronicsIdGPU::MIN_DCCID_EBM && 
+           dccid <= ElectronicsIdGPU::MAX_DCCID_EBP;
+}
+
+__forceinline__ __device__
+uint8_t fed2dcc(int fed) { return static_cast<uint8_t>(fed - 600); }
+
+__forceinline__ __device__
+int zside_for_eb(ElectronicsIdGPU const& eid) {
+    int dcc = eid.dccId();
+    return ((dcc >= ElectronicsIdGPU::MIN_DCCID_EBM && 
+            dcc <= ElectronicsIdGPU::MAX_DCCID_EBM))
+        ? -1
+        : 1;
+    /*
+    if ((dcc >= MIN_DCCID_EBP && dcc <= MAX_DCCID_EBP))
+        return +1;
+        */
+}
+
+__forceinline__ __device__
+bool is_synced_towerblock(
+        uint16_t const dccbx,
+        uint16_t const bx,
+        uint16_t const dccl1,
+        uint16_t const l1) {
+    bool const bxsync = (bx==0 && dccbx==3564) || (bx==dccbx && dccbx!=3564);
+    bool const l1sync = (l1 == ((dccl1 - 1) & 0xfff));
+    return bxsync && l1sync;
+}
+
+__forceinline__ __device__
+bool right_tower_for_eb(int tower) {
+    // for EB, two types of tower (LVRB top/bottom)
+    if ((tower > 12 && tower < 21) || 
+        (tower > 28 && tower < 37) || 
+        (tower > 44 && tower < 53) ||
+        (tower > 60 && tower < 69))
+        return true;
+    else
+        return false;
+}
+
+__forceinline__ __device__
+uint32_t compute_ebdetid(ElectronicsIdGPU const& eid) {
+    // as in Geometry/EcalMaping/.../EcalElectronicsMapping
+    auto const dcc = eid.dccId();
+    auto const tower = eid.towerId();
+    auto const strip = eid.stripId();
+    auto const xtal = eid.xtalId();
+
+    int smid = 0;
+    int iphi = 0;
+    bool EBPlus = (zside_for_eb(eid) > 0);
+    bool EBMinus = !EBPlus;
+
+    if (zside_for_eb(eid) < 0) {
+        smid = dcc + 19 - ElectronicsIdGPU::DCCID_PHI0_EBM;
+        iphi = (smid - 19) * ElectronicsIdGPU::kCrystalsInPhi;
+        iphi += 5 * ((tower - 1) % ElectronicsIdGPU::kTowersInPhi);
+    } else {
+        smid = dcc + 1 - ElectronicsIdGPU::DCCID_PHI0_EBP;
+        iphi = (smid - 1) * ElectronicsIdGPU::kCrystalsInPhi;
+        iphi += 5 * (ElectronicsIdGPU::kTowersInPhi - ((tower - 1) % ElectronicsIdGPU::kTowersInPhi) - 1);
+    }
+
+    bool RightTower = right_tower_for_eb(tower);
+    int ieta = 5 * ((tower - 1) / ElectronicsIdGPU::kTowersInPhi) + 1;
+    if (RightTower) {
+        ieta += (strip - 1);
+        if (strip % 2 == 1) {
+            if (EBMinus)
+                iphi += (xtal - 1) + 1;
+            else
+                iphi += (4 - (xtal - 1)) + 1;
+        } else {
+            if (EBMinus)
+                iphi += (4 - (xtal - 1)) + 1;
+            else
+                iphi += (xtal - 1) + 1;
+        }
+    } else {
+        ieta += 4 - (strip - 1);
+        if (strip % 2 == 1) {
+            if (EBMinus)
+                iphi += (4 - (xtal - 1)) + 1;
+            else
+                iphi += (xtal - 1) + 1;
+        } else {
+            if (EBMinus)
+                iphi += (xtal - 1) + 1;
+            else
+                iphi += (4 - (xtal - 1)) + 1;
+        }
+    }
+    
+    if (zside_for_eb(eid) < 0)
+        ieta = -ieta;
+
+    DetId did{DetId::Ecal, EcalBarrel};
+    return  did.rawId() |  
+        ((ieta > 0) 
+            ? (0x10000 | (ieta << 9)) 
+            : ((-ieta) << 9)) | (iphi & 0x1FF);
+}
+
+__forceinline__ __device__
+int adc(uint16_t sample) { return sample & 0xfff; }
+__forceinline__ __device__
+int gainId(uint16_t sample) { return (sample>>12) & 0x3; }
+
+template<int NTHREADS>
+__global__
+void kernel_unpack_test(
+        unsigned char const* __restrict__ data,
+        uint32_t const* __restrict__ offsets,
+        int const* __restrict__ feds,
+        uint16_t *samplesEB,
+        uint16_t *samplesEE,
+        uint32_t *idsEB,
+        uint32_t *idsEE,
+        uint32_t *pChannelsCounterEBEE,
+        uint32_t const* eid2did,
+        uint32_t const nbytesTotal) {
+    // indices
+    auto const ifed = blockIdx.x;
+
+    // FIXME: use only the very first fed
+    //if (ifed!=10) return;
+
+    // offset in bytes
+    auto const offset = offsets[ifed];
+    // fed id
+    auto const fed = feds[ifed];
+    auto const isBarrel = is_barrel(static_cast<uint8_t>(fed - 600));
+    // size
+    auto const size = ifed==gridDim.x-1 ? nbytesTotal - offset : offsets[ifed+1] - offset;
+    auto *samples = isBarrel ? samplesEB : samplesEE;
+    auto *ids = isBarrel ? idsEB : idsEE;
+    auto *pChannelsCounter = isBarrel 
+        ? &pChannelsCounterEBEE[0] 
+        : &pChannelsCounterEBEE[1];
+
+    // FIXME: debugging
+    //printf("ifed = %u fed = %d offset = %u size = %u\n", ifed, fed, offset, size);
+
+    // offset to the right raw buffer
+    uint64_t const* buffer = reinterpret_cast<uint64_t const*>(data + offset);
+
+    // dump first 3 bits for each 64-bit word
+    //print_first3bits(buffer, size / 8);
+
+    //
+    // fed header
+    //
+    //print_raw_buffer(reinterpret_cast<uint8_t const*>(buffer), 8);
+    //printf("\n");
+    auto const fed_header = buffer[0];
+    uint32_t fed_id = (fed_header >> 8) & 0xfff;
+    uint32_t bx = (fed_header >> 20) & 0xfff;
+    uint32_t lv1 = (fed_header >> 32) & 0xffffff;
+    uint8_t trigger_type = (fed_header >> 56) & 0xf;
+    uint8_t const bid_fed_header = (fed_header >> 60) & 0xf;
+    //printf("fed = %d fed_id = %u bx = %u lv1 = %u tt=%hhu  bid = 0x%u\n",
+    //    fed, fed_id, bx, lv1, trigger_type, bid_fed_header);
+
+    //
+    // dcc header: w1
+    //
+    //print_raw_buffer(reinterpret_cast<uint8_t const*>(buffer + 1), 8);
+    //printf("\n");
+    auto const dcc_header = buffer[1];
+    uint32_t event_length = dcc_header & 0xffffff;
+    uint8_t dcc_errors = (dcc_header >> 24) & 0xff;
+    uint32_t run_number = (dcc_header >> 32) & 0xffffff;
+    uint8_t const word_dcc = (dcc_header >> 56) & 0x3f;
+    uint8_t const bid_dcc_header = (dcc_header >> 62) & 0x3;
+    //printf("fed = %d size = %u event_length = %u dcc_errors = %u run_number = %u word_dcc = 0x%u bid_dcc_header = 0x%u\n",
+    //    fed, size, 8*event_length, static_cast<uint32_t>(dcc_errors), run_number, static_cast<uint32_t>(word_dcc), static_cast<uint32_t>(bid_dcc_header));
+
+    // 
+    // dcc header w2
+    //
+    //print_raw_buffer(reinterpret_cast<uint8_t const*>(buffer + 2), 8);
+    //printf("\n");
+    auto const w2 = buffer[2];
+    uint32_t const run_type = w2 & 0xffffffff;
+    uint16_t const det_trigger_type = (w2 >> 32) & 0xffff;
+    uint8_t w2_dcc = (w2 >> 56) & 0x3f;
+    uint8_t w2_bid_dcc = (w2 >> 62) & 0x3;
+    //printf("run_type = %u det_trigger_type = %u w2_dcc = %u w2_bid_dcc = %u\n", 
+    //    run_type, det_trigger_type, w2_dcc, w2_bid_dcc);
+
+    //
+    // dcc header w3
+    //
+    auto const w3 = buffer[3];
+    //print_raw_buffer(reinterpret_cast<uint8_t const*>(&w3), 8);
+    //printf("\n");
+    uint32_t const orbit_number = w3 & 0xffffffff;
+    uint8_t const sr = (w3 >> 32) & 0x1;
+    uint8_t const zs = (w3 >> 33) & 0x1;
+    uint8_t const tzs = (w3 >> 34) & 0x1;
+    uint8_t const sr_chstatus = (w3 >> 36) & 0xf;
+    uint8_t const tcc_chstatus1 = (w3 >> 40) & 0xf;
+    uint8_t const tcc_chstatus2 = (w3 >> 44) & 0xf;
+    uint8_t const tcc_chstatus3 = (w3 >> 48) & 0xf;
+    uint8_t const tcc_chstatus4 = (w3 >> 52) & 0xf;
+    uint8_t const w3_dcc = (w3 >> 56) & 0x3f;
+    uint8_t const w3_bid_dcc = (w3 >> 62) & 0x3;
+    //printf("orbit_number = %u sr = %u zs = %u tzs = %u sr_chstatus = %u\n",
+    //    orbit_number, static_cast<uint32_t>(sr), static_cast<uint32_t>(zs),
+    //    static_cast<uint32_t>(tzs), static_cast<uint32_t>(sr_chstatus));
+    //printf("tcc_chstatus1 = %u tcc_chstatus2 = %u tcc_chstatus3 = %u tcc_chstatus4 = %u\n",
+    //    static_cast<uint32_t>(tcc_chstatus1), static_cast<uint32_t>(tcc_chstatus2),
+    //    static_cast<uint32_t>(tcc_chstatus3), static_cast<uint32_t>(tcc_chstatus4));
+
+    //
+    // w4 - w8 (including 5 64-bit words)
+    //
+    /*
+    for (uint32_t i=0; i<5; i++) {
+        auto const wi = buffer[4 + i];
+        for (uint32_t i=0; i<14; i++) {
+            uint8_t value_i = (wi >> i*4) & 0xf;
+            printf("fe_chstatus_%u = %u  ", i, static_cast<uint32_t>(value_i));
+        }
+        uint8_t wi_dcc = (wi >> 56) & 0x3f;
+        uint8_t wi_bid_dcc = (wi >> 62) & 0x3;
+        printf("wi_dcc = %u wi_bid-dcc = %u\n", 
+            static_cast<uint32_t>(wi_dcc), static_cast<uint32_t>(wi_bid_dcc));
+        printf("\n");
+    }
+    */
+
+    //
+    // TCC block
+    //
+    {
+        auto const w = buffer[9];
+        //print_raw_buffer(reinterpret_cast<uint8_t const*>(&w), 8);
+        //printf("\n");
+        uint8_t const tccid = w & 0xff;
+        uint8_t const bxlocal = (w >> 16) & 0xff;
+        uint8_t const e0 = (w >> 17) & 0x1;
+        uint8_t const w_bfield_0 = (w >> 29) & 0x7;
+        uint16_t const lv1local = (w >> 32) & 0xfff;
+        uint8_t const e1 = (w >> 44) & 0x1;
+        uint8_t const ntt = (w >> 48) & 0x7f;
+        uint8_t const ntimesamples = (w >> 55) & 0xf;
+        uint8_t const le0 = (w >> 59) & 0x1;
+        uint8_t const le1 = (w >> 60) & 0x1;
+        uint8_t const w_bfield_1 = (w >> 61) & 0x7;
+        //printf("tccid = %u bxlocal = %u e0 = %u w_bitfield_0 = %u lv1local = %u\n",
+        //    tccid, bxlocal, e0, w_bfield_0, lv1local);
+        //printf("e1 = %u ntt = %u ntimesamples = %u le0 = %u le1 = %u w_bfield_1 = %u\n",
+        //    e1, ntt, ntimesamples, le0, le1, w_bfield_1);
+    }
+
+    // 9 for fed + dcc header 
+    // 36 for 4 EE TCC blocks or 18 for 1 EB TCC block
+    // 6 for SR block size
+    //print_first3bits(buffer, size / 8);
+    //auto const* tower_block_start = buffer + 9 + 36 + 6;
+    //print_first3bits(tower_block_start, size / 8 - 10 - 36 - 6);
+
+    //
+    // print Tower block headers
+    //
+    uint8_t ntccblockwords = isBarrel ? 18 : 36;
+    auto const* tower_blocks_start = buffer + 9 + ntccblockwords + 6;
+    auto const* trailer = buffer + (size / 8 - 1);
+    auto const* current_tower_block = tower_blocks_start;
+    while (current_tower_block != trailer) {
+        auto const w = *current_tower_block;
+        uint8_t ttid = w & 0xff;
+        uint8_t ntimesamples = (w >> 8) & 0x7f;
+        uint16_t bxlocal = (w >> 16) & 0xfff;
+        uint8_t e0 = (w >> 28) & 0x1;
+        uint8_t w_bfield_0 = (w >> 30) & 0x3;
+        uint16_t lv1local = (w >> 32) & 0xfff;
+        uint8_t e1 = (w >> 44) & 0x1;
+        uint16_t block_length = (w >> 48) & 0x1ff;
+        uint16_t w_bfield_1 = (w >> 62) & 0x3;
+
+        // 
+        uint16_t const dccbx = bx & 0xfff;
+        uint16_t const dccl1 = lv1 & 0xfff;
+        //printf("dccbx = %u bxlocal = %u dccl1 = %u l1local = %u\n",
+        //    dccbx, bxlocal, dccl1, lv1local);
+        if (!is_synced_towerblock(dccbx, bxlocal, dccl1, lv1local)) {
+            current_tower_block += block_length;
+            continue;
+        }
+
+        //printf("ttid = %u ntimesamples = %u\ bxlocal = %u e0 = %u w_bfield_0 = %u\n", 
+        //    ttid, ntimesamples, bxlocal, e0, w_bfield_0);
+        //printf("lv1local = %u e1 = %u block_length = %u w_bfield-1 = %u\n",
+        //    lv1local, e1, block_length, w_bfield_1);
+
+        // go thru all the channels
+        // get the next channel coordinates
+        uint32_t nchannels = (block_length - 1) / 3;
+
+        // 1 threads per channel in this block
+        for (uint32_t ich=0; ich<nchannels; ich+=NTHREADS) {
+            auto const i_to_access = ich + threadIdx.x;
+            // threads outside of the range -> leave the loop
+            if (i_to_access>=nchannels) break;
+
+            // inc the channel's counter and get the pos where to store
+            auto const wdata = current_tower_block[1 + i_to_access*3];
+            uint8_t const stripid = wdata & 0x7;
+            uint8_t const xtalid = (wdata >> 4) & 0x7;
+            ElectronicsIdGPU eid{fed2dcc(fed), ttid, stripid, xtalid};
+            auto const didraw = isBarrel 
+                ? compute_ebdetid(eid)
+                : eid2did[eid.linearIndex()];
+            // FIXME: what kind of channels are these guys
+            if (didraw == 0) 
+                continue;
+            
+            // get samples
+            uint16_t sampleValues[10];
+            sampleValues[0] = (wdata >> 16) & 0x3fff;
+            sampleValues[1] = (wdata >> 32) & 0x3fff;
+            sampleValues[2] = (wdata >> 48) & 0x3fff;
+            auto const wdata1 = current_tower_block[2+i_to_access*3];
+            sampleValues[3] = wdata1 & 0x3fff;
+            sampleValues[4] = (wdata1 >> 16) & 0x3fff;
+            sampleValues[5] = (wdata1 >> 32) & 0x3fff;
+            sampleValues[6] = (wdata1 >> 48) & 0x3fff;
+            auto const wdata2 = current_tower_block[3+i_to_access*3];
+            sampleValues[7] = wdata2 & 0x3fff;
+            sampleValues[8] = (wdata2 >> 16) & 0x3fff;
+            sampleValues[9] = (wdata2 >> 32) & 0x3fff;
+            //printf("stripid = %u xtalid = %u\n", stripid, xtalid);
+            
+            // check gain
+            bool isSaturation = true;
+            short firstGainZeroSampID{-1}, firstGainZeroSampADC{-1};
+            for (uint32_t si=0; si<10; si++) {
+                if (gainId(sampleValues[si]) == 0) {
+                    firstGainZeroSampID = si;
+                    firstGainZeroSampADC = adc(sampleValues[si]);
+                    break;
+                }
+            }
+            if (firstGainZeroSampID!=-1) {
+                unsigned int plateauEnd = std::min(10u ,(unsigned int)(firstGainZeroSampID+5));
+                for (unsigned int s=firstGainZeroSampID; s<plateauEnd; s++) {
+                    if( gainId(sampleValues[s])==0 && 
+                        adc(sampleValues[s])==firstGainZeroSampADC ) {;}
+                    else { isSaturation=false;  break;}  //it's not saturation
+                }     
+                // get rid of channels which are stuck in gain0
+                if(firstGainZeroSampID<3) {isSaturation=false; }
+                if (!isSaturation)
+                    continue;
+            } else { // there is no zero gainId sample
+                // gain switch check
+                short numGain=1;
+                bool gainSwitchError = false;
+                for (unsigned int si=1; si<10; si++) {
+                    if ((gainId(sampleValues[si-1]) > gainId(sampleValues[si])) && 
+                        numGain<5) gainSwitchError=true;
+                    if (gainId(sampleValues[si-1]) == gainId(sampleValues[si])) numGain++;
+                    else numGain=1;
+                }
+                if (gainSwitchError)
+                    continue;
+            }
+            
+            auto const pos = atomicAdd(pChannelsCounter, 1);
+        
+            // store to global
+            ids[pos] = didraw;
+            samples[pos*10] = sampleValues[0];
+            samples[pos*10 + 1] = sampleValues[1];
+            samples[pos*10 + 2] = sampleValues[2];
+            samples[pos*10 + 3] = sampleValues[3];
+            samples[pos*10 + 4] = sampleValues[4];
+            samples[pos*10 + 5] = sampleValues[5];
+            samples[pos*10 + 6] = sampleValues[6];
+            samples[pos*10 + 7] = sampleValues[7];
+            samples[pos*10 + 8] = sampleValues[8];
+            samples[pos*10 + 9] = sampleValues[9];
+        }
+
+        current_tower_block += block_length;
+    }
+}
+
+void entryPoint(
+        InputDataCPU const& inputCPU, 
+        InputDataGPU& inputGPU,
+        OutputDataGPU& outputGPU,
+        ScratchDataGPU& scratchGPU,
+        OutputDataCPU& outputCPU,
+        ConditionsProducts const& conditions,
+        cudaStream_t cudaStream,
+        uint32_t const nfedsWithData,
+        uint32_t const nbytesTotal) {
+    // transfer
+    cudaCheck( cudaMemcpyAsync(inputGPU.data,
+                               inputCPU.data.data(),
+                               nbytesTotal * sizeof(unsigned char),
+                               cudaMemcpyHostToDevice,
+                               cudaStream) );
+    cudaCheck( cudaMemcpyAsync(inputGPU.offsets,
+                               inputCPU.offsets.data(),
+                               nfedsWithData * sizeof(uint32_t),
+                               cudaMemcpyHostToDevice,
+                               cudaStream) );
+    cudaCheck( cudaMemsetAsync(scratchGPU.pChannelsCounter,
+                               0,
+                               sizeof(uint32_t) * 2, // EB + EE
+                               cudaStream) );
+    cudaCheck( cudaMemcpyAsync(inputGPU.feds,
+                               inputCPU.feds.data(),
+                               nfedsWithData * sizeof(int),
+                               cudaMemcpyHostToDevice,
+                               cudaStream) );
+
+    kernel_unpack_test<32><<<nfedsWithData,32, 0, cudaStream>>>(
+        inputGPU.data,
+        inputGPU.offsets,
+        inputGPU.feds,
+        outputGPU.samplesEB,
+        outputGPU.samplesEE,
+        outputGPU.idsEB,
+        outputGPU.idsEE,
+        scratchGPU.pChannelsCounter,
+        conditions.eMappingProduct.eid2did,
+        nbytesTotal
+    );
+    cudaCheck( cudaGetLastError() );
+
+    // transfer the counters for how many eb and ee channels we got
+    cudaCheck( cudaMemcpyAsync(outputCPU.nchannels.data(),
+                               scratchGPU.pChannelsCounter,
+                               sizeof(uint32_t) * 2,
+                               cudaMemcpyDeviceToHost,
+                               cudaStream) );
+}
+
+}}

From dd8cd82dbfbe66ebb6f4016590fb57d848a49e14 Mon Sep 17 00:00:00 2001
From: Viktor Khristenko <vdkhristenko1991@gmail.com>
Date: Mon, 30 Mar 2020 15:37:21 +0200
Subject: [PATCH 02/30] adding validation source

---
 EventFilter/EcalRawToDigi/bin/BuildFile.xml   |   7 +
 .../makeEcalRaw2DigiGpuValidationPlots.cpp    | 224 ++++++++++++++++++
 2 files changed, 231 insertions(+)
 create mode 100644 EventFilter/EcalRawToDigi/bin/BuildFile.xml
 create mode 100644 EventFilter/EcalRawToDigi/bin/makeEcalRaw2DigiGpuValidationPlots.cpp

diff --git a/EventFilter/EcalRawToDigi/bin/BuildFile.xml b/EventFilter/EcalRawToDigi/bin/BuildFile.xml
new file mode 100644
index 0000000000000..792fe438d8799
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/bin/BuildFile.xml
@@ -0,0 +1,7 @@
+<bin name="makeEcalRaw2DigiGpuValidationPlots" file="makeEcalRaw2DigiGpuValidationPlots.cpp">
+    <use name="root"/>
+    <use name="rootgraphics"/>
+    <use name="DataFormats/Common"/>
+    <use name="DataFormats/EcalDigi"/>
+    <use name="DataFormats/EcalDetId"/>
+</bin>
diff --git a/EventFilter/EcalRawToDigi/bin/makeEcalRaw2DigiGpuValidationPlots.cpp b/EventFilter/EcalRawToDigi/bin/makeEcalRaw2DigiGpuValidationPlots.cpp
new file mode 100644
index 0000000000000..9fc9ec26e3714
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/bin/makeEcalRaw2DigiGpuValidationPlots.cpp
@@ -0,0 +1,224 @@
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <TCanvas.h>
+#include <TFile.h>
+#include <TH1D.h>
+#include <TH2D.h>
+#include <TTree.h>
+#include <TPaveStats.h>
+
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+
+int main(int argc, char *argv[]) {
+    if (argc<3) {
+        std::cout << "run with: ./<exe> <path to input file> <path to output file>\n";
+        exit(0);
+    }
+    
+    // branches to use
+    edm::Wrapper<EBDigiCollection> *wgpuEB=nullptr, *wcpuEB=nullptr;
+    edm::Wrapper<EEDigiCollection> *wgpuEE=nullptr, *wcpuEE=nullptr;
+
+    std::string inFileName{argv[1]};
+    std::string outFileName{argv[2]};
+
+    // prep output 
+    TFile rfout{outFileName.c_str(), "recreate"};
+
+    int const nbins = 400;
+    float const last = 4096.;
+    auto hADCEBGPU = new TH1D("hADCEBGPU", "hADCEBGPU", nbins, 0, last);
+    auto hADCEBCPU = new TH1D("hADCEBCPU", "hADCEBCPU", nbins, 0, last);
+    auto hADCEEGPU = new TH1D("hADCEEGPU", "hADCEEGPU", nbins, 0, last);
+    auto hADCEECPU = new TH1D("hADCEECPU", "hADCEECPU", nbins, 0, last);
+
+    auto hGainEBGPU = new TH1D("hGainEBGPU", "hGainEBGPU", 4, 0, 4);
+    auto hGainEBCPU = new TH1D("hGainEBCPU", "hGainEBCPU", 4, 0, 4);
+    auto hGainEEGPU = new TH1D("hGainEEGPU", "hGainEEGPU", 4, 0, 4);
+    auto hGainEECPU = new TH1D("hGainEECPU", "hGainEECPU", 4, 0, 4);
+
+    auto hADCEBGPUvsCPU = new TH2D("hADCEBGPUvsCPU", "hADCEBGPUvsCPU",
+        nbins, 0, last, nbins, 0, last);
+    auto hADCEEGPUvsCPU = new TH2D("hADCEEGPUvsCPU", "hADCEEGPUvsCPU",
+        nbins, 0, last, nbins, 0, last);
+    auto hGainEBGPUvsCPU = new TH2D("hGainEBGPUvsCPU", "hGainEBGPUvsCPU",
+        4, 0, 4, 4, 0, 4);
+    auto hGainEEGPUvsCPU = new TH2D("hGainEEGPUvsCPU", "hGainEEGPUvsCPU",
+        4, 0, 4, 4, 0, 4);
+
+    // prep input
+    TFile rfin{inFileName.c_str()};
+    TTree *rt = (TTree*)rfin.Get("Events");
+    rt->SetBranchAddress("EBDigiCollection_ecalCPUDigisProducer_ebDigis_RECO.",
+        &wgpuEB);
+    rt->SetBranchAddress("EEDigiCollection_ecalCPUDigisProducer_eeDigis_RECO.",
+        &wgpuEE);
+    rt->SetBranchAddress("EBDigiCollection_ecalDigis_ebDigis_RECO.",
+        &wcpuEB);
+    rt->SetBranchAddress("EEDigiCollection_ecalDigis_eeDigis_RECO.",
+        &wcpuEE);
+
+    // accumulate
+    auto const nentries = rt->GetEntries();
+    std::cout << ">>> nentries = " << nentries << std::endl;
+    for (int ie=0; ie<nentries; ++ie) {
+        rt->GetEntry(ie);
+
+        auto const ngpuebs = wgpuEB->bareProduct().size();
+        auto const ncpuebs = wcpuEB->bareProduct().size();
+        auto const ngpuees = wgpuEE->bareProduct().size();
+        auto const ncpuees = wcpuEE->bareProduct().size();
+
+        if (ngpuebs!=ncpuebs or ngpuees!=ncpuees) {
+            std::cerr << "*** mismatch in ndigis: "
+                      << "ie = " << ie
+                      << "  ngpuebs = " << ngpuebs
+                      << "  ncpuebs = " << ncpuebs
+                      << "  ngpuees = " << ngpuees
+                      << "  ncpuees = " << ncpuees
+                      << std::endl;
+
+            // this is a must for now
+            //assert(ngpuebs==ncpuebs);
+            //assert(ngpuees==ncpuees);
+        }
+
+        // assume identical sizes
+        auto const& idsgpuEB = wgpuEB->bareProduct().ids();
+        auto const& datagpuEB = wgpuEB->bareProduct().data();
+        auto const& idscpuEB = wcpuEB->bareProduct().ids();
+        auto const& datacpuEB = wcpuEB->bareProduct().data();
+        for (uint32_t ieb=0; ieb<ngpuebs; ++ieb) {
+            auto const& idgpu = idsgpuEB[ieb];
+            auto iter2idcpu = std::find(idscpuEB.begin(), idscpuEB.end(), 
+                idgpu);
+            // FIXME
+            assert(idgpu == *iter2idcpu);
+
+            auto const ptrdiff = iter2idcpu - idscpuEB.begin();
+            for (uint32_t s=0u; s<10u; s++) {
+                EcalMGPASample sampleGPU{datagpuEB[ieb*10 + s]};
+                EcalMGPASample sampleCPU{datacpuEB[ptrdiff * 10 + s]};
+
+                hADCEBGPU->Fill(sampleGPU.adc());
+                hGainEBGPU->Fill(sampleGPU.gainId());
+                hADCEBCPU->Fill(sampleCPU.adc());
+                hGainEBCPU->Fill(sampleCPU.gainId());
+                hADCEBGPUvsCPU->Fill(sampleCPU.adc(), sampleGPU.adc());
+                hGainEBGPUvsCPU->Fill(sampleCPU.gainId(), sampleGPU.gainId());
+            }
+        }
+
+        auto const& idsgpuEE = wgpuEE->bareProduct().ids();
+        auto const& datagpuEE = wgpuEE->bareProduct().data();
+        auto const& idscpuEE = wcpuEE->bareProduct().ids();
+        auto const& datacpuEE = wcpuEE->bareProduct().data();
+        for (uint32_t iee=0; iee<ngpuees; ++iee) {
+            auto const& idgpu = idsgpuEE[iee];
+            auto iter2idcpu = std::find(idscpuEE.begin(), idscpuEE.end(), 
+                idgpu);
+            // FIXME
+            assert(idgpu == *iter2idcpu);
+
+            // get the digis
+            auto const ptrdiff = iter2idcpu - idscpuEE.begin();
+            for (uint32_t s=0u; s<10u; s++) {
+                EcalMGPASample sampleGPU{datagpuEE[iee * 10 + s]};
+                EcalMGPASample sampleCPU{datacpuEE[ptrdiff * 10 + s]};
+
+                hADCEEGPU->Fill(sampleGPU.adc());
+                hGainEEGPU->Fill(sampleGPU.gainId());
+                hADCEECPU->Fill(sampleCPU.adc());
+                hGainEECPU->Fill(sampleCPU.gainId());
+                hADCEEGPUvsCPU->Fill(sampleCPU.adc(), sampleGPU.adc());
+                hGainEEGPUvsCPU->Fill(sampleCPU.gainId(), sampleGPU.gainId());
+            }
+        }
+    }
+
+    {
+        TCanvas c{"plots", "plots", 4200, 6200};
+        c.Divide(2, 4);
+        c.cd(1);
+        {
+            gPad->SetLogy();
+            hADCEBCPU->SetLineColor(kBlack);
+            hADCEBCPU->SetLineWidth(1.);
+            hADCEBCPU->Draw("");
+            hADCEBGPU->SetLineColor(kBlue);
+            hADCEBGPU->SetLineWidth(1.);
+            hADCEBGPU->Draw("sames");
+            gPad->Update();
+            auto stats = (TPaveStats*)hADCEBGPU->FindObject("stats");
+            auto y2 = stats->GetY2NDC();
+            auto y1 = stats->GetY1NDC();
+            stats->SetY2NDC(y1);
+            stats->SetY1NDC(y1 - (y2-y1));
+        }
+        c.cd(2);
+        {
+            gPad->SetLogy();
+            hADCEECPU->SetLineColor(kBlack);
+            hADCEECPU->SetLineWidth(1.);
+            hADCEECPU->Draw("");
+            hADCEEGPU->SetLineColor(kBlue);
+            hADCEEGPU->SetLineWidth(1.);
+            hADCEEGPU->Draw("sames");
+            gPad->Update();
+            auto stats = (TPaveStats*)hADCEEGPU->FindObject("stats");
+            auto y2 = stats->GetY2NDC();
+            auto y1 = stats->GetY1NDC();
+            stats->SetY2NDC(y1);
+            stats->SetY1NDC(y1 - (y2-y1));
+        }
+        c.cd(3);
+        {
+            gPad->SetLogy();
+            hGainEBCPU->SetLineColor(kBlack);
+            hGainEBCPU->SetLineWidth(1.);
+            hGainEBCPU->Draw("");
+            hGainEBGPU->SetLineColor(kBlue);
+            hGainEBGPU->SetLineWidth(1.);
+            hGainEBGPU->Draw("sames");
+            gPad->Update();
+            auto stats = (TPaveStats*)hGainEBGPU->FindObject("stats");
+            auto y2 = stats->GetY2NDC();
+            auto y1 = stats->GetY1NDC();
+            stats->SetY2NDC(y1);
+            stats->SetY1NDC(y1 - (y2-y1));
+        }
+        c.cd(4);
+        {
+            gPad->SetLogy();
+            hGainEECPU->SetLineColor(kBlack);
+            hGainEECPU->SetLineWidth(1.);
+            hGainEECPU->Draw("");
+            hGainEEGPU->SetLineColor(kBlue);
+            hGainEEGPU->SetLineWidth(1.);
+            hGainEEGPU->Draw("sames");
+            gPad->Update();
+            auto stats = (TPaveStats*)hGainEEGPU->FindObject("stats");
+            auto y2 = stats->GetY2NDC();
+            auto y1 = stats->GetY1NDC();
+            stats->SetY2NDC(y1);
+            stats->SetY1NDC(y1 - (y2-y1));
+        }
+        c.cd(5);
+        hADCEBGPUvsCPU->Draw("colz");
+        c.cd(6);
+        hADCEEGPUvsCPU->Draw("colz");
+        c.cd(7);
+        hGainEBGPUvsCPU->Draw("colz");
+        c.cd(8);
+        hGainEEGPUvsCPU->Draw("colz");
+        c.SaveAs("plots.pdf");
+    }
+
+    rfin.Close();
+    rfout.Write();
+    rfout.Close();
+}

From 0d118e117ed6f7414f4436139a59310f18eab02b Mon Sep 17 00:00:00 2001
From: Viktor Khristenko <vdkhristenko1991@gmail.com>
Date: Mon, 30 Mar 2020 15:44:49 +0200
Subject: [PATCH 03/30] cuda data formats ecal rechit  fixes for 110x

---
 CUDADataFormats/EcalRecHitSoA/BuildFile.xml   |  2 +
 .../interface/EcalUncalibratedRecHit_soa.h    | 77 +++++++++++--------
 .../EcalRecHitSoA/interface/RecoTypes.h       | 10 +--
 CUDADataFormats/EcalRecHitSoA/src/classes.h   |  1 +
 .../EcalRecHitSoA/src/classes_def.xml         | 18 ++++-
 5 files changed, 68 insertions(+), 40 deletions(-)

diff --git a/CUDADataFormats/EcalRecHitSoA/BuildFile.xml b/CUDADataFormats/EcalRecHitSoA/BuildFile.xml
index 794d2bf7abead..927a7a57a86a7 100644
--- a/CUDADataFormats/EcalRecHitSoA/BuildFile.xml
+++ b/CUDADataFormats/EcalRecHitSoA/BuildFile.xml
@@ -1,6 +1,8 @@
 <use name="DataFormats/Common"/>
+<use name="CUDADataFormats/Common"/>
 <use name="DataFormats/EcalDigi"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="cuda"/>
 
 <export>
   <lib   name="1"/>
diff --git a/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h b/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h
index e11c13ebdf4c2..d43f77315476d 100644
--- a/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h
+++ b/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h
@@ -11,29 +11,39 @@
 
 namespace ecal {
 
-  namespace Tag {
+namespace Tag {
 
-    struct soa {};
-    struct ptr {};
+struct soa {};
+struct ptr {};
 
-  }  // namespace Tag
+}
 
-  template <typename T, typename L = Tag::soa>
-  struct type_wrapper {
-    //#ifndef ECAL_MULTIFIT_DONOT_USE_PINNED_MEM
-    //    using type = std::vector<T, CUDAHostAllocator<T>>;
-    //#else
-    using type = std::vector<T>;
-    //#endif
-  };
+namespace Detail {
 
-  template <typename T>
-  struct type_wrapper<T, Tag::ptr> {
+// empty base 
+template<typename T>
+struct Base {};
+
+// add number of values for ptr case
+template<>
+struct Base<::ecal::Tag::ptr> {
+    uint32_t size;
+};
+
+}
+
+template<typename T, typename L = Tag::soa>
+struct type_wrapper {
+    using type = std::vector<T, CUDAHostAllocator<T>>;
+};
+
+template<typename T>
+struct type_wrapper<T, Tag::ptr> {
     using type = T*;
-  };
+};
 
-  template <typename L = Tag::soa>
-  struct UncalibratedRecHit {
+template<typename L = Tag::soa>
+struct UncalibratedRecHit : public Detail::Base<L> {
     UncalibratedRecHit() = default;
     UncalibratedRecHit(const UncalibratedRecHit&) = default;
     UncalibratedRecHit& operator=(const UncalibratedRecHit&) = default;
@@ -43,8 +53,8 @@ namespace ecal {
 
     // TODO: std::array causes root's dictionary problems
     typename type_wrapper<reco::ComputationScalarType, L>::type amplitudesAll;
-    //    typename type_wrapper<std::array<reco::ComputationScalarType,
-    //        EcalDataFrame::MAXSAMPLES>, L>::type amplitudesAll;
+//    typename type_wrapper<std::array<reco::ComputationScalarType, 
+//        EcalDataFrame::MAXSAMPLES>, L>::type amplitudesAll;
     typename type_wrapper<reco::StorageScalarType, L>::type amplitude;
     typename type_wrapper<reco::StorageScalarType, L>::type chi2;
     typename type_wrapper<reco::StorageScalarType, L>::type pedestal;
@@ -53,21 +63,22 @@ namespace ecal {
     typename type_wrapper<uint32_t, L>::type did;
     typename type_wrapper<uint32_t, L>::type flags;
 
-    template <typename U = L>
-    typename std::enable_if<std::is_same<U, Tag::soa>::value, void>::type resize(size_t size) {
-      amplitudesAll.resize(size * EcalDataFrame::MAXSAMPLES);
-      amplitude.resize(size);
-      pedestal.resize(size);
-      chi2.resize(size);
-      did.resize(size);
-      flags.resize(size);
-      jitter.resize(size);
-      jitterError.resize(size);
+    template<typename U = L>
+    typename std::enable_if<std::is_same<U, Tag::soa>::value, void>::type 
+    resize(size_t size) {
+        amplitudesAll.resize(size * EcalDataFrame::MAXSAMPLES);
+        amplitude.resize(size);
+        pedestal.resize(size);
+        chi2.resize(size);
+        did.resize(size);
+        flags.resize(size);
+        jitter.resize(size);
+        jitterError.resize(size);
     }
-  };
+};
 
-  using SoAUncalibratedRecHitCollection = UncalibratedRecHit<Tag::soa>;
+using SoAUncalibratedRecHitCollection = UncalibratedRecHit<Tag::soa>;
 
-}  // namespace ecal
+}
 
-#endif  // RecoLocalCalo_EcalRecAlgos_interface_EcalUncalibratedRecHit_soa_h
+#endif // RecoLocalCalo_EcalRecAlgos_interface_EcalUncalibratedRecHit_soa_h
diff --git a/CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h b/CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h
index 5667a9225f29d..cf8571feb01ae 100644
--- a/CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h
+++ b/CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h
@@ -1,13 +1,11 @@
 #ifndef CUDADataFormats_EcalRecHitSoA_interface_RecoTypes
 #define CUDADataFormats_EcalRecHitSoA_interface_RecoTypes
 
-namespace ecal {
-  namespace reco {
+namespace ecal { namespace reco {
 
-    using ComputationScalarType = float;
-    using StorageScalarType = float;
+using ComputationScalarType = float;
+using StorageScalarType = float;
 
-  }  // namespace reco
-}  // namespace ecal
+}}
 
 #endif
diff --git a/CUDADataFormats/EcalRecHitSoA/src/classes.h b/CUDADataFormats/EcalRecHitSoA/src/classes.h
index 8ad6b8d684b9a..3cab9957e62b4 100644
--- a/CUDADataFormats/EcalRecHitSoA/src/classes.h
+++ b/CUDADataFormats/EcalRecHitSoA/src/classes.h
@@ -1,2 +1,3 @@
 #include "DataFormats/Common/interface/Wrapper.h"
+#include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
diff --git a/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml b/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml
index 461460835a723..68056d21ad4c1 100644
--- a/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml
+++ b/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml
@@ -1,5 +1,21 @@
 <lcgdict>
+    <class name="std::vector<float, CUDAHostAllocator<float, 0>>" />
+    <class name="std::vector<double, CUDAHostAllocator<double, 0>>" />
+    <class name="std::vector<uint32_t, CUDAHostAllocator<uint32_t, 0>>" />
     <class name="ecal::Tag::soa"/>
+    <class name="ecal::Detail::Base<ecal::Tag::soa>" />
+
+    <class name="cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>" persistent="false" />
+    <class name="edm::Wrapper<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>>" persistent="false" />
+
+    <!--
+    <class name="std::vector<double, CUDAHostAllocator<double> >"/>
+    <class name="std::vector<float, CUDAHostAllocator<float> >"/>
+    <class name="std::vector<unsigned int, CUDAHostAllocator<unsigned int> >" />
+    -->
+
+    <!--  <class name="std::array<double, 10>" />
+    <class name="std::array<float, 10>" /> -->
     <class name="ecal::UncalibratedRecHit<ecal::Tag::soa>"/>
-    <class name="edm::Wrapper<ecal::UncalibratedRecHit<ecal::Tag::soa>>"/>
+    <class name="edm::Wrapper<ecal::UncalibratedRecHit<ecal::Tag::soa> >"/>
 </lcgdict>

From 894d06a45295947d8ab106a4a0f3428e63e96995 Mon Sep 17 00:00:00 2001
From: Viktor Khristenko <vdkhristenko1991@gmail.com>
Date: Mon, 30 Mar 2020 16:17:27 +0200
Subject: [PATCH 04/30] ecal reco algos adapted for 110x

---
 RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml  |    2 +-
 .../bin/makeEcalGpuValidationPlots.cpp        |  232 ---
 ...eEcalMultifitResultsGpuValidationPlots.cpp |  261 +++
 .../EcalRecAlgos/interface/DeclsForKernels.h  |  420 ++--
 .../interface/EcalGainRatiosGPU.h             |   37 +-
 .../EcalRecAlgos/interface/EcalPedestalsGPU.h |   48 +-
 .../interface/EcalPulseCovariancesGPU.h       |   35 +-
 .../interface/EcalPulseShapesGPU.h            |   35 +-
 .../interface/EcalSamplesCorrelationGPU.h     |   49 +-
 .../interface/EcalTimeBiasCorrectionsGPU.h    |   47 +-
 .../interface/EcalTimeCalibConstantsGPU.h     |   39 +-
 .../EcalUncalibRecHitMultiFitAlgo_gpu_new.h   |   20 +-
 .../interface/EigenMatrixTypes_gpu.h          |   73 +-
 .../src/AmplitudeComputationCommonKernels.cu  |  819 ++++----
 .../src/AmplitudeComputationCommonKernels.h   |  151 +-
 .../src/AmplitudeComputationKernels.cu        |  425 +++++
 .../src/AmplitudeComputationKernels.h         |   27 +
 .../src/AmplitudeComputationKernelsV1.cu      |  372 ----
 .../src/AmplitudeComputationKernelsV1.h       |   50 -
 .../EcalRecAlgos/src/EcalGainRatiosGPU.cc     |   83 +-
 .../EcalRecAlgos/src/EcalPedestalsGPU.cc      |  167 +-
 .../src/EcalPulseCovariancesGPU.cc            |   68 +-
 .../EcalRecAlgos/src/EcalPulseShapesGPU.cc    |   68 +-
 .../src/EcalSamplesCorrelationGPU.cc          |  143 +-
 .../src/EcalTimeBiasCorrectionsGPU.cc         |  111 +-
 .../src/EcalTimeCalibConstantsGPU.cc          |   65 +-
 .../EcalUncalibRecHitMultiFitAlgo_gpu_new.cu  |  366 ++--
 .../EcalRecAlgos/src/KernelHelpers.cu         |  156 +-
 .../EcalRecAlgos/src/KernelHelpers.h          |  464 ++++-
 .../src/TimeComputationKernels.cu             | 1688 +++++++++--------
 .../EcalRecAlgos/src/TimeComputationKernels.h |  209 +-
 .../EcalRecAlgos/src/inplace_fnnls.cu         |  198 +-
 .../EcalRecAlgos/src/inplace_fnnls.h          |   28 +-
 33 files changed, 3914 insertions(+), 3042 deletions(-)
 delete mode 100644 RecoLocalCalo/EcalRecAlgos/bin/makeEcalGpuValidationPlots.cpp
 create mode 100644 RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp
 create mode 100644 RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu
 create mode 100644 RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.h
 delete mode 100644 RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernelsV1.cu
 delete mode 100644 RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernelsV1.h

diff --git a/RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml b/RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml
index bb20a5ac3e6da..bf61d052856ad 100644
--- a/RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml
+++ b/RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml
@@ -1,4 +1,4 @@
-<bin name="makeEcalGpuValidationPlots" file="makeEcalGpuValidationPlots.cpp">
+<bin name="makeEcalMultifitResultsGpuValidationPlots" file="makeEcalMultifitResultsGpuValidationPlots.cpp">
     <use name="root"/>
     <use name="rootgraphics"/>
     <use name="CUDADataFormats/EcalRecHitSoA"/>
diff --git a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalGpuValidationPlots.cpp b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalGpuValidationPlots.cpp
deleted file mode 100644
index 9691a07fc5e0a..0000000000000
--- a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalGpuValidationPlots.cpp
+++ /dev/null
@@ -1,232 +0,0 @@
-#include <iomanip>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include <TCanvas.h>
-#include <TFile.h>
-#include <TH1D.h>
-#include <TH2D.h>
-#include <TTree.h>
-
-#include "DataFormats/Common/interface/Wrapper.h"
-#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
-#include "DataFormats/EcalRecHit/interface/EcalRecHitCollections.h"
-#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
-
-int main(int argc, char *argv[]) {
-  if (argc < 3) {
-    std::cout << "run with: ./validateGPU <path to input file> <output file>\n";
-    exit(0);
-  }
-
-  edm::Wrapper<ecal::UncalibratedRecHit<ecal::Tag::soa>> *wgpuEB = nullptr;
-  edm::Wrapper<ecal::UncalibratedRecHit<ecal::Tag::soa>> *wgpuEE = nullptr;
-  edm::Wrapper<EBUncalibratedRecHitCollection> *wcpuEB = nullptr;
-  edm::Wrapper<EEUncalibratedRecHitCollection> *wcpuEE = nullptr;
-
-  std::string fileName = argv[1];
-  std::string outFileName = argv[2];
-
-  // output
-  TFile rfout{outFileName.c_str(), "recreate"};
-
-  int nbins = 300;
-  float last = 3000.;
-
-  int nbins_chi2 = 1000;
-  float last_chi2 = 1000.;
-
-  int nbins_delta = 201;  // use an odd number to center around 0
-  float delta = 0.2;
-
-  auto hSOIAmplitudesEBGPU = new TH1D("hSOIAmplitudesEBGPU", "hSOIAmplitudesEBGPU", nbins, 0, last);
-  auto hSOIAmplitudesEEGPU = new TH1D("hSOIAmplitudesEEGPU", "hSOIAmplitudesEEGPU", nbins, 0, last);
-  auto hSOIAmplitudesEBCPU = new TH1D("hSOIAmplitudesEBCPU", "hSOIAmplitudesEBCPU", nbins, 0, last);
-  auto hSOIAmplitudesEECPU = new TH1D("hSOIAmplitudesEECPU", "hSOIAmplitudesEECPU", nbins, 0, last);
-
-  auto hChi2EBGPU = new TH1D("hChi2EBGPU", "hChi2EBGPU", nbins_chi2, 0, last_chi2);
-  auto hChi2EEGPU = new TH1D("hChi2EEGPU", "hChi2EEGPU", nbins_chi2, 0, last_chi2);
-  auto hChi2EBCPU = new TH1D("hChi2EBCPU", "hChi2EBCPU", nbins_chi2, 0, last_chi2);
-  auto hChi2EECPU = new TH1D("hChi2EECPU", "hChi2EECPU", nbins_chi2, 0, last_chi2);
-
-  auto hSOIAmplitudesEBGPUvsCPU =
-      new TH2D("hSOIAmplitudesEBGPUvsCPU", "hSOIAmplitudesEBGPUvsCPU", nbins, 0, last, nbins, 0, last);
-  auto hSOIAmplitudesEEGPUvsCPU =
-      new TH2D("hSOIAmplitudesEEGPUvsCPU", "hSOIAmplitudesEEGPUvsCPU", nbins, 0, last, nbins, 0, last);
-  auto hSOIAmplitudesEBdeltavsCPU =
-      new TH2D("hSOIAmplitudesEBdeltavsCPU", "hSOIAmplitudesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
-  auto hSOIAmplitudesEEdeltavsCPU =
-      new TH2D("hSOIAmplitudesEEdeltavsCPU", "hSOIAmplitudesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
-
-  auto hChi2EBGPUvsCPU =
-      new TH2D("hChi2EBGPUvsCPU", "hChi2EBGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2);
-  auto hChi2EEGPUvsCPU =
-      new TH2D("hChi2EEGPUvsCPU", "hChi2EEGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2);
-  auto hChi2EBdeltavsCPU =
-      new TH2D("hChi2EBdeltavsCPU", "hChi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
-  auto hChi2EEdeltavsCPU =
-      new TH2D("hChi2EEdeltavsCPU", "hChi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
-
-  // input
-  std::cout << "validating file " << fileName << std::endl;
-  TFile rf{fileName.c_str()};
-  TTree *rt = (TTree *)rf.Get("Events");
-  rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalUncalibRecHitProducerGPU_EcalUncalibRecHitsEB_RECO.",
-                       &wgpuEB);
-  rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalUncalibRecHitProducerGPU_EcalUncalibRecHitsEE_RECO.",
-                       &wgpuEE);
-  rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEB_RECO.", &wcpuEB);
-  rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEE_RECO.", &wcpuEE);
-
-  constexpr float eps_diff = 1e-3;
-
-  // accumulate
-  auto const nentries = rt->GetEntries();
-  std::cout << "#events to validate over: " << nentries << std::endl;
-  for (int ie = 0; ie < nentries; ++ie) {
-    rt->GetEntry(ie);
-
-    const char *ordinal[] = {"th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th"};
-    auto cpu_eb_size = wcpuEB->bareProduct().size();
-    auto cpu_ee_size = wcpuEE->bareProduct().size();
-    auto gpu_eb_size = wgpuEB->bareProduct().amplitude.size();
-    auto gpu_ee_size = wgpuEE->bareProduct().amplitude.size();
-    if (cpu_eb_size != gpu_eb_size or cpu_ee_size != gpu_ee_size) {
-      std::cerr << ie << ordinal[ie % 10] << " entry:\n"
-                << "  EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << gpu_eb_size
-                << " (gpu)\n"
-                << "  EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << gpu_ee_size
-                << " (gpu)" << std::endl;
-      continue;
-    }
-
-    assert(wgpuEB->bareProduct().amplitude.size() == wcpuEB->bareProduct().size());
-    assert(wgpuEE->bareProduct().amplitude.size() == wcpuEE->bareProduct().size());
-    auto const neb = wcpuEB->bareProduct().size();
-    auto const nee = wcpuEE->bareProduct().size();
-
-    for (uint32_t i = 0; i < neb; ++i) {
-      auto const soi_amp_gpu = wgpuEB->bareProduct().amplitude[i];
-      auto const soi_amp_cpu = wcpuEB->bareProduct()[i].amplitude();
-      auto const chi2_gpu = wgpuEB->bareProduct().chi2[i];
-      auto const chi2_cpu = wcpuEB->bareProduct()[i].chi2();
-
-      hSOIAmplitudesEBGPU->Fill(soi_amp_gpu);
-      hSOIAmplitudesEBCPU->Fill(soi_amp_cpu);
-      hSOIAmplitudesEBGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu);
-      hSOIAmplitudesEBdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu - soi_amp_cpu);
-      hChi2EBGPU->Fill(chi2_gpu);
-      hChi2EBCPU->Fill(chi2_cpu);
-      hChi2EBGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
-      hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu);
-
-      if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or
-          std::isnan(chi2_gpu)) {
-        printf("EB eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n",
-               ie,
-               i,
-               soi_amp_gpu,
-               soi_amp_cpu,
-               chi2_gpu,
-               chi2_cpu);
-        if (std::isnan(chi2_gpu))
-          printf("*** nan ***\n");
-      }
-    }
-
-    for (uint32_t i = 0; i < nee; ++i) {
-      auto const soi_amp_gpu = wgpuEE->bareProduct().amplitude[i];
-      auto const soi_amp_cpu = wcpuEE->bareProduct()[i].amplitude();
-      auto const chi2_gpu = wgpuEE->bareProduct().chi2[i];
-      auto const chi2_cpu = wcpuEE->bareProduct()[i].chi2();
-
-      hSOIAmplitudesEEGPU->Fill(soi_amp_gpu);
-      hSOIAmplitudesEECPU->Fill(soi_amp_cpu);
-      hSOIAmplitudesEEGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu);
-      hSOIAmplitudesEEdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu - soi_amp_cpu);
-      hChi2EEGPU->Fill(chi2_gpu);
-      hChi2EECPU->Fill(chi2_cpu);
-      hChi2EEGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
-      hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu);
-
-      if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or
-          std::isnan(chi2_gpu)) {
-        printf("EE eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n",
-               ie,
-               static_cast<int>(neb + i),
-               soi_amp_gpu,
-               soi_amp_cpu,
-               chi2_gpu,
-               chi2_cpu);
-        if (std::isnan(chi2_gpu))
-          printf("*** nan ***\n");
-      }
-    }
-  }
-
-  {
-    TCanvas c("plots", "plots", 4200, 6200);
-    c.Divide(2, 3);
-
-    c.cd(1);
-    gPad->SetLogy();
-    hSOIAmplitudesEBCPU->SetLineColor(kBlack);
-    hSOIAmplitudesEBCPU->SetLineWidth(1.);
-    hSOIAmplitudesEBCPU->Draw("");
-    hSOIAmplitudesEBGPU->SetLineColor(kBlue);
-    hSOIAmplitudesEBGPU->SetLineWidth(1.);
-    hSOIAmplitudesEBGPU->Draw("SAME");
-    c.cd(2);
-    gPad->SetLogy();
-    hSOIAmplitudesEECPU->SetLineColor(kBlack);
-    hSOIAmplitudesEECPU->SetLineWidth(1.);
-    hSOIAmplitudesEECPU->Draw("");
-    hSOIAmplitudesEEGPU->SetLineColor(kBlue);
-    hSOIAmplitudesEEGPU->SetLineWidth(1.);
-    hSOIAmplitudesEEGPU->Draw("SAME");
-    c.cd(3);
-    hSOIAmplitudesEBGPUvsCPU->Draw("COLZ");
-    c.cd(4);
-    hSOIAmplitudesEEGPUvsCPU->Draw("COLZ");
-    c.cd(5);
-    hSOIAmplitudesEBdeltavsCPU->Draw("COLZ");
-    c.cd(6);
-    hSOIAmplitudesEEdeltavsCPU->Draw("COLZ");
-
-    c.SaveAs("ecal-amplitudes.pdf");
-
-    c.cd(1);
-    gPad->SetLogy();
-    hChi2EBCPU->SetLineColor(kBlack);
-    hChi2EBCPU->SetLineWidth(1.);
-    hChi2EBCPU->Draw("");
-    hChi2EBGPU->SetLineColor(kBlue);
-    hChi2EBGPU->SetLineWidth(1.);
-    hChi2EBGPU->Draw("SAME");
-    c.cd(2);
-    gPad->SetLogy();
-    hChi2EECPU->SetLineColor(kBlack);
-    hChi2EECPU->SetLineWidth(1.);
-    hChi2EECPU->Draw("");
-    hChi2EEGPU->SetLineColor(kBlue);
-    hChi2EEGPU->SetLineWidth(1.);
-    hChi2EEGPU->Draw("SAME");
-    c.cd(3);
-    hChi2EBGPUvsCPU->Draw("COLZ");
-    c.cd(4);
-    hChi2EEGPUvsCPU->Draw("COLZ");
-    c.cd(5);
-    hChi2EBdeltavsCPU->Draw("COLZ");
-    c.cd(6);
-    hChi2EEdeltavsCPU->Draw("COLZ");
-
-    c.SaveAs("ecal-chi2.pdf");
-  }
-
-  rf.Close();
-  rfout.Write();
-  rfout.Close();
-
-  return 0;
-}
diff --git a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp
new file mode 100644
index 0000000000000..a336de13b9e7d
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp
@@ -0,0 +1,261 @@
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <TCanvas.h>
+#include <TFile.h>
+#include <TH1D.h>
+#include <TH2D.h>
+#include <TTree.h>
+#include <TPaveStats.h>
+
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
+#include "DataFormats/EcalRecHit/interface/EcalRecHitCollections.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
+
+int main(int argc, char *argv[]) {
+    if (argc<3) {
+        std::cout << "run with: ./validateGPU <path to input file> <output file>\n";
+        exit(0);
+    }
+
+    edm::Wrapper<ecal::UncalibratedRecHit<ecal::Tag::soa>> *wgpuEB=nullptr;
+    edm::Wrapper<ecal::UncalibratedRecHit<ecal::Tag::soa>> *wgpuEE=nullptr;
+    edm::Wrapper<EBUncalibratedRecHitCollection> *wcpuEB = nullptr;
+    edm::Wrapper<EEUncalibratedRecHitCollection> *wcpuEE = nullptr;
+
+    std::string fileName = argv[1];
+    std::string outFileName = argv[2];
+
+    // output
+    TFile rfout{outFileName.c_str(), "recreate"};
+
+    int nbins = 300;
+    float last = 3000.;
+
+    int nbins_chi2 = 1000;
+    float last_chi2 = 1000.;
+
+    int nbins_delta = 201;  // use an odd number to center around 0
+    float delta = 0.2;
+
+    auto hSOIAmplitudesEBGPU = new TH1D("hSOIAmplitudesEBGPU", "hSOIAmplitudesEBGPU", nbins, 0, last);
+    auto hSOIAmplitudesEEGPU = new TH1D("hSOIAmplitudesEEGPU", "hSOIAmplitudesEEGPU", nbins, 0, last);
+    auto hSOIAmplitudesEBCPU = new TH1D("hSOIAmplitudesEBCPU", "hSOIAmplitudesEBCPU", nbins, 0, last);
+    auto hSOIAmplitudesEECPU = new TH1D("hSOIAmplitudesEECPU", "hSOIAmplitudesEECPU", nbins, 0, last);
+
+    auto hChi2EBGPU = new TH1D("hChi2EBGPU", "hChi2EBGPU", nbins_chi2, 0, last_chi2);
+    auto hChi2EEGPU = new TH1D("hChi2EEGPU", "hChi2EEGPU", nbins_chi2, 0, last_chi2);
+    auto hChi2EBCPU = new TH1D("hChi2EBCPU", "hChi2EBCPU", nbins_chi2, 0, last_chi2);
+    auto hChi2EECPU = new TH1D("hChi2EECPU", "hChi2EECPU", nbins_chi2, 0, last_chi2);
+
+    auto hSOIAmplitudesEBGPUvsCPU = new TH2D("hSOIAmplitudesEBGPUvsCPU", "hSOIAmplitudesEBGPUvsCPU", nbins, 0, last, nbins, 0, last);
+    auto hSOIAmplitudesEEGPUvsCPU = new TH2D("hSOIAmplitudesEEGPUvsCPU", "hSOIAmplitudesEEGPUvsCPU", nbins, 0, last, nbins, 0, last);
+    auto hSOIAmplitudesEBdeltavsCPU = new TH2D("hSOIAmplitudesEBdeltavsCPU", "hSOIAmplitudesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+    auto hSOIAmplitudesEEdeltavsCPU = new TH2D("hSOIAmplitudesEEdeltavsCPU", "hSOIAmplitudesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+
+    auto hChi2EBGPUvsCPU = new TH2D("hChi2EBGPUvsCPU", "hChi2EBGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2);
+    auto hChi2EEGPUvsCPU = new TH2D("hChi2EEGPUvsCPU", "hChi2EEGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2);
+    auto hChi2EBdeltavsCPU = new TH2D("hChi2EBdeltavsCPU", "hChi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
+    auto hChi2EEdeltavsCPU = new TH2D("hChi2EEdeltavsCPU", "hChi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
+
+    // input
+    std::cout << "validating file " << fileName << std::endl;
+    TFile rf{fileName.c_str()};
+    TTree *rt = (TTree*)rf.Get("Events");
+    rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEB_RECO.", &wgpuEB);
+    rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEE_RECO.", &wgpuEE);
+    rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEB_RECO.", &wcpuEB);
+    rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEE_RECO.", &wcpuEE);
+
+    constexpr float eps_diff = 1e-3;
+
+    // accumulate
+    auto const nentries = rt->GetEntries();
+    std::cout << "#events to validate over: " << nentries << std::endl;
+    for (int ie=0; ie<nentries; ++ie) {
+        rt->GetEntry(ie);
+
+        const char* ordinal[] = { "th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th" };
+        auto cpu_eb_size = wcpuEB->bareProduct().size();
+        auto cpu_ee_size = wcpuEE->bareProduct().size();
+        auto gpu_eb_size = wgpuEB->bareProduct().amplitude.size();
+        auto gpu_ee_size = wgpuEE->bareProduct().amplitude.size();
+        if (cpu_eb_size != gpu_eb_size or cpu_ee_size != gpu_ee_size) {
+          std::cerr << ie << ordinal[ie % 10] << " entry:\n"
+                    << "  EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << gpu_eb_size << " (gpu)\n"
+                    << "  EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << gpu_ee_size << " (gpu)" << std::endl;
+          continue;
+        }
+
+        assert(wgpuEB->bareProduct().amplitude.size() == wcpuEB->bareProduct().size());
+        assert(wgpuEE->bareProduct().amplitude.size() == wcpuEE->bareProduct().size());
+        auto const neb = wcpuEB->bareProduct().size();
+        auto const nee = wcpuEE->bareProduct().size();
+
+        for (uint32_t i=0; i<neb; ++i) {
+            auto const did_gpu = wgpuEB->bareProduct().did[i];
+            auto const soi_amp_gpu = wgpuEB->bareProduct().amplitude[i];
+            auto const cpu_iter = wcpuEB->bareProduct().find(DetId{did_gpu});
+            if (cpu_iter == wcpuEB->bareProduct().end()) {
+                std::cerr << ie << ordinal[ie % 10] << " entry\n"
+                          << "  Did not find a DetId " << did_gpu
+                          << " in a CPU collection\n";
+                continue;
+            }
+            auto const soi_amp_cpu = cpu_iter->amplitude();
+            auto const chi2_gpu = wgpuEB->bareProduct().chi2[i];
+            auto const chi2_cpu = cpu_iter->chi2();
+
+            hSOIAmplitudesEBGPU->Fill(soi_amp_gpu);
+            hSOIAmplitudesEBCPU->Fill(soi_amp_cpu);
+            hSOIAmplitudesEBGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu);
+            hSOIAmplitudesEBdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu-soi_amp_cpu);
+            hChi2EBGPU->Fill(chi2_gpu);
+            hChi2EBCPU->Fill(chi2_cpu);
+            hChi2EBGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
+            hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu);
+
+            if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or
+                (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu))
+            {
+                printf("EB eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n",
+                    ie, i, soi_amp_gpu, soi_amp_cpu, chi2_gpu, chi2_cpu);
+                if (std::isnan(chi2_gpu))
+                  printf("*** nan ***\n");
+            }
+        }
+
+        for (uint32_t i=0; i<nee; ++i) {
+            auto const did_gpu = wgpuEE->bareProduct().did[i];
+            auto const soi_amp_gpu = wgpuEE->bareProduct().amplitude[i];
+            auto const cpu_iter = wcpuEE->bareProduct().find(DetId{did_gpu});
+            if (cpu_iter == wcpuEE->bareProduct().end()) {
+                std::cerr << ie << ordinal[ie % 10] << " entry\n"
+                          << "  did not find a DetId " << did_gpu
+                          << " in a CPU collection\n";
+                continue;
+            }
+            auto const soi_amp_cpu = cpu_iter->amplitude();
+            auto const chi2_gpu = wgpuEE->bareProduct().chi2[i];
+            auto const chi2_cpu = cpu_iter->chi2();
+
+            hSOIAmplitudesEEGPU->Fill(soi_amp_gpu);
+            hSOIAmplitudesEECPU->Fill(soi_amp_cpu);
+            hSOIAmplitudesEEGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu);
+            hSOIAmplitudesEEdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu-soi_amp_cpu);
+            hChi2EEGPU->Fill(chi2_gpu);
+            hChi2EECPU->Fill(chi2_cpu);
+            hChi2EEGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
+            hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu);
+
+            if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or
+                (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu))
+            {
+                printf("EE eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n",
+                    ie, static_cast<int>(neb+i), soi_amp_gpu, soi_amp_cpu, chi2_gpu, chi2_cpu);
+                if (std::isnan(chi2_gpu))
+                  printf("*** nan ***\n");
+            }
+        }
+    }
+
+    {
+      TCanvas c("plots", "plots", 4200, 6200);
+      c.Divide(2, 3);
+
+      c.cd(1);
+      {
+          gPad->SetLogy();
+          hSOIAmplitudesEBCPU->SetLineColor(kBlack);
+          hSOIAmplitudesEBCPU->SetLineWidth(1.);
+          hSOIAmplitudesEBCPU->Draw("");
+          hSOIAmplitudesEBGPU->SetLineColor(kBlue);
+          hSOIAmplitudesEBGPU->SetLineWidth(1.);
+          hSOIAmplitudesEBGPU->Draw("sames");
+          gPad->Update();
+          auto stats = (TPaveStats*)hSOIAmplitudesEBGPU->FindObject("stats");
+          auto y2 = stats->GetY2NDC();
+          auto y1 = stats->GetY1NDC();
+          stats->SetY2NDC(y1);
+          stats->SetY1NDC(y1 - (y2-y1));
+      }
+      c.cd(2);
+      {
+          gPad->SetLogy();
+          hSOIAmplitudesEECPU->SetLineColor(kBlack);
+          hSOIAmplitudesEECPU->SetLineWidth(1.);
+          hSOIAmplitudesEECPU->Draw("");
+          hSOIAmplitudesEEGPU->SetLineColor(kBlue);
+          hSOIAmplitudesEEGPU->SetLineWidth(1.);
+          hSOIAmplitudesEEGPU->Draw("sames");
+          gPad->Update();
+          auto stats = (TPaveStats*)hSOIAmplitudesEEGPU->FindObject("stats");
+          auto y2 = stats->GetY2NDC();
+          auto y1 = stats->GetY1NDC();
+          stats->SetY2NDC(y1);
+          stats->SetY1NDC(y1 - (y2-y1));
+      }
+      c.cd(3);
+      hSOIAmplitudesEBGPUvsCPU->Draw("COLZ");
+      c.cd(4);
+      hSOIAmplitudesEEGPUvsCPU->Draw("COLZ");
+      c.cd(5);
+      hSOIAmplitudesEBdeltavsCPU->Draw("COLZ");
+      c.cd(6);
+      hSOIAmplitudesEEdeltavsCPU->Draw("COLZ");
+
+      c.SaveAs("ecal-amplitudes.pdf");
+
+      c.cd(1);
+      {
+          gPad->SetLogy();
+          hChi2EBCPU->SetLineColor(kBlack);
+          hChi2EBCPU->SetLineWidth(1.);
+          hChi2EBCPU->Draw("");
+          hChi2EBGPU->SetLineColor(kBlue);
+          hChi2EBGPU->SetLineWidth(1.);
+          hChi2EBGPU->Draw("sames");
+          gPad->Update();
+          auto stats = (TPaveStats*)hChi2EBGPU->FindObject("stats");
+          auto y2 = stats->GetY2NDC();
+          auto y1 = stats->GetY1NDC();
+          stats->SetY2NDC(y1);
+          stats->SetY1NDC(y1 - (y2-y1));
+      }
+      c.cd(2);
+      {
+          gPad->SetLogy();
+          hChi2EECPU->SetLineColor(kBlack);
+          hChi2EECPU->SetLineWidth(1.);
+          hChi2EECPU->Draw("");
+          hChi2EEGPU->SetLineColor(kBlue);
+          hChi2EEGPU->SetLineWidth(1.);
+          hChi2EEGPU->Draw("sames");
+          gPad->Update();
+          auto stats = (TPaveStats*)hChi2EEGPU->FindObject("stats");
+          auto y2 = stats->GetY2NDC();
+          auto y1 = stats->GetY1NDC();
+          stats->SetY2NDC(y1);
+          stats->SetY1NDC(y1 - (y2-y1));
+      }
+      c.cd(3);
+      hChi2EBGPUvsCPU->Draw("COLZ");
+      c.cd(4);
+      hChi2EEGPUvsCPU->Draw("COLZ");
+      c.cd(5);
+      hChi2EBdeltavsCPU->Draw("COLZ");
+      c.cd(6);
+      hChi2EEdeltavsCPU->Draw("COLZ");
+
+      c.SaveAs("ecal-chi2.pdf");
+    }
+
+    rf.Close();
+    rfout.Write();
+    rfout.Close();
+
+    return 0;
+}
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h
index b997906006a22..5ff32c0bc2259 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h
@@ -26,6 +26,8 @@
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h"
 
+#include "CUDADataFormats/EcalDigi/interface/DigisCollection.h"
+
 struct EcalPulseShape;
 class EcalSampleMask;
 class EcalTimeBiasCorrections;
@@ -36,220 +38,238 @@ class EcalSamplesCorrelation;
 class EBDigiCollection;
 class EEDigiCollection;
 
-namespace ecal {
-  namespace multifit {
-
-    enum class TimeComputationState : char { NotFinished = 0, Finished = 1 };
-    enum class MinimizationState : char {
-      NotFinished = 0,
-      Finished = 1,
-      Precomputed = 2,
-    };
-
-    // event input data on cpu, just const refs
-    struct EventInputDataCPU {
-      EBDigiCollection const& ebDigis;
-      EEDigiCollection const& eeDigis;
-    };
-
-    //
-    struct EventInputDataGPU {
-      uint16_t* digis;
-      uint32_t* ids;
-
-      void allocate(uint32_t size) {
-        cudaCheck(cudaMalloc((void**)&digis, sizeof(uint16_t) * size * EcalDataFrame::MAXSAMPLES));
-        cudaCheck(cudaMalloc((void**)&ids, sizeof(uint32_t) * size));
-      }
-
-      void deallocate() {
-        cudaCheck(cudaFree(digis));
-        cudaCheck(cudaFree(ids));
-      }
-    };
-
-    // parameters have a fixed type
-    // Can we go by with single precision
-    struct ConfigurationParameters {
-      using type = double;
-      // device ptrs
-      type *amplitudeFitParametersEB = nullptr, *amplitudeFitParametersEE = nullptr;
-
-      uint32_t timeFitParametersSizeEB, timeFitParametersSizeEE;
-      // device ptrs
-      type *timeFitParametersEB = nullptr, *timeFitParametersEE = nullptr;
-
-      type timeFitLimitsFirstEB, timeFitLimitsFirstEE;
-      type timeFitLimitsSecondEB, timeFitLimitsSecondEE;
-
-      type timeConstantTermEB, timeConstantTermEE;
-
-      type timeNconstEB, timeNconstEE;
-
-      type amplitudeThreshEE, amplitudeThreshEB;
-
-      type outOfTimeThreshG12pEB, outOfTimeThreshG12mEB;
-      type outOfTimeThreshG12pEE, outOfTimeThreshG12mEE;
-      type outOfTimeThreshG61pEE, outOfTimeThreshG61mEE;
-      type outOfTimeThreshG61pEB, outOfTimeThreshG61mEB;
-
-      std::array<uint32_t, 3> kernelMinimizeThreads;
-
-      bool shouldRunTimingComputation;
-    };
-
-    struct EventOutputDataGPU final : public ::ecal::UncalibratedRecHit<::ecal::Tag::ptr> {
-      void allocate(ConfigurationParameters const& configParameters, uint32_t size) {
-        cudaCheck(cudaMalloc((void**)&amplitudesAll, size * sizeof(SampleVector)));
-        cudaCheck(cudaMalloc((void**)&amplitude, size * sizeof(::ecal::reco::StorageScalarType)));
-        cudaCheck(cudaMalloc((void**)&chi2, size * sizeof(::ecal::reco::StorageScalarType)));
-        cudaCheck(cudaMalloc((void**)&pedestal, size * sizeof(::ecal::reco::StorageScalarType)));
+namespace ecal { namespace multifit {
+
+enum class TimeComputationState : char {
+    NotFinished = 0,
+    Finished = 1
+};
+enum class MinimizationState : char {
+    NotFinished = 0,
+    Finished = 1,
+    Precomputed = 2,
+};
+
+//
+struct EventInputDataGPU {
+    ecal::DigisCollection const& ebDigis;
+    ecal::DigisCollection const& eeDigis;
+};
+
+// parameters have a fixed type
+// Can we go by with single precision
+struct ConfigurationParameters {
+    using type = double;
+    // device ptrs
+    type *amplitudeFitParametersEB=nullptr, *amplitudeFitParametersEE=nullptr;
+
+    uint32_t timeFitParametersSizeEB, timeFitParametersSizeEE;
+    // device ptrs
+    type *timeFitParametersEB=nullptr, *timeFitParametersEE=nullptr;
+
+    type timeFitLimitsFirstEB, timeFitLimitsFirstEE;
+    type timeFitLimitsSecondEB, timeFitLimitsSecondEE;
+
+    type timeConstantTermEB, timeConstantTermEE;
+
+    type timeNconstEB, timeNconstEE;
+
+    type amplitudeThreshEE, amplitudeThreshEB;
+
+    type outOfTimeThreshG12pEB, outOfTimeThreshG12mEB;
+    type outOfTimeThreshG12pEE, outOfTimeThreshG12mEE;
+    type outOfTimeThreshG61pEE, outOfTimeThreshG61mEE;
+    type outOfTimeThreshG61pEB, outOfTimeThreshG61mEB;
+
+    std::array<uint32_t, 3> kernelMinimizeThreads;
+
+    bool shouldRunTimingComputation;
+};
+
+struct EventOutputDataGPU final : public ::ecal::UncalibratedRecHit<::ecal::Tag::ptr> 
+{
+    void allocate(ConfigurationParameters const& configParameters, uint32_t size) {
+        cudaCheck( cudaMalloc((void**)&amplitudesAll,
+            size * sizeof(SampleVector)) );
+        cudaCheck( cudaMalloc((void**)&amplitude,
+            size * sizeof(::ecal::reco::StorageScalarType)) );
+        cudaCheck( cudaMalloc((void**)&chi2,
+            size * sizeof(::ecal::reco::StorageScalarType)) );
+        cudaCheck( cudaMalloc((void**)&pedestal,
+            size * sizeof(::ecal::reco::StorageScalarType)) );
 
         if (configParameters.shouldRunTimingComputation) {
-          cudaCheck(cudaMalloc((void**)&jitter, size * sizeof(::ecal::reco::StorageScalarType)));
-          cudaCheck(cudaMalloc((void**)&jitterError, size * sizeof(::ecal::reco::StorageScalarType)));
+            cudaCheck( cudaMalloc((void**)&jitter,
+                size * sizeof(::ecal::reco::StorageScalarType)) );
+            cudaCheck( cudaMalloc((void**)&jitterError,
+                size * sizeof(::ecal::reco::StorageScalarType)) );
         }
 
-        cudaCheck(cudaMalloc((void**)&did, size * sizeof(uint32_t)));
-        cudaCheck(cudaMalloc((void**)&flags, size * sizeof(uint32_t)));
-      }
-
-      void deallocate(ConfigurationParameters const& configParameters) {
-        cudaCheck(cudaFree(amplitudesAll));
-        cudaCheck(cudaFree(amplitude));
-        cudaCheck(cudaFree(chi2));
-        cudaCheck(cudaFree(pedestal));
+        cudaCheck( cudaMalloc((void**)&did,
+            size * sizeof(uint32_t)) );
+        cudaCheck( cudaMalloc((void**)&flags,
+            size * sizeof(uint32_t)) );
+    }
+
+    void deallocate(ConfigurationParameters const& configParameters) {
+        cudaCheck( cudaFree(amplitudesAll) );
+        cudaCheck( cudaFree(amplitude) );
+        cudaCheck( cudaFree(chi2) );
+        cudaCheck( cudaFree(pedestal) );
         if (configParameters.shouldRunTimingComputation) {
-          cudaCheck(cudaFree(jitter));
-          cudaCheck(cudaFree(jitterError));
+            cudaCheck( cudaFree(jitter) );
+            cudaCheck( cudaFree(jitterError) );
         }
-        cudaCheck(cudaFree(did));
-        cudaCheck(cudaFree(flags));
-      }
-    };
-
-    struct EventDataForScratchGPU {
-      SampleVector* samples = nullptr;
-      SampleGainVector* gainsNoise = nullptr;
-
-      SampleMatrix* noisecov = nullptr;
-      PulseMatrixType* pulse_matrix = nullptr;
-      FullSampleMatrix* pulse_covariances = nullptr;
-      BXVectorType* activeBXs = nullptr;
-      char* acState = nullptr;
-
-      bool *hasSwitchToGain6 = nullptr, *hasSwitchToGain1 = nullptr, *isSaturated = nullptr;
-
-      SampleVector::Scalar *sample_values, *sample_value_errors;
-      bool* useless_sample_values;
-      SampleVector::Scalar* chi2sNullHypot;
-      SampleVector::Scalar* sum0sNullHypot;
-      SampleVector::Scalar* sumAAsNullHypot;
-      char* pedestal_nums;
-      SampleVector::Scalar *tMaxAlphaBetas, *tMaxErrorAlphaBetas;
-      SampleVector::Scalar *accTimeMax, *accTimeWgt;
-      SampleVector::Scalar *ampMaxAlphaBeta, *ampMaxError;
-      SampleVector::Scalar *timeMax, *timeError;
-      TimeComputationState* tcState;
-
-      void allocate(ConfigurationParameters const& configParameters, uint32_t size) {
-        cudaCheck(cudaMalloc((void**)&samples, size * sizeof(SampleVector)));
-        cudaCheck(cudaMalloc((void**)&gainsNoise, size * sizeof(SampleGainVector)));
-
-        cudaCheck(cudaMalloc((void**)&pulse_covariances, size * sizeof(FullSampleMatrix)));
-        cudaCheck(cudaMalloc((void**)&noisecov, size * sizeof(SampleMatrix)));
-        cudaCheck(cudaMalloc((void**)&pulse_matrix, size * sizeof(PulseMatrixType)));
-        cudaCheck(cudaMalloc((void**)&activeBXs, size * sizeof(BXVectorType)));
-        cudaCheck(cudaMalloc((void**)&acState, size * sizeof(char)));
-
-        cudaCheck(cudaMalloc((void**)&hasSwitchToGain6, size * sizeof(bool)));
-        cudaCheck(cudaMalloc((void**)&hasSwitchToGain1, size * sizeof(bool)));
-        cudaCheck(cudaMalloc((void**)&isSaturated, size * sizeof(bool)));
+        cudaCheck( cudaFree(did) );
+        cudaCheck( cudaFree(flags) );
+    }
+};
+
+struct EventDataForScratchGPU {
+    SampleVector *samples = nullptr;
+    SampleGainVector *gainsNoise = nullptr;
+
+    SampleMatrix* noisecov = nullptr;
+    PulseMatrixType *pulse_matrix = nullptr;
+    BXVectorType *activeBXs = nullptr;
+    char *acState = nullptr;
+
+    bool *hasSwitchToGain6=nullptr,
+         *hasSwitchToGain1=nullptr,
+         *isSaturated=nullptr;
+
+    SampleVector::Scalar *sample_values, *sample_value_errors;
+    bool *useless_sample_values;
+    SampleVector::Scalar* chi2sNullHypot;
+    SampleVector::Scalar* sum0sNullHypot;
+    SampleVector::Scalar* sumAAsNullHypot;
+    char* pedestal_nums;
+    SampleVector::Scalar *tMaxAlphaBetas, *tMaxErrorAlphaBetas;
+    SampleVector::Scalar *accTimeMax, *accTimeWgt;
+    SampleVector::Scalar *ampMaxAlphaBeta, *ampMaxError;
+    SampleVector::Scalar *timeMax, *timeError;
+    TimeComputationState *tcState;
+
+    void allocate(ConfigurationParameters const& configParameters, uint32_t size) {
+        cudaCheck( cudaMalloc((void**)&samples,
+            size * sizeof(SampleVector)) );
+        cudaCheck( cudaMalloc((void**)&gainsNoise,
+            size * sizeof(SampleGainVector)) );
+
+        cudaCheck( cudaMalloc((void**)&noisecov,
+            size * sizeof(SampleMatrix)) );
+        cudaCheck( cudaMalloc((void**)&pulse_matrix,
+            size * sizeof(PulseMatrixType)) );
+        cudaCheck( cudaMalloc((void**)&activeBXs,
+            size * sizeof(BXVectorType)) );
+        cudaCheck( cudaMalloc((void**)&acState,
+            size * sizeof(char)) );
+
+        cudaCheck( cudaMalloc((void**)&hasSwitchToGain6,
+            size * sizeof(bool)) );
+        cudaCheck( cudaMalloc((void**)&hasSwitchToGain1,
+            size * sizeof(bool)) );
+        cudaCheck( cudaMalloc((void**)&isSaturated,
+            size * sizeof(bool)) );
 
         if (configParameters.shouldRunTimingComputation) {
-          cudaCheck(cudaMalloc((void**)&sample_values, size * sizeof(SampleVector)));
-          cudaCheck(cudaMalloc((void**)&sample_value_errors, size * sizeof(SampleVector)));
-          cudaCheck(cudaMalloc((void**)&useless_sample_values, size * sizeof(bool) * EcalDataFrame::MAXSAMPLES));
-          cudaCheck(cudaMalloc((void**)&chi2sNullHypot, size * sizeof(SampleVector::Scalar)));
-          cudaCheck(cudaMalloc((void**)&sum0sNullHypot, size * sizeof(SampleVector::Scalar)));
-          cudaCheck(cudaMalloc((void**)&sumAAsNullHypot, size * sizeof(SampleVector::Scalar)));
-          cudaCheck(cudaMalloc((void**)&pedestal_nums, size * sizeof(char)));
-
-          cudaCheck(cudaMalloc((void**)&tMaxAlphaBetas, size * sizeof(SampleVector::Scalar)));
-          cudaCheck(cudaMalloc((void**)&tMaxErrorAlphaBetas, size * sizeof(SampleVector::Scalar)));
-          cudaCheck(cudaMalloc((void**)&accTimeMax, size * sizeof(SampleVector::Scalar)));
-          cudaCheck(cudaMalloc((void**)&accTimeWgt, size * sizeof(SampleVector::Scalar)));
-          cudaCheck(cudaMalloc((void**)&ampMaxAlphaBeta, size * sizeof(SampleVector::Scalar)));
-          cudaCheck(cudaMalloc((void**)&ampMaxError, size * sizeof(SampleVector::Scalar)));
-          cudaCheck(cudaMalloc((void**)&timeMax, size * sizeof(SampleVector::Scalar)));
-          cudaCheck(cudaMalloc((void**)&timeError, size * sizeof(SampleVector::Scalar)));
-          cudaCheck(cudaMalloc((void**)&tcState, size * sizeof(TimeComputationState)));
+            cudaCheck( cudaMalloc((void**)&sample_values,
+                size * sizeof(SampleVector)) );
+            cudaCheck( cudaMalloc((void**)&sample_value_errors,
+                size * sizeof(SampleVector)) );
+            cudaCheck( cudaMalloc((void**)&useless_sample_values,
+                size * sizeof(bool) * EcalDataFrame::MAXSAMPLES) );
+            cudaCheck( cudaMalloc((void**)&chi2sNullHypot,
+                size * sizeof(SampleVector::Scalar)) );
+            cudaCheck( cudaMalloc((void**)&sum0sNullHypot,
+                size * sizeof(SampleVector::Scalar)) );
+            cudaCheck( cudaMalloc((void**)&sumAAsNullHypot,
+                size * sizeof(SampleVector::Scalar)) );
+            cudaCheck( cudaMalloc((void**)&pedestal_nums,
+                size * sizeof(char)) );
+
+            cudaCheck( cudaMalloc((void**)&tMaxAlphaBetas,
+                size * sizeof(SampleVector::Scalar)) );
+            cudaCheck( cudaMalloc((void**)&tMaxErrorAlphaBetas,
+                size * sizeof(SampleVector::Scalar)) );
+            cudaCheck( cudaMalloc((void**)&accTimeMax,
+                size * sizeof(SampleVector::Scalar)) );
+            cudaCheck( cudaMalloc((void**)&accTimeWgt,
+                size * sizeof(SampleVector::Scalar)) );
+            cudaCheck( cudaMalloc((void**)&ampMaxAlphaBeta,
+                size * sizeof(SampleVector::Scalar)) );
+            cudaCheck( cudaMalloc((void**)&ampMaxError,
+                size * sizeof(SampleVector::Scalar)) );
+            cudaCheck( cudaMalloc((void**)&timeMax,
+                size * sizeof(SampleVector::Scalar)) );
+            cudaCheck( cudaMalloc((void**)&timeError,
+                size * sizeof(SampleVector::Scalar)) );
+            cudaCheck( cudaMalloc((void**)&tcState,
+                size * sizeof(TimeComputationState)) );
         }
-      }
+    }
 
-      void deallocate(ConfigurationParameters const& configParameters) {
-        cudaCheck(cudaFree(samples));
-        cudaCheck(cudaFree(gainsNoise));
+    void deallocate(ConfigurationParameters const& configParameters) {
+        cudaCheck( cudaFree(samples) );
+        cudaCheck( cudaFree(gainsNoise) );
 
-        cudaCheck(cudaFree(pulse_covariances));
-        cudaCheck(cudaFree(noisecov));
-        cudaCheck(cudaFree(pulse_matrix));
-        cudaCheck(cudaFree(activeBXs));
-        cudaCheck(cudaFree(acState));
+        cudaCheck( cudaFree(noisecov) );
+        cudaCheck( cudaFree(pulse_matrix) );
+        cudaCheck( cudaFree(activeBXs) );
+        cudaCheck( cudaFree(acState) );
 
-        cudaCheck(cudaFree(hasSwitchToGain6));
-        cudaCheck(cudaFree(hasSwitchToGain1));
-        cudaCheck(cudaFree(isSaturated));
+        cudaCheck( cudaFree(hasSwitchToGain6) );
+        cudaCheck( cudaFree(hasSwitchToGain1) );
+        cudaCheck( cudaFree(isSaturated) );
 
         if (configParameters.shouldRunTimingComputation) {
-          cudaCheck(cudaFree(sample_values));
-          cudaCheck(cudaFree(sample_value_errors));
-          cudaCheck(cudaFree(useless_sample_values));
-          cudaCheck(cudaFree(chi2sNullHypot));
-          cudaCheck(cudaFree(sum0sNullHypot));
-          cudaCheck(cudaFree(sumAAsNullHypot));
-          cudaCheck(cudaFree(pedestal_nums));
-
-          cudaCheck(cudaFree(tMaxAlphaBetas));
-          cudaCheck(cudaFree(tMaxErrorAlphaBetas));
-          cudaCheck(cudaFree(accTimeMax));
-          cudaCheck(cudaFree(accTimeWgt));
-          cudaCheck(cudaFree(ampMaxAlphaBeta));
-          cudaCheck(cudaFree(ampMaxError));
-          cudaCheck(cudaFree(timeMax));
-          cudaCheck(cudaFree(timeError));
-          cudaCheck(cudaFree(tcState));
+            cudaCheck( cudaFree(sample_values) );
+            cudaCheck( cudaFree(sample_value_errors) );
+            cudaCheck( cudaFree(useless_sample_values) );
+            cudaCheck( cudaFree(chi2sNullHypot) );
+            cudaCheck( cudaFree(sum0sNullHypot) );
+            cudaCheck( cudaFree(sumAAsNullHypot) );
+            cudaCheck( cudaFree(pedestal_nums) );
+
+            cudaCheck( cudaFree(tMaxAlphaBetas) );
+            cudaCheck( cudaFree(tMaxErrorAlphaBetas) );
+            cudaCheck( cudaFree(accTimeMax) );
+            cudaCheck( cudaFree(accTimeWgt) );
+            cudaCheck( cudaFree(ampMaxAlphaBeta) );
+            cudaCheck( cudaFree(ampMaxError) );
+            cudaCheck( cudaFree(timeMax) );
+            cudaCheck( cudaFree(timeError) );
+            cudaCheck( cudaFree(tcState) );
         }
-      }
-    };
-
-    // const refs products to conditions
-    struct ConditionsProducts {
-      EcalPedestalsGPU::Product const& pedestals;
-      EcalGainRatiosGPU::Product const& gainRatios;
-      EcalPulseShapesGPU::Product const& pulseShapes;
-      EcalPulseCovariancesGPU::Product const& pulseCovariances;
-      EcalSamplesCorrelationGPU::Product const& samplesCorrelation;
-      EcalTimeBiasCorrectionsGPU::Product const& timeBiasCorrections;
-      EcalTimeCalibConstantsGPU::Product const& timeCalibConstants;
-      EcalSampleMask const& sampleMask;
-      EcalTimeOffsetConstant const& timeOffsetConstant;
-      uint32_t offsetForHashes;
-    };
-
-    //*/
-
-    struct xyz {
-      int x, y, z;
-    };
-
-    struct conf_data {
-      xyz threads;
-      bool runV1;
-      cudaStream_t cuStream;
-    };
-
-  }  // namespace multifit
-}  // namespace ecal
+    }
+};
+
+// const refs products to conditions
+struct ConditionsProducts {
+    EcalPedestalsGPU::Product const& pedestals;
+    EcalGainRatiosGPU::Product const& gainRatios;
+    EcalPulseShapesGPU::Product const& pulseShapes;
+    EcalPulseCovariancesGPU::Product const& pulseCovariances;
+    EcalSamplesCorrelationGPU::Product const& samplesCorrelation;
+    EcalTimeBiasCorrectionsGPU::Product const& timeBiasCorrections;
+    EcalTimeCalibConstantsGPU::Product const& timeCalibConstants;
+    EcalSampleMask const& sampleMask;
+    EcalTimeOffsetConstant const& timeOffsetConstant;
+    uint32_t offsetForHashes;
+};
+
+//*/
+
+struct xyz {
+    int x,y,z;
+};
+
+struct conf_data {
+    xyz threads;
+    bool runV1;
+    cudaStream_t cuStream;
+};
+
+}}
 
 #endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h
index e268e5d3d5c13..674695e472ec1 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h
@@ -8,38 +8,37 @@
 #include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #endif
 
-#include <cuda_runtime.h>
-
 class EcalGainRatiosGPU {
 public:
-  struct Product {
-    ~Product();
-    float *gain12Over6 = nullptr, *gain6Over1 = nullptr;
-  };
+    struct Product {
+        ~Product();
+        float *gain12Over6=nullptr, *gain6Over1=nullptr;
+    };
 
 #ifndef __CUDACC__
 
-  // rearrange pedestals
-  EcalGainRatiosGPU(EcalGainRatios const&);
+    // rearrange pedestals
+    EcalGainRatiosGPU(EcalGainRatios const&);
 
-  // will call dealloation for Product thru ~Product
-  ~EcalGainRatiosGPU() = default;
+    // will call dealloation for Product thru ~Product
+    ~EcalGainRatiosGPU() = default;
 
-  // get device pointers
-  Product const& getProduct(cudaStream_t) const;
+    // get device pointers
+    Product const& getProduct(cudaStream_t) const;
 
-  //
-  static std::string name() { return std::string{"ecalGainRatiosGPU"}; }
+    // 
+    static std::string name() { return std::string{"ecalGainRatiosGPU"}; }
 
 private:
-  // in the future, we need to arrange so to avoid this copy on the host
-  // store eb first then ee
-  std::vector<float, CUDAHostAllocator<float>> gain12Over6_;
-  std::vector<float, CUDAHostAllocator<float>> gain6Over1_;
+    // in the future, we need to arrange so to avoid this copy on the host
+    // store eb first then ee
+    std::vector<float, CUDAHostAllocator<float>> gain12Over6_;
+    std::vector<float, CUDAHostAllocator<float>> gain6Over1_;
 
-  cms::cuda::ESProduct<Product> product_;
+    cms::cuda::ESProduct<Product> product_;
 
 #endif
 };
 
+
 #endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h
index 420697dea6bda..419b7273afa6d 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h
@@ -8,41 +8,39 @@
 #include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #endif
 
-#include <cuda_runtime.h>
-
 class EcalPedestalsGPU {
 public:
-  struct Product {
-    ~Product();
-    float *mean_x12 = nullptr, *mean_x6 = nullptr, *mean_x1 = nullptr;
-    float *rms_x12 = nullptr, *rms_x6 = nullptr, *rms_x1 = nullptr;
-  };
+    struct Product {
+        ~Product();
+        float *mean_x12=nullptr, *mean_x6=nullptr, *mean_x1=nullptr;
+        float *rms_x12=nullptr, *rms_x6=nullptr, *rms_x1=nullptr;
+    };
 
 #ifndef __CUDACC__
 
-  // rearrange pedestals
-  EcalPedestalsGPU(EcalPedestals const &);
+    // rearrange pedestals
+    EcalPedestalsGPU(EcalPedestals const&);
 
-  // will call dealloation for Product thru ~Product
-  ~EcalPedestalsGPU() = default;
+    // will call dealloation for Product thru ~Product
+    ~EcalPedestalsGPU() = default;
 
-  // get device pointers
-  Product const &getProduct(cudaStream_t) const;
+    // get device pointers
+    Product const& getProduct(cudaStream_t) const;
 
-  //
-  static std::string name() { return std::string{"ecalPedestalsGPU"}; }
+    // 
+    static std::string name() { return std::string{"ecalPedestalsGPU"}; }
 
 private:
-  // in the future, we need to arrange so to avoid this copy on the host
-  // store eb first then ee
-  std::vector<float, CUDAHostAllocator<float>> mean_x12_;
-  std::vector<float, CUDAHostAllocator<float>> rms_x12_;
-  std::vector<float, CUDAHostAllocator<float>> mean_x6_;
-  std::vector<float, CUDAHostAllocator<float>> rms_x6_;
-  std::vector<float, CUDAHostAllocator<float>> mean_x1_;
-  std::vector<float, CUDAHostAllocator<float>> rms_x1_;
-
-  cms::cuda::ESProduct<Product> product_;
+    // in the future, we need to arrange so to avoid this copy on the host
+    // store eb first then ee
+    std::vector<float, CUDAHostAllocator<float>> mean_x12_;
+    std::vector<float, CUDAHostAllocator<float>> rms_x12_;
+    std::vector<float, CUDAHostAllocator<float>> mean_x6_;
+    std::vector<float, CUDAHostAllocator<float>> rms_x6_;
+    std::vector<float, CUDAHostAllocator<float>> mean_x1_;
+    std::vector<float, CUDAHostAllocator<float>> rms_x1_;
+
+    cms::cuda::ESProduct<Product> product_;
 #endif
 };
 
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h
index b5b9271f6e65e..0a3df41e8b85e 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h
@@ -8,35 +8,34 @@
 #include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #endif
 
-#include <cuda_runtime.h>
-
 class EcalPulseCovariancesGPU {
 public:
-  struct Product {
-    ~Product();
-    EcalPulseCovariance* values = nullptr;
-  };
+    struct Product {
+        ~Product();
+        EcalPulseCovariance *values=nullptr;
+    };
 
 #ifndef __CUDACC__
-  // rearrange pedestals
-  EcalPulseCovariancesGPU(EcalPulseCovariances const&);
+    // rearrange pedestals
+    EcalPulseCovariancesGPU(EcalPulseCovariances const&);
 
-  // will call dealloation for Product thru ~Product
-  ~EcalPulseCovariancesGPU() = default;
+    // will call dealloation for Product thru ~Product
+    ~EcalPulseCovariancesGPU() = default;
 
-  // get device pointers
-  Product const& getProduct(cudaStream_t) const;
+    // get device pointers
+    Product const& getProduct(cudaStream_t) const;
 
-  //
-  static std::string name() { return std::string{"ecalPulseCovariancesGPU"}; }
+    // 
+    static std::string name() { return std::string{"ecalPulseCovariancesGPU"}; }
 
 private:
-  // reuse original vectors (although with default allocator)
-  std::vector<EcalPulseCovariance> const& valuesEB_;
-  std::vector<EcalPulseCovariance> const& valuesEE_;
+    // reuse original vectors (although with default allocator)
+    std::vector<EcalPulseCovariance> const& valuesEB_;
+    std::vector<EcalPulseCovariance> const& valuesEE_;
 
-  cms::cuda::ESProduct<Product> product_;
+    cms::cuda::ESProduct<Product> product_;
 #endif
 };
 
+
 #endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h
index 88893b626ce05..4fddcf24aac32 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h
@@ -8,35 +8,34 @@
 #include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #endif
 
-#include <cuda_runtime.h>
-
 class EcalPulseShapesGPU {
 public:
-  struct Product {
-    ~Product();
-    EcalPulseShape* values = nullptr;
-  };
+    struct Product {
+        ~Product();
+        EcalPulseShape *values=nullptr;
+    };
 
 #ifndef __CUDACC__
-  // rearrange pedestals
-  EcalPulseShapesGPU(EcalPulseShapes const&);
+    // rearrange pedestals
+    EcalPulseShapesGPU(EcalPulseShapes const&);
 
-  // will call dealloation for Product thru ~Product
-  ~EcalPulseShapesGPU() = default;
+    // will call dealloation for Product thru ~Product
+    ~EcalPulseShapesGPU() = default;
 
-  // get device pointers
-  Product const& getProduct(cudaStream_t) const;
+    // get device pointers
+    Product const& getProduct(cudaStream_t) const;
 
-  //
-  static std::string name() { return std::string{"ecalPulseShapesGPU"}; }
+    // 
+    static std::string name() { return std::string{"ecalPulseShapesGPU"}; }
 
 private:
-  // reuse original vectors (although with default allocator)
-  std::vector<EcalPulseShape> const& valuesEB_;
-  std::vector<EcalPulseShape> const& valuesEE_;
+    // reuse original vectors (although with default allocator)
+    std::vector<EcalPulseShape> const& valuesEB_;
+    std::vector<EcalPulseShape> const& valuesEE_;
 
-  cms::cuda::ESProduct<Product> product_;
+    cms::cuda::ESProduct<Product> product_;
 #endif
 };
 
+
 #endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h
index dac1ee041bfc5..3ae409a18e74c 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h
@@ -8,39 +8,42 @@
 #include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #endif
 
-#include <cuda_runtime.h>
-
 class EcalSamplesCorrelationGPU {
 public:
-  struct Product {
-    ~Product();
-    double *EBG12SamplesCorrelation = nullptr, *EBG6SamplesCorrelation = nullptr, *EBG1SamplesCorrelation = nullptr;
-    double *EEG12SamplesCorrelation = nullptr, *EEG6SamplesCorrelation = nullptr, *EEG1SamplesCorrelation = nullptr;
-  };
+    struct Product {
+        ~Product();
+        double *EBG12SamplesCorrelation=nullptr,
+               *EBG6SamplesCorrelation=nullptr,
+               *EBG1SamplesCorrelation=nullptr;
+        double *EEG12SamplesCorrelation=nullptr,
+               *EEG6SamplesCorrelation=nullptr,
+               *EEG1SamplesCorrelation=nullptr;
+    };
 
 #ifndef __CUDACC__
-  // rearrange pedestals
-  EcalSamplesCorrelationGPU(EcalSamplesCorrelation const&);
+    // rearrange pedestals
+    EcalSamplesCorrelationGPU(EcalSamplesCorrelation const&);
 
-  // will call dealloation for Product thru ~Product
-  ~EcalSamplesCorrelationGPU() = default;
+    // will call dealloation for Product thru ~Product
+    ~EcalSamplesCorrelationGPU() = default;
 
-  // get device pointers
-  Product const& getProduct(cudaStream_t) const;
+    // get device pointers
+    Product const& getProduct(cudaStream_t) const;
 
-  //
-  static std::string name() { return std::string{"ecalSamplesCorrelationGPU"}; }
+    // 
+    static std::string name() { return std::string{"ecalSamplesCorrelationGPU"}; }
 
 private:
-  std::vector<double> const& EBG12SamplesCorrelation_;
-  std::vector<double> const& EBG6SamplesCorrelation_;
-  std::vector<double> const& EBG1SamplesCorrelation_;
-  std::vector<double> const& EEG12SamplesCorrelation_;
-  std::vector<double> const& EEG6SamplesCorrelation_;
-  std::vector<double> const& EEG1SamplesCorrelation_;
-
-  cms::cuda::ESProduct<Product> product_;
+    std::vector<double> const& EBG12SamplesCorrelation_;
+    std::vector<double> const& EBG6SamplesCorrelation_;
+    std::vector<double> const& EBG1SamplesCorrelation_;
+    std::vector<double> const& EEG12SamplesCorrelation_;
+    std::vector<double> const& EEG6SamplesCorrelation_;
+    std::vector<double> const& EEG1SamplesCorrelation_;
+
+    cms::cuda::ESProduct<Product> product_;
 #endif
 };
 
+
 #endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h
index 70af33b52f216..cbabea3351eb8 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h
@@ -8,44 +8,45 @@
 #include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #endif
 
-#include <cuda_runtime.h>
-
 class EcalTimeBiasCorrectionsGPU {
 public:
-  struct Product {
-    ~Product();
-    float *EBTimeCorrAmplitudeBins, *EBTimeCorrShiftBins;
-    float *EETimeCorrAmplitudeBins, *EETimeCorrShiftBins;
-    int EBTimeCorrAmplitudeBinsSize, EETimeCorrAmplitudeBinsSize;
-  };
+    struct Product {
+        ~Product();
+        float *EBTimeCorrAmplitudeBins, *EBTimeCorrShiftBins;
+        float *EETimeCorrAmplitudeBins, *EETimeCorrShiftBins;
+        int EBTimeCorrAmplitudeBinsSize, EETimeCorrAmplitudeBinsSize;
+    };
 
-  // rearrange pedestals
-  EcalTimeBiasCorrectionsGPU(EcalTimeBiasCorrections const&);
+    // rearrange pedestals
+    EcalTimeBiasCorrectionsGPU(EcalTimeBiasCorrections const&);
 
 #ifndef __CUDACC__
 
-  // will call dealloation for Product thru ~Product
-  ~EcalTimeBiasCorrectionsGPU() = default;
+    // will call dealloation for Product thru ~Product
+    ~EcalTimeBiasCorrectionsGPU() = default;
 
-  // get device pointers
-  Product const& getProduct(cudaStream_t) const;
+    // get device pointers
+    Product const& getProduct(cudaStream_t) const;
 
-  //
-  static std::string name() { return std::string{"ecalTimeBiasCorrectionsGPU"}; }
+    // 
+    static std::string name() { return std::string{"ecalTimeBiasCorrectionsGPU"}; }
 #endif
 
-  std::vector<float> const& EBTimeCorrAmplitudeBins() const { return EBTimeCorrAmplitudeBins_; }
-  std::vector<float> const& EETimeCorrAmplitudeBins() const { return EETimeCorrAmplitudeBins_; }
+    std::vector<float> const& EBTimeCorrAmplitudeBins() const
+    { return EBTimeCorrAmplitudeBins_; }
+    std::vector<float> const& EETimeCorrAmplitudeBins() const 
+    { return EETimeCorrAmplitudeBins_; }
 
 private:
-  std::vector<float> const& EBTimeCorrAmplitudeBins_;
-  std::vector<float> const& EBTimeCorrShiftBins_;
-  std::vector<float> const& EETimeCorrAmplitudeBins_;
-  std::vector<float> const& EETimeCorrShiftBins_;
+    std::vector<float> const& EBTimeCorrAmplitudeBins_;
+    std::vector<float> const& EBTimeCorrShiftBins_;
+    std::vector<float> const& EETimeCorrAmplitudeBins_;
+    std::vector<float> const& EETimeCorrShiftBins_;
 
 #ifndef __CUDACC__
-  cms::cuda::ESProduct<Product> product_;
+    cms::cuda::ESProduct<Product> product_;
 #endif
 };
 
+
 #endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h
index fd640e7c989b3..f82f4d5a0530f 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h
@@ -8,38 +8,37 @@
 #include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #endif
 
-#include <cuda_runtime.h>
-
 class EcalTimeCalibConstantsGPU {
 public:
-  struct Product {
-    ~Product();
-    float* values = nullptr;
-  };
+    struct Product {
+        ~Product();
+        float *values=nullptr;
+    };
 
 #ifndef __CUDACC__
-  // rearrange pedestals
-  EcalTimeCalibConstantsGPU(EcalTimeCalibConstants const&);
+    // rearrange pedestals
+    EcalTimeCalibConstantsGPU(EcalTimeCalibConstants const&);
 
-  // will call dealloation for Product thru ~Product
-  ~EcalTimeCalibConstantsGPU() = default;
+    // will call dealloation for Product thru ~Product
+    ~EcalTimeCalibConstantsGPU() = default;
 
-  // get device pointers
-  Product const& getProduct(cudaStream_t) const;
+    // get device pointers
+    Product const& getProduct(cudaStream_t) const;
 
-  // TODO: do this centrally
-  // get offset for hashes. equals number of barrel items
-  uint32_t getOffset() const { return valuesEB_.size(); }
+    // TODO: do this centrally
+    // get offset for hashes. equals number of barrel items
+    uint32_t getOffset() const { return valuesEB_.size(); }
 
-  //
-  static std::string name() { return std::string{"ecalTimeCalibConstantsGPU"}; }
+    // 
+    static std::string name() { return std::string{"ecalTimeCalibConstantsGPU"}; }
 
 private:
-  std::vector<float> const& valuesEB_;
-  std::vector<float> const& valuesEE_;
+    std::vector<float> const& valuesEB_;
+    std::vector<float> const& valuesEE_;
 
-  cms::cuda::ESProduct<Product> product_;
+    cms::cuda::ESProduct<Product> product_;
 #endif
 };
 
+
 #endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalUncalibRecHitMultiFitAlgo_gpu_new.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalUncalibRecHitMultiFitAlgo_gpu_new.h
index 424a6e612c2c1..04193663f1e37 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalUncalibRecHitMultiFitAlgo_gpu_new.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalUncalibRecHitMultiFitAlgo_gpu_new.h
@@ -3,22 +3,18 @@
 
 #include <vector>
 
-#include <cuda_runtime.h>
+#include <cuda.h>
 
 #include "RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h"
 
-namespace ecal {
-  namespace multifit {
+namespace ecal { namespace multifit {
 
-    void entryPoint(EventInputDataCPU const&,
-                    EventInputDataGPU&,
-                    EventOutputDataGPU&,
-                    EventDataForScratchGPU&,
-                    ConditionsProducts const&,
-                    ConfigurationParameters const&,
-                    cudaStream_t);
+void entryPoint(
+        EventInputDataGPU const&,
+        EventOutputDataGPU&, EventDataForScratchGPU&,
+        ConditionsProducts const&, ConfigurationParameters const&,
+        cudaStream_t);
 
-  }
-}  // namespace ecal
+}}
 
 #endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EigenMatrixTypes_gpu.h b/RecoLocalCalo/EcalRecAlgos/interface/EigenMatrixTypes_gpu.h
index b162f9b1c9784..d769f65ed0735 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EigenMatrixTypes_gpu.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EigenMatrixTypes_gpu.h
@@ -6,43 +6,40 @@
 
 #include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h"
 
-namespace ecal {
-  namespace multifit {
-
-    constexpr int SampleVectorSize = 10;
-    constexpr int FullSampleVectorSize = 19;
-    constexpr int PulseVectorSize = 12;
-    constexpr int NGains = 3;
-
-    using data_type = ::ecal::reco::ComputationScalarType;
-
-    typedef Eigen::Matrix<data_type, SampleVectorSize, SampleVectorSize> PulseMatrixType;
-    typedef Eigen::Matrix<char, SampleVectorSize, 1> BXVectorType;
-    using SampleMatrixD = Eigen::Matrix<double, SampleVectorSize, SampleVectorSize>;
-
-    typedef Eigen::Matrix<data_type, SampleVectorSize, 1> SampleVector;
-    typedef Eigen::Matrix<data_type, FullSampleVectorSize, 1> FullSampleVector;
-    typedef Eigen::Matrix<data_type, Eigen::Dynamic, 1, 0, PulseVectorSize, 1> PulseVector;
-    typedef Eigen::Matrix<char, Eigen::Dynamic, 1, 0, PulseVectorSize, 1> BXVector;
-    typedef Eigen::Matrix<char, SampleVectorSize, 1> SampleGainVector;
-    typedef Eigen::Matrix<data_type, SampleVectorSize, SampleVectorSize> SampleMatrix;
-    typedef Eigen::Matrix<data_type, FullSampleVectorSize, FullSampleVectorSize> FullSampleMatrix;
-    typedef Eigen::Matrix<data_type, Eigen::Dynamic, Eigen::Dynamic, 0, PulseVectorSize, PulseVectorSize> PulseMatrix;
-    typedef Eigen::Matrix<data_type, SampleVectorSize, Eigen::Dynamic, 0, SampleVectorSize, PulseVectorSize>
-        SamplePulseMatrix;
-    typedef Eigen::LLT<SampleMatrix> SampleDecompLLT;
-    typedef Eigen::LLT<SampleMatrixD> SampleDecompLLTD;
-    typedef Eigen::LLT<PulseMatrix> PulseDecompLLT;
-    typedef Eigen::LDLT<PulseMatrix> PulseDecompLDLT;
-
-    typedef Eigen::Matrix<data_type, 1, 1> SingleMatrix;
-    typedef Eigen::Matrix<data_type, 1, 1> SingleVector;
-
-    typedef std::array<SampleMatrixD, NGains> SampleMatrixGainArray;
-
-    using PermutationMatrix = Eigen::PermutationMatrix<SampleMatrix::RowsAtCompileTime>;
-
-  }  // namespace multifit
-}  // namespace ecal
+namespace ecal { namespace multifit {
+
+constexpr int SampleVectorSize = 10;
+constexpr int FullSampleVectorSize = 19;
+constexpr int PulseVectorSize = 12;
+constexpr int NGains = 3;
+
+using data_type = ::ecal::reco::ComputationScalarType;
+
+typedef Eigen::Matrix<data_type, SampleVectorSize, SampleVectorSize> PulseMatrixType;
+typedef Eigen::Matrix<char, SampleVectorSize, 1> BXVectorType;
+using SampleMatrixD = Eigen::Matrix<double,SampleVectorSize,SampleVectorSize>;
+
+typedef Eigen::Matrix<data_type,SampleVectorSize,1> SampleVector;
+typedef Eigen::Matrix<data_type,FullSampleVectorSize,1> FullSampleVector;
+typedef Eigen::Matrix<data_type,Eigen::Dynamic,1,0,PulseVectorSize,1> PulseVector;
+typedef Eigen::Matrix<char,Eigen::Dynamic,1,0,PulseVectorSize,1> BXVector;
+typedef Eigen::Matrix<char, SampleVectorSize,1> SampleGainVector;
+typedef Eigen::Matrix<data_type,SampleVectorSize,SampleVectorSize> SampleMatrix;
+typedef Eigen::Matrix<data_type,FullSampleVectorSize,FullSampleVectorSize> FullSampleMatrix;
+typedef Eigen::Matrix<data_type,Eigen::Dynamic,Eigen::Dynamic,0,PulseVectorSize,PulseVectorSize> PulseMatrix;
+typedef Eigen::Matrix<data_type,SampleVectorSize,Eigen::Dynamic,0,SampleVectorSize,PulseVectorSize> SamplePulseMatrix;
+typedef Eigen::LLT<SampleMatrix> SampleDecompLLT;
+typedef Eigen::LLT<SampleMatrixD> SampleDecompLLTD;
+typedef Eigen::LLT<PulseMatrix> PulseDecompLLT;
+typedef Eigen::LDLT<PulseMatrix> PulseDecompLDLT;
+
+typedef Eigen::Matrix<data_type,1,1> SingleMatrix;
+typedef Eigen::Matrix<data_type,1,1> SingleVector;
+
+typedef std::array<SampleMatrixD,NGains> SampleMatrixGainArray;
+
+using PermutationMatrix = Eigen::PermutationMatrix<SampleMatrix::RowsAtCompileTime>;
+
+}}
 
 #endif
diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu
index bc2b1300123dd..83a3e2b39ed0b 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu
@@ -16,68 +16,95 @@
 #include "inplace_fnnls.h"
 #include "KernelHelpers.h"
 
-namespace ecal {
-  namespace multifit {
-
-    ///
-    /// assume kernel launch configuration is
-    /// (MAXSAMPLES * nchannels, blocks)
-    /// TODO: is there a point to split this kernel further to separate reductions
-    ///
-    __global__ void kernel_prep_1d_and_initialize(EcalPulseShape const* shapes_in,
-                                                  uint16_t const* digis_in,
-                                                  uint32_t const* dids,
-                                                  SampleVector* amplitudes,
-                                                  SampleVector* amplitudesForMinimization,
-                                                  SampleGainVector* gainsNoise,
-                                                  float const* mean_x1,
-                                                  float const* mean_x12,
-                                                  float const* rms_x12,
-                                                  float const* mean_x6,
-                                                  float const* gain6Over1,
-                                                  float const* gain12Over6,
-                                                  bool* hasSwitchToGain6,
-                                                  bool* hasSwitchToGain1,
-                                                  bool* isSaturated,
-                                                  ::ecal::reco::StorageScalarType* energies,
-                                                  ::ecal::reco::StorageScalarType* chi2,
-                                                  ::ecal::reco::StorageScalarType* g_pedestal,
-                                                  uint32_t* flags,
-                                                  char* acState,
-                                                  BXVectorType* bxs,
-                                                  uint32_t const offsetForHashes,
-                                                  bool const gainSwitchUseMaxSampleEB,
-                                                  bool const gainSwitchUseMaxSampleEE,
-                                                  int const nchannels) {
-      constexpr bool dynamicPedestal = false;  //---- default to false, ok
-      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
-      constexpr int sample_max = 5;
-      constexpr int full_pulse_max = 9;
-      int const tx = threadIdx.x + blockIdx.x * blockDim.x;
-      int const nchannels_per_block = blockDim.x / nsamples;
-      int const total_threads = nchannels * nsamples;
-      int const ch = tx / nsamples;
-      int const sample = threadIdx.x % nsamples;
-
-      if (ch < nchannels) {
+namespace ecal { namespace multifit {
+
+///
+/// assume kernel launch configuration is 
+/// (MAXSAMPLES * nchannels, blocks)
+/// 
+__global__
+void kernel_prep_1d_and_initialize(
+                    EcalPulseShape const* shapes_in,
+                    uint16_t const* digis_in_eb,
+                    uint32_t const* dids_eb,
+                    uint16_t const* digis_in_ee,
+                    uint32_t const* dids_ee,
+                    SampleVector* amplitudes,
+                    SampleVector* amplitudesForMinimization,
+                    SampleGainVector* gainsNoise,
+                    float const* mean_x1,
+                    float const* mean_x12,
+                    float const* rms_x12,
+                    float const* mean_x6,
+                    float const* gain6Over1,
+                    float const* gain12Over6,
+                    bool* hasSwitchToGain6,
+                    bool* hasSwitchToGain1,
+                    bool* isSaturated,
+                    ::ecal::reco::StorageScalarType* energies,
+                    ::ecal::reco::StorageScalarType* chi2,
+                    ::ecal::reco::StorageScalarType* g_pedestal,
+                    uint32_t *dids_out,
+                    uint32_t *flags,
+                    char* acState,
+                    BXVectorType *bxs,
+                    uint32_t const offsetForHashes,
+                    uint32_t const offsetForInputs,
+                    bool const gainSwitchUseMaxSampleEB,
+                    bool const gainSwitchUseMaxSampleEE,
+                    int const nchannels) {
+    constexpr bool dynamicPedestal = false;  //---- default to false, ok
+    constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+    constexpr int sample_max = 5;
+    constexpr int full_pulse_max = 9;
+    int const tx = threadIdx.x + blockIdx.x*blockDim.x;
+    int const nchannels_per_block = blockDim.x / nsamples;
+    int const total_threads = nchannels * nsamples;
+    int const ch = tx / nsamples;
+    // for accessing input arrays
+    int const inputCh = ch >= offsetForInputs
+        ? ch - offsetForInputs
+        : ch;
+    int const inputTx = ch >= offsetForInputs
+        ? tx - offsetForInputs*10
+        : tx;
+    // eb is first and then ee
+    auto const* digis_in = ch >= offsetForInputs
+        ? digis_in_ee
+        : digis_in_eb;
+    auto const* dids = ch >= offsetForInputs
+        ? dids_ee
+        : dids_eb;
+    int const sample = threadIdx.x % nsamples;
+
+    if (ch < nchannels) {
         // array of 10 x channels per block
         // TODO: any other way of doing simple reduction
         // assume bool is 1 byte, should be quite safe
         extern __shared__ char shared_mem[];
-        bool* shr_hasSwitchToGain6 = reinterpret_cast<bool*>(shared_mem);
-        bool* shr_hasSwitchToGain1 = shr_hasSwitchToGain6 + nchannels_per_block * nsamples;
-        bool* shr_hasSwitchToGain0 = shr_hasSwitchToGain1 + nchannels_per_block * nsamples;
-        bool* shr_isSaturated = shr_hasSwitchToGain0 + nchannels_per_block * nsamples;
-        bool* shr_hasSwitchToGain0_tmp = shr_isSaturated + nchannels_per_block * nsamples;
-        char* shr_counts = reinterpret_cast<char*>(shr_hasSwitchToGain0_tmp) + nchannels_per_block * nsamples;
+        bool* shr_hasSwitchToGain6 = reinterpret_cast<bool*>(
+            shared_mem);
+        bool* shr_hasSwitchToGain1 = shr_hasSwitchToGain6 + 
+            nchannels_per_block*nsamples;
+        bool* shr_hasSwitchToGain0 = shr_hasSwitchToGain1 + 
+            nchannels_per_block*nsamples;
+        bool* shr_isSaturated = shr_hasSwitchToGain0 + 
+            nchannels_per_block*nsamples;
+        bool* shr_hasSwitchToGain0_tmp = shr_isSaturated + 
+            nchannels_per_block*nsamples;
+        char* shr_counts = reinterpret_cast<char*>(
+            shr_hasSwitchToGain0_tmp) + nchannels_per_block*nsamples;
 
         //
         // indices
         //
-        auto const did = DetId{dids[ch]};
+        auto const did = DetId{dids[inputCh]};
         auto const isBarrel = did.subdetId() == EcalBarrel;
         // TODO offset for ee, 0 for eb
-        auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId());
+        auto const hashedId = isBarrel
+            ? hashedIndexEB(did.rawId())
+            : offsetForHashes + hashedIndexEE(did.rawId());
+
 
         //
         // pulse shape template
@@ -86,15 +113,15 @@ namespace ecal {
             isample+=nsamples)
             shapes_out[ch](isample + 7) = shapes_in[hashedId].pdfval[isample];
             */
-
+        
         // will be used in the future for setting state
         auto const rmsForChecking = rms_x12[hashedId];
 
         //
         // amplitudes
         //
-        int const adc = ecal::mgpa::adc(digis_in[tx]);
-        int const gainId = ecal::mgpa::gainId(digis_in[tx]);
+        int const adc = ecal::mgpa::adc(digis_in[inputTx]);
+        int const gainId = ecal::mgpa::gainId(digis_in[inputTx]);
         SampleVector::Scalar amplitude = 0.;
         SampleVector::Scalar pedestal = 0.;
         SampleVector::Scalar gainratio = 0.;
@@ -106,12 +133,13 @@ namespace ecal {
         shr_hasSwitchToGain0[threadIdx.x] = shr_hasSwitchToGain0_tmp[threadIdx.x];
         shr_counts[threadIdx.x] = 0;
         __syncthreads();
-
+        
         // non-divergent branch (except for the last 4 threads)
-        if (threadIdx.x <= blockDim.x - 5) {
-#pragma unroll
-          for (int i = 0; i < 5; i++)
-            shr_counts[threadIdx.x] += shr_hasSwitchToGain0[threadIdx.x + i];
+        if (threadIdx.x<=blockDim.x-5) {
+            #pragma unroll
+            for (int i=0; i<5; i++)
+                shr_counts[threadIdx.x] += 
+                    shr_hasSwitchToGain0[threadIdx.x+i];
         }
         shr_isSaturated[threadIdx.x] = shr_counts[threadIdx.x] == 5;
 
@@ -120,89 +148,102 @@ namespace ecal {
         // TODO
         //
         if (sample < 5) {
-          shr_hasSwitchToGain6[threadIdx.x] =
-              shr_hasSwitchToGain6[threadIdx.x] || shr_hasSwitchToGain6[threadIdx.x + 5];
-          shr_hasSwitchToGain1[threadIdx.x] =
-              shr_hasSwitchToGain1[threadIdx.x] || shr_hasSwitchToGain1[threadIdx.x + 5];
-
-          // duplication of hasSwitchToGain0 in order not to
-          // introduce another syncthreads
-          shr_hasSwitchToGain0_tmp[threadIdx.x] =
-              shr_hasSwitchToGain0_tmp[threadIdx.x] || shr_hasSwitchToGain0_tmp[threadIdx.x + 5];
+            shr_hasSwitchToGain6[threadIdx.x] = 
+                shr_hasSwitchToGain6[threadIdx.x] ||
+                shr_hasSwitchToGain6[threadIdx.x + 5];
+            shr_hasSwitchToGain1[threadIdx.x] =
+                shr_hasSwitchToGain1[threadIdx.x] ||
+                shr_hasSwitchToGain1[threadIdx.x + 5];
+            
+            // duplication of hasSwitchToGain0 in order not to
+            // introduce another syncthreads
+            shr_hasSwitchToGain0_tmp[threadIdx.x] = 
+                shr_hasSwitchToGain0_tmp[threadIdx.x] || 
+                shr_hasSwitchToGain0_tmp[threadIdx.x+5];
         }
         __syncthreads();
-
-        if (sample < 2) {
-          // note, both threads per channel take value [3] twice to avoid another if
-          shr_hasSwitchToGain6[threadIdx.x] = shr_hasSwitchToGain6[threadIdx.x] ||
-                                              shr_hasSwitchToGain6[threadIdx.x + 2] ||
-                                              shr_hasSwitchToGain6[threadIdx.x + 3];
-          shr_hasSwitchToGain1[threadIdx.x] = shr_hasSwitchToGain1[threadIdx.x] ||
-                                              shr_hasSwitchToGain1[threadIdx.x + 2] ||
-                                              shr_hasSwitchToGain1[threadIdx.x + 3];
-
-          shr_hasSwitchToGain0_tmp[threadIdx.x] = shr_hasSwitchToGain0_tmp[threadIdx.x] ||
-                                                  shr_hasSwitchToGain0_tmp[threadIdx.x + 2] ||
-                                                  shr_hasSwitchToGain0_tmp[threadIdx.x + 3];
-
-          // sample < 2 -> first 2 threads of each channel will be used here
-          // => 0 -> will compare 3 and 4 and put into 0
-          // => 1 -> will compare 4 and 5 and put into 1
-          shr_isSaturated[threadIdx.x] = shr_isSaturated[threadIdx.x + 3] || shr_isSaturated[threadIdx.x + 4];
+        
+        if (sample<2) {
+            // note, both threads per channel take value [3] twice to avoid another if
+            shr_hasSwitchToGain6[threadIdx.x] = 
+                shr_hasSwitchToGain6[threadIdx.x] ||
+                shr_hasSwitchToGain6[threadIdx.x+2] || 
+                shr_hasSwitchToGain6[threadIdx.x+3];
+            shr_hasSwitchToGain1[threadIdx.x] =
+                shr_hasSwitchToGain1[threadIdx.x] ||
+                shr_hasSwitchToGain1[threadIdx.x+2] || 
+                shr_hasSwitchToGain1[threadIdx.x+3];
+
+            shr_hasSwitchToGain0_tmp[threadIdx.x] = 
+                shr_hasSwitchToGain0_tmp[threadIdx.x] ||
+                shr_hasSwitchToGain0_tmp[threadIdx.x+2] || 
+                shr_hasSwitchToGain0_tmp[threadIdx.x+3];
+
+            // sample < 2 -> first 2 threads of each channel will be used here
+            // => 0 -> will compare 3 and 4 and put into 0
+            // => 1 -> will compare 4 and 5 and put into 1
+            shr_isSaturated[threadIdx.x] = 
+                shr_isSaturated[threadIdx.x+3] || shr_isSaturated[threadIdx.x+4];
         }
         __syncthreads();
 
         bool check_hasSwitchToGain0 = false;
 
-        if (sample == 0) {
-          shr_hasSwitchToGain6[threadIdx.x] =
-              shr_hasSwitchToGain6[threadIdx.x] || shr_hasSwitchToGain6[threadIdx.x + 1];
-          shr_hasSwitchToGain1[threadIdx.x] =
-              shr_hasSwitchToGain1[threadIdx.x] || shr_hasSwitchToGain1[threadIdx.x + 1];
-          shr_hasSwitchToGain0_tmp[threadIdx.x] =
-              shr_hasSwitchToGain0_tmp[threadIdx.x] || shr_hasSwitchToGain0_tmp[threadIdx.x + 1];
-
-          hasSwitchToGain6[ch] = shr_hasSwitchToGain6[threadIdx.x];
-          hasSwitchToGain1[ch] = shr_hasSwitchToGain1[threadIdx.x];
-
-          // set only for the threadIdx.x corresponding to sample==0
-          check_hasSwitchToGain0 = shr_hasSwitchToGain0_tmp[threadIdx.x];
-
-          shr_isSaturated[threadIdx.x + 3] = shr_isSaturated[threadIdx.x] || shr_isSaturated[threadIdx.x + 1];
-          isSaturated[ch] = shr_isSaturated[threadIdx.x + 3];
+        if (sample==0) {
+            shr_hasSwitchToGain6[threadIdx.x] = 
+                shr_hasSwitchToGain6[threadIdx.x] || 
+                shr_hasSwitchToGain6[threadIdx.x+1];
+            shr_hasSwitchToGain1[threadIdx.x] = 
+                shr_hasSwitchToGain1[threadIdx.x] ||
+                shr_hasSwitchToGain1[threadIdx.x+1];
+            shr_hasSwitchToGain0_tmp[threadIdx.x] =
+                shr_hasSwitchToGain0_tmp[threadIdx.x] ||
+                shr_hasSwitchToGain0_tmp[threadIdx.x+1];
+
+            hasSwitchToGain6[ch] = shr_hasSwitchToGain6[threadIdx.x];
+            hasSwitchToGain1[ch] = shr_hasSwitchToGain1[threadIdx.x];
+
+            // set only for the threadIdx.x corresponding to sample==0
+            check_hasSwitchToGain0 = shr_hasSwitchToGain0_tmp[threadIdx.x];
+
+            shr_isSaturated[threadIdx.x+3] = 
+                shr_isSaturated[threadIdx.x] || 
+                shr_isSaturated[threadIdx.x+1];
+            isSaturated[ch] = shr_isSaturated[threadIdx.x+3];
         }
 
         // TODO: w/o this sync, there is a race
         // if (threadIdx == sample_max) below uses max sample thread, not for 0 sample
         // check if we can remove it
         __syncthreads();
-
+        
         // TODO: divergent branch
-        if (gainId == 0 || gainId == 3) {
-          pedestal = mean_x1[hashedId];
-          gainratio = gain6Over1[hashedId] * gain12Over6[hashedId];
-          gainsNoise[ch](sample) = 2;
-        } else if (gainId == 1) {
-          pedestal = mean_x12[hashedId];
-          gainratio = 1.;
-          gainsNoise[ch](sample) = 0;
-        } else if (gainId == 2) {
-          pedestal = mean_x6[hashedId];
-          gainratio = gain12Over6[hashedId];
-          gainsNoise[ch](sample) = 1;
+        if (gainId==0 || gainId==3) {
+            pedestal = mean_x1[hashedId];
+            gainratio = gain6Over1[hashedId] * gain12Over6[hashedId];
+            gainsNoise[ch](sample) = 2;
+        } else if (gainId==1) {
+            pedestal = mean_x12[hashedId];
+            gainratio = 1.;
+            gainsNoise[ch](sample) = 0;
+        } else if (gainId==2) {
+            pedestal = mean_x6[hashedId];
+            gainratio = gain12Over6[hashedId];
+            gainsNoise[ch](sample)  = 1;
         }
-
+        
         // TODO: compile time constant -> branch should be non-divergent
         if (dynamicPedestal)
-          amplitude = static_cast<SampleVector::Scalar>(adc) * gainratio;
+            amplitude = static_cast<SampleVector::Scalar>(adc) * gainratio;
         else
-          amplitude = (static_cast<SampleVector::Scalar>(adc) - pedestal) * gainratio;
+            amplitude = (static_cast<SampleVector::Scalar>(adc) - pedestal) * gainratio;
         amplitudes[ch][sample] = amplitude;
 
 #ifdef ECAL_RECO_CUDA_DEBUG
-        printf("%d %d %d %d %f %f %f\n", tx, ch, sample, adc, amplitude, pedestal, gainratio);
-        if (adc == 0)
-          printf("adc is zero\n");
+        printf("%d %d %d %d %f %f %f\n", tx, ch, sample, adc, amplitude,
+            pedestal, gainratio);
+        if (adc==0)
+            printf("adc is zero\n");
 #endif
 
         //
@@ -211,289 +252,325 @@ namespace ecal {
         amplitudesForMinimization[ch](sample) = 0;
         bxs[ch](sample) = sample - 5;
 
-        // select the thread for the max sample
+        // select the thread for the max sample 
         //---> hardcoded above to be 5th sample, ok
         if (sample == sample_max) {
-          //
-          // initialization
-          //
-          acState[ch] = static_cast<char>(MinimizationState::NotFinished);
-          energies[ch] = 0;
-          chi2[ch] = 0;
-          g_pedestal[ch] = 0;
-          uint32_t flag = 0;
-
-          // start of this channel in shared mem
-          int const chStart = threadIdx.x - sample_max;
-          // thread for the max sample in shared mem
-          int const threadMax = threadIdx.x;
-          auto const gainSwitchUseMaxSample = isBarrel ? gainSwitchUseMaxSampleEB : gainSwitchUseMaxSampleEE;
-
-          // this flag setting is applied to all of the cases
-          if (shr_hasSwitchToGain6[chStart])
-            flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain6;
-          if (shr_hasSwitchToGain1[chStart])
-            flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain1;
-
-          // this corresponds to cpu branching on lastSampleBeforeSaturation
-          // likely false
-          if (check_hasSwitchToGain0) {
-            // assign for the case some sample having gainId == 0
-            //energies[ch] = amplitudes[ch][sample_max];
-            energies[ch] = amplitude;
-
-            // check if samples before sample_max have true
-            bool saturated_before_max = false;
-#pragma unroll
-            for (char ii = 0; ii < 5; ii++)
-              saturated_before_max = saturated_before_max || shr_hasSwitchToGain0[chStart + ii];
-
-            // if saturation is in the max sample and not in the first 5
-            if (!saturated_before_max && shr_hasSwitchToGain0[threadMax])
-              energies[ch] = 49140;  // 4095 * 12
-                                     //---- AM FIXME : no pedestal subtraction???
-                                     //It should be "(4095. - pedestal) * gainratio"
-
-            // set state flag to terminate further processing of this channel
-            acState[ch] = static_cast<char>(MinimizationState::Precomputed);
-            flag |= 0x1 << EcalUncalibratedRecHit::kSaturated;
+            //
+            // initialization
+            //
+            acState[ch] = static_cast<char>(MinimizationState::NotFinished);
+            energies[ch] = 0;
+            chi2[ch] = 0;
+            g_pedestal[ch] = 0;
+            uint32_t flag = 0;
+            dids_out[ch] = did.rawId();
+
+            // start of this channel in shared mem
+            int const chStart = threadIdx.x - sample_max;
+            // thread for the max sample in shared mem
+            int const threadMax = threadIdx.x;
+            auto const gainSwitchUseMaxSample = isBarrel
+                ? gainSwitchUseMaxSampleEB
+                : gainSwitchUseMaxSampleEE;
+            
+            // this flag setting is applied to all of the cases
+            if (shr_hasSwitchToGain6[chStart])
+                flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain6;
+            if (shr_hasSwitchToGain1[chStart])
+                flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain1;
+
+            // this corresponds to cpu branching on lastSampleBeforeSaturation
+            // likely false
+            if (check_hasSwitchToGain0) {
+                // assign for the case some sample having gainId == 0
+                //energies[ch] = amplitudes[ch][sample_max];
+                energies[ch] = amplitude;
+
+                // check if samples before sample_max have true
+                bool saturated_before_max = false;
+                #pragma unroll
+                for (char ii=0; ii<5; ii++)
+                    saturated_before_max = saturated_before_max ||
+                        shr_hasSwitchToGain0[chStart + ii];
+
+                // if saturation is in the max sample and not in the first 5
+                if (!saturated_before_max && 
+                    shr_hasSwitchToGain0[threadMax])
+                    energies[ch] = 49140; // 4095 * 12
+                    //---- AM FIXME : no pedestal subtraction???  
+                    //It should be "(4095. - pedestal) * gainratio"
+
+                // set state flag to terminate further processing of this channel
+                acState[ch] = static_cast<char>(MinimizationState::Precomputed); 
+                flag |= 0x1 << EcalUncalibratedRecHit::kSaturated;
+                flags[ch] = flag;
+                return;
+            }
+
+            // according to cpu version
+//            auto max_amplitude = amplitudes[ch][sample_max]; 
+            auto const max_amplitude = amplitude;
+            // according to cpu version
+            auto shape_value = shapes_in[hashedId].pdfval[full_pulse_max-7]; 
+            // note, no syncing as the same thread will be accessing here
+            bool hasGainSwitch = shr_hasSwitchToGain6[chStart]
+                || shr_hasSwitchToGain1[chStart]
+                || shr_isSaturated[chStart+3];
+
+            // pedestal is final unconditionally
+            g_pedestal[ch] = pedestal;
+            if (hasGainSwitch && gainSwitchUseMaxSample) {
+                // thread for sample=0 will access the right guys
+                energies[ch] = max_amplitude / shape_value;
+                acState[ch] = static_cast<char>(MinimizationState::Precomputed);
+                flags[ch] = flag;
+                return;
+            }
+            
+            // this happens cause sometimes rms_x12 is 0...
+            // needs to be checkec why this is the case
+            // general case here is that noisecov is a Zero matrix
+            if (rmsForChecking == 0) {
+                acState[ch] = static_cast<char>(MinimizationState::Precomputed);
+                flags[ch] = flag;
+                return;
+            }
+
+            // for the case when no shortcuts were taken
             flags[ch] = flag;
-            return;
-          }
-
-          // according to cpu version
-          //            auto max_amplitude = amplitudes[ch][sample_max];
-          auto const max_amplitude = amplitude;
-          // according to cpu version
-          auto shape_value = shapes_in[hashedId].pdfval[full_pulse_max - 7];
-          // note, no syncing as the same thread will be accessing here
-          bool hasGainSwitch =
-              shr_hasSwitchToGain6[chStart] || shr_hasSwitchToGain1[chStart] || shr_isSaturated[chStart + 3];
-
-          // pedestal is final unconditionally
-          g_pedestal[ch] = pedestal;
-          if (hasGainSwitch && gainSwitchUseMaxSample) {
-            // thread for sample=0 will access the right guys
-            energies[ch] = max_amplitude / shape_value;
-            acState[ch] = static_cast<char>(MinimizationState::Precomputed);
-            flags[ch] = flag;
-            return;
-          }
-
-          // this happens cause sometimes rms_x12 is 0...
-          // needs to be checkec why this is the case
-          // general case here is that noisecov is a Zero matrix
-          if (rmsForChecking == 0) {
-            acState[ch] = static_cast<char>(MinimizationState::Precomputed);
-            flags[ch] = flag;
-            return;
-          }
-
-          // for the case when no shortcuts were taken
-          flags[ch] = flag;
         }
-      }
     }
+}
 
-    ///
-    /// assume kernel launch configuration is
-    /// ([MAXSAMPLES, MAXSAMPLES], nchannels)
-    ///
-    __global__ void kernel_prep_2d(EcalPulseCovariance const* pulse_cov_in,
-                                   FullSampleMatrix* pulse_cov_out,
-                                   SampleGainVector const* gainNoise,
-                                   uint32_t const* dids,
-                                   float const* rms_x12,
-                                   float const* rms_x6,
-                                   float const* rms_x1,
-                                   float const* gain12Over6,
-                                   float const* gain6Over1,
-                                   double const* G12SamplesCorrelationEB,
-                                   double const* G6SamplesCorrelationEB,
-                                   double const* G1SamplesCorrelationEB,
-                                   double const* G12SamplesCorrelationEE,
-                                   double const* G6SamplesCorrelationEE,
-                                   double const* G1SamplesCorrelationEE,
-                                   SampleMatrix* noisecov,
-                                   PulseMatrixType* pulse_matrix,
-                                   EcalPulseShape const* pulse_shape,
-                                   bool const* hasSwitchToGain6,
-                                   bool const* hasSwitchToGain1,
-                                   bool const* isSaturated,
-                                   uint32_t const offsetForHashes) {
-      int ch = blockIdx.x;
-      int tx = threadIdx.x;
-      int ty = threadIdx.y;
-      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
-      constexpr float addPedestalUncertainty = 0.f;
-      constexpr bool dynamicPedestal = false;
-      constexpr bool simplifiedNoiseModelForGainSwitch = true;  //---- default is true
-      constexpr int template_samples = EcalPulseShape::TEMPLATESAMPLES;
-
-      bool tmp0 = hasSwitchToGain6[ch];
-      bool tmp1 = hasSwitchToGain1[ch];
-      auto const did = DetId{dids[ch]};
-      auto const isBarrel = did.subdetId() == EcalBarrel;
-      auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId());
-      auto const G12SamplesCorrelation = isBarrel ? G12SamplesCorrelationEB : G12SamplesCorrelationEE;
-      auto const* G6SamplesCorrelation = isBarrel ? G6SamplesCorrelationEB : G6SamplesCorrelationEE;
-      auto const* G1SamplesCorrelation = isBarrel ? G1SamplesCorrelationEB : G1SamplesCorrelationEE;
-      bool tmp2 = isSaturated[ch];
-      bool hasGainSwitch = tmp0 || tmp1 || tmp2;
-      auto const vidx = ecal::abs(ty - tx);
-
-      // only ty == 0 and 1 will go for a second iteration
-      for (int iy = ty; iy < template_samples; iy += nsamples)
-        for (int ix = tx; ix < template_samples; ix += nsamples)
-          pulse_cov_out[ch](iy + 7, ix + 7) = pulse_cov_in[hashedId].covval[iy][ix];
-
-      // non-divergent branch for all threads per block
-      if (hasGainSwitch) {
+///
+/// assume kernel launch configuration is 
+/// ([MAXSAMPLES, MAXSAMPLES], nchannels)
+///
+__global__
+void kernel_prep_2d(SampleGainVector const* gainNoise,
+                    uint32_t const* dids_eb,
+                    uint32_t const* dids_ee,
+                    float const* rms_x12,
+                    float const* rms_x6,
+                    float const* rms_x1,
+                    float const* gain12Over6,
+                    float const* gain6Over1,
+                    double const* G12SamplesCorrelationEB,
+                    double const* G6SamplesCorrelationEB,
+                    double const* G1SamplesCorrelationEB,
+                    double const* G12SamplesCorrelationEE,
+                    double const* G6SamplesCorrelationEE,
+                    double const* G1SamplesCorrelationEE,
+                    SampleMatrix* noisecov,
+                    PulseMatrixType* pulse_matrix,
+                    EcalPulseShape const* pulse_shape,
+                    bool const* hasSwitchToGain6,
+                    bool const* hasSwitchToGain1,
+                    bool const* isSaturated,
+                    uint32_t const offsetForHashes,
+                    uint32_t const offsetForInputs) {
+    int const ch = blockIdx.x;
+    int const tx = threadIdx.x;
+    int const ty = threadIdx.y;
+    constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+    constexpr float addPedestalUncertainty = 0.f;
+    constexpr bool dynamicPedestal = false;
+    constexpr bool simplifiedNoiseModelForGainSwitch = true;  //---- default is true
+    constexpr int template_samples = EcalPulseShape::TEMPLATESAMPLES;
+
+    // to access input arrays (ids and digis only)
+    int const inputCh = ch >= offsetForInputs
+        ? ch - offsetForInputs
+        : ch;
+    auto const* dids = ch >= offsetForInputs
+        ? dids_ee
+        : dids_eb;
+
+    bool tmp0 = hasSwitchToGain6[ch];
+    bool tmp1 = hasSwitchToGain1[ch];
+    auto const did = DetId{dids[inputCh]};
+    auto const isBarrel = did.subdetId() == EcalBarrel;
+    auto const hashedId = isBarrel
+        ? hashedIndexEB(did.rawId())
+        : offsetForHashes + hashedIndexEE(did.rawId());
+    auto const G12SamplesCorrelation = isBarrel
+        ? G12SamplesCorrelationEB
+        : G12SamplesCorrelationEE;
+    auto const* G6SamplesCorrelation = isBarrel
+        ? G6SamplesCorrelationEB
+        : G6SamplesCorrelationEE;
+    auto const* G1SamplesCorrelation = isBarrel
+        ? G1SamplesCorrelationEB
+        : G1SamplesCorrelationEE;
+    bool tmp2 = isSaturated[ch];
+    bool hasGainSwitch = tmp0 || tmp1 || tmp2;
+    auto const vidx = ecal::abs(ty - tx);
+
+    // non-divergent branch for all threads per block
+    if (hasGainSwitch) {
         // TODO: did not include simplified noise model
         float noise_value = 0;
 
         // non-divergent branch - all threads per block
-        // TODO: all of these constants indicate that
-        // that these parts could be splitted into completely different
+        // TODO: all of these constants indicate that 
+        // that these parts could be splitted into completely different 
         // kernels and run one of them only depending on the config
         if (simplifiedNoiseModelForGainSwitch) {
-          int isample_max = 5;  // according to cpu defs
-          int gainidx = gainNoise[ch][isample_max];
-
-          // non-divergent branches
-          if (gainidx == 0)
-            //noise_value = rms_x12[ch]*rms_x12[ch]*noisecorrs[0](ty, tx);
-            noise_value = rms_x12[hashedId] * rms_x12[hashedId] * G12SamplesCorrelation[vidx];
-          if (gainidx == 1)
-            //                noise_value = gain12Over6[ch]*gain12Over6[ch] * rms_x6[ch]*rms_x6[ch]
-            //                    *noisecorrs[1](ty, tx);
-            noise_value = gain12Over6[hashedId] * gain12Over6[hashedId] * rms_x6[hashedId] * rms_x6[hashedId] *
-                          G6SamplesCorrelation[vidx];
-          if (gainidx == 2)
-            //                noise_value = gain12Over6[ch]*gain12Over6[ch]
-            //                    * gain6Over1[ch]*gain6Over1[ch] * rms_x1[ch]*rms_x1[ch]
-            //                    * noisecorrs[2](ty, tx);
-            noise_value = gain12Over6[hashedId] * gain12Over6[hashedId] * gain6Over1[hashedId] * gain6Over1[hashedId] *
-                          rms_x1[hashedId] * rms_x1[hashedId] * G1SamplesCorrelation[vidx];
-          if (!dynamicPedestal && addPedestalUncertainty > 0.f)
-            noise_value += addPedestalUncertainty * addPedestalUncertainty;
+            int isample_max = 5; // according to cpu defs
+            int gainidx = gainNoise[ch][isample_max];
+
+            // non-divergent branches
+            if (gainidx==0)
+                //noise_value = rms_x12[ch]*rms_x12[ch]*noisecorrs[0](ty, tx);
+                noise_value = rms_x12[hashedId]*rms_x12[hashedId]
+                    * G12SamplesCorrelation[vidx];
+            if (gainidx==1) 
+//                noise_value = gain12Over6[ch]*gain12Over6[ch] * rms_x6[ch]*rms_x6[ch]
+//                    *noisecorrs[1](ty, tx);
+                noise_value = gain12Over6[hashedId]*gain12Over6[hashedId] 
+                    * rms_x6[hashedId]*rms_x6[hashedId]
+                    * G6SamplesCorrelation[vidx];
+            if (gainidx==2)
+//                noise_value = gain12Over6[ch]*gain12Over6[ch]
+//                    * gain6Over1[ch]*gain6Over1[ch] * rms_x1[ch]*rms_x1[ch]
+//                    * noisecorrs[2](ty, tx);
+                noise_value = gain12Over6[hashedId]*gain12Over6[hashedId]
+                    * gain6Over1[hashedId]*gain6Over1[hashedId] 
+                    * rms_x1[hashedId]*rms_x1[hashedId]
+                    * G1SamplesCorrelation[vidx];
+            if (!dynamicPedestal && addPedestalUncertainty>0.f)
+                noise_value += addPedestalUncertainty*addPedestalUncertainty;
         } else {
-          int gainidx = 0;
-          char mask = gainidx;
-          int pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
-          //            noise_value += /* gainratio is 1*/ rms_x12[ch]*rms_x12[ch]
-          //                *pedestal*noisecorrs[0](ty, tx);
-          noise_value +=
-              /* gainratio is 1*/ rms_x12[hashedId] * rms_x12[hashedId] * pedestal * G12SamplesCorrelation[vidx];
-          // non-divergent branch
-          if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
-            noise_value += /* gainratio is 1 */
-                addPedestalUncertainty * addPedestalUncertainty * pedestal;
-          }
-
-          //
-          gainidx = 1;
-          mask = gainidx;
-          pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
-          //            noise_value += gain12Over6[ch]*gain12Over6[ch]
-          //                *rms_x6[ch]*rms_x6[ch]*pedestal*noisecorrs[1](ty, tx);
-          noise_value += gain12Over6[hashedId] * gain12Over6[hashedId] * rms_x6[hashedId] * rms_x6[hashedId] *
-                         pedestal * G6SamplesCorrelation[vidx];
-          // non-divergent branch
-          if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
-            noise_value += gain12Over6[hashedId] * gain12Over6[hashedId] * addPedestalUncertainty *
-                           addPedestalUncertainty * pedestal;
-          }
-
-          //
-          gainidx = 2;
-          mask = gainidx;
-          pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
-          float tmp = gain6Over1[hashedId] * gain12Over6[hashedId];
-          //            noise_value += tmp*tmp * rms_x1[ch]*rms_x1[ch]
-          //                *pedestal*noisecorrs[2](ty, tx);
-          noise_value += tmp * tmp * rms_x1[hashedId] * rms_x1[hashedId] * pedestal * G1SamplesCorrelation[vidx];
-          // non-divergent branch
-          if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
-            noise_value += tmp * tmp * addPedestalUncertainty * addPedestalUncertainty * pedestal;
-          }
+            int gainidx=0;
+            char mask = gainidx;
+            int pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
+//            noise_value += /* gainratio is 1*/ rms_x12[ch]*rms_x12[ch]
+//                *pedestal*noisecorrs[0](ty, tx);
+            noise_value += /* gainratio is 1*/ rms_x12[hashedId]*rms_x12[hashedId]
+                * pedestal* G12SamplesCorrelation[vidx];
+            // non-divergent branch
+            if (!dynamicPedestal && addPedestalUncertainty>0.f) {
+                noise_value += /* gainratio is 1 */
+                    addPedestalUncertainty*addPedestalUncertainty*pedestal;
+            }
+
+            //
+            gainidx=1;
+            mask = gainidx;
+            pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
+//            noise_value += gain12Over6[ch]*gain12Over6[ch]
+//                *rms_x6[ch]*rms_x6[ch]*pedestal*noisecorrs[1](ty, tx);
+            noise_value += gain12Over6[hashedId]*gain12Over6[hashedId]
+                *rms_x6[hashedId]*rms_x6[hashedId]*pedestal
+                * G6SamplesCorrelation[vidx];
+            // non-divergent branch
+            if (!dynamicPedestal && addPedestalUncertainty>0.f) {
+                noise_value += gain12Over6[hashedId]*gain12Over6[hashedId]
+                    *addPedestalUncertainty*addPedestalUncertainty
+                    *pedestal;
+            }
+            
+            //
+            gainidx=2;
+            mask = gainidx;
+            pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
+            float tmp = gain6Over1[hashedId] * gain12Over6[hashedId];
+//            noise_value += tmp*tmp * rms_x1[ch]*rms_x1[ch]
+//                *pedestal*noisecorrs[2](ty, tx);
+            noise_value += tmp*tmp * rms_x1[hashedId]*rms_x1[hashedId]
+                *pedestal* G1SamplesCorrelation[vidx];
+            // non-divergent branch
+            if (!dynamicPedestal && addPedestalUncertainty>0.f) {
+                noise_value += tmp*tmp * addPedestalUncertainty*addPedestalUncertainty
+                    * pedestal;
+            }
         }
 
         noisecov[ch](ty, tx) = noise_value;
-      } else {
+    } else {
         auto rms = rms_x12[hashedId];
-        float noise_value = rms * rms * G12SamplesCorrelation[vidx];
-        if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
-          //----  add fully correlated component to noise covariance to inflate pedestal uncertainty
-          noise_value += addPedestalUncertainty * addPedestalUncertainty;
+        float noise_value = rms*rms * G12SamplesCorrelation[vidx];
+        if (!dynamicPedestal && addPedestalUncertainty>0.f) {
+            //----  add fully correlated component to noise covariance to inflate pedestal uncertainty
+            noise_value += addPedestalUncertainty*addPedestalUncertainty;
         }
         noisecov[ch](ty, tx) = noise_value;
-      }
-
-      // pulse matrix
-      //    int const bx = tx - 5; // -5 -4 -3 ... 3 4
-      //    int bx = (*bxs)(tx);
-      //    int const offset = 7 - 3 - bx;
-      int const posToAccess = 9 - tx + ty;  // see cpu for reference
-      float const value = posToAccess >= 7 ? pulse_shape[hashedId].pdfval[posToAccess - 7] : 0;
-      pulse_matrix[ch](ty, tx) = value;
     }
 
-    __global__ void kernel_permute_results(SampleVector* amplitudes,
-                                           BXVectorType const* activeBXs,
-                                           ::ecal::reco::StorageScalarType* energies,
-                                           char const* acState,
-                                           int const nchannels) {
-      // constants
-      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
-
-      // indices
-      int const tx = threadIdx.x + blockIdx.x * blockDim.x;
-      int const ch = tx / nsamples;
-      int const iii = tx % nsamples;  // this is to address activeBXs
-
-      if (ch >= nchannels)
+    // pulse matrix
+//    int const bx = tx - 5; // -5 -4 -3 ... 3 4
+//    int bx = (*bxs)(tx);
+//    int const offset = 7 - 3 - bx;
+    int const posToAccess = 9 - tx + ty; // see cpu for reference
+    float const value = posToAccess>=7 
+        ? pulse_shape[hashedId].pdfval[posToAccess-7]
+        : 0;
+    pulse_matrix[ch](ty, tx) = value;
+}
+
+__global__
+void kernel_permute_results(
+        SampleVector *amplitudes,
+        BXVectorType const*activeBXs,
+        ::ecal::reco::StorageScalarType *energies,
+        char const* acState,
+        int const nchannels) {
+    // constants
+    constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+    // indices
+    int const tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int const ch = tx / nsamples;
+    int const iii = tx % nsamples; // this is to address activeBXs
+
+    if (ch >= nchannels) return;
+    
+    // channels that have amplitude precomputed do not need results to be permuted
+    auto const state = static_cast<MinimizationState>(acState[ch]);
+    if (static_cast<MinimizationState>(acState[ch]) ==
+        MinimizationState::Precomputed)
         return;
 
-      // channels that have amplitude precomputed do not need results to be permuted
-      auto const state = static_cast<MinimizationState>(acState[ch]);
-      if (static_cast<MinimizationState>(acState[ch]) == MinimizationState::Precomputed)
-        return;
+    // configure shared memory and cp into it
+    extern __shared__ char smem[];
+    SampleVector::Scalar* values = reinterpret_cast<SampleVector::Scalar*>(
+        smem);
+    values[threadIdx.x] = amplitudes[ch](iii);
+    __syncthreads();
 
-      // configure shared memory and cp into it
-      extern __shared__ char smem[];
-      SampleVector::Scalar* values = reinterpret_cast<SampleVector::Scalar*>(smem);
-      values[threadIdx.x] = amplitudes[ch](iii);
-      __syncthreads();
+    // get the sample for this bx
+    auto const sample = static_cast<int>(activeBXs[ch](iii)) + 5;
 
-      // get the sample for this bx
-      auto const sample = static_cast<int>(activeBXs[ch](iii)) + 5;
+    // store back to global
+    amplitudes[ch](sample) = values[threadIdx.x];
 
-      // store back to global
-      amplitudes[ch](sample) = values[threadIdx.x];
-
-      // store sample 5 separately
-      // only for the case when minimization was performed
-      // not for cases with precomputed amplitudes
-      if (sample == 5)
+    // store sample 5 separately
+    // only for the case when minimization was performed
+    // not for cases with precomputed amplitudes
+    if (sample == 5)
         energies[ch] = values[threadIdx.x];
-    }
+}
 
 ///
 /// Build an Ecal RecHit.
 /// TODO: Use SoA data structures on the host directly
-/// the reason for removing this from minimize kernel is to isolate the minimize +
+/// the reason for removing this from minimize kernel is to isolate the minimize + 
 /// again, building an aos rec hit involves strides... -> bad memory access pattern
 ///
 #ifdef RUN_BUILD_AOS_RECHIT
-    __global__ void kernel_build_rechit(
-        float const* energies, float const* chi2s, uint32_t* dids, EcalUncalibratedRecHit* rechits, int nchannels) {
-      int idx = threadIdx.x + blockDim.x * blockIdx.x;
-      if (idx < nchannels) {
-        rechits[idx] = EcalUncalibratedRecHit{dids[idx], energies[idx], 0, 0, chi2s[idx], 0};
-      }
+__global__
+void kernel_build_rechit(
+    float const* energies,
+    float const* chi2s,
+    uint32_t* dids,
+    EcalUncalibratedRecHit* rechits,
+    int nchannels) {
+    int idx = threadIdx.x + blockDim.x * blockIdx.x;
+    if (idx < nchannels) {
+        rechits[idx] = EcalUncalibratedRecHit{dids[idx], energies[idx],
+            0, 0, chi2s[idx], 0};
     }
+}
 #endif
 
-  }  // namespace multifit
-}  // namespace ecal
+}}
diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.h b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.h
index 6a3bc9ac43795..4b01e056fe0a8 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.h
+++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.h
@@ -6,89 +6,100 @@
 #include "RecoLocalCalo/EcalRecAlgos/interface/Common.h"
 
 class EcalPulseShape;
-// this flag setting is applied to all of the cases
+            // this flag setting is applied to all of the cases
 class EcalPulseCovariance;
 class EcalUncalibratedRecHit;
 
-namespace ecal {
-  namespace multifit {
+namespace ecal { namespace multifit {
 
-    ///
-    /// assume kernel launch configuration is
-    /// (MAXSAMPLES * nchannels, blocks)
-    /// TODO: is there a point to split this kernel further to separate reductions
-    ///
-    __global__ void kernel_prep_1d_and_initialize(EcalPulseShape const* shapes_in,
-                                                  uint16_t const* digis_in,
-                                                  uint32_t const* dids,
-                                                  SampleVector* amplitudes,
-                                                  SampleVector* amplitudesForMinimization,
-                                                  SampleGainVector* gainsNoise,
-                                                  float const* mean_x1,
-                                                  float const* mean_x12,
-                                                  float const* rms_x12,
-                                                  float const* mean_x6,
-                                                  float const* gain6Over1,
-                                                  float const* gain12Over6,
-                                                  bool* hasSwitchToGain6,
-                                                  bool* hasSwitchToGain1,
-                                                  bool* isSaturated,
-                                                  ::ecal::reco::StorageScalarType* energies,
-                                                  ::ecal::reco::StorageScalarType* chi2,
-                                                  ::ecal::reco::StorageScalarType* pedestal,
-                                                  uint32_t* flags,
-                                                  char* acState,
-                                                  BXVectorType* bxs,
-                                                  uint32_t offsetForHashes,
-                                                  bool const gainSwitchUseMaxSampleEB,
-                                                  bool const gainSwitchUseMaxSampleEE,
-                                                  int const nchannels);
+///
+/// assume kernel launch configuration is 
+/// (MAXSAMPLES * nchannels, blocks)
+/// TODO: is there a point to split this kernel further to separate reductions
+/// 
+__global__
+void kernel_prep_1d_and_initialize(EcalPulseShape const* shapes_in,
+                    uint16_t const* digis_in_eb,
+                    uint32_t const* dids_eb,
+                    uint16_t const* digis_in_ee,
+                    uint32_t const* dids_ee,
+                    SampleVector* amplitudes,
+                    SampleVector* amplitudesForMinimization,
+                    SampleGainVector* gainsNoise,
+                    float const* mean_x1,
+                    float const* mean_x12,
+                    float const* rms_x12,
+                    float const* mean_x6,
+                    float const* gain6Over1,
+                    float const* gain12Over6,
+                    bool* hasSwitchToGain6,
+                    bool* hasSwitchToGain1,
+                    bool* isSaturated,
+                    ::ecal::reco::StorageScalarType* energies,
+                    ::ecal::reco::StorageScalarType* chi2,
+                    ::ecal::reco::StorageScalarType* pedestal,
+                    uint32_t *dids_out,
+                    uint32_t *flags,
+                    char* acState,
+                    BXVectorType *bxs,
+                    uint32_t const offsetForHashes,
+                    uint32_t const offsetForInputs,
+                    bool const gainSwitchUseMaxSampleEB,
+                    bool const gainSwitchUseMaxSampleEE,
+                    int const nchannels);
 
-    ///
-    /// assume kernel launch configuration is
-    /// ([MAXSAMPLES, MAXSAMPLES], nchannels)
-    ///
-    __global__ void kernel_prep_2d(EcalPulseCovariance const* pulse_cov_in,
-                                   FullSampleMatrix* pulse_cov_out,
-                                   SampleGainVector const* gainNoise,
-                                   uint32_t const* dids,
-                                   float const* rms_x12,
-                                   float const* rms_x6,
-                                   float const* rms_x1,
-                                   float const* gain12Over6,
-                                   float const* gain6Over1,
-                                   double const* G12SamplesCorrelationEB,
-                                   double const* G6SamplesCorrelationEB,
-                                   double const* G1SamplesCorrelationEB,
-                                   double const* G12SamplesCorrelationEE,
-                                   double const* G6SamplesCorrelationEE,
-                                   double const* G1SamplesCorrelationEE,
-                                   SampleMatrix* noisecov,
-                                   PulseMatrixType* pulse_matrix,
-                                   EcalPulseShape const* pulse_shape,
-                                   bool const* hasSwitchToGain6,
-                                   bool const* hasSwitchToGain1,
-                                   bool const* isSaturated,
-                                   uint32_t const offsetForHashes);
+///
+/// assume kernel launch configuration is 
+/// ([MAXSAMPLES, MAXSAMPLES], nchannels)
+///
+__global__
+void kernel_prep_2d(SampleGainVector const* gainNoise,
+                    uint32_t const* dids_eb,
+                    uint32_t const* dids_ee,
+                    float const* rms_x12,
+                    float const* rms_x6,
+                    float const* rms_x1,
+                    float const* gain12Over6,
+                    float const* gain6Over1,
+                    double const* G12SamplesCorrelationEB,
+                    double const* G6SamplesCorrelationEB,
+                    double const* G1SamplesCorrelationEB,
+                    double const* G12SamplesCorrelationEE,
+                    double const* G6SamplesCorrelationEE,
+                    double const* G1SamplesCorrelationEE,
+                    SampleMatrix* noisecov,
+                    PulseMatrixType* pulse_matrix,
+                    EcalPulseShape const* pulse_shape,
+                    bool const* hasSwitchToGain6,
+                    bool const* hasSwitchToGain1,
+                    bool const* isSaturated,
+                    uint32_t const offsetForHashes,
+                    uint32_t const offsetForInputs);
 
-    __global__ void kernel_permute_results(SampleVector* amplitudes,
-                                           BXVectorType const* activeBXs,
-                                           ::ecal::reco::StorageScalarType* energies,
-                                           char const* acState,
-                                           int const nchannels);
+__global__
+void kernel_permute_results(
+        SampleVector *amplitudes,
+        BXVectorType const* activeBXs,
+        ::ecal::reco::StorageScalarType *energies,
+        char const* acState,
+        int const nchannels);
 
 ///
 /// Build an Ecal RecHit.
 /// TODO: Use SoA data structures on the host directly
-/// the reason for removing this from minimize kernel is to isolate the minimize +
+/// the reason for removing this from minimize kernel is to isolate the minimize + 
 /// again, building an aos rec hit involves strides... -> bad memory access pattern
 ///
 #ifdef RUN_BUILD_AOS_RECHIT
-    __global__ void kernel_build_rechit(
-        float const* energies, float const* chi2s, uint32_t* dids, EcalUncalibratedRecHit* rechits, int nchannels);
+__global__
+void kernel_build_rechit(
+    float const* energies,
+    float const* chi2s,
+    uint32_t* dids,
+    EcalUncalibratedRecHit* rechits,
+    int nchannels);
 #endif
 
-  }  // namespace multifit
-}  // namespace ecal
+}}
 
-#endif  // RecoLocalCalo_EcalRecAlgos_src_AmplitudeComputationCommonKernels
+#endif // RecoLocalCalo_EcalRecAlgos_src_AmplitudeComputationCommonKernels
diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu
new file mode 100644
index 0000000000000..fb6b396089151
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu
@@ -0,0 +1,425 @@
+#include <iostream>
+#include <limits>
+
+#include "cuda.h"
+
+#include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
+#include "DataFormats/Math/interface/approx_exp.h"
+#include "DataFormats/Math/interface/approx_log.h"
+
+#include "CondFormats/EcalObjects/interface/EcalPulseShapes.h"
+#include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+
+#include "inplace_fnnls.h"
+#include "KernelHelpers.h"
+#include "AmplitudeComputationKernels.h"
+#include "AmplitudeComputationCommonKernels.h"
+
+namespace ecal { namespace multifit {
+
+void eigen_solve_submatrix(SampleMatrix& mat, 
+                           SampleVector& invec, 
+                           SampleVector& outvec, unsigned NP) {
+    using namespace Eigen;
+    switch( NP ) { // pulse matrix is always square.
+    case 10: {   
+        Matrix<SampleMatrix::Scalar,10,10> temp = mat.topLeftCorner<10,10>();
+        outvec.head<10>() = temp.ldlt().solve(invec.head<10>());
+        break;
+    }   
+    case 9: {
+        Matrix<SampleMatrix::Scalar,9,9> temp = mat.topLeftCorner<9,9>();
+        outvec.head<9>() = temp.ldlt().solve(invec.head<9>());
+        break;
+    }   
+    case 8: {   
+        Matrix<SampleMatrix::Scalar,8,8> temp = mat.topLeftCorner<8,8>();
+        outvec.head<8>() = temp.ldlt().solve(invec.head<8>());
+        break;
+    }   
+    case 7: {   
+        Matrix<SampleMatrix::Scalar,7,7> temp = mat.topLeftCorner<7,7>();
+        outvec.head<7>() = temp.ldlt().solve(invec.head<7>());
+        break;
+    }   
+    case 6: {   
+        Matrix<SampleMatrix::Scalar,6,6> temp = mat.topLeftCorner<6,6>();
+        outvec.head<6>() = temp.ldlt().solve(invec.head<6>());
+        break;
+    }   
+    case 5: {   
+        Matrix<SampleMatrix::Scalar,5,5> temp = mat.topLeftCorner<5,5>();
+        outvec.head<5>() = temp.ldlt().solve(invec.head<5>());
+        break;
+    }   
+    case 4: {   
+        Matrix<SampleMatrix::Scalar,4,4> temp = mat.topLeftCorner<4,4>();
+        outvec.head<4>() = temp.ldlt().solve(invec.head<4>());
+        break;
+    }   
+    case 3: {   
+        Matrix<SampleMatrix::Scalar,3,3> temp = mat.topLeftCorner<3,3>();
+        outvec.head<3>() = temp.ldlt().solve(invec.head<3>());
+        break;
+    }   
+    case 2: {   
+        Matrix<SampleMatrix::Scalar,2,2> temp = mat.topLeftCorner<2,2>();
+        outvec.head<2>() = temp.ldlt().solve(invec.head<2>());
+        break;
+    }   
+    case 1: {   
+        Matrix<SampleMatrix::Scalar,1,1> temp = mat.topLeftCorner<1,1>();
+        outvec.head<1>() = temp.ldlt().solve(invec.head<1>());
+        break;
+    }    
+    default:
+        return;
+    }
+}
+
+template<typename MatrixType>
+__device__ __forceinline__
+bool update_covariance(
+        EcalPulseCovariance const& pulse_covariance,
+        MatrixType& inverse_cov,
+        SampleVector const& amplitudes) {
+    constexpr int nsamples = SampleVector::RowsAtCompileTime;
+    constexpr int npulses = BXVectorType::RowsAtCompileTime;
+
+    #pragma unroll
+    for (unsigned int ipulse=0; ipulse<npulses; ipulse++) {
+        auto const amplitude = amplitudes.coeff(ipulse);
+        if (amplitude == 0) 
+            continue;
+
+        // FIXME: ipulse - 5 -> ipulse - firstOffset
+        int bx = ipulse - 5;
+        int first_sample_t = std::max(0, bx+3);
+        int offset = -3 - bx;
+
+        auto const value_sq = amplitude * amplitude;
+
+        unsigned int nsample_pulse = nsamples - first_sample_t;
+
+        for (int col=first_sample_t; col<nsamples; col++) {
+            for (int row=col; row<nsamples; row++) {
+                inverse_cov(row, col) += value_sq * 
+                    __ldg(&pulse_covariance.covval[row + offset][col + offset]);
+            }
+        }
+    }
+
+    return true;
+}
+
+///
+/// launch ctx parameters are (nchannels / block, blocks)
+/// TODO: trivial impl for now, there must be a way to improve
+///
+/// Conventions:
+///   - amplitudes -> solution vector, what we are fitting for
+///   - samples -> raw detector responses
+///   - passive constraint - satisfied constraint
+///   - active constraint - unsatisfied (yet) constraint
+///
+__global__
+void kernel_minimize(
+        uint32_t const* dids_eb,
+        uint32_t const* dids_ee,
+        SampleMatrix const* __restrict__ noisecov,
+        EcalPulseCovariance const* __restrict__ pulse_covariance,
+        BXVectorType *bxs,
+        SampleVector const* __restrict__ samples,
+        SampleVector* amplitudes,
+        PulseMatrixType const* __restrict__ pulse_matrix, 
+        ::ecal::reco::StorageScalarType* chi2s,
+        ::ecal::reco::StorageScalarType* energies,
+        char *acState,
+        int nchannels,
+        int max_iterations,
+        uint32_t const offsetForHashes,
+        uint32_t const offsetForInputs) {
+    // FIXME: ecal has 10 samples and 10 pulses....
+    // but this needs to be properly treated and renamed everywhere
+    constexpr auto NSAMPLES = SampleMatrix::RowsAtCompileTime;
+    constexpr auto NPULSES = SampleMatrix::RowsAtCompileTime;
+    static_assert(NSAMPLES == NPULSES);
+
+    using DataType = SampleVector::Scalar;
+
+    extern __shared__ char shrmem[];
+    DataType *shrMatrixLForFnnlsStorage = 
+        reinterpret_cast<DataType*>(shrmem) + MapSymM<DataType, NPULSES>::total * threadIdx.x;
+    DataType *shrAtAStorage = 
+        reinterpret_cast<DataType*>(shrmem) + MapSymM<DataType, NPULSES>::total * (
+            threadIdx.x + blockDim.x);
+
+    // FIXME: remove eitehr idx or ch -> they are teh same thing
+    int idx = threadIdx.x + blockDim.x*blockIdx.x;
+    auto const ch = idx;
+    if (idx < nchannels) {
+        if (static_cast<MinimizationState>(acState[idx]) == 
+            MinimizationState::Precomputed)
+            return;
+
+        // get the hash
+        int const inputCh = ch >= offsetForInputs
+            ? ch - offsetForInputs
+            : ch;
+        auto const* dids = ch >= offsetForInputs
+            ? dids_ee
+            : dids_eb;
+        auto const did = DetId{dids[inputCh]};
+        auto const isBarrel = did.subdetId() == EcalBarrel;
+        auto const hashedId = isBarrel
+            ? hashedIndexEB(did.rawId())
+            : offsetForHashes + hashedIndexEE(did.rawId());
+
+        // inits
+        int iter = 0;
+        int npassive = 0;
+
+        ColumnVector<NPULSES, int> pulseOffsets;
+        #pragma unroll
+        for (int i=0; i<NPULSES; ++i)
+            pulseOffsets(i) = i;
+
+        ColumnVector<NPULSES, DataType> resultAmplitudes;
+        #pragma unroll
+        for (int counter=0; counter<NPULSES; counter++)
+            resultAmplitudes(counter) = 0;
+
+        // inits
+        //SampleDecompLLT covariance_decomposition;
+        //SampleMatrix inverse_cov;
+        SampleVector::Scalar chi2 = 0, chi2_now = 0;
+
+        // loop until ocnverge
+        while (true) {
+            if (iter >= max_iterations)
+                break;
+
+            //inverse_cov = noisecov[idx];
+            //DataType covMatrixStorage[MapSymM<DataType, NSAMPLES>::total];
+            DataType* covMatrixStorage = shrMatrixLForFnnlsStorage;
+            MapSymM<DataType, NSAMPLES> covMatrix{covMatrixStorage};
+            int counter = 0;
+            #pragma unroll
+            for (int col=0; col<NSAMPLES; col++)
+                #pragma unroll
+                for (int row=col; row<NSAMPLES; row++)
+                    covMatrixStorage[counter++] = __ldg(
+                        &noisecov[idx].coeffRef(row, col));
+
+            update_covariance(
+                pulse_covariance[hashedId],
+                covMatrix,
+                resultAmplitudes);
+
+            // compute actual covariance decomposition
+            //covariance_decomposition.compute(inverse_cov);
+            //auto const& matrixL = covariance_decomposition.matrixL();
+            DataType matrixLStorage[MapSymM<DataType, NSAMPLES>::total];
+            MapSymM<DataType, NSAMPLES> matrixL{matrixLStorage};
+            compute_decomposition_unrolled(matrixL, covMatrix);
+
+            // L * A = P
+            ColMajorMatrix<NSAMPLES, NPULSES> A;
+            solve_forward_subst_matrix(A, pulse_matrix[idx], matrixL);
+
+            // L b = s
+            float reg_b[NSAMPLES];
+            solve_forward_subst_vector(reg_b, samples[idx], matrixL);
+
+            // FIXME: shared mem
+            //DataType AtAStorage[MapSymM<DataType, NPULSES>::total];
+            MapSymM<DataType, NPULSES> AtA{shrAtAStorage};
+            //SampleMatrix AtA;
+            SampleVector Atb;
+            #pragma unroll
+            for (int icol=0; icol<NPULSES; icol++) {
+                float reg_ai[NSAMPLES];
+
+                // load column icol
+                #pragma unroll
+                for (int counter=0; counter<NSAMPLES; counter++)
+                    reg_ai[counter] = A(counter, icol);
+
+                // compute diagoanl
+                float sum = 0.f;
+                #pragma unroll
+                for (int counter=0; counter<NSAMPLES; counter++)
+                    sum += reg_ai[counter] * reg_ai[counter];
+
+                // store
+                AtA(icol, icol) = sum;
+
+                // go thru the other columns
+                #pragma unroll
+                for (int j=icol+1; j<NPULSES; j++) {
+                    // load column j
+                    float reg_aj[NSAMPLES];
+                    #pragma unroll
+                    for (int counter=0; counter<NSAMPLES; counter++)
+                        reg_aj[counter] = A(counter, j);
+
+                    // accum
+                    float sum = 0.f;
+                    #pragma unroll
+                    for (int counter=0; counter<NSAMPLES; counter++)
+                        sum += reg_aj[counter] * reg_ai[counter];
+
+                    // store
+                    //AtA(icol, j) = sum;
+                    AtA(j, icol) = sum;
+                }
+
+                // Atb accum
+                float sum_atb = 0.f;
+                #pragma unroll
+                for (int counter=0; counter<NSAMPLES; counter++)
+                    sum_atb += reg_ai[counter] * reg_b[counter];
+
+                // store atb
+                Atb(icol) = sum_atb;
+            }
+            
+            // FIXME: shared mem
+            //DataType matrixLForFnnlsStorage[MapSymM<DataType, NPULSES>::total];
+            MapSymM<DataType, NPULSES> matrixLForFnnls{shrMatrixLForFnnlsStorage};
+
+            fnnls(
+                AtA,
+                Atb,
+                //amplitudes[idx],
+                resultAmplitudes,
+                npassive,
+                pulseOffsets,
+                matrixLForFnnls,
+                1e-11,
+                500
+                );
+                
+            {    
+                DataType accum[NSAMPLES];
+                // load accum
+                #pragma unroll
+                for (int counter=0; counter<NSAMPLES; counter++)
+                    accum[counter] = -samples[idx](counter);
+
+                // iterate
+                for (int icol=0; icol<NPULSES; icol++) {
+                    DataType pm_col[NSAMPLES];
+
+                    // preload a column of pulse matrix
+                    #pragma unroll
+                    for (int counter=0; counter<NSAMPLES; counter++)
+                        pm_col[counter] = __ldg(
+                            &pulse_matrix[idx].coeffRef(counter, icol));
+
+                    // accum
+                    #pragma unroll
+                    for (int counter=0; counter<NSAMPLES; counter++)
+                        accum[counter] += resultAmplitudes[icol] * pm_col[counter];
+                }
+
+                DataType reg_L[NSAMPLES];
+                DataType accumSum = 0;
+
+                // preload a column and load column 0 of cholesky
+                #pragma unroll
+                for (int i=0; i<NSAMPLES; i++)
+                    reg_L[i] = matrixL(i, 0);
+
+                // compute x0 and store it
+                auto x_prev = accum[0] / reg_L[0];
+                accumSum += x_prev * x_prev;
+
+                // iterate
+                #pragma unroll
+                for (int iL=1; iL<NSAMPLES; iL++) {
+                    // update accum
+                    #pragma unroll
+                    for (int counter=iL; counter<NSAMPLES; counter++)
+                        accum[counter] -= x_prev * reg_L[counter];
+
+                    // load the next column of cholesky
+                    #pragma unroll
+                    for (int counter=iL; counter<NSAMPLES; counter++)
+                        reg_L[counter] = matrixL(counter, iL);
+
+                    // compute the next x for M(iL, icol)
+                    x_prev = accum[iL] / reg_L[iL];
+
+                    // store teh result value
+                    accumSum += x_prev * x_prev;
+                }
+
+                chi2_now = accumSum;
+            }
+
+            auto deltachi2 = chi2_now - chi2;
+            chi2 = chi2_now;
+
+            if (ecal::abs(deltachi2) < 1e-3)
+                break;
+
+            //---- AM: TEST
+            //---- it was 3 lines above, now here as in the CPU version
+            ++iter;
+        }
+
+        // store to global output values
+        // FIXME: amplitudes are used in global directly
+        chi2s[idx] = chi2;
+        energies[idx] = resultAmplitudes(5);
+        #pragma unroll
+        for (int counter=0; counter<NPULSES; counter++)
+            amplitudes[idx](counter) = resultAmplitudes(counter);
+    }
+}
+
+namespace v1 {
+
+void minimization_procedure(
+        EventInputDataGPU const& eventInputGPU,
+        EventOutputDataGPU& eventOutputGPU, EventDataForScratchGPU& scratch,
+        ConditionsProducts const& conditions,
+        ConfigurationParameters const& configParameters,
+        cudaStream_t cudaStream) {
+    using DataType = SampleVector::Scalar;
+    unsigned int totalChannels = eventInputGPU.ebDigis.ndigis
+        + eventInputGPU.eeDigis.ndigis;
+//    unsigned int threads_min = conf.threads.x;
+    // TODO: configure from python
+    unsigned int threads_min = configParameters.kernelMinimizeThreads[0];
+    unsigned int blocks_min = threads_min > totalChannels
+        ? 1
+        : (totalChannels + threads_min - 1) / threads_min;
+    uint32_t const offsetForHashes = conditions.offsetForHashes;
+    uint32_t const offsetForInputs = eventInputGPU.ebDigis.ndigis;
+    auto const nbytesShared = 2 * threads_min * 
+        MapSymM<DataType, SampleVector::RowsAtCompileTime>::total * sizeof(DataType);
+    kernel_minimize<<<blocks_min, threads_min, nbytesShared, cudaStream>>>(
+        eventInputGPU.ebDigis.ids,
+        eventInputGPU.eeDigis.ids,
+        scratch.noisecov,
+        conditions.pulseCovariances.values,
+        scratch.activeBXs,
+        scratch.samples,
+        (SampleVector*)eventOutputGPU.amplitudesAll,
+        scratch.pulse_matrix,
+        eventOutputGPU.chi2,
+        eventOutputGPU.amplitude,
+        scratch.acState,
+        totalChannels,
+        50,
+        offsetForHashes,
+        offsetForInputs);
+    cudaCheck(cudaGetLastError());
+}
+
+}
+
+}}
diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.h b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.h
new file mode 100644
index 0000000000000..f54fef09b1f17
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.h
@@ -0,0 +1,27 @@
+#ifndef RecoLocalCalo_EcalRecAlgos_src_AmplitudeComputationKernels
+#define RecoLocalCalo_EcalRecAlgos_src_AmplitudeComputationKernels
+
+#include "RecoLocalCalo/EcalRecAlgos/interface/EigenMatrixTypes_gpu.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/Common.h"
+
+class EcalPulseShape;
+class EcalPulseCovariance;
+class EcalUncalibratedRecHit;
+
+namespace ecal { namespace multifit {
+
+namespace v1 {
+
+void minimization_procedure(
+        EventInputDataGPU const& eventInputGPU,
+        EventOutputDataGPU& eventOutputGPU, EventDataForScratchGPU& scratch,
+        ConditionsProducts const& conditions,
+        ConfigurationParameters const& configParameters,
+        cudaStream_t cudaStream);
+
+}
+
+}}
+
+#endif // RecoLocalCalo_EcalRecAlgos_src_AmplitudeComputationKernelsV1
diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernelsV1.cu b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernelsV1.cu
deleted file mode 100644
index 880e729c2c72d..0000000000000
--- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernelsV1.cu
+++ /dev/null
@@ -1,372 +0,0 @@
-#include <iostream>
-#include <limits>
-
-#include "cuda.h"
-
-#include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
-#include "DataFormats/Math/interface/approx_exp.h"
-#include "DataFormats/Math/interface/approx_log.h"
-
-#include "CondFormats/EcalObjects/interface/EcalPulseShapes.h"
-#include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h"
-#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
-
-#include "inplace_fnnls.h"
-#include "AmplitudeComputationKernelsV1.h"
-#include "AmplitudeComputationCommonKernels.h"
-
-namespace ecal {
-  namespace multifit {
-
-    void eigen_solve_submatrix(SampleMatrix& mat, SampleVector& invec, SampleVector& outvec, unsigned NP) {
-      using namespace Eigen;
-      switch (NP) {  // pulse matrix is always square.
-        case 10: {
-          Matrix<SampleMatrix::Scalar, 10, 10> temp = mat.topLeftCorner<10, 10>();
-          outvec.head<10>() = temp.ldlt().solve(invec.head<10>());
-          break;
-        }
-        case 9: {
-          Matrix<SampleMatrix::Scalar, 9, 9> temp = mat.topLeftCorner<9, 9>();
-          outvec.head<9>() = temp.ldlt().solve(invec.head<9>());
-          break;
-        }
-        case 8: {
-          Matrix<SampleMatrix::Scalar, 8, 8> temp = mat.topLeftCorner<8, 8>();
-          outvec.head<8>() = temp.ldlt().solve(invec.head<8>());
-          break;
-        }
-        case 7: {
-          Matrix<SampleMatrix::Scalar, 7, 7> temp = mat.topLeftCorner<7, 7>();
-          outvec.head<7>() = temp.ldlt().solve(invec.head<7>());
-          break;
-        }
-        case 6: {
-          Matrix<SampleMatrix::Scalar, 6, 6> temp = mat.topLeftCorner<6, 6>();
-          outvec.head<6>() = temp.ldlt().solve(invec.head<6>());
-          break;
-        }
-        case 5: {
-          Matrix<SampleMatrix::Scalar, 5, 5> temp = mat.topLeftCorner<5, 5>();
-          outvec.head<5>() = temp.ldlt().solve(invec.head<5>());
-          break;
-        }
-        case 4: {
-          Matrix<SampleMatrix::Scalar, 4, 4> temp = mat.topLeftCorner<4, 4>();
-          outvec.head<4>() = temp.ldlt().solve(invec.head<4>());
-          break;
-        }
-        case 3: {
-          Matrix<SampleMatrix::Scalar, 3, 3> temp = mat.topLeftCorner<3, 3>();
-          outvec.head<3>() = temp.ldlt().solve(invec.head<3>());
-          break;
-        }
-        case 2: {
-          Matrix<SampleMatrix::Scalar, 2, 2> temp = mat.topLeftCorner<2, 2>();
-          outvec.head<2>() = temp.ldlt().solve(invec.head<2>());
-          break;
-        }
-        case 1: {
-          Matrix<SampleMatrix::Scalar, 1, 1> temp = mat.topLeftCorner<1, 1>();
-          outvec.head<1>() = temp.ldlt().solve(invec.head<1>());
-          break;
-        }
-        default:
-          return;
-      }
-    }
-
-#define PRINT_MATRIX_10x10(M)                                                                                        \
-  printf(                                                                                                            \
-      "%f %f %f %f %f %f %f %f %f %f\n%f %f %f %f %f %f %f %f %f %f\n%f %f %f %f %f %f %f %f %f %f\n%f %f %f %f %f " \
-      "%f %f %f %f %f\n%f %f %f %f %f %f %f %f %f %f\n%f %f %f %f %f %f %f %f %f %f\n%f %f %f %f %f %f %f %f %f "    \
-      "%f\n%f %f %f %f %f %f %f %f %f %f\n%f %f %f %f %f %f %f %f %f %f\n%f %f %f %f %f %f %f %f %f %f\n",           \
-      M(0, 0),                                                                                                       \
-      M(1, 0),                                                                                                       \
-      M(2, 0),                                                                                                       \
-      M(3, 0),                                                                                                       \
-      M(4, 0),                                                                                                       \
-      M(5, 0),                                                                                                       \
-      M(6, 0),                                                                                                       \
-      M(7, 0),                                                                                                       \
-      M(8, 0),                                                                                                       \
-      M(9, 0),                                                                                                       \
-      M(0, 1),                                                                                                       \
-      M(1, 1),                                                                                                       \
-      M(2, 1),                                                                                                       \
-      M(3, 1),                                                                                                       \
-      M(4, 1),                                                                                                       \
-      M(5, 1),                                                                                                       \
-      M(6, 1),                                                                                                       \
-      M(7, 1),                                                                                                       \
-      M(8, 1),                                                                                                       \
-      M(9, 1),                                                                                                       \
-      M(0, 2),                                                                                                       \
-      M(1, 2),                                                                                                       \
-      M(2, 2),                                                                                                       \
-      M(3, 2),                                                                                                       \
-      M(4, 2),                                                                                                       \
-      M(5, 2),                                                                                                       \
-      M(6, 2),                                                                                                       \
-      M(7, 2),                                                                                                       \
-      M(8, 2),                                                                                                       \
-      M(9, 2),                                                                                                       \
-      M(0, 3),                                                                                                       \
-      M(1, 3),                                                                                                       \
-      M(2, 3),                                                                                                       \
-      M(3, 3),                                                                                                       \
-      M(4, 3),                                                                                                       \
-      M(5, 3),                                                                                                       \
-      M(6, 3),                                                                                                       \
-      M(7, 3),                                                                                                       \
-      M(8, 3),                                                                                                       \
-      M(9, 3),                                                                                                       \
-      M(0, 4),                                                                                                       \
-      M(1, 4),                                                                                                       \
-      M(2, 4),                                                                                                       \
-      M(3, 4),                                                                                                       \
-      M(4, 4),                                                                                                       \
-      M(5, 4),                                                                                                       \
-      M(6, 4),                                                                                                       \
-      M(7, 4),                                                                                                       \
-      M(8, 4),                                                                                                       \
-      M(9, 4),                                                                                                       \
-      M(0, 5),                                                                                                       \
-      M(1, 5),                                                                                                       \
-      M(2, 5),                                                                                                       \
-      M(3, 5),                                                                                                       \
-      M(4, 5),                                                                                                       \
-      M(5, 5),                                                                                                       \
-      M(6, 5),                                                                                                       \
-      M(7, 5),                                                                                                       \
-      M(8, 5),                                                                                                       \
-      M(9, 5),                                                                                                       \
-      M(0, 6),                                                                                                       \
-      M(1, 6),                                                                                                       \
-      M(2, 6),                                                                                                       \
-      M(3, 6),                                                                                                       \
-      M(4, 6),                                                                                                       \
-      M(5, 6),                                                                                                       \
-      M(6, 6),                                                                                                       \
-      M(7, 6),                                                                                                       \
-      M(8, 6),                                                                                                       \
-      M(9, 6),                                                                                                       \
-      M(0, 7),                                                                                                       \
-      M(1, 7),                                                                                                       \
-      M(2, 7),                                                                                                       \
-      M(3, 7),                                                                                                       \
-      M(4, 7),                                                                                                       \
-      M(5, 7),                                                                                                       \
-      M(6, 7),                                                                                                       \
-      M(7, 7),                                                                                                       \
-      M(8, 7),                                                                                                       \
-      M(9, 7),                                                                                                       \
-      M(0, 8),                                                                                                       \
-      M(1, 8),                                                                                                       \
-      M(2, 8),                                                                                                       \
-      M(3, 8),                                                                                                       \
-      M(4, 8),                                                                                                       \
-      M(5, 8),                                                                                                       \
-      M(6, 8),                                                                                                       \
-      M(7, 8),                                                                                                       \
-      M(8, 8),                                                                                                       \
-      M(9, 8),                                                                                                       \
-      M(0, 9),                                                                                                       \
-      M(1, 9),                                                                                                       \
-      M(2, 9),                                                                                                       \
-      M(3, 9),                                                                                                       \
-      M(4, 9),                                                                                                       \
-      M(5, 9),                                                                                                       \
-      M(6, 9),                                                                                                       \
-      M(7, 9),                                                                                                       \
-      M(8, 9),                                                                                                       \
-      M(9, 9))
-
-    __device__ __forceinline__ bool update_covariance(SampleMatrix const& noisecov,
-                                                      FullSampleMatrix const& full_pulse_cov,
-                                                      SampleMatrix& inverse_cov,
-                                                      BXVectorType const& bxs,
-                                                      SampleDecompLLT& covariance_decomposition,
-                                                      SampleVector const& amplitudes) {
-      constexpr int nsamples = SampleVector::RowsAtCompileTime;
-      constexpr int npulses = BXVectorType::RowsAtCompileTime;
-
-      inverse_cov = noisecov;
-
-      for (unsigned int ipulse = 0; ipulse < npulses; ipulse++) {
-        if (amplitudes.coeff(ipulse) == 0)
-          continue;
-
-        int bx = bxs.coeff(ipulse);
-        int first_sample_t = std::max(0, bx + 3);
-        int offset = 7 - 3 - bx;
-
-        auto const value = amplitudes.coeff(ipulse);
-        auto const value_sq = value * value;
-
-        unsigned int nsample_pulse = nsamples - first_sample_t;
-        inverse_cov.block(first_sample_t, first_sample_t, nsample_pulse, nsample_pulse) +=
-            value_sq *
-            full_pulse_cov.block(first_sample_t + offset, first_sample_t + offset, nsample_pulse, nsample_pulse);
-      }
-
-      return true;
-    }
-
-    __device__ __forceinline__ SampleVector::Scalar compute_chi2(SampleDecompLLT& covariance_decomposition,
-                                                                 PulseMatrixType const& pulse_matrix,
-                                                                 SampleVector const& amplitudes,
-                                                                 SampleVector const& samples) {
-      return covariance_decomposition.matrixL().solve(pulse_matrix * amplitudes - samples).squaredNorm();
-    }
-
-    ///
-    /// launch ctx parameters are (nchannels / block, blocks)
-    /// TODO: trivial impl for now, there must be a way to improve
-    ///
-    /// Conventions:
-    ///   - amplitudes -> solution vector, what we are fitting for
-    ///   - samples -> raw detector responses
-    ///   - passive constraint - satisfied constraint
-    ///   - active constraint - unsatisfied (yet) constraint
-    ///
-    __global__ void kernel_minimize(SampleMatrix const* noisecov,
-                                    FullSampleMatrix const* full_pulse_cov,
-                                    BXVectorType* bxs,
-                                    SampleVector const* samples,
-                                    SampleVector* amplitudes,
-                                    PulseMatrixType* pulse_matrix,
-                                    ::ecal::reco::StorageScalarType* chi2s,
-                                    char* acState,
-                                    int nchannels,
-                                    int max_iterations) {
-      int idx = threadIdx.x + blockDim.x * blockIdx.x;
-      if (idx < nchannels) {
-        if (static_cast<MinimizationState>(acState[idx]) == MinimizationState::Precomputed)
-          return;
-
-        // inits
-        int iter = 0;
-        int npassive = 0;
-
-        // inits
-        SampleDecompLLT covariance_decomposition;
-        SampleMatrix inverse_cov;
-        SampleVector::Scalar chi2 = 0, chi2_now = 0;
-
-#ifdef ECAL_MULTIFIT_KERNEL_MINIMIZE_V1
-//    PRINT_MATRIX_10x10(noisecov[idx]);
-#endif
-
-        // loop until ocnverge
-        while (true) {
-          if (iter >= max_iterations)
-            break;
-
-          update_covariance(
-              noisecov[idx], full_pulse_cov[idx], inverse_cov, bxs[idx], covariance_decomposition, amplitudes[idx]);
-
-          // compute actual covariance decomposition
-          covariance_decomposition.compute(inverse_cov);
-
-          // prepare input matrices for fnnls
-          SampleMatrix A = covariance_decomposition.matrixL().solve(pulse_matrix[idx]);
-          SampleVector b = covariance_decomposition.matrixL().solve(samples[idx]);
-
-          inplace_fnnls(A, b, amplitudes[idx], npassive, bxs[idx], pulse_matrix[idx]);
-
-          chi2_now = compute_chi2(covariance_decomposition, pulse_matrix[idx], amplitudes[idx], samples[idx]);
-          auto deltachi2 = chi2_now - chi2;
-
-#ifdef ECAL_MULTIFIT_KERNEL_MINIMIZE_V1
-          if (iter > 10) {
-            printf("idx = %d iter = %d chi2 = %f chi2old = %f\n", idx, iter, chi2_now, chi2);
-
-            printf("noisecov(0, i): %f %f %f %f %f %f %f %f %f %f\n",
-                   noisecov[idx](0, 0),
-                   noisecov[idx](0, 1),
-                   noisecov[idx](0, 2),
-                   noisecov[idx](0, 3),
-                   noisecov[idx](0, 4),
-                   noisecov[idx](0, 5),
-                   noisecov[idx](0, 6),
-                   noisecov[idx](0, 7),
-                   noisecov[idx](0, 8),
-                   noisecov[idx](0, 9));
-
-            printf("ampls: %f %f %f %f %f %f %f %f %f %f\n",
-                   amplitudes[idx](0),
-                   amplitudes[idx](1),
-                   amplitudes[idx](2),
-                   amplitudes[idx](3),
-                   amplitudes[idx](4),
-                   amplitudes[idx](5),
-                   amplitudes[idx](6),
-                   amplitudes[idx](7),
-                   amplitudes[idx](8),
-                   amplitudes[idx](9));
-          }
-#endif
-
-          chi2 = chi2_now;
-
-          if (ecal::abs(deltachi2) < 1e-3)
-            break;
-
-          //---- AM: TEST
-          //---- it was 3 lines above, now here as in the CPU version
-          ++iter;
-        }
-
-        // the rest will be set later
-        chi2s[idx] = chi2;
-      }
-    }
-
-    namespace v1 {
-
-      void minimization_procedure(EventInputDataCPU const& eventInputCPU,
-                                  EventInputDataGPU& eventInputGPU,
-                                  EventOutputDataGPU& eventOutputGPU,
-                                  EventDataForScratchGPU& scratch,
-                                  ConditionsProducts const& conditions,
-                                  ConfigurationParameters const& configParameters,
-                                  cudaStream_t cudaStream) {
-        unsigned int totalChannels = eventInputCPU.ebDigis.size() + eventInputCPU.eeDigis.size();
-        //    unsigned int threads_min = conf.threads.x;
-        // TODO: configure from python
-        unsigned int threads_min = configParameters.kernelMinimizeThreads[0];
-        unsigned int blocks_min = threads_min > totalChannels ? 1 : (totalChannels + threads_min - 1) / threads_min;
-        kernel_minimize<<<blocks_min, threads_min, 0, cudaStream>>>(scratch.noisecov,
-                                                                    scratch.pulse_covariances,
-                                                                    scratch.activeBXs,
-                                                                    scratch.samples,
-                                                                    (SampleVector*)eventOutputGPU.amplitudesAll,
-                                                                    scratch.pulse_matrix,
-                                                                    eventOutputGPU.chi2,
-                                                                    scratch.acState,
-                                                                    totalChannels,
-                                                                    50);
-        cudaCheck(cudaGetLastError());
-
-        //
-        // permute computed amplitudes
-        // and assign the final uncalibared energy value
-        //
-        unsigned int threadsPermute = 32 * EcalDataFrame::MAXSAMPLES;  // 32 * 10
-        unsigned int blocksPermute =
-            threadsPermute > 10 * totalChannels ? 1 : (10 * totalChannels + threadsPermute - 1) / threadsPermute;
-        int bytesPermute = threadsPermute * sizeof(SampleVector::Scalar);
-        kernel_permute_results<<<blocksPermute, threadsPermute, bytesPermute, cudaStream>>>(
-            (SampleVector*)eventOutputGPU.amplitudesAll,
-            scratch.activeBXs,
-            eventOutputGPU.amplitude,
-            scratch.acState,
-            totalChannels);
-        cudaCheck(cudaGetLastError());
-      }
-
-    }  // namespace v1
-
-  }  // namespace multifit
-}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernelsV1.h b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernelsV1.h
deleted file mode 100644
index f3c075e2a2e38..0000000000000
--- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernelsV1.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifndef RecoLocalCalo_EcalRecAlgos_src_AmplitudeComputationKernelsV1
-#define RecoLocalCalo_EcalRecAlgos_src_AmplitudeComputationKernelsV1
-
-#include "RecoLocalCalo/EcalRecAlgos/interface/EigenMatrixTypes_gpu.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/Common.h"
-
-class EcalPulseShape;
-class EcalPulseCovariance;
-class EcalUncalibratedRecHit;
-
-namespace ecal {
-  namespace multifit {
-
-    namespace v1 {
-
-      void minimization_procedure(EventInputDataCPU const& eventInputCPU,
-                                  EventInputDataGPU& eventInputGPU,
-                                  EventOutputDataGPU& eventOutputGPU,
-                                  EventDataForScratchGPU& scratch,
-                                  ConditionsProducts const& conditions,
-                                  ConfigurationParameters const& configParameters,
-                                  cudaStream_t cudaStream);
-
-    }
-
-    ///
-    /// TODO: trivial impl for now, there must be a way to improve
-    ///
-    /// Conventions:
-    ///   - amplitudes -> solution vector, what we are fitting for
-    ///   - samples -> raw detector responses
-    ///   - passive constraint - satisfied constraint
-    ///   - active constraint - unsatisfied (yet) constraint
-    ///
-    __global__ void kernel_minimize(SampleMatrix const* noisecov,
-                                    FullSampleMatrix const* full_pulse_cov,
-                                    BXVectorType* bxs,
-                                    SampleVector const* samples,
-                                    SampleVector* amplitudes,
-                                    PulseMatrixType* pulse_matrix,
-                                    ::ecal::reco::StorageScalarType* chi2s,
-                                    char* acState,
-                                    int nchannels,
-                                    int max_iterations);
-
-  }  // namespace multifit
-}  // namespace ecal
-
-#endif  // RecoLocalCalo_EcalRecAlgos_src_AmplitudeComputationKernelsV1
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalGainRatiosGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalGainRatiosGPU.cc
index d5980d8a757aa..bcb199b133c0d 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalGainRatiosGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalGainRatiosGPU.cc
@@ -3,50 +3,57 @@
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-EcalGainRatiosGPU::EcalGainRatiosGPU(EcalGainRatios const& values)
-    : gain12Over6_(values.size()), gain6Over1_(values.size()) {
-  // fill in eb
-  auto const& barrelValues = values.barrelItems();
-  for (unsigned int i = 0; i < barrelValues.size(); i++) {
-    gain12Over6_[i] = barrelValues[i].gain12Over6();
-    gain6Over1_[i] = barrelValues[i].gain6Over1();
-  }
-
-  // fill in ee
-  auto const& endcapValues = values.endcapItems();
-  auto const offset = barrelValues.size();
-  for (unsigned int i = 0; i < endcapValues.size(); i++) {
-    gain12Over6_[offset + i] = endcapValues[i].gain12Over6();
-    gain6Over1_[offset + i] = endcapValues[i].gain6Over1();
-  }
+EcalGainRatiosGPU::EcalGainRatiosGPU(EcalGainRatios const& values) 
+    : gain12Over6_(values.size())
+    , gain6Over1_(values.size())
+{
+    // fill in eb
+    auto const& barrelValues = values.barrelItems();
+    for (unsigned int i=0; i<barrelValues.size(); i++) {
+        gain12Over6_[i] = barrelValues[i].gain12Over6();
+        gain6Over1_[i] = barrelValues[i].gain6Over1();
+    }
+    
+    // fill in ee
+    auto const& endcapValues = values.endcapItems();
+    auto const offset = barrelValues.size();
+    for (unsigned int i=0; i<endcapValues.size(); i++) {
+        gain12Over6_[offset + i] = endcapValues[i].gain12Over6();
+        gain6Over1_[offset + i] = endcapValues[i].gain6Over1();
+    }
 }
 
 EcalGainRatiosGPU::Product::~Product() {
-  // deallocation
-  cudaCheck(cudaFree(gain12Over6));
-  cudaCheck(cudaFree(gain6Over1));
+    // deallocation
+    cudaCheck( cudaFree(gain12Over6) );
+    cudaCheck( cudaFree(gain6Over1) );
 }
 
-EcalGainRatiosGPU::Product const& EcalGainRatiosGPU::getProduct(cudaStream_t cudaStream) const {
-  auto const& product = product_.dataForCurrentDeviceAsync(
-      cudaStream, [this](EcalGainRatiosGPU::Product& product, cudaStream_t cudaStream) {
-        // malloc
-        cudaCheck(cudaMalloc((void**)&product.gain12Over6, this->gain12Over6_.size() * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&product.gain6Over1, this->gain6Over1_.size() * sizeof(float)));
-        // transfer
-        cudaCheck(cudaMemcpyAsync(product.gain12Over6,
-                                  this->gain12Over6_.data(),
-                                  this->gain12Over6_.size() * sizeof(float),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-        cudaCheck(cudaMemcpyAsync(product.gain6Over1,
-                                  this->gain6Over1_.data(),
-                                  this->gain6Over1_.size() * sizeof(float),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-      });
+EcalGainRatiosGPU::Product const& EcalGainRatiosGPU::getProduct(
+        cudaStream_t cudaStream) const
+{
+    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
+        [this](EcalGainRatiosGPU::Product& product, cudaStream_t cudaStream) {
+            // malloc
+            cudaCheck( cudaMalloc((void**)&product.gain12Over6,
+                                  this->gain12Over6_.size() * sizeof(float)) );
+            cudaCheck( cudaMalloc((void**)&product.gain6Over1,
+                                  this->gain6Over1_.size() * sizeof(float)) );
+            // transfer 
+            cudaCheck( cudaMemcpyAsync(product.gain12Over6,
+                                       this->gain12Over6_.data(),
+                                       this->gain12Over6_.size() * sizeof(float),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+            cudaCheck( cudaMemcpyAsync(product.gain6Over1,
+                                       this->gain6Over1_.data(),
+                                       this->gain6Over1_.size() * sizeof(float),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+        }
+    );
 
-  return product;
+    return product;
 }
 
 TYPELOOKUP_DATA_REG(EcalGainRatiosGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalPedestalsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalPedestalsGPU.cc
index 9e3284cd9c7c8..401ad8c454737 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalPedestalsGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalPedestalsGPU.cc
@@ -3,92 +3,103 @@
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-EcalPedestalsGPU::EcalPedestalsGPU(EcalPedestals const& pedestals)
-    : mean_x12_(pedestals.size()),
-      rms_x12_(pedestals.size()),
-      mean_x6_(pedestals.size()),
-      rms_x6_(pedestals.size()),
-      mean_x1_(pedestals.size()),
-      rms_x1_(pedestals.size()) {
-  // fill in eb
-  auto const& barrelValues = pedestals.barrelItems();
-  for (unsigned int i = 0; i < barrelValues.size(); i++) {
-    mean_x12_[i] = barrelValues[i].mean_x12;
-    rms_x12_[i] = barrelValues[i].rms_x12;
-    mean_x6_[i] = barrelValues[i].mean_x6;
-    rms_x6_[i] = barrelValues[i].rms_x6;
-    mean_x1_[i] = barrelValues[i].mean_x1;
-    rms_x1_[i] = barrelValues[i].rms_x1;
-  }
+EcalPedestalsGPU::EcalPedestalsGPU(EcalPedestals const& pedestals) 
+    : mean_x12_(pedestals.size())
+    , rms_x12_(pedestals.size())
+    , mean_x6_(pedestals.size())
+    , rms_x6_(pedestals.size())
+    , mean_x1_(pedestals.size())
+    , rms_x1_(pedestals.size())
+{   
 
-  // fill in ee
-  auto const& endcapValues = pedestals.endcapItems();
-  auto const offset = barrelValues.size();
-  for (unsigned int i = 0; i < endcapValues.size(); i++) {
-    mean_x12_[offset + i] = endcapValues[i].mean_x12;
-    rms_x12_[offset + i] = endcapValues[i].rms_x12;
-    mean_x6_[offset + i] = endcapValues[i].mean_x6;
-    rms_x6_[offset + i] = endcapValues[i].rms_x6;
-    mean_x1_[offset + i] = endcapValues[i].mean_x1;
-    rms_x1_[offset + i] = endcapValues[i].rms_x1;
-  }
+    // fill in eb
+    auto const& barrelValues = pedestals.barrelItems();
+    for (unsigned int i=0; i<barrelValues.size(); i++) {
+        mean_x12_[i] = barrelValues[i].mean_x12;
+        rms_x12_[i] = barrelValues[i].rms_x12;
+        mean_x6_[i] = barrelValues[i].mean_x6;
+        rms_x6_[i] = barrelValues[i].rms_x6;
+        mean_x1_[i] = barrelValues[i].mean_x1;
+        rms_x1_[i] = barrelValues[i].rms_x1;
+    }
+    
+    // fill in ee
+    auto const& endcapValues = pedestals.endcapItems();
+    auto const offset = barrelValues.size();
+    for (unsigned int i=0; i<endcapValues.size(); i++) {
+        mean_x12_[offset + i] = endcapValues[i].mean_x12;
+        rms_x12_[offset + i] = endcapValues[i].rms_x12;
+        mean_x6_[offset + i] = endcapValues[i].mean_x6;
+        rms_x6_[offset + i] = endcapValues[i].rms_x6;
+        mean_x1_[offset + i] = endcapValues[i].mean_x1;
+        rms_x1_[offset + i] = endcapValues[i].rms_x1;
+    }
 }
 
 EcalPedestalsGPU::Product::~Product() {
-  // deallocation
-  cudaCheck(cudaFree(mean_x12));
-  cudaCheck(cudaFree(rms_x12));
-  cudaCheck(cudaFree(mean_x6));
-  cudaCheck(cudaFree(rms_x6));
-  cudaCheck(cudaFree(mean_x1));
-  cudaCheck(cudaFree(rms_x1));
+    // deallocation
+    cudaCheck( cudaFree(mean_x12) );
+    cudaCheck( cudaFree(rms_x12) );
+    cudaCheck( cudaFree(mean_x6) );
+    cudaCheck( cudaFree(rms_x6) );
+    cudaCheck( cudaFree(mean_x1) );
+    cudaCheck( cudaFree(rms_x1) );
 }
 
-EcalPedestalsGPU::Product const& EcalPedestalsGPU::getProduct(cudaStream_t cudaStream) const {
-  auto const& product = product_.dataForCurrentDeviceAsync(
-      cudaStream, [this](EcalPedestalsGPU::Product& product, cudaStream_t cudaStream) {
-        // malloc
-        cudaCheck(cudaMalloc((void**)&product.mean_x12, this->mean_x12_.size() * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&product.rms_x12, this->mean_x12_.size() * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&product.mean_x6, this->mean_x12_.size() * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&product.rms_x6, this->mean_x12_.size() * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&product.mean_x1, this->mean_x12_.size() * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&product.rms_x1, this->mean_x12_.size() * sizeof(float)));
+EcalPedestalsGPU::Product const& EcalPedestalsGPU::getProduct(
+        cudaStream_t cudaStream) const
+{
+    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
+        [this](EcalPedestalsGPU::Product& product, cudaStream_t cudaStream) {
+            // malloc
+            cudaCheck( cudaMalloc((void**)&product.mean_x12,
+                                  this->mean_x12_.size() * sizeof(float)) );
+            cudaCheck( cudaMalloc((void**)&product.rms_x12,
+                                  this->mean_x12_.size() * sizeof(float)) );
+            cudaCheck( cudaMalloc((void**)&product.mean_x6,
+                                  this->mean_x12_.size() * sizeof(float)) );
+            cudaCheck( cudaMalloc((void**)&product.rms_x6,
+                                  this->mean_x12_.size() * sizeof(float)) );
+            cudaCheck( cudaMalloc((void**)&product.mean_x1,
+                                  this->mean_x12_.size() * sizeof(float)) );
+            cudaCheck( cudaMalloc((void**)&product.rms_x1,
+                                  this->mean_x12_.size() * sizeof(float)) );
 
-        // transfer
-        cudaCheck(cudaMemcpyAsync(product.mean_x12,
-                                  this->mean_x12_.data(),
-                                  this->mean_x12_.size() * sizeof(float),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-        cudaCheck(cudaMemcpyAsync(product.rms_x12,
-                                  this->rms_x12_.data(),
-                                  this->rms_x12_.size() * sizeof(float),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-        cudaCheck(cudaMemcpyAsync(product.mean_x6,
-                                  this->mean_x6_.data(),
-                                  this->mean_x6_.size() * sizeof(float),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-        cudaCheck(cudaMemcpyAsync(product.rms_x6,
-                                  this->rms_x6_.data(),
-                                  this->rms_x6_.size() * sizeof(float),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-        cudaCheck(cudaMemcpyAsync(product.mean_x1,
-                                  this->mean_x1_.data(),
-                                  this->mean_x1_.size() * sizeof(float),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-        cudaCheck(cudaMemcpyAsync(product.rms_x1,
-                                  this->rms_x1_.data(),
-                                  this->rms_x1_.size() * sizeof(float),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-      });
+            // transfer 
+            cudaCheck( cudaMemcpyAsync(product.mean_x12,
+                                       this->mean_x12_.data(),
+                                       this->mean_x12_.size() * sizeof(float),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+            cudaCheck( cudaMemcpyAsync(product.rms_x12,
+                                       this->rms_x12_.data(),
+                                       this->rms_x12_.size() * sizeof(float),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+            cudaCheck( cudaMemcpyAsync(product.mean_x6,
+                                       this->mean_x6_.data(),
+                                       this->mean_x6_.size() * sizeof(float),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+            cudaCheck( cudaMemcpyAsync(product.rms_x6,
+                                       this->rms_x6_.data(),
+                                       this->rms_x6_.size() * sizeof(float),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+            cudaCheck( cudaMemcpyAsync(product.mean_x1,
+                                       this->mean_x1_.data(),
+                                       this->mean_x1_.size() * sizeof(float),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+            cudaCheck( cudaMemcpyAsync(product.rms_x1,
+                                       this->rms_x1_.data(),
+                                       this->rms_x1_.size() * sizeof(float),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+        }
+    );
 
-  return product;
+    return product;
 }
 
 TYPELOOKUP_DATA_REG(EcalPedestalsGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalPulseCovariancesGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalPulseCovariancesGPU.cc
index bbeda99652e22..121a5b9e684f7 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalPulseCovariancesGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalPulseCovariancesGPU.cc
@@ -3,40 +3,48 @@
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-EcalPulseCovariancesGPU::EcalPulseCovariancesGPU(EcalPulseCovariances const& values)
-    : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {}
+EcalPulseCovariancesGPU::EcalPulseCovariancesGPU(EcalPulseCovariances const& values) 
+    : valuesEB_{values.barrelItems()}
+    , valuesEE_{values.endcapItems()}
+{}
 
 EcalPulseCovariancesGPU::Product::~Product() {
-  // deallocation
-  cudaCheck(cudaFree(values));
+    // deallocation
+    cudaCheck( cudaFree(values) );
 }
 
-EcalPulseCovariancesGPU::Product const& EcalPulseCovariancesGPU::getProduct(cudaStream_t cudaStream) const {
-  auto const& product = product_.dataForCurrentDeviceAsync(
-      cudaStream, [this](EcalPulseCovariancesGPU::Product& product, cudaStream_t cudaStream) {
-        // malloc
-        cudaCheck(cudaMalloc((void**)&product.values,
-                             (this->valuesEE_.size() + this->valuesEB_.size()) * sizeof(EcalPulseCovariance)));
-
-        // offset in terms of sizeof(EcalPulseCovariance)
-        uint32_t offset = this->valuesEB_.size();
-
-        // transfer eb
-        cudaCheck(cudaMemcpyAsync(product.values,
-                                  this->valuesEB_.data(),
-                                  this->valuesEB_.size() * sizeof(EcalPulseCovariance),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-
-        // transfer ee starting at values + offset
-        cudaCheck(cudaMemcpyAsync(product.values + offset,
-                                  this->valuesEE_.data(),
-                                  this->valuesEE_.size() * sizeof(EcalPulseCovariance),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-      });
-
-  return product;
+EcalPulseCovariancesGPU::Product const& EcalPulseCovariancesGPU::getProduct(
+        cudaStream_t cudaStream) const
+{
+    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
+        [this](EcalPulseCovariancesGPU::Product& product, cudaStream_t cudaStream) {
+            // malloc
+            cudaCheck( cudaMalloc((void**)&product.values,
+                                  (this->valuesEE_.size() + this->valuesEB_.size()) 
+                                  * sizeof(EcalPulseCovariance)) );
+           
+            // offset in terms of sizeof(EcalPulseCovariance)
+            uint32_t offset = this->valuesEB_.size();
+
+            // transfer eb 
+            cudaCheck( cudaMemcpyAsync(product.values,
+                                       this->valuesEB_.data(),
+                                       this->valuesEB_.size() * 
+                                       sizeof(EcalPulseCovariance),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+
+            // transfer ee starting at values + offset
+            cudaCheck( cudaMemcpyAsync(product.values + offset,
+                                       this->valuesEE_.data(),
+                                       this->valuesEE_.size() * 
+                                       sizeof(EcalPulseCovariance),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+        }
+    );
+
+    return product;
 }
 
 TYPELOOKUP_DATA_REG(EcalPulseCovariancesGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalPulseShapesGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalPulseShapesGPU.cc
index aee122a01627d..8e8f00795d225 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalPulseShapesGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalPulseShapesGPU.cc
@@ -3,40 +3,48 @@
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-EcalPulseShapesGPU::EcalPulseShapesGPU(EcalPulseShapes const& values)
-    : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {}
+EcalPulseShapesGPU::EcalPulseShapesGPU(EcalPulseShapes const& values) 
+    : valuesEB_{values.barrelItems()}
+    , valuesEE_{values.endcapItems()}
+{}
 
 EcalPulseShapesGPU::Product::~Product() {
-  // deallocation
-  cudaCheck(cudaFree(values));
+    // deallocation
+    cudaCheck( cudaFree(values) );
 }
 
-EcalPulseShapesGPU::Product const& EcalPulseShapesGPU::getProduct(cudaStream_t cudaStream) const {
-  auto const& product = product_.dataForCurrentDeviceAsync(
-      cudaStream, [this](EcalPulseShapesGPU::Product& product, cudaStream_t cudaStream) {
-        // malloc
-        cudaCheck(cudaMalloc((void**)&product.values,
-                             (this->valuesEE_.size() + this->valuesEB_.size()) * sizeof(EcalPulseShape)));
-
-        // offset in terms of sizeof(EcalPulseShape) - plain c array
-        uint32_t offset = this->valuesEB_.size();
-
-        // transfer eb
-        cudaCheck(cudaMemcpyAsync(product.values,
-                                  this->valuesEB_.data(),
-                                  this->valuesEB_.size() * sizeof(EcalPulseShape),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-
-        // transfer ee starting at values + offset
-        cudaCheck(cudaMemcpyAsync(product.values + offset,
-                                  this->valuesEE_.data(),
-                                  this->valuesEE_.size() * sizeof(EcalPulseShape),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-      });
-
-  return product;
+EcalPulseShapesGPU::Product const& EcalPulseShapesGPU::getProduct(
+        cudaStream_t cudaStream) const
+{
+    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
+        [this](EcalPulseShapesGPU::Product& product, cudaStream_t cudaStream) {
+            // malloc
+            cudaCheck( cudaMalloc((void**)&product.values,
+                                  (this->valuesEE_.size() + this->valuesEB_.size()) 
+                                  * sizeof(EcalPulseShape)) );
+           
+            // offset in terms of sizeof(EcalPulseShape) - plain c array
+            uint32_t offset = this->valuesEB_.size();
+
+            // transfer eb 
+            cudaCheck( cudaMemcpyAsync(product.values,
+                                       this->valuesEB_.data(),
+                                       this->valuesEB_.size() * 
+                                       sizeof(EcalPulseShape),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+
+            // transfer ee starting at values + offset
+            cudaCheck( cudaMemcpyAsync(product.values + offset,
+                                       this->valuesEE_.data(),
+                                       this->valuesEE_.size() * 
+                                       sizeof(EcalPulseShape),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+        }
+    );
+
+    return product;
 }
 
 TYPELOOKUP_DATA_REG(EcalPulseShapesGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalSamplesCorrelationGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalSamplesCorrelationGPU.cc
index 2a98067f51d9e..7294c759aaa0d 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalSamplesCorrelationGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalSamplesCorrelationGPU.cc
@@ -3,74 +3,91 @@
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-EcalSamplesCorrelationGPU::EcalSamplesCorrelationGPU(EcalSamplesCorrelation const& values)
-    : EBG12SamplesCorrelation_{values.EBG12SamplesCorrelation},
-      EBG6SamplesCorrelation_{values.EBG6SamplesCorrelation},
-      EBG1SamplesCorrelation_{values.EBG1SamplesCorrelation},
-      EEG12SamplesCorrelation_{values.EEG12SamplesCorrelation},
-      EEG6SamplesCorrelation_{values.EEG6SamplesCorrelation},
-      EEG1SamplesCorrelation_{values.EEG1SamplesCorrelation} {}
+EcalSamplesCorrelationGPU::EcalSamplesCorrelationGPU(
+        EcalSamplesCorrelation const& values) 
+    : EBG12SamplesCorrelation_{values.EBG12SamplesCorrelation}
+    , EBG6SamplesCorrelation_{values.EBG6SamplesCorrelation}
+    , EBG1SamplesCorrelation_{values.EBG1SamplesCorrelation}
+    , EEG12SamplesCorrelation_{values.EEG12SamplesCorrelation}
+    , EEG6SamplesCorrelation_{values.EEG6SamplesCorrelation}
+    , EEG1SamplesCorrelation_{values.EEG1SamplesCorrelation}
+{}
 
 EcalSamplesCorrelationGPU::Product::~Product() {
-  // deallocation
-  cudaCheck(cudaFree(EBG12SamplesCorrelation));
-  cudaCheck(cudaFree(EBG6SamplesCorrelation));
-  cudaCheck(cudaFree(EBG1SamplesCorrelation));
-  cudaCheck(cudaFree(EEG12SamplesCorrelation));
-  cudaCheck(cudaFree(EEG6SamplesCorrelation));
-  cudaCheck(cudaFree(EEG1SamplesCorrelation));
+    // deallocation
+    cudaCheck( cudaFree(EBG12SamplesCorrelation) );
+    cudaCheck( cudaFree(EBG6SamplesCorrelation) );
+    cudaCheck( cudaFree(EBG1SamplesCorrelation) );
+    cudaCheck( cudaFree(EEG12SamplesCorrelation) );
+    cudaCheck( cudaFree(EEG6SamplesCorrelation) );
+    cudaCheck( cudaFree(EEG1SamplesCorrelation) );
 }
 
-EcalSamplesCorrelationGPU::Product const& EcalSamplesCorrelationGPU::getProduct(cudaStream_t cudaStream) const {
-  auto const& product = product_.dataForCurrentDeviceAsync(
-      cudaStream, [this](EcalSamplesCorrelationGPU::Product& product, cudaStream_t cudaStream) {
-        // malloc
-        cudaCheck(cudaMalloc((void**)&product.EBG12SamplesCorrelation,
-                             this->EBG12SamplesCorrelation_.size() * sizeof(double)));
-        cudaCheck(
-            cudaMalloc((void**)&product.EBG6SamplesCorrelation, this->EBG6SamplesCorrelation_.size() * sizeof(double)));
-        cudaCheck(
-            cudaMalloc((void**)&product.EBG1SamplesCorrelation, this->EBG1SamplesCorrelation_.size() * sizeof(double)));
-        cudaCheck(cudaMalloc((void**)&product.EEG12SamplesCorrelation,
-                             this->EEG12SamplesCorrelation_.size() * sizeof(double)));
-        cudaCheck(
-            cudaMalloc((void**)&product.EEG6SamplesCorrelation, this->EEG6SamplesCorrelation_.size() * sizeof(double)));
-        cudaCheck(
-            cudaMalloc((void**)&product.EEG1SamplesCorrelation, this->EEG1SamplesCorrelation_.size() * sizeof(double)));
-        // transfer
-        cudaCheck(cudaMemcpyAsync(product.EBG12SamplesCorrelation,
-                                  this->EBG12SamplesCorrelation_.data(),
-                                  this->EBG12SamplesCorrelation_.size() * sizeof(double),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-        cudaCheck(cudaMemcpyAsync(product.EBG6SamplesCorrelation,
-                                  this->EBG6SamplesCorrelation_.data(),
-                                  this->EBG6SamplesCorrelation_.size() * sizeof(double),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-        cudaCheck(cudaMemcpyAsync(product.EBG1SamplesCorrelation,
-                                  this->EBG1SamplesCorrelation_.data(),
-                                  this->EBG1SamplesCorrelation_.size() * sizeof(double),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-        cudaCheck(cudaMemcpyAsync(product.EEG12SamplesCorrelation,
-                                  this->EEG12SamplesCorrelation_.data(),
-                                  this->EEG12SamplesCorrelation_.size() * sizeof(double),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-        cudaCheck(cudaMemcpyAsync(product.EEG6SamplesCorrelation,
-                                  this->EEG6SamplesCorrelation_.data(),
-                                  this->EEG6SamplesCorrelation_.size() * sizeof(double),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-        cudaCheck(cudaMemcpyAsync(product.EEG1SamplesCorrelation,
-                                  this->EEG1SamplesCorrelation_.data(),
-                                  this->EEG1SamplesCorrelation_.size() * sizeof(double),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-      });
+EcalSamplesCorrelationGPU::Product const& EcalSamplesCorrelationGPU::getProduct(
+        cudaStream_t cudaStream) const
+{
+    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
+        [this](EcalSamplesCorrelationGPU::Product& product, cudaStream_t cudaStream) {
+            // malloc
+            cudaCheck( cudaMalloc((void**)&product.EBG12SamplesCorrelation,
+                                  this->EBG12SamplesCorrelation_.size() * 
+                                  sizeof(double)) );
+            cudaCheck( cudaMalloc((void**)&product.EBG6SamplesCorrelation,
+                                  this->EBG6SamplesCorrelation_.size() * 
+                                  sizeof(double)) );
+            cudaCheck( cudaMalloc((void**)&product.EBG1SamplesCorrelation,
+                                  this->EBG1SamplesCorrelation_.size() * 
+                                  sizeof(double)) );
+            cudaCheck( cudaMalloc((void**)&product.EEG12SamplesCorrelation,
+                                  this->EEG12SamplesCorrelation_.size() * 
+                                  sizeof(double)) );
+            cudaCheck( cudaMalloc((void**)&product.EEG6SamplesCorrelation,
+                                  this->EEG6SamplesCorrelation_.size() * 
+                                  sizeof(double)) );
+            cudaCheck( cudaMalloc((void**)&product.EEG1SamplesCorrelation,
+                                  this->EEG1SamplesCorrelation_.size() * 
+                                  sizeof(double)) );
+            // transfer 
+            cudaCheck( cudaMemcpyAsync(product.EBG12SamplesCorrelation,
+                                       this->EBG12SamplesCorrelation_.data(),
+                                       this->EBG12SamplesCorrelation_.size() * 
+                                       sizeof(double),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+            cudaCheck( cudaMemcpyAsync(product.EBG6SamplesCorrelation,
+                                       this->EBG6SamplesCorrelation_.data(),
+                                       this->EBG6SamplesCorrelation_.size() * 
+                                       sizeof(double),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+            cudaCheck( cudaMemcpyAsync(product.EBG1SamplesCorrelation,
+                                       this->EBG1SamplesCorrelation_.data(),
+                                       this->EBG1SamplesCorrelation_.size() * 
+                                       sizeof(double),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+            cudaCheck( cudaMemcpyAsync(product.EEG12SamplesCorrelation,
+                                       this->EEG12SamplesCorrelation_.data(),
+                                       this->EEG12SamplesCorrelation_.size() * 
+                                       sizeof(double),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+            cudaCheck( cudaMemcpyAsync(product.EEG6SamplesCorrelation,
+                                       this->EEG6SamplesCorrelation_.data(),
+                                       this->EEG6SamplesCorrelation_.size() * 
+                                       sizeof(double),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+            cudaCheck( cudaMemcpyAsync(product.EEG1SamplesCorrelation,
+                                       this->EEG1SamplesCorrelation_.data(),
+                                       this->EEG1SamplesCorrelation_.size() * 
+                                       sizeof(double),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+        }
+    );
 
-  return product;
+    return product;
 }
 
 TYPELOOKUP_DATA_REG(EcalSamplesCorrelationGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalTimeBiasCorrectionsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalTimeBiasCorrectionsGPU.cc
index 9ab0a6302a9c4..277661b030c68 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalTimeBiasCorrectionsGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalTimeBiasCorrectionsGPU.cc
@@ -3,59 +3,76 @@
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-EcalTimeBiasCorrectionsGPU::EcalTimeBiasCorrectionsGPU(EcalTimeBiasCorrections const& values)
-    : EBTimeCorrAmplitudeBins_{values.EBTimeCorrAmplitudeBins},
-      EBTimeCorrShiftBins_{values.EBTimeCorrShiftBins},
-      EETimeCorrAmplitudeBins_{values.EETimeCorrAmplitudeBins},
-      EETimeCorrShiftBins_{values.EETimeCorrShiftBins} {}
+EcalTimeBiasCorrectionsGPU::EcalTimeBiasCorrectionsGPU(
+        EcalTimeBiasCorrections const& values) 
+    : EBTimeCorrAmplitudeBins_{values.EBTimeCorrAmplitudeBins}
+    , EBTimeCorrShiftBins_{values.EBTimeCorrShiftBins}
+    , EETimeCorrAmplitudeBins_{values.EETimeCorrAmplitudeBins}
+    , EETimeCorrShiftBins_{values.EETimeCorrShiftBins}
+{}
 
 EcalTimeBiasCorrectionsGPU::Product::~Product() {
-  // deallocation
-  cudaCheck(cudaFree(EBTimeCorrAmplitudeBins));
-  cudaCheck(cudaFree(EBTimeCorrShiftBins));
-  cudaCheck(cudaFree(EETimeCorrAmplitudeBins));
-  cudaCheck(cudaFree(EETimeCorrShiftBins));
+    // deallocation
+    cudaCheck( cudaFree(EBTimeCorrAmplitudeBins) );
+    cudaCheck( cudaFree(EBTimeCorrShiftBins) );
+    cudaCheck( cudaFree(EETimeCorrAmplitudeBins) );
+    cudaCheck( cudaFree(EETimeCorrShiftBins) );
 }
 
-EcalTimeBiasCorrectionsGPU::Product const& EcalTimeBiasCorrectionsGPU::getProduct(cudaStream_t cudaStream) const {
-  auto const& product = product_.dataForCurrentDeviceAsync(
-      cudaStream, [this](EcalTimeBiasCorrectionsGPU::Product& product, cudaStream_t cudaStream) {
-        // to get the size of vectors later on
-        // should be removed and host conditions' objects used directly
-        product.EBTimeCorrAmplitudeBinsSize = this->EBTimeCorrAmplitudeBins_.size();
-        product.EETimeCorrAmplitudeBinsSize = this->EETimeCorrAmplitudeBins_.size();
+EcalTimeBiasCorrectionsGPU::Product const& EcalTimeBiasCorrectionsGPU::getProduct(
+        cudaStream_t cudaStream) const
+{
+    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
+        [this](EcalTimeBiasCorrectionsGPU::Product& product, cudaStream_t cudaStream) {
+            // to get the size of vectors later on
+            // should be removed and host conditions' objects used directly
+            product.EBTimeCorrAmplitudeBinsSize = 
+                this->EBTimeCorrAmplitudeBins_.size();
+            product.EETimeCorrAmplitudeBinsSize = 
+                this->EETimeCorrAmplitudeBins_.size();
 
-        // malloc
-        cudaCheck(cudaMalloc((void**)&product.EBTimeCorrAmplitudeBins,
-                             this->EBTimeCorrAmplitudeBins_.size() * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&product.EBTimeCorrShiftBins, this->EBTimeCorrShiftBins_.size() * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&product.EETimeCorrAmplitudeBins,
-                             this->EETimeCorrAmplitudeBins_.size() * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&product.EETimeCorrShiftBins, this->EETimeCorrShiftBins_.size() * sizeof(float)));
-        // transfer
-        cudaCheck(cudaMemcpyAsync(product.EBTimeCorrAmplitudeBins,
-                                  this->EBTimeCorrAmplitudeBins_.data(),
-                                  this->EBTimeCorrAmplitudeBins_.size() * sizeof(float),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-        cudaCheck(cudaMemcpyAsync(product.EBTimeCorrShiftBins,
-                                  this->EBTimeCorrShiftBins_.data(),
-                                  this->EBTimeCorrShiftBins_.size() * sizeof(float),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-        cudaCheck(cudaMemcpyAsync(product.EETimeCorrAmplitudeBins,
-                                  this->EETimeCorrAmplitudeBins_.data(),
-                                  this->EETimeCorrAmplitudeBins_.size() * sizeof(float),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-        cudaCheck(cudaMemcpyAsync(product.EETimeCorrShiftBins,
-                                  this->EETimeCorrShiftBins_.data(),
-                                  this->EETimeCorrShiftBins_.size() * sizeof(float),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-      });
+            // malloc
+            cudaCheck( cudaMalloc((void**)&product.EBTimeCorrAmplitudeBins,
+                                  this->EBTimeCorrAmplitudeBins_.size() * 
+                                  sizeof(float)) );
+            cudaCheck( cudaMalloc((void**)&product.EBTimeCorrShiftBins,
+                                  this->EBTimeCorrShiftBins_.size() * 
+                                  sizeof(float)) );
+            cudaCheck( cudaMalloc((void**)&product.EETimeCorrAmplitudeBins,
+                                  this->EETimeCorrAmplitudeBins_.size() * 
+                                  sizeof(float)) );
+            cudaCheck( cudaMalloc((void**)&product.EETimeCorrShiftBins,
+                                  this->EETimeCorrShiftBins_.size() * 
+                                  sizeof(float)) );
+            // transfer 
+            cudaCheck( cudaMemcpyAsync(product.EBTimeCorrAmplitudeBins,
+                                       this->EBTimeCorrAmplitudeBins_.data(),
+                                       this->EBTimeCorrAmplitudeBins_.size() * 
+                                       sizeof(float),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+            cudaCheck( cudaMemcpyAsync(product.EBTimeCorrShiftBins,
+                                       this->EBTimeCorrShiftBins_.data(),
+                                       this->EBTimeCorrShiftBins_.size() * 
+                                       sizeof(float),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+            cudaCheck( cudaMemcpyAsync(product.EETimeCorrAmplitudeBins,
+                                       this->EETimeCorrAmplitudeBins_.data(),
+                                       this->EETimeCorrAmplitudeBins_.size() * 
+                                       sizeof(float),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+            cudaCheck( cudaMemcpyAsync(product.EETimeCorrShiftBins,
+                                       this->EETimeCorrShiftBins_.data(),
+                                       this->EETimeCorrShiftBins_.size() * 
+                                       sizeof(float),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+        }
+    );
 
-  return product;
+    return product;
 }
 
 TYPELOOKUP_DATA_REG(EcalTimeBiasCorrectionsGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalTimeCalibConstantsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalTimeCalibConstantsGPU.cc
index d724a33f1d4e1..1da155b2539f2 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalTimeCalibConstantsGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalTimeCalibConstantsGPU.cc
@@ -3,38 +3,47 @@
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-EcalTimeCalibConstantsGPU::EcalTimeCalibConstantsGPU(EcalTimeCalibConstants const& values)
-    : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {}
+EcalTimeCalibConstantsGPU::EcalTimeCalibConstantsGPU(
+        EcalTimeCalibConstants const& values) 
+    : valuesEB_{values.barrelItems()}
+    , valuesEE_{values.endcapItems()}
+{}
 
 EcalTimeCalibConstantsGPU::Product::~Product() {
-  // deallocation
-  cudaCheck(cudaFree(values));
+    // deallocation
+    cudaCheck( cudaFree(values) );
 }
 
-EcalTimeCalibConstantsGPU::Product const& EcalTimeCalibConstantsGPU::getProduct(cudaStream_t cudaStream) const {
-  auto const& product = product_.dataForCurrentDeviceAsync(
-      cudaStream, [this](EcalTimeCalibConstantsGPU::Product& product, cudaStream_t cudaStream) {
-        // malloc
-        cudaCheck(
-            cudaMalloc((void**)&product.values, (this->valuesEB_.size() + this->valuesEE_.size()) * sizeof(float)));
-
-        // offset in floats, not bytes
-        auto const offset = this->valuesEB_.size();
-
-        // transfer
-        cudaCheck(cudaMemcpyAsync(product.values,
-                                  this->valuesEB_.data(),
-                                  this->valuesEB_.size() * sizeof(float),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-        cudaCheck(cudaMemcpyAsync(product.values + offset,
-                                  this->valuesEE_.data(),
-                                  this->valuesEE_.size() * sizeof(float),
-                                  cudaMemcpyHostToDevice,
-                                  cudaStream));
-      });
-
-  return product;
+EcalTimeCalibConstantsGPU::Product const& EcalTimeCalibConstantsGPU::getProduct(
+        cudaStream_t cudaStream) const
+{
+    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
+        [this](EcalTimeCalibConstantsGPU::Product& product, cudaStream_t cudaStream) {
+            // malloc
+            cudaCheck( cudaMalloc((void**)&product.values,
+                                  (this->valuesEB_.size() + this->valuesEE_.size()) * 
+                                  sizeof(float)) );
+
+            // offset in floats, not bytes
+            auto const offset = this->valuesEB_.size();
+
+            // transfer 
+            cudaCheck( cudaMemcpyAsync(product.values,
+                                       this->valuesEB_.data(),
+                                       this->valuesEB_.size() * 
+                                       sizeof(float),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+            cudaCheck( cudaMemcpyAsync(product.values + offset,
+                                       this->valuesEE_.data(),
+                                       this->valuesEE_.size() * 
+                                       sizeof(float),
+                                       cudaMemcpyHostToDevice,
+                                       cudaStream) );
+        }
+    );
+
+    return product;
 }
 
 TYPELOOKUP_DATA_REG(EcalTimeCalibConstantsGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu
index 79b70716a675b..b67bb74235e4a 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu
@@ -18,143 +18,114 @@
 #include "cuda.h"
 
 #include "AmplitudeComputationCommonKernels.h"
-#include "AmplitudeComputationKernelsV1.h"
+#include "AmplitudeComputationKernels.h"
 #include "TimeComputationKernels.h"
 
 //#define DEBUG
 
 //#define ECAL_RECO_CUDA_DEBUG
 
-namespace ecal {
-  namespace multifit {
+namespace ecal { namespace multifit {
+   
+void entryPoint(
+        EventInputDataGPU const& eventInputGPU,
+        EventOutputDataGPU& eventOutputGPU, EventDataForScratchGPU& scratch,
+        ConditionsProducts const& conditions, 
+        ConfigurationParameters const& configParameters,
+        cudaStream_t cudaStream) {
+    using digis_type = std::vector<uint16_t>;
+    using dids_type = std::vector<uint32_t>;
+    // accodring to the cpu setup  //----> hardcoded
+    bool const gainSwitchUseMaxSampleEB = true;
+    // accodring to the cpu setup  //----> hardcoded
+    bool const gainSwitchUseMaxSampleEE = false;
+    
+    uint32_t const offsetForHashes = conditions.offsetForHashes;
+    uint32_t const offsetForInputs = eventInputGPU.ebDigis.ndigis;
+    unsigned int totalChannels = eventInputGPU.ebDigis.ndigis +
+        eventInputGPU.eeDigis.ndigis;
+    
+    // 
+    // 1d preparation kernel
+    //
+    unsigned int nchannels_per_block = 32;
+    unsigned int threads_1d = 10 * nchannels_per_block;
+    unsigned int blocks_1d = threads_1d > 10*totalChannels 
+        ? 1 : (totalChannels*10 + threads_1d - 1) / threads_1d;
+    int shared_bytes = nchannels_per_block * EcalDataFrame::MAXSAMPLES * (
+        sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(char)
+        + sizeof(bool)
+    );
+    kernel_prep_1d_and_initialize<<<blocks_1d, threads_1d, 
+                                    shared_bytes, cudaStream>>>(
+        conditions.pulseShapes.values, 
+        eventInputGPU.ebDigis.data, 
+        eventInputGPU.ebDigis.ids,
+        eventInputGPU.eeDigis.data,
+        eventInputGPU.eeDigis.ids,
+        scratch.samples,
+        (SampleVector*)eventOutputGPU.amplitudesAll,
+        scratch.gainsNoise,
+        conditions.pedestals.mean_x1,
+        conditions.pedestals.mean_x12,
+        conditions.pedestals.rms_x12,
+        conditions.pedestals.mean_x6,
+        conditions.gainRatios.gain6Over1,
+        conditions.gainRatios.gain12Over6,
+        scratch.hasSwitchToGain6,
+        scratch.hasSwitchToGain1,
+        scratch.isSaturated,
+        eventOutputGPU.amplitude,
+        eventOutputGPU.chi2,
+        eventOutputGPU.pedestal,
+        eventOutputGPU.did,
+        eventOutputGPU.flags,
+        scratch.acState,
+        scratch.activeBXs,
+        offsetForHashes,
+        offsetForInputs,
+        gainSwitchUseMaxSampleEB,
+        gainSwitchUseMaxSampleEE,
+        totalChannels);
+    cudaCheck(cudaGetLastError());
 
-    void entryPoint(EventInputDataCPU const& eventInputCPU,
-                    EventInputDataGPU& eventInputGPU,
-                    EventOutputDataGPU& eventOutputGPU,
-                    EventDataForScratchGPU& scratch,
-                    ConditionsProducts const& conditions,
-                    ConfigurationParameters const& configParameters,
-                    cudaStream_t cudaStream) {
-      using digis_type = std::vector<uint16_t>;
-      using dids_type = std::vector<uint32_t>;
-      // accodring to the cpu setup  //----> hardcoded
-      bool const gainSwitchUseMaxSampleEB = true;
-      // accodring to the cpu setup  //----> hardcoded
-      bool const gainSwitchUseMaxSampleEE = false;
+    //
+    // 2d preparation kernel
+    //
+    int blocks_2d = totalChannels;
+    dim3 threads_2d{10, 10};
+    kernel_prep_2d<<<blocks_2d, threads_2d, 0, cudaStream>>>(
+        scratch.gainsNoise,
+        eventInputGPU.ebDigis.ids,
+        eventInputGPU.eeDigis.ids,
+        conditions.pedestals.rms_x12,
+        conditions.pedestals.rms_x6,
+        conditions.pedestals.rms_x1,
+        conditions.gainRatios.gain12Over6,
+        conditions.gainRatios.gain6Over1,
+        conditions.samplesCorrelation.EBG12SamplesCorrelation,
+        conditions.samplesCorrelation.EBG6SamplesCorrelation,
+        conditions.samplesCorrelation.EBG1SamplesCorrelation,
+        conditions.samplesCorrelation.EEG12SamplesCorrelation,
+        conditions.samplesCorrelation.EEG6SamplesCorrelation,
+        conditions.samplesCorrelation.EEG1SamplesCorrelation,
+        scratch.noisecov,
+        scratch.pulse_matrix,
+        conditions.pulseShapes.values,
+        scratch.hasSwitchToGain6,
+        scratch.hasSwitchToGain1,
+        scratch.isSaturated,
+        offsetForHashes,
+        offsetForInputs);
+    cudaCheck(cudaGetLastError());
+    
+    // run minimization kernels
+    v1::minimization_procedure(
+        eventInputGPU, eventOutputGPU,
+        scratch, conditions, configParameters, cudaStream);
 
-      uint32_t const offsetForHashes = conditions.offsetForHashes;
-      unsigned int totalChannels = eventInputCPU.ebDigis.size() + eventInputCPU.eeDigis.size();
-
-      // temporary for recording
-      /*cudaEvent_t start_event;
-    cudaEvent_t end_event;
-    cudaCheck( cudaEventCreate(&start_event) );
-    cudaCheck( cudaEventCreate(&end_event) );
-
-    cudaCheck (cudaEventRecord(start_event, 0) );
-    */
-
-      //
-      // in what follows we copy eb then ee.
-      // offset by size
-      //
-
-      //
-      // copy event data: digis + ids, not really async as vectors have default
-      // allocators
-      //
-      cudaCheck(cudaMemcpyAsync(eventInputGPU.digis,
-                                eventInputCPU.ebDigis.data().data(),
-                                eventInputCPU.ebDigis.data().size() * sizeof(digis_type::value_type),
-                                cudaMemcpyHostToDevice,
-                                cudaStream));
-      cudaCheck(cudaMemcpyAsync(eventInputGPU.digis + eventInputCPU.ebDigis.data().size(),
-                                eventInputCPU.eeDigis.data().data(),
-                                eventInputCPU.eeDigis.data().size() * sizeof(digis_type::value_type),
-                                cudaMemcpyHostToDevice,
-                                cudaStream));
-
-      cudaCheck(cudaMemcpyAsync(eventInputGPU.ids,
-                                eventInputCPU.ebDigis.ids().data(),
-                                eventInputCPU.ebDigis.ids().size() * sizeof(dids_type::value_type),
-                                cudaMemcpyHostToDevice,
-                                cudaStream));
-      cudaCheck(cudaMemcpyAsync(eventInputGPU.ids + eventInputCPU.ebDigis.ids().size(),
-                                eventInputCPU.eeDigis.ids().data(),
-                                eventInputCPU.eeDigis.ids().size() * sizeof(dids_type::value_type),
-                                cudaMemcpyHostToDevice,
-                                cudaStream));
-
-      //
-      // 1d preparation kernel
-      //
-      unsigned int nchannels_per_block = 32;
-      unsigned int threads_1d = 10 * nchannels_per_block;
-      unsigned int blocks_1d = threads_1d > 10 * totalChannels ? 1 : (totalChannels * 10 + threads_1d - 1) / threads_1d;
-      int shared_bytes = nchannels_per_block * EcalDataFrame::MAXSAMPLES *
-                         (sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(char) + sizeof(bool));
-      kernel_prep_1d_and_initialize<<<blocks_1d, threads_1d, shared_bytes, cudaStream>>>(
-          conditions.pulseShapes.values,
-          eventInputGPU.digis,
-          eventInputGPU.ids,
-          scratch.samples,
-          (SampleVector*)eventOutputGPU.amplitudesAll,
-          scratch.gainsNoise,
-          conditions.pedestals.mean_x1,
-          conditions.pedestals.mean_x12,
-          conditions.pedestals.rms_x12,
-          conditions.pedestals.mean_x6,
-          conditions.gainRatios.gain6Over1,
-          conditions.gainRatios.gain12Over6,
-          scratch.hasSwitchToGain6,
-          scratch.hasSwitchToGain1,
-          scratch.isSaturated,
-          eventOutputGPU.amplitude,
-          eventOutputGPU.chi2,
-          eventOutputGPU.pedestal,
-          eventOutputGPU.flags,
-          scratch.acState,
-          scratch.activeBXs,
-          offsetForHashes,
-          gainSwitchUseMaxSampleEB,
-          gainSwitchUseMaxSampleEE,
-          totalChannels);
-      cudaCheck(cudaGetLastError());
-
-      //
-      // 2d preparation kernel
-      //
-      int blocks_2d = totalChannels;
-      dim3 threads_2d{10, 10};
-      kernel_prep_2d<<<blocks_2d, threads_2d, 0, cudaStream>>>(conditions.pulseCovariances.values,
-                                                               scratch.pulse_covariances,
-                                                               scratch.gainsNoise,
-                                                               eventInputGPU.ids,
-                                                               conditions.pedestals.rms_x12,
-                                                               conditions.pedestals.rms_x6,
-                                                               conditions.pedestals.rms_x1,
-                                                               conditions.gainRatios.gain12Over6,
-                                                               conditions.gainRatios.gain6Over1,
-                                                               conditions.samplesCorrelation.EBG12SamplesCorrelation,
-                                                               conditions.samplesCorrelation.EBG6SamplesCorrelation,
-                                                               conditions.samplesCorrelation.EBG1SamplesCorrelation,
-                                                               conditions.samplesCorrelation.EEG12SamplesCorrelation,
-                                                               conditions.samplesCorrelation.EEG6SamplesCorrelation,
-                                                               conditions.samplesCorrelation.EEG1SamplesCorrelation,
-                                                               scratch.noisecov,
-                                                               scratch.pulse_matrix,
-                                                               conditions.pulseShapes.values,
-                                                               scratch.hasSwitchToGain6,
-                                                               scratch.hasSwitchToGain1,
-                                                               scratch.isSaturated,
-                                                               offsetForHashes);
-      cudaCheck(cudaGetLastError());
-
-      // run minimization kernels
-      v1::minimization_procedure(
-          eventInputCPU, eventInputGPU, eventOutputGPU, scratch, conditions, configParameters, cudaStream);
-
-      if (configParameters.shouldRunTimingComputation) {
+    if (configParameters.shouldRunTimingComputation) {
+        
         //
         // TODO: this guy can run concurrently with other kernels,
         // there is no dependence on the order of execution
@@ -162,9 +133,12 @@ namespace ecal {
         unsigned int threads_time_init = threads_1d;
         unsigned int blocks_time_init = blocks_1d;
         int sharedBytesInit = 2 * threads_time_init * sizeof(SampleVector::Scalar);
-        kernel_time_computation_init<<<blocks_time_init, threads_time_init, sharedBytesInit, cudaStream>>>(
-            eventInputGPU.digis,
-            eventInputGPU.ids,
+        kernel_time_computation_init<<<blocks_time_init, threads_time_init,
+                                       sharedBytesInit, cudaStream>>>(
+            eventInputGPU.ebDigis.data, 
+            eventInputGPU.ebDigis.ids,
+            eventInputGPU.eeDigis.data,
+            eventInputGPU.eeDigis.ids,
             conditions.pedestals.rms_x12,
             conditions.pedestals.rms_x6,
             conditions.pedestals.rms_x1,
@@ -179,57 +153,69 @@ namespace ecal {
             scratch.useless_sample_values,
             scratch.pedestal_nums,
             offsetForHashes,
+            offsetForInputs,
             conditions.sampleMask.getEcalSampleMaskRecordEB(),
             conditions.sampleMask.getEcalSampleMaskRecordEE(),
-            totalChannels);
+            totalChannels
+        );
         cudaCheck(cudaGetLastError());
 
-        //
-        // TODO: small kernel only for EB. It needs to be checked if
+        // 
+        // TODO: small kernel only for EB. It needs to be checked if 
         /// fusing such small kernels is beneficial in here
         //
         // we are running only over EB digis
         // therefore we need to create threads/blocks only for that
         unsigned int const threadsFixMGPA = threads_1d;
-        unsigned int const blocksFixMGPA =
-            threadsFixMGPA > 10 * eventInputCPU.ebDigis.size()
+        unsigned int const blocksFixMGPA = 
+            threadsFixMGPA > 10 * eventInputGPU.ebDigis.ndigis
                 ? 1
-                : (10 * eventInputCPU.ebDigis.size() + threadsFixMGPA - 1) / threadsFixMGPA;
-        kernel_time_compute_fixMGPAslew<<<blocksFixMGPA, threadsFixMGPA, 0, cudaStream>>>(
-            eventInputGPU.digis,
+                : (10 * eventInputGPU.ebDigis.ndigis + threadsFixMGPA - 1) 
+                    / threadsFixMGPA;
+        kernel_time_compute_fixMGPAslew<<<blocksFixMGPA, threadsFixMGPA, 
+                                          0, cudaStream>>>(
+            eventInputGPU.ebDigis.data,
+            eventInputGPU.eeDigis.data,
             scratch.sample_values,
             scratch.sample_value_errors,
             scratch.useless_sample_values,
             conditions.sampleMask.getEcalSampleMaskRecordEB(),
-            totalChannels);
+            totalChannels,
+            offsetForInputs
+        );
         cudaCheck(cudaGetLastError());
 
         //
+        // 
         //
-        //
-        int sharedBytes = EcalDataFrame::MAXSAMPLES * nchannels_per_block * 4 * sizeof(SampleVector::Scalar);
+        int sharedBytes = EcalDataFrame::MAXSAMPLES * nchannels_per_block *
+            4 * sizeof(SampleVector::Scalar);
         auto const threads_nullhypot = threads_1d;
         auto const blocks_nullhypot = blocks_1d;
-        kernel_time_compute_nullhypot<<<blocks_nullhypot, threads_nullhypot, sharedBytes, cudaStream>>>(
+        kernel_time_compute_nullhypot<<<blocks_nullhypot, threads_nullhypot, 
+                                        sharedBytes, cudaStream>>>(
             scratch.sample_values,
             scratch.sample_value_errors,
             scratch.useless_sample_values,
             scratch.chi2sNullHypot,
             scratch.sum0sNullHypot,
             scratch.sumAAsNullHypot,
-            totalChannels);
+            totalChannels
+        );
         cudaCheck(cudaGetLastError());
 
         unsigned int nchannels_per_block_makeratio = 10;
         unsigned int threads_makeratio = 45 * nchannels_per_block_makeratio;
         unsigned int blocks_makeratio = threads_makeratio > 45 * totalChannels
-                                            ? 1
-                                            : (totalChannels * 45 + threads_makeratio - 1) / threads_makeratio;
+            ? 1
+            : (totalChannels * 45 + threads_makeratio - 1) / threads_makeratio;
         int sharedBytesMakeRatio = 5 * threads_makeratio * sizeof(SampleVector::Scalar);
-        kernel_time_compute_makeratio<<<blocks_makeratio, threads_makeratio, sharedBytesMakeRatio, cudaStream>>>(
+        kernel_time_compute_makeratio<<<blocks_makeratio, threads_makeratio,
+                                        sharedBytesMakeRatio, cudaStream>>>(
             scratch.sample_values,
             scratch.sample_value_errors,
-            eventInputGPU.ids,
+            eventInputGPU.ebDigis.ids,
+            eventInputGPU.eeDigis.ids,
             scratch.useless_sample_values,
             scratch.pedestal_nums,
             configParameters.amplitudeFitParametersEB,
@@ -243,13 +229,15 @@ namespace ecal {
             scratch.accTimeMax,
             scratch.accTimeWgt,
             scratch.tcState,
-            configParameters.timeFitParametersSizeEB,
+            configParameters.timeFitParametersSizeEB, 
             configParameters.timeFitParametersSizeEE,
             configParameters.timeFitLimitsFirstEB,
             configParameters.timeFitLimitsFirstEE,
             configParameters.timeFitLimitsSecondEB,
             configParameters.timeFitLimitsSecondEE,
-            totalChannels);
+            totalChannels,
+            offsetForInputs
+        );
         cudaCheck(cudaGetLastError());
 
         //
@@ -257,41 +245,48 @@ namespace ecal {
         //
         auto const threads_findamplchi2 = threads_1d;
         auto const blocks_findamplchi2 = blocks_1d;
-        int const sharedBytesFindAmplChi2 = 2 * threads_findamplchi2 * sizeof(SampleVector::Scalar);
+        int const sharedBytesFindAmplChi2 = 2 * threads_findamplchi2 * 
+            sizeof(SampleVector::Scalar);
         kernel_time_compute_findamplchi2_and_finish<<<blocks_findamplchi2,
-                                                      threads_findamplchi2,
-                                                      sharedBytesFindAmplChi2,
-                                                      cudaStream>>>(scratch.sample_values,
-                                                                    scratch.sample_value_errors,
-                                                                    eventInputGPU.ids,
-                                                                    scratch.useless_sample_values,
-                                                                    scratch.tMaxAlphaBetas,
-                                                                    scratch.tMaxErrorAlphaBetas,
-                                                                    scratch.accTimeMax,
-                                                                    scratch.accTimeWgt,
-                                                                    configParameters.amplitudeFitParametersEB,
-                                                                    configParameters.amplitudeFitParametersEE,
-                                                                    scratch.sumAAsNullHypot,
-                                                                    scratch.sum0sNullHypot,
-                                                                    scratch.chi2sNullHypot,
-                                                                    scratch.tcState,
-                                                                    scratch.ampMaxAlphaBeta,
-                                                                    scratch.ampMaxError,
-                                                                    scratch.timeMax,
-                                                                    scratch.timeError,
-                                                                    totalChannels);
+                                           threads_findamplchi2,
+                                           sharedBytesFindAmplChi2, cudaStream>>>(
+            scratch.sample_values,
+            scratch.sample_value_errors,
+            eventInputGPU.ebDigis.ids,
+            eventInputGPU.eeDigis.ids,
+            scratch.useless_sample_values,
+            scratch.tMaxAlphaBetas,
+            scratch.tMaxErrorAlphaBetas,
+            scratch.accTimeMax,
+            scratch.accTimeWgt,
+            configParameters.amplitudeFitParametersEB,
+            configParameters.amplitudeFitParametersEE,
+            scratch.sumAAsNullHypot,
+            scratch.sum0sNullHypot,
+            scratch.chi2sNullHypot,
+            scratch.tcState,
+            scratch.ampMaxAlphaBeta,
+            scratch.ampMaxError,
+            scratch.timeMax,
+            scratch.timeError,
+            totalChannels,
+            offsetForInputs
+        );
         cudaCheck(cudaGetLastError());
-
+        
         //
         //
         //
         auto const threads_timecorr = 32;
-        auto const blocks_timecorr =
-            threads_timecorr > totalChannels ? 1 : (totalChannels + threads_timecorr - 1) / threads_timecorr;
-        kernel_time_correction_and_finalize<<<blocks_timecorr, threads_timecorr, 0, cudaStream>>>(
+        auto const blocks_timecorr = threads_timecorr > totalChannels
+            ? 1 : (totalChannels + threads_timecorr-1) / threads_timecorr;
+        kernel_time_correction_and_finalize<<<blocks_timecorr, threads_timecorr,
+                                              0, cudaStream>>>(
             eventOutputGPU.amplitude,
-            eventInputGPU.digis,
-            eventInputGPU.ids,
+            eventInputGPU.ebDigis.data,
+            eventInputGPU.ebDigis.ids,
+            eventInputGPU.eeDigis.data,
+            eventInputGPU.eeDigis.ids,
             conditions.timeBiasCorrections.EBTimeCorrAmplitudeBins,
             conditions.timeBiasCorrections.EETimeCorrAmplitudeBins,
             conditions.timeBiasCorrections.EBTimeCorrShiftBins,
@@ -322,18 +317,19 @@ namespace ecal {
             configParameters.outOfTimeThreshG61mEB,
             configParameters.outOfTimeThreshG61mEE,
             offsetForHashes,
-            totalChannels);
+            offsetForInputs,
+            totalChannels
+        );
         cudaCheck(cudaGetLastError());
-      }
+    }
 
-      /*
+        /*
     cudaEventRecord(end_event, 0);
     cudaEventSynchronize(end_event);
     float ms;
     cudaEventElapsedTime(&ms, start_event, end_event);
     std::cout << "elapsed time = " << ms << std::endl;
     */
-    }
+}
 
-  }  // namespace multifit
-}  // namespace ecal
+}}
diff --git a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu
index b85f002464f65..6b60f4fc35560 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu
@@ -3,74 +3,88 @@
 #include "DataFormats/EcalDetId/interface/EBDetId.h"
 #include "DataFormats/EcalDetId/interface/EEDetId.h"
 
-namespace ecal {
-  namespace multifit {
-
-    namespace internal {
-
-      namespace barrel {
-
-        __device__ __forceinline__ bool positiveZ(uint32_t id) { return id & 0x10000; }
-
-        __device__ __forceinline__ uint32_t ietaAbs(uint32_t id) { return (id >> 9) & 0x7F; }
-
-        __device__ __forceinline__ uint32_t iphi(uint32_t id) { return id & 0x1FF; }
-
-      }  // namespace barrel
-
-    }  // namespace internal
-
-    __device__ uint32_t hashedIndexEB(uint32_t id) {
-      using namespace internal::barrel;
-      return (EBDetId::MAX_IETA + (positiveZ(id) ? ietaAbs(id) - 1 : -ietaAbs(id))) * EBDetId::MAX_IPHI + iphi(id) - 1;
-    }
-
-    namespace internal {
-
-      namespace endcap {
-
-        __device__ __forceinline__ uint32_t ix(uint32_t id) { return (id >> 7) & 0x7F; }
-
-        __device__ __forceinline__ uint32_t iy(uint32_t id) { return id & 0x7F; }
-
-        __device__ __forceinline__ bool positiveZ(uint32_t id) { return id & 0x4000; }
-
-        // these constants come from EE Det Id
-        __constant__ const unsigned short kxf[] = {
-            41, 51, 41, 51, 41, 51, 36, 51, 36, 51, 26, 51, 26, 51, 26, 51, 21, 51, 21, 51, 21, 51, 21, 51, 21,
-            51, 16, 51, 16, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 9,  51, 9,  51, 9,  51, 9,  51, 9,  51,
-            6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 4,  51, 4,  51, 4,
-            51, 4,  51, 4,  56, 1,  58, 1,  59, 1,  60, 1,  61, 1,  61, 1,  62, 1,  62, 1,  62, 1,  62, 1,  62,
-            1,  62, 1,  62, 1,  62, 1,  62, 1,  62, 1,  61, 1,  61, 1,  60, 1,  59, 1,  58, 4,  56, 4,  51, 4,
-            51, 4,  51, 4,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51,
-            9,  51, 9,  51, 9,  51, 9,  51, 9,  51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 16, 51, 16, 51, 21,
-            51, 21, 51, 21, 51, 21, 51, 21, 51, 26, 51, 26, 51, 26, 51, 36, 51, 36, 51, 41, 51, 41, 51, 41, 51};
-
-        __constant__ const unsigned short kdi[] = {
-            0,    10,   20,   30,   40,   50,   60,   75,   90,   105,  120,  145,  170,  195,  220,  245,  270,
-            300,  330,  360,  390,  420,  450,  480,  510,  540,  570,  605,  640,  675,  710,  747,  784,  821,
-            858,  895,  932,  969,  1006, 1043, 1080, 1122, 1164, 1206, 1248, 1290, 1332, 1374, 1416, 1458, 1500,
-            1545, 1590, 1635, 1680, 1725, 1770, 1815, 1860, 1905, 1950, 1995, 2040, 2085, 2130, 2175, 2220, 2265,
-            2310, 2355, 2400, 2447, 2494, 2541, 2588, 2635, 2682, 2729, 2776, 2818, 2860, 2903, 2946, 2988, 3030,
-            3071, 3112, 3152, 3192, 3232, 3272, 3311, 3350, 3389, 3428, 3467, 3506, 3545, 3584, 3623, 3662, 3701,
-            3740, 3779, 3818, 3857, 3896, 3935, 3974, 4013, 4052, 4092, 4132, 4172, 4212, 4253, 4294, 4336, 4378,
-            4421, 4464, 4506, 4548, 4595, 4642, 4689, 4736, 4783, 4830, 4877, 4924, 4969, 5014, 5059, 5104, 5149,
-            5194, 5239, 5284, 5329, 5374, 5419, 5464, 5509, 5554, 5599, 5644, 5689, 5734, 5779, 5824, 5866, 5908,
-            5950, 5992, 6034, 6076, 6118, 6160, 6202, 6244, 6281, 6318, 6355, 6392, 6429, 6466, 6503, 6540, 6577,
-            6614, 6649, 6684, 6719, 6754, 6784, 6814, 6844, 6874, 6904, 6934, 6964, 6994, 7024, 7054, 7079, 7104,
-            7129, 7154, 7179, 7204, 7219, 7234, 7249, 7264, 7274, 7284, 7294, 7304, 7314};
-
-      }  // namespace endcap
-
-    }  // namespace internal
-
-    __device__ uint32_t hashedIndexEE(uint32_t id) {
-      using namespace internal::endcap;
-
-      const uint32_t jx(ix(id));
-      const uint32_t jd(2 * (iy(id) - 1) + (jx - 1) / 50);
-      return ((positiveZ(id) ? EEDetId::kEEhalf : 0) + kdi[jd] + jx - kxf[jd]);
-    }
-
-  }  // namespace multifit
-}  // namespace ecal
+namespace ecal { namespace multifit {
+
+namespace internal {
+
+namespace barrel {
+
+__device__
+__forceinline__
+bool positiveZ(uint32_t id) { return id & 0x10000; }
+
+__device__
+__forceinline__
+uint32_t ietaAbs(uint32_t id) { return (id >> 9) & 0x7F; }
+
+__device__
+__forceinline__
+uint32_t iphi(uint32_t id) { return id & 0x1FF; }
+
+}
+
+}
+
+__device__ 
+uint32_t hashedIndexEB(uint32_t id) {
+    using namespace internal::barrel;
+    return (EBDetId::MAX_IETA + 
+            (positiveZ(id) ? ietaAbs(id)-1 : -ietaAbs(id)) ) * EBDetId::MAX_IPHI + 
+            iphi(id)-1;
+}
+
+namespace internal {
+
+namespace endcap {
+
+__device__
+__forceinline__
+uint32_t ix(uint32_t id) { return (id >> 7) & 0x7F; }
+
+__device__
+__forceinline__
+uint32_t iy(uint32_t id) { return id & 0x7F; }
+
+__device__
+__forceinline__
+bool positiveZ(uint32_t id) { return id & 0x4000; }
+
+// these constants come from EE Det Id 
+__constant__ 
+const unsigned short kxf[] = {
+  41, 51, 41, 51, 41, 51, 36, 51, 36, 51, 26, 51, 26, 51, 26, 51, 21, 51, 21, 51, 21, 51, 21, 51, 21, 51, 16, 51, 16,
+  51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 9,  51, 9,  51, 9,  51, 9,  51, 9,  51, 6,  51, 6,  51, 6,  51, 6,  51,
+  6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 4,  51, 4,  51, 4,  51, 4,  51, 4,  56, 1,  58, 1,  59, 1,  60, 1,
+  61, 1,  61, 1,  62, 1,  62, 1,  62, 1,  62, 1,  62, 1,  62, 1,  62, 1,  62, 1,  62, 1,  62, 1,  61, 1,  61, 1,  60,
+  1,  59, 1,  58, 4,  56, 4,  51, 4,  51, 4,  51, 4,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,
+  51, 6,  51, 6,  51, 9,  51, 9,  51, 9,  51, 9,  51, 9,  51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 16, 51, 16, 51,
+  21, 51, 21, 51, 21, 51, 21, 51, 21, 51, 26, 51, 26, 51, 26, 51, 36, 51, 36, 51, 41, 51, 41, 51, 41, 51};
+
+__constant__
+const unsigned short kdi[] = {
+  0,    10,   20,   30,   40,   50,   60,   75,   90,   105,  120,  145,  170,  195,  220,  245,  270,  300,  330,
+  360,  390,  420,  450,  480,  510,  540,  570,  605,  640,  675,  710,  747,  784,  821,  858,  895,  932,  969,
+  1006, 1043, 1080, 1122, 1164, 1206, 1248, 1290, 1332, 1374, 1416, 1458, 1500, 1545, 1590, 1635, 1680, 1725, 1770,
+  1815, 1860, 1905, 1950, 1995, 2040, 2085, 2130, 2175, 2220, 2265, 2310, 2355, 2400, 2447, 2494, 2541, 2588, 2635,
+  2682, 2729, 2776, 2818, 2860, 2903, 2946, 2988, 3030, 3071, 3112, 3152, 3192, 3232, 3272, 3311, 3350, 3389, 3428,
+  3467, 3506, 3545, 3584, 3623, 3662, 3701, 3740, 3779, 3818, 3857, 3896, 3935, 3974, 4013, 4052, 4092, 4132, 4172,
+  4212, 4253, 4294, 4336, 4378, 4421, 4464, 4506, 4548, 4595, 4642, 4689, 4736, 4783, 4830, 4877, 4924, 4969, 5014,
+  5059, 5104, 5149, 5194, 5239, 5284, 5329, 5374, 5419, 5464, 5509, 5554, 5599, 5644, 5689, 5734, 5779, 5824, 5866,
+  5908, 5950, 5992, 6034, 6076, 6118, 6160, 6202, 6244, 6281, 6318, 6355, 6392, 6429, 6466, 6503, 6540, 6577, 6614,
+  6649, 6684, 6719, 6754, 6784, 6814, 6844, 6874, 6904, 6934, 6964, 6994, 7024, 7054, 7079, 7104, 7129, 7154, 7179,
+  7204, 7219, 7234, 7249, 7264, 7274, 7284, 7294, 7304, 7314};
+
+}
+
+}
+
+__device__
+uint32_t hashedIndexEE(uint32_t id) {
+    using namespace internal::endcap;
+
+    const uint32_t jx ( ix(id) ) ;
+    const uint32_t jd ( 2*( iy(id) - 1 ) + ( jx - 1 )/50 ) ;
+    return (  ( positiveZ(id) ? EEDetId::kEEhalf : 0) + kdi[jd] + jx - kxf[jd] ) ;
+}
+
+}}
diff --git a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h
index b148ab91915d1..888bdc103b0d4 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h
+++ b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h
@@ -1,14 +1,464 @@
 #ifndef RecoLocalCalo_EcalRecAlgos_src_KernelHelpers_h
 #define RecoLocalCalo_EcalRecAlgos_src_KernelHelpers_h
 
-namespace ecal {
-  namespace multifit {
+#include <Eigen/Dense>
 
-    __device__ uint32_t hashedIndexEB(uint32_t id);
+namespace ecal { namespace multifit {
 
-    __device__ uint32_t hashedIndexEE(uint32_t id);
+template<int NROWS, int NCOLS>
+using ColMajorMatrix = Eigen::Matrix<float, NROWS, NCOLS, Eigen::ColMajor>;
 
-  }  // namespace multifit
-}  // namespace ecal
+template<int NROWS, int NCOLS>
+using RowMajorMatrix = Eigen::Matrix<float, NROWS, NCOLS, Eigen::RowMajor>;
 
-#endif  // RecoLocalCalo_EcalRecAlgos_src_KernelHelpers_h
+template<int SIZE, typename T = float>
+using ColumnVector = Eigen::Matrix<T, SIZE, 1>;
+
+template<int SIZE, typename T = float>
+using RowVector = Eigen::Matrix<T, 1, SIZE>;
+
+__device__
+uint32_t hashedIndexEB(uint32_t id);
+
+__device__
+uint32_t hashedIndexEE(uint32_t id);
+
+
+// FIXME: provide specialization for Row Major layout
+template
+<
+    typename T,
+    int Stride,
+    int Order = Eigen::ColMajor
+>
+struct MapSymM {
+    using type = T;
+    using base_type = typename std::remove_const<type>::type;
+
+    static constexpr int total = Stride * (Stride + 1) / 2;
+    static constexpr int stride = Stride;
+    T* data;
+
+    __forceinline__ __device__
+    MapSymM(T *data) : data{data} {}
+
+    __forceinline__ __device__
+    T const& operator()(int const row, int const col) const {
+        auto const tmp = (Stride - col) * (Stride - col + 1) / 2;
+        auto const index = total - tmp + row - col;
+        return data[index];
+    }
+
+    template<typename U = T>
+    __forceinline__ __device__
+    typename std::enable_if<std::is_same<base_type, U>::value, base_type>::type&
+    operator()(int const row, int const col) {
+        auto const tmp = (Stride - col) * (Stride - col + 1) / 2;
+        auto const index = total - tmp + row - col;
+        return data[index];
+    }
+};
+
+// FIXME: either use/modify/improve eigen or make this more generic
+// this is a map for a pulse matrix to building a 2d matrix for each channel
+// and hide indexing
+template
+<
+    typename T
+>
+struct MapMForPM {
+    using type = T;
+    using base_type = typename std::remove_cv<type>::type;
+
+    type* data;
+    __forceinline__ __device__
+    MapMForPM(type* data) : data{data} {}
+
+    __forceinline__ __device__
+    base_type operator()(int const row, int const col) const {
+        auto const index = 2 - col + row;
+        return index>=0 ? data[index] : 0;
+    }   
+};
+
+// simple/trivial cholesky decomposition impl
+template<typename MatrixType1, typename MatrixType2>
+__forceinline__ __device__ 
+void compute_decomposition_unrolled(MatrixType1& L, MatrixType2 const& M) {
+    auto const sqrtm_0_0 = std::sqrt(M(0, 0));
+    L(0, 0) = sqrtm_0_0;
+    using T = typename MatrixType1::base_type;
+
+    #pragma unroll
+    for (int i=1; i<MatrixType1::stride; i++) {
+        T sumsq{0};
+        for (int j=0; j<i; j++) {
+            T sumsq2{0};
+            auto const m_i_j = M(i, j);
+            for (int k=0; k<j; ++k)
+                sumsq2 += L(i, k) * L(j, k);
+
+            auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
+            L(i, j) = value_i_j;
+
+            sumsq += value_i_j * value_i_j;
+        }
+
+        auto const l_i_i = std::sqrt(M(i, i) - sumsq);
+        L(i, i) = l_i_i;
+    }
+}
+
+template<typename MatrixType1, typename MatrixType2>
+__forceinline__ __device__ 
+void compute_decomposition(MatrixType1& L, MatrixType2 const& M, int const N) {
+    auto const sqrtm_0_0 = std::sqrt(M(0, 0));
+    L(0, 0) = sqrtm_0_0;
+    using T = typename MatrixType1::base_type;
+
+    for (int i=1; i<N; i++) {
+        T sumsq{0};
+        for (int j=0; j<i; j++) {
+            T sumsq2{0};
+            auto const m_i_j = M(i, j);
+            for (int k=0; k<j; ++k)
+                sumsq2 += L(i, k) * L(j, k);
+
+            auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
+            L(i, j) = value_i_j;
+
+            sumsq += value_i_j * value_i_j;
+        }
+
+        auto const l_i_i = std::sqrt(M(i, i) - sumsq);
+        L(i, i) = l_i_i;
+    }
+}
+
+template<typename MatrixType1, typename MatrixType2, typename VectorType>
+__forceinline__ __device__ 
+void compute_decomposition_forwardsubst_with_offsets(
+        MatrixType1& L, MatrixType2 const& M,
+        float b[MatrixType1::stride],
+        VectorType const& Atb,
+        int const N,
+        ColumnVector<MatrixType1::stride, int> const& pulseOffsets) {
+    auto const real_0 = pulseOffsets(0);
+    auto const sqrtm_0_0 = std::sqrt(M(real_0, real_0));
+    L(0, 0) = sqrtm_0_0;
+    using T = typename MatrixType1::base_type;
+    b[0] = Atb(real_0) / sqrtm_0_0; 
+
+    for (int i=1; i<N; i++) {
+        auto const i_real = pulseOffsets(i);
+        T sumsq{0};
+        T total = 0;
+        auto const atb = Atb(i_real);
+        for (int j=0; j<i; j++) {
+            auto const j_real = pulseOffsets(j);
+            T sumsq2{0};
+            auto const m_i_j = M(std::max(i_real, j_real), std::min(i_real, j_real));
+            for (int k=0; k<j; ++k)
+                sumsq2 += L(i, k) * L(j, k);
+
+            auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
+            L(i, j) = value_i_j;
+
+            sumsq += value_i_j * value_i_j;
+            total += value_i_j * b[j];
+        }
+
+        auto const l_i_i = std::sqrt(M(i_real, i_real) - sumsq);
+        L(i, i) = l_i_i;
+        b[i] = (atb - total) / l_i_i;
+    }
+}
+
+template<typename MatrixType1, typename MatrixType2, typename VectorType>
+__forceinline__ __device__ 
+void update_decomposition_forwardsubst_with_offsets(
+        MatrixType1& L, MatrixType2 const& M,
+        float b[MatrixType1::stride],
+        VectorType const& Atb,
+        int const N,
+        ColumnVector<MatrixType1::stride, int> const& pulseOffsets) {
+    using T = typename MatrixType1::base_type;
+    auto const i = N-1;
+    auto const i_real = pulseOffsets(i);
+    T sumsq {0};
+    T total = 0;
+    for (int j=0; j<i; j++) {
+        auto const j_real = pulseOffsets(j);
+        T sumsq2{0};
+        auto const m_i_j = M(std::max(i_real, j_real), std::min(i_real, j_real));
+        for (int k=0; k<j; ++k)
+            sumsq2 += L(i, k) * L(j, k);
+
+        auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
+        L(i, j) = value_i_j;
+        sumsq += value_i_j * value_i_j;
+
+        total += value_i_j * b[j];
+    }
+
+    auto const l_i_i = std::sqrt(M(i_real, i_real) - sumsq);
+    L(i, i) = l_i_i;
+    b[i] = (Atb(i_real) - total) / l_i_i;
+}
+
+template<typename MatrixType1, typename MatrixType2, typename MatrixType3>
+__device__
+void solve_forward_subst_matrix(
+        MatrixType1 &A, 
+        MatrixType2 const& pulseMatrixView, 
+        MatrixType3 const& matrixL) {
+    // FIXME: this assumes pulses are on columns and samples on rows
+    constexpr auto NPULSES = MatrixType2::ColsAtCompileTime;
+    constexpr auto NSAMPLES = MatrixType2::RowsAtCompileTime;
+
+    #pragma unroll
+    for (int icol=0; icol<NPULSES; icol++) {
+        float reg_b[NSAMPLES];
+        float reg_L[NSAMPLES];
+
+        // preload a column and load column 0 of cholesky
+        #pragma unroll
+        for (int i=0; i<NSAMPLES; i++) {
+            reg_b[i] = __ldg(&pulseMatrixView.coeffRef(i, icol));
+            reg_L[i] = matrixL(i, 0);
+        }
+
+        // compute x0 and store it
+        auto x_prev = reg_b[0] / reg_L[0];
+        A(0, icol) = x_prev;
+
+        // iterate
+        #pragma unroll
+        for (int iL=1; iL<NSAMPLES; iL++) {
+            // update accum
+            #pragma unroll
+            for (int counter=iL; counter<NSAMPLES; counter++)
+                reg_b[counter] -= x_prev * reg_L[counter];
+
+            // load the next column of cholesky
+            #pragma unroll
+            for (int counter=iL; counter<NSAMPLES; counter++)
+                reg_L[counter] = matrixL(counter, iL);
+
+            // compute the next x for M(iL, icol)
+            x_prev = reg_b[iL] / reg_L[iL];
+
+            // store the result value
+            A(iL, icol) = x_prev;
+        }
+    }
+}
+
+template<typename MatrixType1, typename MatrixType2>
+__device__
+void solve_forward_subst_vector(
+        float reg_b[MatrixType1::RowsAtCompileTime], 
+        MatrixType1 inputAmplitudesView, 
+        MatrixType2 matrixL) {
+    constexpr auto NSAMPLES = MatrixType1::RowsAtCompileTime;
+
+    float reg_b_tmp[NSAMPLES];
+    float reg_L[NSAMPLES];
+
+    // preload a column and load column 0 of cholesky
+    #pragma unroll
+    for (int i=0; i<NSAMPLES; i++) {
+        reg_b_tmp[i] = inputAmplitudesView(i);
+        reg_L[i] = matrixL(i, 0);
+    }
+
+    // compute x0 and store it
+    auto x_prev = reg_b_tmp[0] / reg_L[0];
+    reg_b[0] = x_prev;
+
+    // iterate
+    #pragma unroll
+    for (int iL=1; iL<NSAMPLES; iL++) {
+        // update accum
+        #pragma unroll
+        for (int counter=iL; counter<NSAMPLES; counter++)
+            reg_b_tmp[counter] -= x_prev * reg_L[counter];
+
+        // load the next column of cholesky
+        #pragma unroll
+        for (int counter=iL; counter<NSAMPLES; counter++)
+            reg_L[counter] = matrixL(counter, iL);
+
+        // compute the next x for M(iL, icol)
+        x_prev = reg_b_tmp[iL] / reg_L[iL];
+
+        // store the result value
+        reg_b[iL] = x_prev;
+    }
+}
+
+// TODO: add active bxs
+template<typename MatrixType, typename VectorType>
+__device__
+void fnnls(
+        MatrixType const& AtA,
+        VectorType const& Atb,
+        VectorType& solution,
+        int& npassive,
+        ColumnVector<VectorType::RowsAtCompileTime, int> &pulseOffsets,
+        MapSymM<float, VectorType::RowsAtCompileTime> &matrixL,
+        double const eps,
+        int const maxIterations) {
+    // constants
+    constexpr auto NPULSES = VectorType::RowsAtCompileTime;
+
+    // to keep track of where to terminate if converged
+    Eigen::Index w_max_idx_prev = 0;
+    float w_max_prev = 0;
+    auto eps_to_use = eps;
+    bool recompute = false;
+
+    // used throughout
+    VectorType s;
+    float reg_b[NPULSES];
+    //float matrixLStorage[MapSymM<float, NPULSES>::total];
+    //MapSymM<float, NPULSES> matrixL{matrixLStorage};
+
+    int iter = 0;
+    while (true) {
+        if (iter > 0 || npassive==0) {
+            auto const nactive = NPULSES - npassive;
+            // exit if there are no more pulses to constrain
+            if (nactive==0) break;
+
+            // compute the gradient
+            //w.tail(nactive) = Atb.tail(nactive) - (AtA * solution).tail(nactive);
+            Eigen::Index w_max_idx;
+            float w_max = -std::numeric_limits<float>::max();
+            for (int icol=npassive; icol<NPULSES; icol++) {
+                auto const icol_real = pulseOffsets(icol);
+                auto const atb = Atb(icol_real);
+                float sum = 0;
+                #pragma unroll
+                for (int counter=0; counter<NPULSES; counter++)
+                    sum += counter > icol_real
+                        ? AtA(counter, icol_real) * solution(counter)
+                        : AtA(icol_real, counter) * solution(counter);
+
+                auto const w = atb - sum;
+                if (w > w_max) {
+                    w_max = w;
+                    w_max_idx = icol - npassive;
+                }
+            }
+
+            // check for convergence
+            if (w_max<eps_to_use || 
+                w_max_idx==w_max_idx_prev && w_max==w_max_prev)
+                break;
+
+            if (iter >= maxIterations) break;
+
+            w_max_prev = w_max;
+            w_max_idx_prev = w_max_idx;
+
+            // move index to the right part of the vector
+            w_max_idx += npassive;
+
+            Eigen::numext::swap(pulseOffsets.coeffRef(npassive),
+                pulseOffsets.coeffRef(w_max_idx));
+            ++npassive;
+        }
+
+        // inner loop
+        while (true) {
+            if (npassive == 0) break;
+
+            //s.head(npassive)
+            //auto const& matrixL = 
+            //    AtA.topLeftCorner(npassive, npassive)
+            //        .llt().matrixL();
+            //.solve(Atb.head(npassive));
+            if (recompute || iter==0)
+                compute_decomposition_forwardsubst_with_offsets(
+                    matrixL, AtA, reg_b, Atb, 
+                    npassive, pulseOffsets);
+            else
+                update_decomposition_forwardsubst_with_offsets(
+                    matrixL, AtA, reg_b, Atb,
+                    npassive, pulseOffsets);
+       
+            // run backward substituion
+            s(npassive-1) = reg_b[npassive-1] / matrixL(npassive-1, npassive-1);
+            for (int i=npassive-2; i>=0; --i) {
+                float total=0;
+                for (int j=i+1; j<npassive; j++)
+                    total += matrixL(j, i) * s(j);
+
+                s(i) = (reg_b[i] - total) / matrixL(i, i);
+            }
+
+            // done if solution values are all positive
+            bool hasNegative = false;
+            bool hasNans = false;
+            for (int counter=0; counter<npassive; counter++) {
+                auto const s_ii = s(counter);
+                hasNegative |= s_ii <= 0;
+                hasNans |= isnan(s_ii);
+            }
+
+            // FIXME: temporary solution. my cholesky impl is unstable yielding nans
+            // this check removes nans - do not accept solution unless all values 
+            // are stable
+            if (hasNans) break;
+            if (!hasNegative) {
+                for (int i=0; i<npassive; i++) {
+                    auto const i_real = pulseOffsets(i);
+                    solution(i_real) = s(i);
+                }
+                //solution.head(npassive) = s.head(npassive);
+                recompute = false;
+                break;
+            }
+
+            // there were negative values -> have to recompute the whole decomp
+            recompute = true;
+
+            auto alpha = std::numeric_limits<float>::max();
+            Eigen::Index alpha_idx = 0, alpha_idx_real = 0;
+            for (int i=0; i<npassive; i++) {
+                if (s[i] <= 0.) {
+                    auto const i_real = pulseOffsets(i);
+                    auto const ratio = solution[i_real] / (solution[i_real] - s[i]);
+                    if (ratio < alpha) {
+                        alpha = ratio;
+                        alpha_idx = i;
+                        alpha_idx_real = i_real;
+                    }
+                }
+            }
+
+            // upadte solution
+            for (int i=0; i<npassive; i++) {
+                auto const i_real = pulseOffsets(i);
+                solution(i_real) += alpha * (s(i) - solution(i_real));
+            }
+            //solution.head(npassive) += alpha * 
+            //    (s.head(npassive) - solution.head(npassive));
+            solution[alpha_idx_real] = 0;
+            --npassive;
+
+            Eigen::numext::swap(pulseOffsets.coeffRef(npassive),
+                pulseOffsets.coeffRef(alpha_idx));
+        }
+
+        // as in cpu 
+        ++iter;
+        if (iter % 16 == 0)
+            eps_to_use *= 2;
+    }
+}
+
+}}
+
+#endif // RecoLocalCalo_EcalRecAlgos_src_KernelHelpers_h
diff --git a/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.cu
index 3726ea43d95db..4c538a2e352ad 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.cu
@@ -17,45 +17,49 @@
 
 //#define ECAL_RECO_CUDA_DEBUG
 
-namespace ecal {
-  namespace multifit {
-
-    __device__ __forceinline__ bool use_sample(unsigned int sample_mask, unsigned int sample) {
-      return sample_mask & (0x1 << (EcalDataFrame::MAXSAMPLES - (sample + 1)));
-    }
-
-    __global__ void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values,
-                                                  SampleVector::Scalar const* sample_value_errors,
-                                                  bool const* useless_sample_values,
-                                                  SampleVector::Scalar* chi2s,
-                                                  SampleVector::Scalar* sum0s,
-                                                  SampleVector::Scalar* sumAAs,
-                                                  int const nchannels) {
-      using ScalarType = SampleVector::Scalar;
-      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
-
-      // indices
-      int tx = threadIdx.x + blockDim.x * blockIdx.x;
-      int ltx = threadIdx.x;
-      int ch = tx / nsamples;
-      int nchannels_per_block = blockDim.x / nsamples;
-
-      // TODO: make sure that this branch plays nicely with __syncthreads inside
-      // can there be a deadlock even if the thread is inactive
-      if (ch < nchannels) {
-        //
+namespace ecal { namespace multifit {
+
+__device__
+__forceinline__
+bool use_sample(unsigned int sample_mask, unsigned int sample) {
+    return sample_mask & (0x1 << (EcalDataFrame::MAXSAMPLES - (sample + 1)));
+}
+
+__global__
+void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values,
+                                   SampleVector::Scalar const* sample_value_errors,
+                                   bool const* useless_sample_values,
+                                   SampleVector::Scalar* chi2s,
+                                   SampleVector::Scalar* sum0s,
+                                   SampleVector::Scalar* sumAAs,
+                                   int const nchannels) {
+    using ScalarType = SampleVector::Scalar;
+    constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+    // indices
+    int tx = threadIdx.x + blockDim.x*blockIdx.x;
+    int ltx = threadIdx.x;
+    int ch = tx / nsamples;
+    int nchannels_per_block = blockDim.x / nsamples;
+
+    // TODO: make sure that this branch plays nicely with __syncthreads inside
+    // can there be a deadlock even if the thread is inactive
+    if (ch < nchannels) {
+        // 
         int sample = tx % nsamples;
 
         // shared mem inits
         extern __shared__ char sdata[];
         char* s_sum0 = sdata;
-        SampleVector::Scalar* s_sum1 = reinterpret_cast<SampleVector::Scalar*>(s_sum0 + nchannels_per_block * nsamples);
-        SampleVector::Scalar* s_sumA = s_sum1 + nchannels_per_block * nsamples;
-        SampleVector::Scalar* s_sumAA = s_sumA + nchannels_per_block * nsamples;
+        SampleVector::Scalar* s_sum1 = reinterpret_cast<SampleVector::Scalar*>(
+            s_sum0 + nchannels_per_block*nsamples);
+        SampleVector::Scalar* s_sumA = s_sum1 + nchannels_per_block*nsamples;
+        SampleVector::Scalar* s_sumAA = s_sumA + nchannels_per_block*nsamples;
 
         // TODO make sure no div by 0
-        auto const inv_error =
-            useless_sample_values[tx] ? 0.0 : 1.0 / (sample_value_errors[tx] * sample_value_errors[tx]);
+        auto const inv_error = useless_sample_values[tx] 
+            ? 0.0 
+            : 1.0 / (sample_value_errors[tx] * sample_value_errors[tx]);
         auto const sample_value = sample_values[tx];
         s_sum0[ltx] = useless_sample_values[tx] ? 0 : 1;
         s_sum1[ltx] = inv_error;
@@ -64,190 +68,215 @@ namespace ecal {
         __syncthreads();
 
         // 5 threads for [0, 4] samples
-        if (sample < 5) {
-          s_sum0[ltx] += s_sum0[ltx + 5];
-          s_sum1[ltx] += s_sum1[ltx + 5];
-          s_sumA[ltx] += s_sumA[ltx + 5];
-          s_sumAA[ltx] += s_sumAA[ltx + 5];
+        if (sample<5) {
+            s_sum0[ltx] += s_sum0[ltx+5];
+            s_sum1[ltx] += s_sum1[ltx+5];
+            s_sumA[ltx] += s_sumA[ltx+5];
+            s_sumAA[ltx] += s_sumAA[ltx+5];
         }
         __syncthreads();
 
-        if (sample < 2) {
-          // note double counting of sample 3
-          s_sum0[ltx] += s_sum0[ltx + 2] + s_sum0[ltx + 3];
-          s_sum1[ltx] += s_sum1[ltx + 2] + s_sum1[ltx + 3];
-          s_sumA[ltx] += s_sumA[ltx + 2] + s_sumA[ltx + 3];
-          s_sumAA[ltx] += s_sumAA[ltx + 2] + s_sumAA[ltx + 3];
+        if (sample<2) {
+            // note double counting of sample 3
+            s_sum0[ltx] += s_sum0[ltx+2] + s_sum0[ltx+3];
+            s_sum1[ltx] += s_sum1[ltx+2] + s_sum1[ltx+3];
+            s_sumA[ltx] += s_sumA[ltx+2] + s_sumA[ltx+3];
+            s_sumAA[ltx] += s_sumAA[ltx+2] + s_sumAA[ltx+3];
         }
         __syncthreads();
 
         if (sample == 0) {
-          // note, subtract to remove the double counting of sample == 3
-          //s_sum0[ltx] += s_sum0[ltx+1] - s_sum0[ltx+3];
-          //s_sum1[ltx] += s_sum1[ltx+1] - s_sum1[ltx+3];
-          //s_sumA[ltx] += s_sumA[ltx+1] - s_sumA[ltx+3];
-          //s_sumAA[ltx] += s_sumAA[ltx+1] - s_sumAA[ltx+3];
-          auto const sum0 = s_sum0[ltx] + s_sum0[ltx + 1] - s_sum0[ltx + 3];
-          auto const sum1 = s_sum1[ltx] + s_sum1[ltx + 1] - s_sum1[ltx + 3];
-          auto const sumA = s_sumA[ltx] + s_sumA[ltx + 1] - s_sumA[ltx + 3];
-          auto const sumAA = s_sumAA[ltx] + s_sumAA[ltx + 1] - s_sumAA[ltx + 3];
-          auto const chi2 = sum0 > 0 ? (sumAA - sumA * sumA / sum1) / sum0 : static_cast<ScalarType>(0);
-          chi2s[ch] = chi2;
-          sum0s[ch] = sum0;
-          sumAAs[ch] = sumAA;
+            // note, subtract to remove the double counting of sample == 3
+            //s_sum0[ltx] += s_sum0[ltx+1] - s_sum0[ltx+3];
+            //s_sum1[ltx] += s_sum1[ltx+1] - s_sum1[ltx+3];
+            //s_sumA[ltx] += s_sumA[ltx+1] - s_sumA[ltx+3];
+            //s_sumAA[ltx] += s_sumAA[ltx+1] - s_sumAA[ltx+3];
+            auto const sum0 = s_sum0[ltx] + s_sum0[ltx+1] - s_sum0[ltx+3];
+            auto const sum1 = s_sum1[ltx] + s_sum1[ltx+1] - s_sum1[ltx+3];
+            auto const sumA = s_sumA[ltx] + s_sumA[ltx+1] - s_sumA[ltx+3];
+            auto const sumAA = s_sumAA[ltx] + s_sumAA[ltx+1] - s_sumAA[ltx+3];
+            auto const chi2 = sum0>0 
+                ? (sumAA - sumA * sumA / sum1) / sum0
+                : static_cast<ScalarType>(0);
+            chi2s[ch] = chi2;
+            sum0s[ch] = sum0;
+            sumAAs[ch] = sumAA;
 
 #ifdef DEBUG_TC_NULLHYPOT
-          if (ch == 0) {
-            printf("chi2 = %f sum0 = %d sumAA = %f\n", chi2, static_cast<int>(sum0), sumAA);
-          }
+            if (ch == 0) {
+                printf("chi2 = %f sum0 = %d sumAA = %f\n",
+                    chi2, static_cast<int>(sum0), sumAA);
+            }
 #endif
         }
-      }
     }
-
-    constexpr float fast_expf(float x) { return unsafe_expf<6>(x); }
-    constexpr float fast_logf(float x) { return unsafe_logf<7>(x); }
-
-    //#define DEBUG_TC_MAKERATIO
-    //
-    // launch ctx parameters are
-    // 45 threads per channel, X channels per block, Y blocks
-    // 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9
-    // TODO: it might be much beter to use 32 threads per channel instead of 45
-    // to simplify the synchronization
-    //
-    __global__ void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values,
-                                                  SampleVector::Scalar const* sample_value_errors,
-                                                  uint32_t const* dids,
-                                                  bool const* useless_sample_values,
-                                                  char const* pedestal_nums,
-                                                  ConfigurationParameters::type const* amplitudeFitParametersEB,
-                                                  ConfigurationParameters::type const* amplitudeFitParametersEE,
-                                                  ConfigurationParameters::type const* timeFitParametersEB,
-                                                  ConfigurationParameters::type const* timeFitParametersEE,
-                                                  SampleVector::Scalar const* sumAAsNullHypot,
-                                                  SampleVector::Scalar const* sum0sNullHypot,
-                                                  SampleVector::Scalar* tMaxAlphaBetas,
-                                                  SampleVector::Scalar* tMaxErrorAlphaBetas,
-                                                  SampleVector::Scalar* g_accTimeMax,
-                                                  SampleVector::Scalar* g_accTimeWgt,
-                                                  TimeComputationState* g_state,
-                                                  unsigned int const timeFitParameters_sizeEB,
-                                                  unsigned int const timeFitParameters_sizeEE,
-                                                  ConfigurationParameters::type const timeFitLimits_firstEB,
-                                                  ConfigurationParameters::type const timeFitLimits_firstEE,
-                                                  ConfigurationParameters::type const timeFitLimits_secondEB,
-                                                  ConfigurationParameters::type const timeFitLimits_secondEE,
-                                                  int const nchannels) {
-      using ScalarType = SampleVector::Scalar;
-
-      // constants
-      constexpr int nthreads_per_channel = 45;  // n=10, n(n-1)/2
-      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
-
-      // indices
-      int const gtx = threadIdx.x + blockDim.x * blockIdx.x;
-      int const ch = gtx / nthreads_per_channel;
-      int const lch = threadIdx.x / nthreads_per_channel;
-      int const ltx = threadIdx.x % nthreads_per_channel;
-      int const ch_start = ch * nsamples;
-      int const lch_start = lch * nthreads_per_channel;
-      int const nchannels_per_block = blockDim.x / nthreads_per_channel;
-
-      // rmeove inactive threads
-      // TODO: need to understand if this is 100% safe in presence of syncthreads
-      if (ch >= nchannels)
-        return;
-
-      auto const did = DetId{dids[ch]};
-      auto const isBarrel = did.subdetId() == EcalBarrel;
-      auto const* amplitudeFitParameters = isBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE;
-      auto const* timeFitParameters = isBarrel ? timeFitParametersEB : timeFitParametersEE;
-      auto const timeFitParameters_size = isBarrel ? timeFitParameters_sizeEB : timeFitParameters_sizeEE;
-      auto const timeFitLimits_first = isBarrel ? timeFitLimits_firstEB : timeFitLimits_firstEE;
-      auto const timeFitLimits_second = isBarrel ? timeFitLimits_secondEB : timeFitLimits_secondEE;
-
-      extern __shared__ char smem[];
-      ScalarType* shr_chi2s = reinterpret_cast<ScalarType*>(smem);
-      ScalarType* shr_time_wgt = shr_chi2s + blockDim.x;
-      ScalarType* shr_time_max = shr_time_wgt + blockDim.x;
-      ScalarType* shrTimeMax = shr_time_max + blockDim.x;
-      ScalarType* shrTimeWgt = shrTimeMax + blockDim.x;
-
-      // map tx -> (sample_i, sample_j)
-      int sample_i, sample_j = 0;
-      if (ltx >= 0 && ltx <= 8) {
+}
+
+constexpr float fast_expf(float x) { return unsafe_expf<6>(x); }
+constexpr float fast_logf(float x) { return unsafe_logf<7>(x); }
+
+//#define DEBUG_TC_MAKERATIO
+//
+// launch ctx parameters are 
+// 45 threads per channel, X channels per block, Y blocks
+// 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9
+// TODO: it might be much beter to use 32 threads per channel instead of 45
+// to simplify the synchronization
+//
+__global__
+void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values,
+                                   SampleVector::Scalar const* sample_value_errors,
+                                   uint32_t const* dids_eb,
+                                   uint32_t const* dids_ee,
+                                   bool const* useless_sample_values,
+                                   char const* pedestal_nums,
+                                   ConfigurationParameters::type const* amplitudeFitParametersEB,
+                                   ConfigurationParameters::type const* amplitudeFitParametersEE,
+                                   ConfigurationParameters::type const* timeFitParametersEB,
+                                   ConfigurationParameters::type const* timeFitParametersEE,
+                                   SampleVector::Scalar const* sumAAsNullHypot,
+                                   SampleVector::Scalar const* sum0sNullHypot,
+                                   SampleVector::Scalar* tMaxAlphaBetas,
+                                   SampleVector::Scalar* tMaxErrorAlphaBetas,
+                                   SampleVector::Scalar* g_accTimeMax,
+                                   SampleVector::Scalar* g_accTimeWgt,
+                                   TimeComputationState* g_state,
+                                   unsigned int const timeFitParameters_sizeEB,
+                                   unsigned int const timeFitParameters_sizeEE,
+                                   ConfigurationParameters::type const timeFitLimits_firstEB,
+                                   ConfigurationParameters::type const timeFitLimits_firstEE,
+                                   ConfigurationParameters::type const timeFitLimits_secondEB,
+                                   ConfigurationParameters::type const timeFitLimits_secondEE,
+                                   int const nchannels,
+                                   uint32_t const offsetForInputs) {
+    using ScalarType = SampleVector::Scalar;
+
+    // constants
+    constexpr int nthreads_per_channel = 45; // n=10, n(n-1)/2
+    constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+    // indices
+    int const gtx = threadIdx.x + blockDim.x*blockIdx.x;
+    int const ch = gtx / nthreads_per_channel;
+    int const lch = threadIdx.x / nthreads_per_channel;
+    int const ltx = threadIdx.x % nthreads_per_channel;
+    int const ch_start = ch*nsamples;
+    int const lch_start = lch*nthreads_per_channel;
+    int const nchannels_per_block = blockDim.x / nthreads_per_channel;
+    auto const* dids = ch >= offsetForInputs
+        ? dids_ee
+        : dids_eb;
+    int const inputCh = ch >= offsetForInputs
+        ? ch - offsetForInputs
+        : ch;
+    
+    // rmeove inactive threads
+    // TODO: need to understand if this is 100% safe in presence of syncthreads
+    if (ch >= nchannels) return;
+
+    auto const did = DetId{dids[inputCh]};
+    auto const isBarrel = did.subdetId() == EcalBarrel;
+    auto const* amplitudeFitParameters = isBarrel
+        ? amplitudeFitParametersEB
+        : amplitudeFitParametersEE;
+    auto const* timeFitParameters = isBarrel
+        ? timeFitParametersEB
+        : timeFitParametersEE;
+    auto const timeFitParameters_size = isBarrel
+        ? timeFitParameters_sizeEB
+        : timeFitParameters_sizeEE;
+    auto const timeFitLimits_first = isBarrel
+        ? timeFitLimits_firstEB
+        : timeFitLimits_firstEE;
+    auto const timeFitLimits_second = isBarrel
+        ? timeFitLimits_secondEB
+        : timeFitLimits_secondEE;
+
+    extern __shared__ char smem[];
+    ScalarType* shr_chi2s = reinterpret_cast<ScalarType*>(smem);
+    ScalarType* shr_time_wgt = shr_chi2s + blockDim.x;
+    ScalarType* shr_time_max = shr_time_wgt + blockDim.x;
+    ScalarType* shrTimeMax = shr_time_max + blockDim.x;
+    ScalarType* shrTimeWgt = shrTimeMax + blockDim.x;
+
+    // map tx -> (sample_i, sample_j)
+    int sample_i, sample_j = 0;
+    if (ltx>=0 && ltx<=8) {
         sample_i = 0;
-        sample_j = 1 + ltx;
-      } else if (ltx <= 16) {
+        sample_j = 1+ltx;
+    } else if (ltx<=16) {
         sample_i = 1;
-        sample_j = 2 + ltx - 9;
-      } else if (ltx <= 23) {
+        sample_j = 2+ltx-9;
+    } else if (ltx<=23) {
         sample_i = 2;
         sample_j = 3 + ltx - 17;
-      } else if (ltx <= 29) {
+    } else if (ltx<=29) {
         sample_i = 3;
         sample_j = 4 + ltx - 24;
-      } else if (ltx <= 34) {
+    } else if (ltx<=34) {
         sample_i = 4;
         sample_j = 5 + ltx - 30;
-      } else if (ltx <= 38) {
+    } else if (ltx<=38) {
         sample_i = 5;
         sample_j = 6 + ltx - 35;
-      } else if (ltx <= 41) {
+    } else if (ltx<=41) {
         sample_i = 6;
         sample_j = 7 + ltx - 39;
-      } else if (ltx <= 43) {
+    } else if (ltx<=43) {
         sample_i = 7;
         sample_j = 8 + ltx - 42;
-      } else if (ltx <= 44) {
+    } else if (ltx <= 44) {
         sample_i = 8;
         sample_j = 9;
-      } else
+    } else
         assert(false);
 
-      auto const tx_i = ch_start + sample_i;
-      auto const tx_j = ch_start + sample_j;
+    auto const tx_i = ch_start + sample_i;
+    auto const tx_j = ch_start + sample_j;
 
-      //
-      // note, given the way we partition the block, with 45 threads per channel
-      // we will end up with inactive threads which need to be dragged along
-      // through the synching point
-      //
-      /*
+    //
+    // note, given the way we partition the block, with 45 threads per channel
+    // we will end up with inactive threads which need to be dragged along
+    // through the synching point
+    // 
+    /*
     bool const condToExit = ch >= nchannels
         ? true
         : useless_sample_values[tx_i] 
           || useless_sample_values[tx_j]
           || sample_values[tx_i]<=1 || sample_values[tx_j]<=1;
           */
-      bool const condForUselessSamples = useless_sample_values[tx_i] || useless_sample_values[tx_j] ||
-                                         sample_values[tx_i] <= 1 || sample_values[tx_j] <= 1;
-
-      //
-      // see cpu implementation for explanation
-      //
-      ScalarType chi2 = std::numeric_limits<ScalarType>::max();
-      ScalarType tmax = 0;
-      ScalarType tmaxerr = 0;
-      shrTimeMax[threadIdx.x] = 0;
-      shrTimeWgt[threadIdx.x] = 0;
-      bool internalCondForSkipping1 = true;
-      bool internalCondForSkipping2 = true;
-      if (!condForUselessSamples) {
+    bool const condForUselessSamples = useless_sample_values[tx_i] 
+        || useless_sample_values[tx_j]
+        || sample_values[tx_i]<=1 || sample_values[tx_j]<=1;
+
+    //
+    // see cpu implementation for explanation
+    // 
+    ScalarType chi2 = std::numeric_limits<ScalarType>::max();
+    ScalarType tmax = 0;
+    ScalarType tmaxerr = 0;
+    shrTimeMax[threadIdx.x] = 0;
+    shrTimeWgt[threadIdx.x] = 0;
+    bool internalCondForSkipping1 = true;
+    bool internalCondForSkipping2 = true;
+    if (!condForUselessSamples) {
         auto const rtmp = sample_values[tx_i] / sample_values[tx_j];
         auto const invampl_i = 1.0 / sample_values[tx_i];
-        auto const relErr2_i = sample_value_errors[tx_i] * sample_value_errors[tx_i] * invampl_i * invampl_i;
+        auto const relErr2_i = sample_value_errors[tx_i]*sample_value_errors[tx_i]*
+            invampl_i*invampl_i;
         auto const invampl_j = 1.0 / sample_values[tx_j];
-        auto const relErr2_j = sample_value_errors[tx_j] * sample_value_errors[tx_j] * invampl_j * invampl_j;
+        auto const relErr2_j = sample_value_errors[tx_j]*sample_value_errors[tx_j]*
+            invampl_j*invampl_j;
         auto const err1 = rtmp * rtmp * (relErr2_i + relErr2_j);
-        auto err2 = sample_value_errors[tx_j] * (sample_values[tx_i] - sample_values[tx_j]) * (invampl_j * invampl_j);
+        auto err2 = sample_value_errors[tx_j]*
+            (sample_values[tx_i] - sample_values[tx_j])*(invampl_j*invampl_j);
         // TODO non-divergent branch for a block if each block has 1 channel
         // otherwise non-divergent for groups of 45 threads
         // at this point, pedestal_nums[ch] can be either 0, 1 or 2
-        if (pedestal_nums[ch] == 2)
-          err2 *= err2 * 0.5;
-        auto const err3 = (0.289 * 0.289) * (invampl_j * invampl_j);
+        if (pedestal_nums[ch]==2)
+            err2 *= err2 * 0.5;
+        auto const err3 = (0.289*0.289) * (invampl_j*invampl_j);
         auto const total_error = std::sqrt(err1 + err2 + err3);
 
         auto const alpha = amplitudeFitParameters[0];
@@ -261,153 +290,158 @@ namespace ecal {
         auto const ratio_value = rtmp;
         auto const ratio_error = total_error;
 
-        auto const rlim_i_j = fast_expf(static_cast<ScalarType>(sample_j - sample_i) / beta) - 0.001;
-        internalCondForSkipping1 = !(total_error < 1.0 && rtmp > 0.001 && rtmp < rlim_i_j);
+        auto const rlim_i_j = fast_expf(
+            static_cast<ScalarType>(sample_j - sample_i) / beta) - 0.001;
+        internalCondForSkipping1 = !(total_error<1.0 && rtmp>0.001 && rtmp<rlim_i_j);
         if (!internalCondForSkipping1) {
-          //
-          // precompute.
-          // in cpu version this was done conditionally
-          // however easier to do it here (precompute) and then just filter out
-          // if not needed
-          //
-          auto const l_timeFitLimits_first = timeFitLimits_first;
-          auto const l_timeFitLimits_second = timeFitLimits_second;
-          if (ratio_step == 1 && ratio_value >= l_timeFitLimits_first && ratio_value <= l_timeFitLimits_second) {
-            auto const time_max_i = static_cast<ScalarType>(ratio_index);
-            auto u = timeFitParameters[timeFitParameters_size - 1];
+            //
+            // precompute.
+            // in cpu version this was done conditionally
+            // however easier to do it here (precompute) and then just filter out
+            // if not needed
+            // 
+            auto const l_timeFitLimits_first = timeFitLimits_first;
+            auto const l_timeFitLimits_second = timeFitLimits_second;
+            if (ratio_step == 1
+                && ratio_value >= l_timeFitLimits_first
+                && ratio_value <= l_timeFitLimits_second) {
+
+                auto const time_max_i = static_cast<ScalarType>(ratio_index);
+                auto u = timeFitParameters[timeFitParameters_size - 1];
 #pragma unroll
-            for (int k = timeFitParameters_size - 2; k >= 0; k--)
-              u = u * ratio_value + timeFitParameters[k];
-
-            auto du = (timeFitParameters_size - 1) * (timeFitParameters[timeFitParameters_size - 1]);
-            for (int k = timeFitParameters_size - 2; k >= 1; k--)
-              du = du * ratio_value + k * timeFitParameters[k];
-
-            auto const error2 = ratio_error * ratio_error * du * du;
-            auto const time_max = error2 > 0 ? (time_max_i - u) / error2 : static_cast<ScalarType>(0);
-            auto const time_wgt = error2 > 0 ? 1.0 / error2 : static_cast<ScalarType>(0);
-
-            // store into shared mem
-            // note, this name is essentially identical to the one used
-            // below.
-            shrTimeMax[threadIdx.x] = error2 > 0 ? time_max : 0;
-            shrTimeWgt[threadIdx.x] = error2 > 0 ? time_wgt : 0;
-          } else {
-            shrTimeMax[threadIdx.x] = 0;
-            shrTimeWgt[threadIdx.x] = 0;
-          }
-
-          // continue with ratios
-          auto const stepOverBeta = static_cast<SampleVector::Scalar>(ratio_step) / beta;
-          auto const offset = static_cast<SampleVector::Scalar>(ratio_index) + alphabeta;
-          auto const rmin = std::max(ratio_value - ratio_error, 0.001);
-          auto const rmax = std::min(ratio_value + ratio_error,
-                                     fast_expf(static_cast<SampleVector::Scalar>(ratio_step) / beta) - 0.001);
-          auto const time1 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmin)) / alpha) - 1.0);
-          auto const time2 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmax)) / alpha) - 1.0);
-
-          // set these guys
-          tmax = 0.5 * (time1 + time2);
-          tmaxerr = 0.5 * std::sqrt((time1 - time2) * (time1 - time2));
+                for (int k=timeFitParameters_size-2; k>=0; k--)
+                    u = u*ratio_value + timeFitParameters[k];
+
+                auto du = (timeFitParameters_size - 1) *
+                    (timeFitParameters[timeFitParameters_size - 1]);
+                for (int k=timeFitParameters_size - 2; k>=1; k--)
+                    du = du*ratio_value + k*timeFitParameters[k];
+
+                auto const error2 = ratio_error * ratio_error * du * du;
+                auto const time_max = error2 > 0
+                    ? (time_max_i - u) / error2
+                    : static_cast<ScalarType>(0);
+                auto const time_wgt = error2 > 0
+                    ? 1.0 / error2
+                    : static_cast<ScalarType>(0);
+
+                // store into shared mem
+                // note, this name is essentially identical to the one used 
+                // below. 
+                shrTimeMax[threadIdx.x] = error2 > 0 ? time_max : 0;
+                shrTimeWgt[threadIdx.x] = error2 > 0 ? time_wgt : 0;
+            } else {
+                shrTimeMax[threadIdx.x] = 0;
+                shrTimeWgt[threadIdx.x] = 0;
+            }
+
+            // continue with ratios
+            auto const stepOverBeta = static_cast<SampleVector::Scalar>(ratio_step) / beta;
+            auto const offset = static_cast<SampleVector::Scalar>(ratio_index) + alphabeta;
+            auto const rmin = std::max(ratio_value - ratio_error, 0.001);
+            auto const rmax = std::min(ratio_value + ratio_error, 
+                fast_expf(static_cast<SampleVector::Scalar>(ratio_step) / beta)
+                - 0.001);
+            auto const time1 = 
+                offset - 
+                ratio_step / 
+                    (fast_expf((stepOverBeta - fast_logf(rmin)) / 
+                                       alpha) - 1.0);
+            auto const time2 = 
+                offset - 
+                ratio_step /
+                    (fast_expf((stepOverBeta - fast_logf(rmax)) / 
+                                       alpha) - 1.0);
+
+            // set these guys
+            tmax = 0.5 * (time1 + time2);
+            tmaxerr = 0.5 * std::sqrt((time1 - time2) * (time1 - time2));
 #ifdef DEBUG_TC_MAKERATIO
-          if (ch == 1 || ch == 0)
-            printf("ch = %d ltx = %d tmax = %f tmaxerr = %f time1 = %f time2 = %f offset = %f rmin = %f rmax = %f\n",
-                   ch,
-                   ltx,
-                   tmax,
-                   tmaxerr,
-                   time1,
-                   time2,
-                   offset,
-                   rmin,
-                   rmax);
+            if (ch == 1 || ch == 0)
+                printf("ch = %d ltx = %d tmax = %f tmaxerr = %f time1 = %f time2 = %f offset = %f rmin = %f rmax = %f\n",
+                    ch, ltx, tmax, tmaxerr, time1, time2, offset, rmin, rmax);
 #endif
 
-          SampleVector::Scalar sumAf = 0;
-          SampleVector::Scalar sumff = 0;
-          int const itmin = std::max(-1, static_cast<int>(std::floor(tmax - alphabeta)));
-          auto loffset = (static_cast<ScalarType>(itmin) - tmax) * invalphabeta;
-          // TODO: data dependence
-          for (int it = itmin + 1; it < nsamples; it++) {
-            loffset += invalphabeta;
-            if (useless_sample_values[ch_start + it])
-              continue;
-            auto const inverr2 = 1.0 / (sample_value_errors[ch_start + it] * sample_value_errors[ch_start + it]);
-            auto const term1 = 1.0 + loffset;
-            auto const f = (term1 > 1e-6) ? fast_expf(alpha * (fast_logf(term1) - loffset)) : 0;
-            sumAf += sample_values[ch_start + it] * (f * inverr2);
-            sumff += f * (f * inverr2);
-          }
-
-          auto const sumAA = sumAAsNullHypot[ch];
-          auto const sum0 = sum0sNullHypot[ch];
-          chi2 = sumAA;
-          ScalarType amp = 0;
-          // TODO: sum0 can not be 0 below, need to introduce the check upfront
-          if (sumff > 0) {
-            chi2 = sumAA - sumAf * (sumAf / sumff);
-            amp = sumAf / sumff;
-          }
-          chi2 /= sum0;
+            SampleVector::Scalar sumAf = 0;
+            SampleVector::Scalar sumff = 0;
+            int const itmin = std::max(-1, static_cast<int>(std::floor(tmax - alphabeta)));
+            auto loffset = (static_cast<ScalarType>(itmin) - tmax) * invalphabeta;
+            // TODO: data dependence 
+            for (int it = itmin+1; it<nsamples; it++) {
+                loffset += invalphabeta;
+                if (useless_sample_values[ch_start + it])
+                    continue;
+                auto const inverr2 = 1.0 / 
+                    (sample_value_errors[ch_start + it]*sample_value_errors[ch_start + it]);
+                auto const term1 = 1.0 + loffset;
+                auto const f = (term1 > 1e-6)
+                    ? fast_expf(alpha * (fast_logf(term1) - loffset))
+                    : 0;
+                sumAf += sample_values[ch_start+it] * (f * inverr2);
+                sumff += f*(f*inverr2);
+            }
+
+            auto const sumAA = sumAAsNullHypot[ch];
+            auto const sum0 = sum0sNullHypot[ch];
+            chi2 = sumAA;
+            ScalarType amp = 0;
+            // TODO: sum0 can not be 0 below, need to introduce the check upfront
+            if (sumff > 0) {
+                chi2 = sumAA - sumAf * (sumAf / sumff);
+                amp = sumAf / sumff;
+            }
+            chi2 /= sum0;
 
 #ifdef DEBUG_TC_MAKERATIO
-          if (ch == 1 || ch == 0)
-            printf("ch = %d ltx = %d sumAf = %f sumff = %f sumAA = %f sum0 = %d tmax = %f tmaxerr = %f chi2 = %f\n",
-                   ch,
-                   ltx,
-                   sumAf,
-                   sumff,
-                   sumAA,
-                   static_cast<int>(sum0),
-                   tmax,
-                   tmaxerr,
-                   chi2);
+            if (ch == 1 || ch == 0)
+                printf("ch = %d ltx = %d sumAf = %f sumff = %f sumAA = %f sum0 = %d tmax = %f tmaxerr = %f chi2 = %f\n",
+                    ch, ltx, sumAf, sumff, sumAA, static_cast<int>(sum0), tmax, tmaxerr, chi2);
 #endif
 
-          if (chi2 > 0 && tmax > 0 && tmaxerr > 0)
-            internalCondForSkipping2 = false;
-          else
-            chi2 = std::numeric_limits<ScalarType>::max();
+            if (chi2>0 && tmax>0 && tmaxerr>0)
+                internalCondForSkipping2 = false;
+            else
+                chi2 = std::numeric_limits<ScalarType>::max();
         }
-      }
+    }
 
-      // store into smem
-      shr_chi2s[threadIdx.x] = chi2;
-      __syncthreads();
+    // store into smem
+    shr_chi2s[threadIdx.x] = chi2;
+    __syncthreads();
 
-      // find min chi2 - quite crude for now
-      // TODO validate/check
-      char iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
-      bool oddElements = nthreads_per_channel % 2;
+    // find min chi2 - quite crude for now
+    // TODO validate/check
+    char iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
+    bool oddElements = nthreads_per_channel % 2;
 #pragma unroll
-      while (iter >= 1) {
+    while (iter>=1) {
         if (ltx < iter)
-          // for odd ns, the last guy will just store itself
-          // exception is for ltx == 0 and iter==1
-          shr_chi2s[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
-                                       ? shr_chi2s[threadIdx.x]
-                                       : std::min(shr_chi2s[threadIdx.x], shr_chi2s[threadIdx.x + iter]);
+            // for odd ns, the last guy will just store itself
+            // exception is for ltx == 0 and iter==1
+            shr_chi2s[threadIdx.x] = oddElements && (ltx==iter-1 && ltx>0)
+                ? shr_chi2s[threadIdx.x] 
+                : std::min(shr_chi2s[threadIdx.x], shr_chi2s[threadIdx.x+iter]);
         __syncthreads();
         oddElements = iter % 2;
-        iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
-      }
+        iter = iter==1 ? iter/2 : iter/2 + iter%2;
+    }
 
-      // filter out inactive or useless samples threads
-      if (!condForUselessSamples && !internalCondForSkipping1 && !internalCondForSkipping2) {
+    // filter out inactive or useless samples threads
+    if (!condForUselessSamples && !internalCondForSkipping1 
+            && !internalCondForSkipping2) {
         // min chi2, now compute weighted average of tmax measurements
         // see cpu version for more explanation
         auto const chi2min = shr_chi2s[threadIdx.x - ltx];
         auto const chi2Limit = chi2min + 1.0;
-        auto const inverseSigmaSquared = chi2 < chi2Limit ? 1.0 / (tmaxerr * tmaxerr) : 0.0;
+        auto const inverseSigmaSquared = 
+            chi2 < chi2Limit
+                ? 1.0 / (tmaxerr * tmaxerr)
+                : 0.0;
 
 #ifdef DEBUG_TC_MAKERATIO
         if (ch == 1 || ch == 0)
-          printf("ch = %d ltx = %d chi2min = %f chi2Limit = %f inverseSigmaSquared = %f\n",
-                 ch,
-                 ltx,
-                 chi2min,
-                 chi2Limit,
-                 inverseSigmaSquared);
+            printf("ch = %d ltx = %d chi2min = %f chi2Limit = %f inverseSigmaSquared = %f\n",
+                ch, ltx, chi2min, chi2Limit, inverseSigmaSquared);
 #endif
 
         // store into shared mem and run reduction
@@ -415,48 +449,48 @@ namespace ecal {
         // TODO: check if shuffling intrinsics are better
         shr_time_wgt[threadIdx.x] = inverseSigmaSquared;
         shr_time_max[threadIdx.x] = tmax * inverseSigmaSquared;
-      } else {
+    } else {
         shr_time_wgt[threadIdx.x] = 0;
         shr_time_max[threadIdx.x] = 0;
-      }
-      __syncthreads();
+    }
+    __syncthreads();
 
-      // reduce to compute time_max and time_wgt
-      iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
-      oddElements = nthreads_per_channel % 2;
+    // reduce to compute time_max and time_wgt
+    iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
+    oddElements = nthreads_per_channel % 2;
 #pragma unroll
-      while (iter >= 1) {
+    while (iter>=1) {
         if (ltx < iter) {
-          shr_time_wgt[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
-                                          ? shr_time_wgt[threadIdx.x]
-                                          : shr_time_wgt[threadIdx.x] + shr_time_wgt[threadIdx.x + iter];
-          shr_time_max[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
-                                          ? shr_time_max[threadIdx.x]
-                                          : shr_time_max[threadIdx.x] + shr_time_max[threadIdx.x + iter];
-          shrTimeMax[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
-                                        ? shrTimeMax[threadIdx.x]
-                                        : shrTimeMax[threadIdx.x] + shrTimeMax[threadIdx.x + iter];
-          shrTimeWgt[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
-                                        ? shrTimeWgt[threadIdx.x]
-                                        : shrTimeWgt[threadIdx.x] + shrTimeWgt[threadIdx.x + iter];
+            shr_time_wgt[threadIdx.x] = oddElements && (ltx==iter-1 && ltx>0)
+                ? shr_time_wgt[threadIdx.x]
+                : shr_time_wgt[threadIdx.x] + shr_time_wgt[threadIdx.x+iter];
+            shr_time_max[threadIdx.x] = oddElements && (ltx==iter-1 && ltx>0)
+                ? shr_time_max[threadIdx.x]
+                : shr_time_max[threadIdx.x] + shr_time_max[threadIdx.x+iter];
+            shrTimeMax[threadIdx.x] = oddElements && (ltx==iter-1 && ltx>0)
+                ? shrTimeMax[threadIdx.x]
+                : shrTimeMax[threadIdx.x] + shrTimeMax[threadIdx.x+iter];
+            shrTimeWgt[threadIdx.x] = oddElements && (ltx==iter-1 && ltx>0)
+                ? shrTimeWgt[threadIdx.x]
+                : shrTimeWgt[threadIdx.x] + shrTimeWgt[threadIdx.x+iter];
         }
-
+        
         __syncthreads();
         oddElements = iter % 2;
-        iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
-      }
+        iter = iter==1 ? iter/2 : iter/2 + iter%2;
+    }
 
-      // load from shared memory the 0th guy (will contain accumulated values)
-      // compute
-      // store into global mem
-      if (ltx == 0) {
+    // load from shared memory the 0th guy (will contain accumulated values)
+    // compute 
+    // store into global mem
+    if (ltx == 0) {
         auto const tmp_time_max = shr_time_max[threadIdx.x];
         auto const tmp_time_wgt = shr_time_wgt[threadIdx.x];
 
         // we are done if there number of time ratios is 0
-        if (tmp_time_wgt == 0 && tmp_time_max == 0) {
-          g_state[ch] = TimeComputationState::Finished;
-          return;
+        if (tmp_time_wgt==0 && tmp_time_max==0) {
+            g_state[ch] = TimeComputationState::Finished;
+            return ;
         }
 
         // no div by 0
@@ -470,25 +504,26 @@ namespace ecal {
         g_state[ch] = TimeComputationState::NotFinished;
 
 #ifdef DEBUG_TC_MAKERATIO
-        printf("ch = %d time_max = %f time_wgt = %f\n", ch, tmp_time_max, tmp_time_wgt);
-        printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f timeMax = %f timeWgt = %f\n",
-               ch,
-               tMaxAlphaBeta,
-               tMaxErrorAlphaBeta,
-               shrTimeMax[threadIdx.x],
-               shrTimeWgt[threadIdx.x]);
+            printf("ch = %d time_max = %f time_wgt = %f\n",
+                ch, tmp_time_max, tmp_time_wgt);
+            printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f timeMax = %f timeWgt = %f\n",
+                ch, tMaxAlphaBeta, tMaxErrorAlphaBeta, 
+                shrTimeMax[threadIdx.x],
+                shrTimeWgt[threadIdx.x]);
 #endif
-      }
     }
-
-    /// launch ctx parameters are
-    /// 10 threads per channel, N channels per block, Y blocks
-    /// TODO: do we need to keep the state around or can be removed?!
-    //#define DEBUG_FINDAMPLCHI2_AND_FINISH
-    __global__ void kernel_time_compute_findamplchi2_and_finish(
+}
+
+/// launch ctx parameters are 
+/// 10 threads per channel, N channels per block, Y blocks
+/// TODO: do we need to keep the state around or can be removed?!
+//#define DEBUG_FINDAMPLCHI2_AND_FINISH
+__global__
+void kernel_time_compute_findamplchi2_and_finish(
         SampleVector::Scalar const* sample_values,
         SampleVector::Scalar const* sample_value_errors,
-        uint32_t const* dids,
+        uint32_t const* dids_eb,
+        uint32_t const* dids_ee,
         bool const* useless_samples,
         SampleVector::Scalar const* g_tMaxAlphaBeta,
         SampleVector::Scalar const* g_tMaxErrorAlphaBeta,
@@ -504,36 +539,44 @@ namespace ecal {
         SampleVector::Scalar* g_ampMaxError,
         SampleVector::Scalar* g_timeMax,
         SampleVector::Scalar* g_timeError,
-        int const nchannels) {
-      using ScalarType = SampleVector::Scalar;
-
-      // constants
-      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
-
-      // indices
-      int const gtx = threadIdx.x + blockIdx.x * blockDim.x;
-      int const ch = gtx / nsamples;
-      int const sample = threadIdx.x % nsamples;
-      int const ch_start = ch * nsamples;
-
-      // configure shared mem
-      // per block, we need #threads per block * 2 * sizeof(ScalarType)
-      // we run with N channels per block
-      extern __shared__ char smem[];
-      ScalarType* shr_sumAf = reinterpret_cast<ScalarType*>(smem);
-      ScalarType* shr_sumff = shr_sumAf + blockDim.x;
-
-      if (ch >= nchannels)
-        return;
-
-      auto state = g_state[ch];
-      auto const did = DetId{dids[ch]};
-      auto const* amplitudeFitParameters =
-          did.subdetId() == EcalBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE;
-
-      // TODO is that better than storing into global and launching another kernel
-      // for the first 10 threads
-      if (state == TimeComputationState::NotFinished) {
+        int const nchannels,
+        uint32_t const offsetForInputs) {
+    using ScalarType = SampleVector::Scalar;
+
+    // constants 
+    constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+    // indices
+    int const gtx = threadIdx.x + blockIdx.x*blockDim.x;
+    int const ch = gtx / nsamples;
+    int const sample = threadIdx.x % nsamples;
+    int const ch_start = ch * nsamples;
+    auto const* dids = ch >= offsetForInputs
+        ? dids_ee
+        : dids_eb;
+    int const inputCh = ch >= offsetForInputs
+        ? ch - offsetForInputs
+        : ch;
+
+    // configure shared mem
+    // per block, we need #threads per block * 2 * sizeof(ScalarType)
+    // we run with N channels per block
+    extern __shared__ char smem[];
+    ScalarType* shr_sumAf = reinterpret_cast<ScalarType*>(smem);
+    ScalarType* shr_sumff = shr_sumAf + blockDim.x;
+
+    if (ch >= nchannels) return;
+
+    auto state = g_state[ch];
+    auto const did = DetId{dids[inputCh]};
+    auto const* amplitudeFitParameters = did.subdetId() == EcalBarrel
+        ? amplitudeFitParametersEB
+        : amplitudeFitParametersEE;
+
+
+    // TODO is that better than storing into global and launching another kernel
+    // for the first 10 threads
+    if (state == TimeComputationState::NotFinished) {
         auto const alpha = amplitudeFitParameters[0];
         auto const beta = amplitudeFitParameters[1];
         auto const alphabeta = alpha * beta;
@@ -541,91 +584,96 @@ namespace ecal {
         auto const tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
         auto const sample_value = sample_values[gtx];
         auto const sample_value_error = sample_value_errors[gtx];
-        auto const inverr2 =
-            useless_samples[gtx] ? static_cast<ScalarType>(0) : 1.0 / (sample_value_error * sample_value_error);
-        auto const offset = (static_cast<ScalarType>(sample) - tMaxAlphaBeta) * invalphabeta;
+        auto const inverr2 = useless_samples[gtx]
+            ? static_cast<ScalarType>(0)
+            : 1.0 / (sample_value_error * sample_value_error);
+        auto const offset = (static_cast<ScalarType>(sample) - tMaxAlphaBeta) 
+            * invalphabeta;
         auto const term1 = 1.0 + offset;
-        auto const f = term1 > 1e-6 ? fast_expf(alpha * (fast_logf(term1) - offset)) : static_cast<ScalarType>(0.0);
+        auto const f = term1 > 1e-6 
+            ? fast_expf(alpha * (fast_logf(term1) - offset))
+            : static_cast<ScalarType>(0.0);
         auto const sumAf = sample_value * (f * inverr2);
         auto const sumff = f * (f * inverr2);
 
         // store into shared mem
         shr_sumAf[threadIdx.x] = sumAf;
         shr_sumff[threadIdx.x] = sumff;
-      } else {
+    } else {
         shr_sumAf[threadIdx.x] = 0;
         shr_sumff[threadIdx.x] = 0;
-      }
-      __syncthreads();
-
-      // reduce
-      // unroll completely here (but hardcoded)
-      if (sample < 5) {
-        shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x + 5];
-        shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x + 5];
-      }
-      __syncthreads();
-
-      if (sample < 2) {
+    }
+    __syncthreads();
+
+    // reduce
+    // unroll completely here (but hardcoded)
+    if (sample<5) {
+        shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x+5];
+        shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x+5];
+    }
+    __syncthreads();
+
+    if (sample<2) {
         // will need to subtract for ltx = 3, we double count here
-        shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x + 2] + shr_sumAf[threadIdx.x + 3];
-        shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x + 2] + shr_sumff[threadIdx.x + 3];
-      }
-      __syncthreads();
+        shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x+2] 
+            + shr_sumAf[threadIdx.x+3];
+        shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x+2] 
+            + shr_sumff[threadIdx.x+3];
+    }
+    __syncthreads();
 
-      if (sample == 0) {
+    if (sample==0) {
         // exit if the state is done
         // note, we do not exit before all __synchtreads are finished
         if (state == TimeComputationState::Finished) {
-          g_timeMax[ch] = 5;
-          g_timeError[ch] = -999;
-          return;
+            g_timeMax[ch] = 5;
+            g_timeError[ch] = -999;
+            return;
         }
 
         // subtract to avoid double counting
-        auto const sumff = shr_sumff[threadIdx.x] + shr_sumff[threadIdx.x + 1] - shr_sumff[threadIdx.x + 3];
-        auto const sumAf = shr_sumAf[threadIdx.x] + shr_sumAf[threadIdx.x + 1] - shr_sumAf[threadIdx.x + 3];
-
-        auto const ampMaxAlphaBeta = sumff > 0 ? sumAf / sumff : 0;
+        auto const sumff = shr_sumff[threadIdx.x] 
+            + shr_sumff[threadIdx.x+1] 
+            - shr_sumff[threadIdx.x+3];
+        auto const sumAf = shr_sumAf[threadIdx.x]
+            + shr_sumAf[threadIdx.x+1]
+            - shr_sumAf[threadIdx.x+3];
+
+        auto const ampMaxAlphaBeta = sumff>0 ? sumAf / sumff : 0;
         auto const sumAA = sumAAsNullHypot[ch];
         auto const sum0 = sum0sNullHypot[ch];
         auto const nullChi2 = chi2sNullHypot[ch];
         if (sumff > 0) {
-          auto const chi2AlphaBeta = (sumAA - sumAf * sumAf / sumff) / sum0;
-          if (chi2AlphaBeta > nullChi2) {
-            // null hypothesis is better
-            state = TimeComputationState::Finished;
+            auto const chi2AlphaBeta = (sumAA - sumAf * sumAf / sumff) / sum0;
+            if (chi2AlphaBeta > nullChi2) {
+                // null hypothesis is better
+                state = TimeComputationState::Finished;
 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
-            printf("ch = %d chi2AlphaBeta = %f nullChi2 = %f sumAA = %f sumAf = %f sumff = %f sum0 = %f\n",
-                   ch,
-                   chi2AlphaBeta,
-                   nullChi2,
-                   sumAA,
-                   sumAf,
-                   sumff,
-                   sum0);
+                printf("ch = %d chi2AlphaBeta = %f nullChi2 = %f sumAA = %f sumAf = %f sumff = %f sum0 = %f\n",
+                    ch, chi2AlphaBeta, nullChi2, sumAA, sumAf, sumff, sum0);
 #endif
-          }
+            }
 
-          // store to global
-          g_ampMaxAlphaBeta[ch] = ampMaxAlphaBeta;
+            // store to global
+            g_ampMaxAlphaBeta[ch] = ampMaxAlphaBeta;
         } else {
 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
-          printf("ch = %d sum0 = %f sumAA = %f sumff = %f sumAf = %f\n", ch, sum0, sumAA, sumff, sumAf);
+            printf("ch = %d sum0 = %f sumAA = %f sumff = %f sumAf = %f\n",
+                ch, sum0, sumAA, sumff, sumAf);
 #endif
-          state = TimeComputationState::Finished;
+            state = TimeComputationState::Finished;
         }
 
         // store the state to global and finish calcs
         g_state[ch] = state;
         if (state == TimeComputationState::Finished) {
-          // store default values into global
-          g_timeMax[ch] = 5;
-          g_timeError[ch] = -999;
+            // store default values into global
+            g_timeMax[ch] = 5;
+            g_timeError[ch] = -999;
 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
-          printf("ch = %d finished state\n", ch);
+            printf("ch = %d finished state\n", ch);
 #endif
-          return;
+            return;
         }
 
         auto const ampMaxError = g_ampMaxError[ch];
@@ -636,242 +684,306 @@ namespace ecal {
         auto const tMaxErrorAlphaBeta = g_tMaxErrorAlphaBeta[ch];
         // branch to separate large vs small pulses
         // see cpu version for more info
-        if (test_ratio > 5.0 && accTimeWgt > 0) {
-          auto const tMaxRatio = accTimeWgt > 0 ? accTimeMax / accTimeWgt : static_cast<ScalarType>(0);
-          auto const tMaxErrorRatio = accTimeWgt > 0 ? 1.0 / std::sqrt(accTimeWgt) : static_cast<ScalarType>(0);
-
-          if (test_ratio > 10.0) {
-            g_timeMax[ch] = tMaxRatio;
-            g_timeError[ch] = tMaxErrorRatio;
-
+        if (test_ratio > 5.0 && accTimeWgt>0) {
+            auto const tMaxRatio = accTimeWgt>0 
+                ? accTimeMax / accTimeWgt 
+                : static_cast<ScalarType>(0);
+            auto const tMaxErrorRatio = accTimeWgt>0 
+                ? 1.0 / std::sqrt(accTimeWgt) 
+                : static_cast<ScalarType>(0);
+
+            if (test_ratio > 10.0) {
+                g_timeMax[ch] = tMaxRatio;
+                g_timeError[ch] = tMaxErrorRatio;
+                
 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
-            printf("ch = %d tMaxRatio = %f tMaxErrorRatio = %f\n", ch, tMaxRatio, tMaxErrorRatio);
+                    printf("ch = %d tMaxRatio = %f tMaxErrorRatio = %f\n",
+                        ch, tMaxRatio, tMaxErrorRatio);
 #endif
-          } else {
-            auto const timeMax = (tMaxAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) +
-                                  tMaxRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) /
-                                 5.0;
-            auto const timeError = (tMaxErrorAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) +
-                                    tMaxErrorRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) /
-                                   5.0;
-            state = TimeComputationState::Finished;
-            g_state[ch] = state;
-            g_timeMax[ch] = timeMax;
-            g_timeError[ch] = timeError;
+            } else {
+                auto const timeMax = 
+                    (tMaxAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) + 
+                     tMaxRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) / 5.0;
+                auto const timeError = 
+                    (tMaxErrorAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) + 
+                     tMaxErrorRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) / 5.0;
+                state = TimeComputationState::Finished;
+                g_state[ch] = state;
+                g_timeMax[ch] = timeMax;
+                g_timeError[ch] = timeError;
 
 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
-            printf("ch = %d timeMax = %f timeError = %f\n", ch, timeMax, timeError);
+                    printf("ch = %d timeMax = %f timeError = %f\n",
+                        ch, timeMax, timeError);
 #endif
-          }
-        } else {
-          state = TimeComputationState::Finished;
-          g_state[ch] = state;
-          g_timeMax[ch] = tMaxAlphaBeta;
-          g_timeError[ch] = tMaxErrorAlphaBeta;
+            }
+        }
+        else {
+            state = TimeComputationState::Finished;
+            g_state[ch] = state;
+            g_timeMax[ch] = tMaxAlphaBeta;
+            g_timeError[ch] = tMaxErrorAlphaBeta;
 
 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
-          printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f\n", ch, tMaxAlphaBeta, tMaxErrorAlphaBeta);
+                printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f\n",
+                    ch, tMaxAlphaBeta, tMaxErrorAlphaBeta);
 #endif
         }
-      }
     }
-
-    __global__ void kernel_time_compute_fixMGPAslew(uint16_t const* digis,
-                                                    SampleVector::Scalar* sample_values,
-                                                    SampleVector::Scalar* sample_value_errors,
-                                                    bool* useless_sample_values,
-                                                    unsigned int const sample_mask,
-                                                    int const nchannels) {
-      using ScalarType = SampleVector::Scalar;
-
-      // constants
-      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
-
-      // indices
-      int const gtx = threadIdx.x + blockIdx.x * blockDim.x;
-      int const ch = gtx / nsamples;
-      int const sample = threadIdx.x % nsamples;
-
-      // remove thread for sample 0, oversubscribing is easier than ....
-      if (ch >= nchannels || sample == 0)
-        return;
-
-      if (!use_sample(sample_mask, sample))
-        return;
-
-      auto const gainIdPrev = ecal::mgpa::gainId(digis[gtx - 1]);
-      auto const gainIdNext = ecal::mgpa::gainId(digis[gtx]);
-      if (gainIdPrev >= 1 && gainIdPrev <= 3 && gainIdNext >= 1 && gainIdNext <= 3 && gainIdPrev < gainIdNext) {
-        sample_values[gtx - 1] = 0;
-        sample_value_errors[gtx - 1] = 1e+9;
-        useless_sample_values[gtx - 1] = true;
-      }
+}
+
+__global__
+void kernel_time_compute_fixMGPAslew(uint16_t const* digis_eb,
+                                     uint16_t const* digis_ee,
+                                     SampleVector::Scalar* sample_values,
+                                     SampleVector::Scalar* sample_value_errors,
+                                     bool* useless_sample_values,
+                                     unsigned int const sample_mask,
+                                     int const nchannels,
+                                     uint32_t const offsetForInputs) {
+    using ScalarType = SampleVector::Scalar;
+
+    // constants
+    constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+    // indices
+    int const gtx = threadIdx.x + blockIdx.x * blockDim.x;
+    int const ch = gtx / nsamples;
+    int const sample = threadIdx.x % nsamples;
+    int const inputCh = ch >= offsetForInputs
+        ? ch - offsetForInputs
+        : ch;
+    int const inputGtx = ch >= offsetForInputs
+        ? gtx - offsetForInputs*nsamples
+        : gtx;
+    auto const* digis = ch >= offsetForInputs
+        ? digis_ee
+        : digis_eb;
+
+    // remove thread for sample 0, oversubscribing is easier than ....
+    if (ch >= nchannels || sample==0) return;
+
+    if (!use_sample(sample_mask, sample)) return;
+
+    auto const gainIdPrev = ecal::mgpa::gainId(digis[inputGtx-1]);
+    auto const gainIdNext = ecal::mgpa::gainId(digis[inputGtx]);
+    if (gainIdPrev>=1 && gainIdPrev<=3 &&
+        gainIdNext>=1 && gainIdNext<=3 && gainIdPrev < gainIdNext) {
+        sample_values[gtx-1] = 0;
+        sample_value_errors[gtx-1] = 1e+9;
+        useless_sample_values[gtx-1] = true;
+    }
+}
+
+__global__
+void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values,
+                              SampleVector::Scalar const* sample_value_errors,
+                              uint32_t const* dids,
+                              bool const* useless_samples,
+                              SampleVector::Scalar const* g_timeMax,
+                              SampleVector::Scalar const* amplitudeFitParametersEB,
+                              SampleVector::Scalar const* amplitudeFitParametersEE,
+                              SampleVector::Scalar *g_amplitudeMax,
+                              int const nchannels) {
+    using ScalarType = SampleVector::Scalar;
+
+    // constants
+    constexpr ScalarType corr4 = 1.;
+    constexpr ScalarType corr6 = 1.;
+    constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+    // indices
+    int const gtx = threadIdx.x + blockIdx.x * blockDim.x;
+    int const ch = gtx / nsamples;
+    int const sample = threadIdx.x % nsamples;
+
+    if (ch >= nchannels) return;
+
+    auto const did = DetId{dids[ch]};
+    auto const* amplitudeFitParameters = did.subdetId() == EcalBarrel
+        ? amplitudeFitParametersEB
+        : amplitudeFitParametersEE;
+
+    // configure shared mem
+    extern __shared__ char smem[];
+    ScalarType* shr_sum1 = reinterpret_cast<ScalarType*>(smem);
+    auto *shr_sumA = shr_sum1 + blockDim.x;
+    auto *shr_sumF = shr_sumA + blockDim.x;
+    auto *shr_sumAF = shr_sumF + blockDim.x;
+    auto *shr_sumFF = shr_sumAF + blockDim.x;
+
+    auto const alpha = amplitudeFitParameters[0];
+    auto const beta = amplitudeFitParameters[1];
+    auto const timeMax = g_timeMax[ch];
+    auto const pedestalLimit = timeMax - (alpha * beta) - 1.0;
+    auto const sample_value = sample_values[gtx];
+    auto const sample_value_error = sample_value_errors[gtx];
+    auto const inverr2 = sample_value_error > 0
+        ? 1. / (sample_value_error * sample_value_error)
+        : static_cast<ScalarType>(0);
+    auto const termOne = 1 + (sample - timeMax) / (alpha * beta);
+    auto const f = termOne > 1.e-5
+        ? fast_expf(alpha * fast_logf(termOne) - 
+            (sample - timeMax) / beta)
+        : static_cast<ScalarType>(0.); 
+
+    bool const cond = ((sample < pedestalLimit) ||
+        (f>0.6*corr6 && sample<=timeMax) ||
+        (f>0.4*corr4 && sample>=timeMax)) && !useless_samples[gtx];
+
+    // store into shared mem
+    shr_sum1[threadIdx.x] = cond ? inverr2 : static_cast<ScalarType>(0);
+    shr_sumA[threadIdx.x] = cond
+        ? sample_value * inverr2
+        : static_cast<ScalarType>(0);
+    shr_sumF[threadIdx.x] = cond 
+        ? f * inverr2
+        : static_cast<ScalarType>(0);
+    shr_sumAF[threadIdx.x] = cond 
+        ? (f*inverr2)*sample_value
+        : static_cast<ScalarType>(0);
+    shr_sumFF[threadIdx.x] = cond 
+        ? f*(f*inverr2)
+        : static_cast<ScalarType>(0);
+
+    // reduction
+    if (sample <= 4) {
+        shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x+5];
+        shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x+5];
+        shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x+5];
+        shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x+5];
+        shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x+5];
     }
+    __syncthreads();
 
-    __global__ void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values,
-                                             SampleVector::Scalar const* sample_value_errors,
-                                             uint32_t const* dids,
-                                             bool const* useless_samples,
-                                             SampleVector::Scalar const* g_timeMax,
-                                             SampleVector::Scalar const* amplitudeFitParametersEB,
-                                             SampleVector::Scalar const* amplitudeFitParametersEE,
-                                             SampleVector::Scalar* g_amplitudeMax,
-                                             int const nchannels) {
-      using ScalarType = SampleVector::Scalar;
-
-      // constants
-      constexpr ScalarType corr4 = 1.;
-      constexpr ScalarType corr6 = 1.;
-      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
-
-      // indices
-      int const gtx = threadIdx.x + blockIdx.x * blockDim.x;
-      int const ch = gtx / nsamples;
-      int const sample = threadIdx.x % nsamples;
-
-      if (ch >= nchannels)
-        return;
-
-      auto const did = DetId{dids[ch]};
-      auto const* amplitudeFitParameters =
-          did.subdetId() == EcalBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE;
-
-      // configure shared mem
-      extern __shared__ char smem[];
-      ScalarType* shr_sum1 = reinterpret_cast<ScalarType*>(smem);
-      auto* shr_sumA = shr_sum1 + blockDim.x;
-      auto* shr_sumF = shr_sumA + blockDim.x;
-      auto* shr_sumAF = shr_sumF + blockDim.x;
-      auto* shr_sumFF = shr_sumAF + blockDim.x;
-
-      auto const alpha = amplitudeFitParameters[0];
-      auto const beta = amplitudeFitParameters[1];
-      auto const timeMax = g_timeMax[ch];
-      auto const pedestalLimit = timeMax - (alpha * beta) - 1.0;
-      auto const sample_value = sample_values[gtx];
-      auto const sample_value_error = sample_value_errors[gtx];
-      auto const inverr2 =
-          sample_value_error > 0 ? 1. / (sample_value_error * sample_value_error) : static_cast<ScalarType>(0);
-      auto const termOne = 1 + (sample - timeMax) / (alpha * beta);
-      auto const f = termOne > 1.e-5 ? fast_expf(alpha * fast_logf(termOne) - (sample - timeMax) / beta)
-                                     : static_cast<ScalarType>(0.);
-
-      bool const cond = ((sample < pedestalLimit) || (f > 0.6 * corr6 && sample <= timeMax) ||
-                         (f > 0.4 * corr4 && sample >= timeMax)) &&
-                        !useless_samples[gtx];
-
-      // store into shared mem
-      shr_sum1[threadIdx.x] = cond ? inverr2 : static_cast<ScalarType>(0);
-      shr_sumA[threadIdx.x] = cond ? sample_value * inverr2 : static_cast<ScalarType>(0);
-      shr_sumF[threadIdx.x] = cond ? f * inverr2 : static_cast<ScalarType>(0);
-      shr_sumAF[threadIdx.x] = cond ? (f * inverr2) * sample_value : static_cast<ScalarType>(0);
-      shr_sumFF[threadIdx.x] = cond ? f * (f * inverr2) : static_cast<ScalarType>(0);
-
-      // reduction
-      if (sample <= 4) {
-        shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x + 5];
-        shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x + 5];
-        shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x + 5];
-        shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x + 5];
-        shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x + 5];
-      }
-      __syncthreads();
-
-      if (sample < 2) {
+    if (sample < 2) {
         // note: we double count sample 3
-        shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x + 2] + shr_sum1[threadIdx.x + 3];
-        shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x + 2] + shr_sumA[threadIdx.x + 3];
-        shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x + 2] + shr_sumF[threadIdx.x + 3];
-        shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x + 2] + shr_sumAF[threadIdx.x + 3];
-        shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x + 2] + shr_sumFF[threadIdx.x + 3];
-      }
-      __syncthreads();
-
-      if (sample == 0) {
-        auto const sum1 = shr_sum1[threadIdx.x] + shr_sum1[threadIdx.x + 1] - shr_sum1[threadIdx.x + 3];
-        auto const sumA = shr_sumA[threadIdx.x] + shr_sumA[threadIdx.x + 1] - shr_sumA[threadIdx.x + 3];
-        auto const sumF = shr_sumF[threadIdx.x] + shr_sumF[threadIdx.x + 1] - shr_sumF[threadIdx.x + 3];
-        auto const sumAF = shr_sumAF[threadIdx.x] + shr_sumAF[threadIdx.x + 1] - shr_sumAF[threadIdx.x + 3];
-        auto const sumFF = shr_sumFF[threadIdx.x] + shr_sumFF[threadIdx.x + 1] - shr_sumFF[threadIdx.x + 3];
-
-        auto const denom = sumFF * sum1 - sumF * sumF;
-        auto const condForDenom = sum1 > 0 && ecal::abs(denom) > 1.e-20;
-        auto const amplitudeMax = condForDenom ? (sumAF * sum1 - sumA * sumF) / denom : static_cast<ScalarType>(0.);
+        shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x+2] + shr_sum1[threadIdx.x+3];
+        shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x+2] + shr_sumA[threadIdx.x+3];
+        shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x+2] + shr_sumF[threadIdx.x+3];
+        shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x+2] 
+            + shr_sumAF[threadIdx.x+3];
+        shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x+2] 
+            + shr_sumFF[threadIdx.x+3];
+    }
+    __syncthreads();
+
+    if (sample == 0) {
+        auto const sum1 = shr_sum1[threadIdx.x] 
+            + shr_sum1[threadIdx.x+1] - shr_sum1[threadIdx.x+3];
+        auto const sumA = shr_sumA[threadIdx.x] 
+            + shr_sumA[threadIdx.x+1] - shr_sumA[threadIdx.x+3];
+        auto const sumF = shr_sumF[threadIdx.x] 
+            + shr_sumF[threadIdx.x+1] - shr_sumF[threadIdx.x+3];
+        auto const sumAF = shr_sumAF[threadIdx.x] 
+            + shr_sumAF[threadIdx.x+1] - shr_sumAF[threadIdx.x+3];
+        auto const sumFF = shr_sumFF[threadIdx.x] 
+            + shr_sumFF[threadIdx.x+1] - shr_sumFF[threadIdx.x+3];
+
+        auto const denom = sumFF * sum1 - sumF*sumF;
+        auto const condForDenom = sum1 > 0 && ecal::abs(denom)>1.e-20;
+        auto const amplitudeMax = condForDenom
+            ? (sumAF * sum1 - sumA * sumF) / denom
+            : static_cast<ScalarType>(0.);
 
         // store into global mem
         g_amplitudeMax[ch] = amplitudeMax;
-      }
     }
-
-    //#define ECAL_RECO_CUDA_TC_INIT_DEBUG
-    __global__ void kernel_time_computation_init(uint16_t const* digis,
-                                                 uint32_t const* dids,
-                                                 float const* rms_x12,
-                                                 float const* rms_x6,
-                                                 float const* rms_x1,
-                                                 float const* mean_x12,
-                                                 float const* mean_x6,
-                                                 float const* mean_x1,
-                                                 float const* gain12Over6,
-                                                 float const* gain6Over1,
-                                                 SampleVector::Scalar* sample_values,
-                                                 SampleVector::Scalar* sample_value_errors,
-                                                 SampleVector::Scalar* ampMaxError,
-                                                 bool* useless_sample_values,
-                                                 char* pedestal_nums,
-                                                 uint32_t const offsetForHashes,
-                                                 unsigned int const sample_maskEB,
-                                                 unsigned int const sample_maskEE,
-                                                 int nchannels) {
-      using ScalarType = SampleVector::Scalar;
-
-      // constants
-      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
-
-      // indices
-      int tx = threadIdx.x + blockDim.x * blockIdx.x;
-      int ch = tx / nsamples;
-
-      if (ch < nchannels) {
+}
+
+//#define ECAL_RECO_CUDA_TC_INIT_DEBUG
+__global__
+void kernel_time_computation_init(uint16_t const* digis_eb,
+                                  uint32_t const* dids_eb,
+                                  uint16_t const* digis_ee,
+                                  uint32_t const* dids_ee,
+                                  float const* rms_x12,
+                                  float const* rms_x6,
+                                  float const* rms_x1,
+                                  float const* mean_x12,
+                                  float const* mean_x6,
+                                  float const* mean_x1,
+                                  float const* gain12Over6,
+                                  float const* gain6Over1,
+                                  SampleVector::Scalar* sample_values,
+                                  SampleVector::Scalar* sample_value_errors,
+                                  SampleVector::Scalar* ampMaxError,
+                                  bool* useless_sample_values,
+                                  char* pedestal_nums,
+                                  uint32_t const offsetForHashes,
+                                  uint32_t const offsetForInputs,
+                                  unsigned int const sample_maskEB,
+                                  unsigned int const sample_maskEE,
+                                  int nchannels) {
+    using ScalarType = SampleVector::Scalar;
+
+    // constants
+    constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+    // indices
+    int const tx = threadIdx.x + blockDim.x*blockIdx.x;
+    int const ch = tx/nsamples;
+    int const inputTx = ch >= offsetForInputs
+        ? tx - offsetForInputs*nsamples
+        : tx;
+    int const inputCh = ch >= offsetForInputs
+        ? ch - offsetForInputs
+        : ch;
+    auto const* digis = ch >= offsetForInputs
+        ? digis_ee
+        : digis_eb;
+    auto const* dids = ch >= offsetForInputs
+        ? dids_ee
+        : dids_eb;
+
+    if (ch < nchannels) {
         // indices/inits
-        int sample = tx % nsamples;
-        int ch_start = ch * nsamples;
+        int const sample = tx % nsamples;
+        int const ch_start = ch*nsamples;
+        int const input_ch_start = inputCh*nsamples;
         SampleVector::Scalar pedestal = 0.;
         int num = 0;
 
         // configure shared mem
         extern __shared__ char smem[];
-        ScalarType* shrSampleValues = reinterpret_cast<SampleVector::Scalar*>(smem);
+        ScalarType* shrSampleValues = 
+            reinterpret_cast<SampleVector::Scalar*>(smem);
         ScalarType* shrSampleValueErrors = shrSampleValues + blockDim.x;
 
         // 0 and 1 sample values
-        auto const adc0 = ecal::mgpa::adc(digis[ch_start]);
-        auto const gainId0 = ecal::mgpa::gainId(digis[ch_start]);
-        auto const adc1 = ecal::mgpa::adc(digis[ch_start + 1]);
-        auto const gainId1 = ecal::mgpa::gainId(digis[ch_start + 1]);
-        auto const did = DetId{dids[ch]};
+        auto const adc0 = ecal::mgpa::adc(digis[input_ch_start]);
+        auto const gainId0 = ecal::mgpa::gainId(digis[input_ch_start]);
+        auto const adc1 = ecal::mgpa::adc(digis[input_ch_start+1]);
+        auto const gainId1 = ecal::mgpa::gainId(digis[input_ch_start+1]);
+        auto const did = DetId{dids[inputCh]};
         auto const isBarrel = did.subdetId() == EcalBarrel;
-        auto const sample_mask = did.subdetId() == EcalBarrel ? sample_maskEB : sample_maskEE;
-        auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId());
+        auto const sample_mask = did.subdetId() == EcalBarrel
+            ? sample_maskEB
+            : sample_maskEE;
+        auto const hashedId = isBarrel
+            ? hashedIndexEB(did.rawId())
+            : offsetForHashes + hashedIndexEE(did.rawId());
 
         // set pedestal
         // TODO this branch is non-divergent for a group of 10 threads
         if (gainId0 == 1 && use_sample(sample_mask, 0)) {
-          pedestal = static_cast<SampleVector::Scalar>(adc0);
-          num = 1;
-
-          auto const diff = adc1 - adc0;
-          if (gainId1 == 1 && use_sample(sample_mask, 1) && std::abs(diff) < 3 * rms_x12[hashedId]) {
-            pedestal = (pedestal + static_cast<SampleVector::Scalar>(adc1)) / 2.0;
-            num = 2;
-          }
+            pedestal = static_cast<SampleVector::Scalar>(adc0);
+            num=1;
+
+            auto const diff = adc1 - adc0;
+            if (gainId1 == 1 && use_sample(sample_mask, 1)
+                && std::abs(diff) < 3*rms_x12[hashedId]) {
+                pedestal = 
+                    (pedestal + static_cast<SampleVector::Scalar>(adc1)) / 2.0;
+                num=2;
+            }
         } else {
-          pedestal = mean_x12[ch];
+            pedestal = mean_x12[ch];
         }
 
         // ped subtracted and gain-renormalized samples.
-        auto const gainId = ecal::mgpa::gainId(digis[tx]);
-        auto const adc = ecal::mgpa::adc(digis[tx]);
+        auto const gainId = ecal::mgpa::gainId(digis[inputTx]);
+        auto const adc = ecal::mgpa::adc(digis[inputTx]);
 
         bool bad = false;
         SampleVector::Scalar sample_value, sample_value_error;
@@ -879,23 +991,25 @@ namespace ecal {
         // TODO: piece below is general both for amplitudes and timing
         // potentially there is a way to reduce the amount of code...
         if (!use_sample(sample_mask, sample)) {
-          bad = true;
-          sample_value = 0;
-          sample_value_error = 0;
+            bad = true;
+            sample_value = 0;
+            sample_value_error = 0;
         } else if (gainId == 1) {
-          sample_value = static_cast<SampleVector::Scalar>(adc) - pedestal;
-          sample_value_error = rms_x12[hashedId];
+            sample_value = static_cast<SampleVector::Scalar>(adc) - pedestal;
+            sample_value_error = rms_x12[hashedId];
         } else if (gainId == 2) {
-          sample_value = (static_cast<SampleVector::Scalar>(adc) - mean_x6[hashedId]) * gain12Over6[hashedId];
-          sample_value_error = rms_x6[hashedId] * gain12Over6[hashedId];
+            sample_value =  (static_cast<SampleVector::Scalar>(adc) 
+                - mean_x6[hashedId]) * gain12Over6[hashedId]; 
+            sample_value_error = rms_x6[hashedId] * gain12Over6[hashedId];
         } else if (gainId == 3) {
-          sample_value = (static_cast<SampleVector::Scalar>(adc) - mean_x1[hashedId]) * gain6Over1[hashedId] *
-                         gain12Over6[hashedId];
-          sample_value_error = rms_x1[hashedId] * gain6Over1[hashedId] * gain12Over6[hashedId];
+            sample_value = (static_cast<SampleVector::Scalar>(adc) 
+                - mean_x1[hashedId]) * gain6Over1[hashedId] * gain12Over6[hashedId];
+            sample_value_error = rms_x1[hashedId] 
+                * gain6Over1[hashedId] * gain12Over6[hashedId];
         } else {
-          sample_value = 0;
-          sample_value_error = 0;
-          bad = true;
+            sample_value = 0;
+            sample_value_error = 0;
+            bad = true;
         }
 
         // TODO: make sure we save things correctly when sample is useless
@@ -907,76 +1021,90 @@ namespace ecal {
         // DEBUG
 #ifdef ECAL_RECO_CUDA_TC_INIT_DEBUG
         if (ch == 0) {
-          printf("sample = %d sample_value = %f sample_value_error = %f useless = %c\n",
-                 sample,
-                 sample_value,
-                 sample_value_error,
-                 useless_sample ? '1' : '0');
+            printf("sample = %d sample_value = %f sample_value_error = %f useless = %c\n",
+                sample, sample_value, sample_value_error, 
+                useless_sample ? '1' : '0');           
         }
 #endif
 
         // store into the shared mem
-        shrSampleValues[threadIdx.x] = sample_value_error > 0 ? sample_value : std::numeric_limits<ScalarType>::min();
+        shrSampleValues[threadIdx.x] = sample_value_error > 0
+            ? sample_value
+            : std::numeric_limits<ScalarType>::min();
         shrSampleValueErrors[threadIdx.x] = sample_value_error;
         __syncthreads();
 
         // perform the reduction with min
         if (sample < 5) {
-          // note, if equal -> we keep the value with lower sample as for cpu
-          shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 5]
-                                                  ? shrSampleValueErrors[threadIdx.x + 5]
-                                                  : shrSampleValueErrors[threadIdx.x];
-          shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 5]);
+            // note, if equal -> we keep the value with lower sample as for cpu
+            shrSampleValueErrors[threadIdx.x] = 
+                shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x+5] 
+                ? shrSampleValueErrors[threadIdx.x+5]
+                : shrSampleValueErrors[threadIdx.x];
+            shrSampleValues[threadIdx.x] = 
+                std::max(shrSampleValues[threadIdx.x], 
+                         shrSampleValues[threadIdx.x+5]);
         }
         __syncthreads();
 
         // a bit of an overkill, but easier than to compare across 3 values
-        if (sample < 3) {
-          shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 3]
-                                                  ? shrSampleValueErrors[threadIdx.x + 3]
-                                                  : shrSampleValueErrors[threadIdx.x];
-          shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 3]);
+        if (sample<3) {
+            shrSampleValueErrors[threadIdx.x] = 
+                shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x+3]
+                ? shrSampleValueErrors[threadIdx.x+3]
+                : shrSampleValueErrors[threadIdx.x];
+            shrSampleValues[threadIdx.x] = 
+                std::max(shrSampleValues[threadIdx.x], 
+                         shrSampleValues[threadIdx.x+3]);
         }
         __syncthreads();
 
         if (sample < 2) {
-          shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 2]
-                                                  ? shrSampleValueErrors[threadIdx.x + 2]
-                                                  : shrSampleValueErrors[threadIdx.x];
-          shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 2]);
+            shrSampleValueErrors[threadIdx.x] = 
+                shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x+2]
+                ? shrSampleValueErrors[threadIdx.x+2]
+                : shrSampleValueErrors[threadIdx.x];
+            shrSampleValues[threadIdx.x] = 
+                std::max(shrSampleValues[threadIdx.x], 
+                         shrSampleValues[threadIdx.x+2]);
         }
         __syncthreads();
-
+ 
         if (sample == 0) {
-          // we only needd the max error
-          auto const maxSampleValueError = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 1]
-                                               ? shrSampleValueErrors[threadIdx.x + 1]
-                                               : shrSampleValueErrors[threadIdx.x];
-
-          // # pedestal samples used
-          pedestal_nums[ch] = num;
-          // this is used downstream
-          ampMaxError[ch] = maxSampleValueError;
-
-          // DEBUG
+            // we only needd the max error
+            auto const maxSampleValueError = 
+                shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x+1]
+                ? shrSampleValueErrors[threadIdx.x+1]
+                : shrSampleValueErrors[threadIdx.x];
+
+            // # pedestal samples used
+            pedestal_nums[ch] = num;
+            // this is used downstream
+            ampMaxError[ch] = maxSampleValueError;
+
+            // DEBUG
 #ifdef ECAL_RECO_CUDA_TC_INIT_DEBUG
-          if (ch == 0) {
-            printf("pedestal_nums = %d ampMaxError = %f\n", num, maxSampleValueError);
-          }
+            if (ch == 0) {
+                printf("pedestal_nums = %d ampMaxError = %f\n",
+                    num, maxSampleValueError);
+            }
 #endif
         }
-      }
     }
-
-    ///
-    /// launch context parameters: 1 thread per channel
-    ///
-    //#define DEBUG_TIME_CORRECTION
-    __global__ void kernel_time_correction_and_finalize(
-        //        SampleVector::Scalar const* g_amplitude,
+}
+
+///
+/// launch context parameters: 1 thread per channel
+///
+//#define DEBUG_TIME_CORRECTION
+__global__
+void kernel_time_correction_and_finalize(
+//        SampleVector::Scalar const* g_amplitude,
         ::ecal::reco::StorageScalarType const* g_amplitude,
-        uint16_t const* digis,
-        uint32_t const* dids,
+        uint16_t const* digis_eb,
+        uint32_t const* dids_eb,
+        uint16_t const* digis_ee,
+        uint32_t const* dids_ee,
         float const* amplitudeBinsEB,
         float const* amplitudeBinsEE,
         float const* shiftBinsEB,
@@ -985,9 +1113,9 @@ namespace ecal {
         SampleVector::Scalar const* g_timeError,
         float const* g_rms_x12,
         float const* timeCalibConstant,
-        float* g_jitter,
-        float* g_jitterError,
-        uint32_t* flags,
+        float *g_jitter,
+        float *g_jitterError,
+        uint32_t *flags,
         int const amplitudeBinsSizeEB,
         int const amplitudeBinsSizeEE,
         ConfigurationParameters::type const timeConstantTermEB,
@@ -1007,105 +1135,137 @@ namespace ecal {
         ConfigurationParameters::type const outOfTimeThreshG61mEB,
         ConfigurationParameters::type const outOfTimeThreshG61mEE,
         uint32_t const offsetForHashes,
+        uint32_t const offsetForInputs,
         int const nchannels) {
-      using ScalarType = SampleVector::Scalar;
-
-      // constants
-      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
-
-      // indices
-      int const gtx = threadIdx.x + blockIdx.x * blockDim.x;
-
-      // filter out outside of range threads
-      if (gtx >= nchannels)
-        return;
-
-      auto const did = DetId{dids[gtx]};
-      auto const isBarrel = did.subdetId() == EcalBarrel;
-      auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId());
-      auto const* amplitudeBins = isBarrel ? amplitudeBinsEB : amplitudeBinsEE;
-      auto const* shiftBins = isBarrel ? shiftBinsEB : shiftBinsEE;
-      auto const amplitudeBinsSize = isBarrel ? amplitudeBinsSizeEB : amplitudeBinsSizeEE;
-      auto const timeConstantTerm = isBarrel ? timeConstantTermEB : timeConstantTermEE;
-      auto const timeNconst = isBarrel ? timeNconstEB : timeNconstEE;
-      auto const offsetTimeValue = isBarrel ? offsetTimeValueEB : offsetTimeValueEE;
-      auto const amplitudeThreshold = isBarrel ? amplitudeThresholdEB : amplitudeThresholdEE;
-      auto const outOfTimeThreshG12p = isBarrel ? outOfTimeThreshG12pEB : outOfTimeThreshG12pEE;
-      auto const outOfTimeThreshG12m = isBarrel ? outOfTimeThreshG12mEB : outOfTimeThreshG12mEE;
-      auto const outOfTimeThreshG61p = isBarrel ? outOfTimeThreshG61pEB : outOfTimeThreshG61pEE;
-      auto const outOfTimeThreshG61m = isBarrel ? outOfTimeThreshG61mEB : outOfTimeThreshG61mEE;
-
-      // load some
-      auto const amplitude = g_amplitude[gtx];
-      auto const rms_x12 = g_rms_x12[hashedId];
-      auto const timeCalibConst = timeCalibConstant[hashedId];
-
-      int myBin = -1;
-      for (int bin = 0; bin < amplitudeBinsSize; bin++) {
-        if (amplitude > amplitudeBins[bin])
-          myBin = bin;
-        else
-          break;
-      }
-
-      ScalarType correction = 0;
-      if (myBin == -1) {
+    using ScalarType = SampleVector::Scalar;
+
+    // constants
+    constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+    // indices
+    int const gtx = threadIdx.x + blockIdx.x * blockDim.x;
+    int const inputGtx = gtx >= offsetForInputs
+        ? gtx - offsetForInputs
+        : gtx;
+    auto const* dids = gtx >= offsetForInputs
+        ? dids_ee
+        : dids_eb;
+    auto const& digis = gtx >= offsetForInputs
+        ? digis_ee
+        : digis_eb;
+
+    // filter out outside of range threads
+    if (gtx >= nchannels) return;
+
+    auto const did = DetId{dids[inputGtx]};
+    auto const isBarrel = did.subdetId() == EcalBarrel;
+    auto const hashedId = isBarrel
+        ? hashedIndexEB(did.rawId())
+        : offsetForHashes + hashedIndexEE(did.rawId());
+    auto const* amplitudeBins = isBarrel
+        ? amplitudeBinsEB
+        : amplitudeBinsEE;
+    auto const* shiftBins = isBarrel
+        ? shiftBinsEB
+        : shiftBinsEE;
+    auto const amplitudeBinsSize = isBarrel
+        ? amplitudeBinsSizeEB
+        : amplitudeBinsSizeEE;
+    auto const timeConstantTerm = isBarrel 
+        ? timeConstantTermEB
+        : timeConstantTermEE;
+    auto const timeNconst = isBarrel 
+        ? timeNconstEB
+        : timeNconstEE;
+    auto const offsetTimeValue = isBarrel
+        ? offsetTimeValueEB
+        : offsetTimeValueEE;
+    auto const amplitudeThreshold = isBarrel
+        ? amplitudeThresholdEB
+        : amplitudeThresholdEE;
+    auto const outOfTimeThreshG12p = isBarrel
+        ? outOfTimeThreshG12pEB
+        : outOfTimeThreshG12pEE;
+    auto const outOfTimeThreshG12m = isBarrel
+        ? outOfTimeThreshG12mEB
+        : outOfTimeThreshG12mEE;
+    auto const outOfTimeThreshG61p = isBarrel
+        ? outOfTimeThreshG61pEB
+        : outOfTimeThreshG61pEE;
+    auto const outOfTimeThreshG61m = isBarrel
+        ? outOfTimeThreshG61mEB
+        : outOfTimeThreshG61mEE;
+    
+    // load some
+    auto const amplitude = g_amplitude[gtx];
+    auto const rms_x12 = g_rms_x12[hashedId];
+    auto const timeCalibConst = timeCalibConstant[hashedId];
+
+    int myBin = -1;
+    for (int bin=0; bin<amplitudeBinsSize; bin++) {
+        if (amplitude > amplitudeBins[bin]) 
+            myBin = bin;
+        else 
+            break;
+    }
+
+    ScalarType correction = 0;
+    if (myBin == -1) {
         correction = shiftBins[0];
-      } else if (myBin == amplitudeBinsSize - 1) {
+    } else if (myBin == amplitudeBinsSize-1) {
         correction = shiftBins[myBin];
-      } else {
-        correction = shiftBins[myBin + 1] - shiftBins[myBin];
-        correction *= (amplitude - amplitudeBins[myBin]) / (amplitudeBins[myBin + 1] - amplitudeBins[myBin]);
+    } else {
+        correction = shiftBins[myBin+1] - shiftBins[myBin];
+        correction *= (amplitude - amplitudeBins[myBin]) / 
+            (amplitudeBins[myBin+1] - amplitudeBins[myBin]);
         correction += shiftBins[myBin];
-      }
+    }
 
-      // correction * 1./25.
-      correction = correction * 0.04;
-      auto const timeMax = g_timeMax[gtx];
-      auto const timeError = g_timeError[gtx];
-      auto const jitter = timeMax - 5 + correction;
-      auto const jitterError =
-          std::sqrt(timeError * timeError + timeConstantTerm * timeConstantTerm * 0.04 * 0.04);  // 0.04 = 1./25.
+    // correction * 1./25.
+    correction = correction * 0.04;
+    auto const timeMax = g_timeMax[gtx];
+    auto const timeError = g_timeError[gtx];
+    auto const jitter = timeMax - 5 + correction;
+    auto const jitterError = std::sqrt(timeError*timeError + 
+        timeConstantTerm*timeConstantTerm * 0.04 * 0.04); // 0.04 = 1./25.
 
 #ifdef DEBUG_TIME_CORRECTION
-      //    if (gtx == 0) {
-      printf("ch = %d timeMax = %f timeError = %f jitter = %f correction = %f\n",
-             gtx,
-             timeMax,
-             timeError,
-             jitter,
-             correction);
+//    if (gtx == 0) {
+        printf("ch = %d timeMax = %f timeError = %f jitter = %f correction = %f\n",
+            gtx, timeMax, timeError, jitter, correction);
 //    }
 #endif
 
-      // store back to  global
-      g_jitter[gtx] = jitter;
-      g_jitterError[gtx] = jitterError;
+    // store back to  global
+    g_jitter[gtx] = jitter;
+    g_jitterError[gtx] = jitterError;
 
-      // set the flag
-      // TODO: replace with something more efficient (if required),
-      // for now just to make it work
-      if (amplitude > amplitudeThreshold * rms_x12) {
+    // set the flag
+    // TODO: replace with something more efficient (if required), 
+    // for now just to make it work
+    if (amplitude > amplitudeThreshold * rms_x12) {
         auto threshP = outOfTimeThreshG12p;
         auto threshM = outOfTimeThreshG12m;
         if (amplitude > 3000.) {
-          for (int isample = 0; isample < nsamples; isample++) {
-            int gainid = ecal::mgpa::gainId(digis[nsamples * gtx + isample]);
-            if (gainid != 1) {
-              threshP = outOfTimeThreshG61p;
-              threshM = outOfTimeThreshG61m;
-              break;
+            for (int isample=0; isample<nsamples; isample++) {
+                int gainid = ecal::mgpa::gainId(digis[nsamples*inputGtx + isample]);
+                if (gainid != 1) {
+                    threshP = outOfTimeThreshG61p;
+                    threshM = outOfTimeThreshG61m;
+                    break;
+                }
             }
-          }
         }
 
-        auto const correctedTime = (timeMax - 5) * 25 + timeCalibConst + offsetTimeValue;
+        auto const correctedTime = (timeMax - 5) * 25 + 
+            timeCalibConst + offsetTimeValue;
         auto const nterm = timeNconst * rms_x12 / amplitude;
-        auto const sigmat = std::sqrt(nterm * nterm + timeConstantTerm * timeConstantTerm);
-        if (correctedTime > sigmat * threshP || correctedTime < -sigmat * threshM)
-          flags[gtx] |= 0x1 << EcalUncalibratedRecHit::kOutOfTime;
-      }
+        auto const sigmat = std::sqrt(nterm * nterm + 
+            timeConstantTerm*timeConstantTerm);
+        if (correctedTime > sigmat*threshP || 
+            correctedTime < -sigmat*threshM)
+            flags[gtx] |= 0x1 << EcalUncalibratedRecHit::kOutOfTime;
     }
+}
 
-  }  // namespace multifit
-}  // namespace ecal
+}}
diff --git a/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.h b/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.h
index 30f2a6f6b774d..1a5d1a96e65cd 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.h
+++ b/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.h
@@ -17,55 +17,60 @@
 
 //#define ECAL_RECO_CUDA_DEBUG
 
-namespace ecal {
-  namespace multifit {
+namespace ecal { namespace multifit {
 
-    __global__ void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values,
-                                                  SampleVector::Scalar const* sample_value_errors,
-                                                  bool const* useless_sample_values,
-                                                  SampleVector::Scalar* chi2s,
-                                                  SampleVector::Scalar* sum0s,
-                                                  SampleVector::Scalar* sumAAs,
-                                                  int const nchannels);
-    //
-    // launch ctx parameters are
-    // 45 threads per channel, X channels per block, Y blocks
-    // 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9
-    // TODO: it might be much beter to use 32 threads per channel instead of 45
-    // to simplify the synchronization
-    //
-    __global__ void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values,
-                                                  SampleVector::Scalar const* sample_value_errors,
-                                                  uint32_t const* dids,
-                                                  bool const* useless_sample_values,
-                                                  char const* pedestal_nums,
-                                                  ConfigurationParameters::type const* amplitudeFitParametersEB,
-                                                  ConfigurationParameters::type const* amplitudeFitParametersEE,
-                                                  ConfigurationParameters::type const* timeFitParametersEB,
-                                                  ConfigurationParameters::type const* timeFitParametersEE,
-                                                  SampleVector::Scalar const* sumAAsNullHypot,
-                                                  SampleVector::Scalar const* sum0sNullHypot,
-                                                  SampleVector::Scalar* tMaxAlphaBetas,
-                                                  SampleVector::Scalar* tMaxErrorAlphaBetas,
-                                                  SampleVector::Scalar* g_accTimeMax,
-                                                  SampleVector::Scalar* g_accTimeWgt,
-                                                  TimeComputationState* g_state,
-                                                  unsigned int const timeFitParameters_sizeEB,
-                                                  unsigned int const timeFitParameters_sizeEE,
-                                                  ConfigurationParameters::type const timeFitLimits_firstEB,
-                                                  ConfigurationParameters::type const timeFitLimits_firstEE,
-                                                  ConfigurationParameters::type const timeFitLimits_secondEB,
-                                                  ConfigurationParameters::type const timeFitLimits_secondEE,
-                                                  int const nchannels);
+__global__
+void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values,
+                                   SampleVector::Scalar const* sample_value_errors,
+                                   bool const* useless_sample_values,
+                                   SampleVector::Scalar* chi2s,
+                                   SampleVector::Scalar* sum0s,
+                                   SampleVector::Scalar* sumAAs,
+                                   int const nchannels);
+//
+// launch ctx parameters are 
+// 45 threads per channel, X channels per block, Y blocks
+// 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9
+// TODO: it might be much beter to use 32 threads per channel instead of 45
+// to simplify the synchronization
+//
+__global__
+void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values,
+                                   SampleVector::Scalar const* sample_value_errors,
+                                   uint32_t const* dids_eb,
+                                   uint32_t const* dids_ee,
+                                   bool const* useless_sample_values,
+                                   char const* pedestal_nums,
+                                   ConfigurationParameters::type const* amplitudeFitParametersEB,
+                                   ConfigurationParameters::type const* amplitudeFitParametersEE,
+                                   ConfigurationParameters::type const* timeFitParametersEB,
+                                   ConfigurationParameters::type const* timeFitParametersEE,
+                                   SampleVector::Scalar const* sumAAsNullHypot,
+                                   SampleVector::Scalar const* sum0sNullHypot,
+                                   SampleVector::Scalar* tMaxAlphaBetas,
+                                   SampleVector::Scalar* tMaxErrorAlphaBetas,
+                                   SampleVector::Scalar* g_accTimeMax,
+                                   SampleVector::Scalar* g_accTimeWgt,
+                                   TimeComputationState* g_state,
+                                   unsigned int const timeFitParameters_sizeEB,
+                                   unsigned int const timeFitParameters_sizeEE,
+                                   ConfigurationParameters::type const timeFitLimits_firstEB,
+                                   ConfigurationParameters::type const timeFitLimits_firstEE,
+                                   ConfigurationParameters::type const timeFitLimits_secondEB,
+                                   ConfigurationParameters::type const timeFitLimits_secondEE,
+                                   int const nchannels,
+                                   uint32_t const offsetForInputs);
 
-    /// launch ctx parameters are
-    /// 10 threads per channel, N channels per block, Y blocks
-    /// TODO: do we need to keep the state around or can be removed?!
-    //#define DEBUG_FINDAMPLCHI2_AND_FINISH
-    __global__ void kernel_time_compute_findamplchi2_and_finish(
+/// launch ctx parameters are 
+/// 10 threads per channel, N channels per block, Y blocks
+/// TODO: do we need to keep the state around or can be removed?!
+//#define DEBUG_FINDAMPLCHI2_AND_FINISH
+__global__
+void kernel_time_compute_findamplchi2_and_finish(
         SampleVector::Scalar const* sample_values,
         SampleVector::Scalar const* sample_value_errors,
-        uint32_t const* dids,
+        uint32_t const* dids_eb,
+        uint32_t const* dids_ee,
         bool const* useless_samples,
         SampleVector::Scalar const* g_tMaxAlphaBeta,
         SampleVector::Scalar const* g_tMaxErrorAlphaBeta,
@@ -81,55 +86,69 @@ namespace ecal {
         SampleVector::Scalar* g_ampMaxError,
         SampleVector::Scalar* g_timeMax,
         SampleVector::Scalar* g_timeError,
-        int const nchannels);
+        int const nchannels,
+        uint32_t const offsetForInputs);
 
-    __global__ void kernel_time_compute_fixMGPAslew(uint16_t const* digis,
-                                                    SampleVector::Scalar* sample_values,
-                                                    SampleVector::Scalar* sample_value_errors,
-                                                    bool* useless_sample_values,
-                                                    unsigned int const sample_mask,
-                                                    int const nchannels);
+__global__
+void kernel_time_compute_fixMGPAslew(uint16_t const* digis_eb,
+                                     uint16_t const* digis_ee,
+                                     SampleVector::Scalar* sample_values,
+                                     SampleVector::Scalar* sample_value_errors,
+                                     bool* useless_sample_values,
+                                     unsigned int const sample_mask,
+                                     int const nchannels,
+                                     uint32_t const offsetForInputs);
 
-    __global__ void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values,
-                                             SampleVector::Scalar const* sample_value_errors,
-                                             uint32_t const* dids,
-                                             bool const* useless_samples,
-                                             SampleVector::Scalar const* g_timeMax,
-                                             SampleVector::Scalar const* amplitudeFitParametersEB,
-                                             SampleVector::Scalar const* amplitudeFitParametersEE,
-                                             SampleVector::Scalar* g_amplitudeMax,
-                                             int const nchannels);
+__global__
+void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values,
+                              SampleVector::Scalar const* sample_value_errors,
+                              uint32_t const* dids_eb,
+                              uint32_t const* dids_ed,
+                              bool const* useless_samples,
+                              SampleVector::Scalar const* g_timeMax,
+                              SampleVector::Scalar const* amplitudeFitParametersEB,
+                              SampleVector::Scalar const* amplitudeFitParametersEE,
+                              SampleVector::Scalar *g_amplitudeMax,
+                              int const nchannels,
+                              uint32_t const offsetForInputs);
 
-    //#define ECAL_RECO_CUDA_TC_INIT_DEBUG
-    __global__ void kernel_time_computation_init(uint16_t const* digis,
-                                                 uint32_t const* dids,
-                                                 float const* rms_x12,
-                                                 float const* rms_x6,
-                                                 float const* rms_x1,
-                                                 float const* mean_x12,
-                                                 float const* mean_x6,
-                                                 float const* mean_x1,
-                                                 float const* gain12Over6,
-                                                 float const* gain6Over1,
-                                                 SampleVector::Scalar* sample_values,
-                                                 SampleVector::Scalar* sample_value_errors,
-                                                 SampleVector::Scalar* ampMaxError,
-                                                 bool* useless_sample_values,
-                                                 char* pedestal_nums,
-                                                 uint32_t const offsetForHashes,
-                                                 unsigned int const sample_maskEB,
-                                                 unsigned int const sample_maskEE,
-                                                 int nchannels);
+//#define ECAL_RECO_CUDA_TC_INIT_DEBUG
+__global__
+void kernel_time_computation_init(uint16_t const* digis_eb,
+                                  uint32_t const* dids_eb,
+                                  uint16_t const* digis_ee,
+                                  uint32_t const* dids_ee,
+                                  float const* rms_x12,
+                                  float const* rms_x6,
+                                  float const* rms_x1,
+                                  float const* mean_x12,
+                                  float const* mean_x6,
+                                  float const* mean_x1,
+                                  float const* gain12Over6,
+                                  float const* gain6Over1,
+                                  SampleVector::Scalar* sample_values,
+                                  SampleVector::Scalar* sample_value_errors,
+                                  SampleVector::Scalar* ampMaxError,
+                                  bool* useless_sample_values,
+                                  char* pedestal_nums,
+                                  uint32_t const offsetForHashes,
+                                  uint32_t const offsetForInputs,
+                                  unsigned int const sample_maskEB,
+                                  unsigned int const sample_maskEE,
+                                  int nchannels);
 
-    ///
-    /// launch context parameters: 1 thread per channel
-    ///
-    //#define DEBUG_TIME_CORRECTION
-    __global__ void kernel_time_correction_and_finalize(
-        //        SampleVector::Scalar const* g_amplitude,
+///
+/// launch context parameters: 1 thread per channel
+///
+//#define DEBUG_TIME_CORRECTION
+__global__
+void kernel_time_correction_and_finalize(
+//        SampleVector::Scalar const* g_amplitude,
         ::ecal::reco::StorageScalarType const* g_amplitude,
-        uint16_t const* digis,
-        uint32_t const* dids,
+        uint16_t const* digis_eb,
+        uint32_t const* dids_eb,
+        uint16_t const* digis_ee,
+        uint32_t const* dids_ee,
         float const* amplitudeBinsEB,
         float const* amplitudeBinsEE,
         float const* shiftBinsEB,
@@ -138,9 +157,9 @@ namespace ecal {
         SampleVector::Scalar const* g_timeError,
         float const* g_rms_x12,
         float const* timeCalibConstant,
-        ::ecal::reco::StorageScalarType* g_jitter,
-        ::ecal::reco::StorageScalarType* g_jitterError,
-        uint32_t* flags,
+        ::ecal::reco::StorageScalarType *g_jitter,
+        ::ecal::reco::StorageScalarType *g_jitterError,
+        uint32_t *flags,
         int const amplitudeBinsSizeEB,
         int const amplitudeBinsSizeEE,
         ConfigurationParameters::type const timeConstantTermEB,
@@ -160,9 +179,9 @@ namespace ecal {
         ConfigurationParameters::type const outOfTimeThreshG61mEB,
         ConfigurationParameters::type const outOfTimeThreshG61mEE,
         uint32_t const offsetForHashes,
+        uint32_t const offsetForInputs,
         int const nchannels);
 
-  }  // namespace multifit
-}  // namespace ecal
+}}
 
-#endif  // RecoLocalCalo_EcalRecAlgos_src_TimeComputationKernels
+#endif // RecoLocalCalo_EcalRecAlgos_src_TimeComputationKernels
diff --git a/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.cu b/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.cu
index 98f2899876d43..f657981b95fa0 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.cu
@@ -1,122 +1,120 @@
 #include "inplace_fnnls.h"
 
-namespace ecal {
-  namespace multifit {
-
-    using matrix_t = SampleMatrix;
-    using vector_t = SampleVector;
-
-    __device__ bool inplace_fnnls(matrix_t const& A,
-                                  vector_t const& b,
-                                  vector_t& x,
-                                  int& npassive,
-                                  BXVectorType& activeBXs,
-                                  PulseMatrixType& pulse_matrix,
-                                  const double eps,
-                                  const unsigned int max_iterations) {
-      matrix_t AtA = A.transpose() * A;
-      vector_t Atb = A.transpose() * b;
-      vector_t s;
-      vector_t w;
-
-      // main loop
-      Eigen::Index w_max_idx_prev = 0;
-      matrix_t::Scalar w_max_prev = 0;
-      double eps_to_use = eps;
-
-      int iter = 0;
-      while (true) {
-        if (iter > 0 || npassive == 0) {
-          const auto nActive = vector_t::RowsAtCompileTime - npassive;
-          if (!nActive)
+namespace ecal { namespace multifit {
+
+using matrix_t = SampleMatrix;
+using vector_t = SampleVector;
+
+__device__
+bool inplace_fnnls(matrix_t& AtA,
+                   vector_t& Atb,
+                   vector_t& x,
+                   int& npassive,
+                   BXVectorType& activeBXs,
+                   PulseMatrixType& pulse_matrix,
+                   const double eps,
+                   const unsigned int max_iterations) {
+  vector_t s;
+  vector_t w;
+
+// main loop
+  Eigen::Index w_max_idx_prev = 0;
+  matrix_t::Scalar w_max_prev = 0;
+  double eps_to_use = eps;
+
+  int iter = 0;
+  while (true) {
+    if (iter>0 || npassive==0) {
+        const auto nActive = vector_t::RowsAtCompileTime - npassive;
+        if(!nActive)
+          break;
+
+        w.tail(nActive) = Atb.tail(nActive) - (AtA * x).tail(nActive);
+
+        // get the index of w that gives the maximum gain
+        Eigen::Index w_max_idx;
+        const auto max_w = w.tail(nActive).maxCoeff(&w_max_idx);
+
+        // check for convergence
+        if (max_w < eps_to_use || (w_max_idx==w_max_idx_prev && max_w==w_max_prev))
+          break;
+    
+        // worst case
+        if (iter >= 500)
             break;
 
-          w.tail(nActive) = Atb.tail(nActive) - (AtA * x).tail(nActive);
+        w_max_prev = max_w;
+        w_max_idx_prev = w_max_idx;
 
-          // get the index of w that gives the maximum gain
-          Eigen::Index w_max_idx;
-          const auto max_w = w.tail(nActive).maxCoeff(&w_max_idx);
+        // need to translate the index into the right part of the vector
+        w_max_idx += npassive;
 
-          // check for convergence
-          if (max_w < eps_to_use || (w_max_idx == w_max_idx_prev && max_w == w_max_prev))
-            break;
-
-          // worst case
-          if (iter >= 500)
-            break;
-
-          w_max_prev = max_w;
-          w_max_idx_prev = w_max_idx;
-
-          // need to translate the index into the right part of the vector
-          w_max_idx += npassive;
-
-          // swap AtA to avoid copy
-          AtA.col(npassive).swap(AtA.col(w_max_idx));
-          AtA.row(npassive).swap(AtA.row(w_max_idx));
-          // swap Atb to match with AtA
-          Eigen::numext::swap(Atb.coeffRef(npassive), Atb.coeffRef(w_max_idx));
-          Eigen::numext::swap(x.coeffRef(npassive), x.coeffRef(w_max_idx));
-          Eigen::numext::swap(activeBXs.coeffRef(npassive), activeBXs.coeffRef(w_max_idx));
-          pulse_matrix.col(npassive).swap(pulse_matrix.col(w_max_idx));
+        // swap AtA to avoid copy
+        AtA.col(npassive).swap(AtA.col(w_max_idx));
+        AtA.row(npassive).swap(AtA.row(w_max_idx));
+        // swap Atb to match with AtA
+        Eigen::numext::swap(Atb.coeffRef(npassive), Atb.coeffRef(w_max_idx));
+        Eigen::numext::swap(x.coeffRef(npassive), x.coeffRef(w_max_idx));
+        Eigen::numext::swap(activeBXs.coeffRef(npassive), activeBXs.coeffRef(w_max_idx));
+        pulse_matrix.col(npassive).swap(pulse_matrix.col(w_max_idx));
 
-          ++npassive;
-        }
+        ++npassive;
+    }
 
-        // inner loop
-        while (true) {
-          if (npassive == 0)
-            break;
+// inner loop
+    while (true) {
+      if (npassive == 0) break;
 
-          s.head(npassive) = AtA.topLeftCorner(npassive, npassive).llt().solve(Atb.head(npassive));
+      s.head(npassive) =
+          AtA.topLeftCorner(npassive, npassive).llt().solve(Atb.head(npassive));
 
-          // if all coefficients are positive, done for this iteration
-          if (s.head(npassive).minCoeff() > 0.) {
-            x.head(npassive) = s.head(npassive);
-            break;
-          }
+      // if all coefficients are positive, done for this iteration
+      if (s.head(npassive).minCoeff() > 0.) {
+        x.head(npassive) = s.head(npassive);
+        break;
+      }
 
-          auto alpha = std::numeric_limits<double>::max();
-          Eigen::Index alpha_idx = 0;
+      auto alpha = std::numeric_limits<double>::max();
+      Eigen::Index alpha_idx = 0;
 
 #pragma unroll
-          for (auto i = 0; i < npassive; ++i) {
-            if (s[i] <= 0.) {
-              auto const ratio = x[i] / (x[i] - s[i]);
-              if (ratio < alpha) {
-                alpha = ratio;
-                alpha_idx = i;
-              }
-            }
+      for (auto i = 0; i < npassive; ++i) {
+        if (s[i] <= 0.) {
+          auto const ratio = x[i] / (x[i] - s[i]);
+          if (ratio < alpha) {
+            alpha = ratio;
+            alpha_idx = i;
           }
+        }
+      }
 
-          /*
+      /*
       if (std::numeric_limits<double>::max() == alpha) {
         x.head(npassive) = s.head(npassive);
         break;
       }*/
 
-          x.head(npassive) += alpha * (s.head(npassive) - x.head(npassive));
-          x[alpha_idx] = 0;
-          --npassive;
-
-          AtA.col(npassive).swap(AtA.col(alpha_idx));
-          AtA.row(npassive).swap(AtA.row(alpha_idx));
-          // swap Atb to match with AtA
-          Eigen::numext::swap(Atb.coeffRef(npassive), Atb.coeffRef(alpha_idx));
-          Eigen::numext::swap(x.coeffRef(npassive), x.coeffRef(alpha_idx));
-          Eigen::numext::swap(activeBXs.coeffRef(npassive), activeBXs.coeffRef(alpha_idx));
-          pulse_matrix.col(npassive).swap(pulse_matrix.col(alpha_idx));
-        }
-
-        // TODO as in cpu NNLS version
-        iter++;
-        if (iter % 16 == 0)
-          eps_to_use *= 2;
-      }
-
-      return true;
+      x.head(npassive) += alpha * (s.head(npassive) - x.head(npassive));
+      x[alpha_idx] = 0;
+      --npassive;
+
+      AtA.col(npassive).swap(AtA.col(alpha_idx));
+      AtA.row(npassive).swap(AtA.row(alpha_idx));
+      // swap Atb to match with AtA
+      Eigen::numext::swap(Atb.coeffRef(npassive), Atb.coeffRef(alpha_idx));
+      Eigen::numext::swap(x.coeffRef(npassive), x.coeffRef(alpha_idx));
+      Eigen::numext::swap(activeBXs.coeffRef(npassive), 
+                          activeBXs.coeffRef(alpha_idx));
+      pulse_matrix.col(npassive).swap(pulse_matrix.col(alpha_idx));
     }
 
-  }  // namespace multifit
-}  // namespace ecal
+    // TODO as in cpu NNLS version
+    iter++;
+    if (iter % 16 == 0)
+        eps_to_use *= 2;
+  }
+  
+  return true;
+}
+
+}}
diff --git a/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.h b/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.h
index 54805a3ab941c..9cda75008cc32 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.h
+++ b/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.h
@@ -3,22 +3,22 @@
 
 #include "RecoLocalCalo/EcalRecAlgos/interface/EigenMatrixTypes_gpu.h"
 
-namespace ecal {
-  namespace multifit {
+namespace ecal { namespace multifit {
 
-    using matrix_t = SampleMatrix;
-    using vector_t = SampleVector;
+using matrix_t = SampleMatrix;
+using vector_t = SampleVector;
 
-    __device__ bool inplace_fnnls(matrix_t const& A,
-                                  vector_t const& b,
-                                  vector_t& x,
-                                  int& npassive,
-                                  BXVectorType& activeBXs,
-                                  PulseMatrixType& pulse_matrix,
-                                  const double eps = 1e-11,
-                                  const unsigned int max_iterations = 500);
+__device__
+bool
+inplace_fnnls(matrix_t& AtA,
+              vector_t& Atb,
+              vector_t& x,
+              int& npassive,
+              BXVectorType& activeBXs,
+              PulseMatrixType& pulse_matrix,
+              const double eps = 1e-11,
+              const unsigned int max_iterations = 500);
 
-  }  // namespace multifit
-}  // namespace ecal
+}}
 
 #endif

From 1fea2b70f473163f84c8d9b11243d0bd8856980e Mon Sep 17 00:00:00 2001
From: Viktor Khristenko <vdkhristenko1991@gmail.com>
Date: Mon, 30 Mar 2020 16:31:58 +0200
Subject: [PATCH 05/30] ecal reco producers adapted for 111x

---
 .../plugins/EcalCPUUncalibRecHitProducer.cc   | 200 +++++
 .../plugins/EcalESProducerGPU.h               |  47 +-
 .../plugins/EcalESProducersGPUDefs.cc         |  35 +-
 .../EcalUncalibRecHitConvertGPU2CPUFormat.cc  | 144 ++--
 .../plugins/EcalUncalibRecHitProducerGPU.cc   | 731 ++++++++----------
 5 files changed, 658 insertions(+), 499 deletions(-)
 create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc

diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc
new file mode 100644
index 0000000000000..9c531d7060525
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc
@@ -0,0 +1,200 @@
+#include <iostream>
+
+// framework
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
+//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h" 
+
+// algorithm specific
+
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
+
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
+
+class EcalCPUUncalibRecHitProducer
+    : public edm::stream::EDProducer<edm::ExternalWork>
+{
+public:
+    explicit EcalCPUUncalibRecHitProducer(edm::ParameterSet const& ps);
+    ~EcalCPUUncalibRecHitProducer() override;
+    static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+    void acquire(edm::Event const&, 
+                 edm::EventSetup const&,
+                 edm::WaitingTaskWithArenaHolder) override;
+    void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+    edm::EDGetTokenT<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>> 
+        recHitsInEBToken_, recHitsInEEToken_;
+    edm::EDPutTokenT<ecal::UncalibratedRecHit<ecal::Tag::soa>>
+        recHitsOutEBToken_, recHitsOutEEToken_;
+
+    ecal::UncalibratedRecHit<ecal::Tag::soa>
+        recHitsEB_, recHitsEE_;
+    bool containsTimingInformation_;
+};
+
+void EcalCPUUncalibRecHitProducer::fillDescriptions(
+        edm::ConfigurationDescriptions& confDesc) {
+    edm::ParameterSetDescription desc;
+
+    desc.add<edm::InputTag>("recHitsInLabelEB", 
+        edm::InputTag{"ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEB"});
+    desc.add<edm::InputTag>("recHitsInLabelEE", 
+        edm::InputTag{"ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEE"});
+    desc.add<std::string>("recHitsOutLabelEB", "EcalUncalibRecHitsEB");
+    desc.add<std::string>("recHitsOutLabelEE", "EcalUncalibRecHitsEE");
+    desc.add<bool>("containsTimingInformation", false);
+
+    std::string label = "ecalCPUUncalibRecHitProducer";
+    confDesc.add(label, desc);
+}
+
+EcalCPUUncalibRecHitProducer::EcalCPUUncalibRecHitProducer(
+        const edm::ParameterSet& ps) 
+    : recHitsInEBToken_{consumes<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>>(
+        ps.getParameter<edm::InputTag>("recHitsInLabelEB"))}
+    , recHitsInEEToken_{consumes<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>>(
+        ps.getParameter<edm::InputTag>("recHitsInLabelEE"))}
+    , recHitsOutEBToken_{produces<ecal::UncalibratedRecHit<ecal::Tag::soa>>(
+        ps.getParameter<std::string>("recHitsOutLabelEB"))}
+    , recHitsOutEEToken_{produces<ecal::UncalibratedRecHit<ecal::Tag::soa>>(
+        ps.getParameter<std::string>("recHitsOutLabelEE"))}
+    , containsTimingInformation_{ps.getParameter<bool>("containsTimingInformation")}
+{}
+
+EcalCPUUncalibRecHitProducer::~EcalCPUUncalibRecHitProducer() {}
+
+void EcalCPUUncalibRecHitProducer::acquire(
+        edm::Event const& event,
+        edm::EventSetup const& setup,
+        edm::WaitingTaskWithArenaHolder taskHolder) 
+{
+    // retrieve data/ctx
+    auto const& ebRecHitsProduct = event.get(recHitsInEBToken_);
+    auto const& eeRecHitsProduct = event.get(recHitsInEEToken_);
+    cms::cuda::ScopedContextAcquire ctx{ebRecHitsProduct, std::move(taskHolder)};
+    auto const& ebRecHits = ctx.get(ebRecHitsProduct);
+    auto const& eeRecHits = ctx.get(eeRecHitsProduct);
+
+    // resize the output buffers
+    recHitsEB_.resize(ebRecHits.size);
+    recHitsEE_.resize(eeRecHits.size);
+
+    // enqeue transfers
+    cudaCheck( cudaMemcpyAsync(recHitsEB_.did.data(),
+                               ebRecHits.did,
+                               recHitsEB_.did.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    cudaCheck( cudaMemcpyAsync(recHitsEE_.did.data(),
+                               eeRecHits.did,
+                               recHitsEE_.did.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+
+    cudaCheck( cudaMemcpyAsync(recHitsEB_.amplitudesAll.data(),
+                               ebRecHits.amplitudesAll,
+                               recHitsEB_.amplitudesAll.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    cudaCheck( cudaMemcpyAsync(recHitsEE_.amplitudesAll.data(),
+                               eeRecHits.amplitudesAll,
+                               recHitsEE_.amplitudesAll.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    
+    cudaCheck( cudaMemcpyAsync(recHitsEB_.amplitude.data(),
+                               ebRecHits.amplitude,
+                               recHitsEB_.amplitude.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    cudaCheck( cudaMemcpyAsync(recHitsEE_.amplitude.data(),
+                               eeRecHits.amplitude,
+                               recHitsEE_.amplitude.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    
+    cudaCheck( cudaMemcpyAsync(recHitsEB_.chi2.data(),
+                               ebRecHits.chi2,
+                               recHitsEB_.chi2.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    cudaCheck( cudaMemcpyAsync(recHitsEE_.chi2.data(),
+                               eeRecHits.chi2,
+                               recHitsEE_.chi2.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    
+    cudaCheck( cudaMemcpyAsync(recHitsEB_.pedestal.data(),
+                               ebRecHits.pedestal,
+                               recHitsEB_.pedestal.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    cudaCheck( cudaMemcpyAsync(recHitsEE_.pedestal.data(),
+                               eeRecHits.pedestal,
+                               recHitsEE_.pedestal.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    
+    cudaCheck( cudaMemcpyAsync(recHitsEB_.flags.data(),
+                               ebRecHits.flags,
+                               recHitsEB_.flags.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    cudaCheck( cudaMemcpyAsync(recHitsEE_.flags.data(),
+                               eeRecHits.flags,
+                               recHitsEE_.flags.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    
+    if (containsTimingInformation_) {
+        cudaCheck( cudaMemcpyAsync(recHitsEB_.jitter.data(),
+                                   ebRecHits.jitter,
+                                   recHitsEB_.jitter.size() * sizeof(uint32_t),
+                                   cudaMemcpyDeviceToHost,
+                                   ctx.stream()) );
+        cudaCheck( cudaMemcpyAsync(recHitsEE_.jitter.data(),
+                                   eeRecHits.jitter,
+                                   recHitsEE_.jitter.size() * sizeof(uint32_t),
+                                   cudaMemcpyDeviceToHost,
+                                   ctx.stream()) );
+        
+        cudaCheck( cudaMemcpyAsync(recHitsEB_.jitterError.data(),
+                                   ebRecHits.jitterError,
+                                   recHitsEB_.jitterError.size() * sizeof(uint32_t),
+                                   cudaMemcpyDeviceToHost,
+                                   ctx.stream()) );
+        cudaCheck( cudaMemcpyAsync(recHitsEE_.jitterError.data(),
+                                   eeRecHits.jitterError,
+                                   recHitsEE_.jitterError.size() * sizeof(uint32_t),
+                                   cudaMemcpyDeviceToHost,
+                                   ctx.stream()) );
+    }
+}
+
+void EcalCPUUncalibRecHitProducer::produce(
+        edm::Event& event, 
+        edm::EventSetup const& setup) 
+{
+    // tmp vectors
+    auto recHitsOutEB = std::make_unique<ecal::UncalibratedRecHit<ecal::Tag::soa>>(
+        std::move(recHitsEB_));
+    auto recHitsOutEE = std::make_unique<ecal::UncalibratedRecHit<ecal::Tag::soa>>(
+        std::move(recHitsEE_));
+
+    // put into event
+    event.put(recHitsOutEBToken_, std::move(recHitsOutEB));
+    event.put(recHitsOutEEToken_, std::move(recHitsOutEE));
+}
+
+DEFINE_FWK_MODULE(EcalCPUUncalibRecHitProducer);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducerGPU.h b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducerGPU.h
index 7c8bfb86dba24..b1509d593f67f 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducerGPU.h
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducerGPU.h
@@ -11,33 +11,34 @@
 
 #include <iostream>
 
-template <typename Target, typename Source, typename Record>
+template<typename Target, typename Source, typename Record>
 class EcalESProducerGPU : public edm::ESProducer {
 public:
-  explicit EcalESProducerGPU(edm::ParameterSet const& ps) : label_{ps.getParameter<std::string>("label")} {
-    std::string name = ps.getParameter<std::string>("ComponentName");
-    setWhatProduced(this, name);
-  }
-
-  std::unique_ptr<Target> produce(Record const& record) {
-    // retrieve conditions in old format
-    edm::ESTransientHandle<Source> product;
-    record.get(label_, product);
-
-    return std::make_unique<Target>(*product);
-  }
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
-    edm::ParameterSetDescription desc;
-
-    std::string label = Target::name() + "ESProducer";
-    desc.add<std::string>("ComponentName", "");
-    desc.add<std::string>("label", "")->setComment("Product Label");
-    confDesc.add(label, desc);
-  }
+    explicit EcalESProducerGPU(edm::ParameterSet const& ps) {
+        auto const label = ps.getParameter<std::string>("label");
+        std::string name = ps.getParameter<std::string>("ComponentName");
+        auto cc = setWhatProduced(this, name);
+        cc.setConsumes(token_, edm::ESInputTag{"", label});
+    }
+   
+    std::unique_ptr<Target> produce(Record const& record) {
+        // retrieve conditions in old format 
+        auto sourceProduct = record.getTransientHandle(token_);
+
+        return std::make_unique<Target>(*sourceProduct);
+    }
+
+    static void fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+        edm::ParameterSetDescription desc;
+
+        std::string label = Target::name() + "ESProducer";
+        desc.add<std::string>("ComponentName", "");
+        desc.add<std::string>("label", "")->setComment("Product Label");
+        confDesc.add(label, desc);
+    }
 
 private:
-  std::string label_;
+    edm::ESGetToken<Source, Record> token_;
 };
 
 #endif
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc
index c851bf24c0e40..24b782b7b434d 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc
@@ -18,19 +18,32 @@
 
 #include <iostream>
 
-using EcalPedestalsGPUESProducer = EcalESProducerGPU<EcalPedestalsGPU, EcalPedestals, EcalPedestalsRcd>;
-using EcalGainRatiosGPUESProducer = EcalESProducerGPU<EcalGainRatiosGPU, EcalGainRatios, EcalGainRatiosRcd>;
-using EcalPulseShapesGPUESProducer = EcalESProducerGPU<EcalPulseShapesGPU, EcalPulseShapes, EcalPulseShapesRcd>;
-using EcalPulseCovariancesGPUESProducer =
-    EcalESProducerGPU<EcalPulseCovariancesGPU, EcalPulseCovariances, EcalPulseCovariancesRcd>;
-using EcalSamplesCorrelationGPUESProducer =
-    EcalESProducerGPU<EcalSamplesCorrelationGPU, EcalSamplesCorrelation, EcalSamplesCorrelationRcd>;
+using EcalPedestalsGPUESProducer = EcalESProducerGPU<EcalPedestalsGPU,
+                                                     EcalPedestals,
+                                                     EcalPedestalsRcd>;
+using EcalGainRatiosGPUESProducer = EcalESProducerGPU<EcalGainRatiosGPU,
+                                                      EcalGainRatios,
+                                                      EcalGainRatiosRcd>;
+using EcalPulseShapesGPUESProducer = EcalESProducerGPU<EcalPulseShapesGPU,
+                                                       EcalPulseShapes,
+                                                       EcalPulseShapesRcd>;
+using EcalPulseCovariancesGPUESProducer = EcalESProducerGPU<EcalPulseCovariancesGPU,
+                                                            EcalPulseCovariances,
+                                                            EcalPulseCovariancesRcd>;
+using EcalSamplesCorrelationGPUESProducer = EcalESProducerGPU<
+    EcalSamplesCorrelationGPU,
+    EcalSamplesCorrelation,
+    EcalSamplesCorrelationRcd>;
 
-using EcalTimeBiasCorrectionsGPUESProducer =
-    EcalESProducerGPU<EcalTimeBiasCorrectionsGPU, EcalTimeBiasCorrections, EcalTimeBiasCorrectionsRcd>;
+using EcalTimeBiasCorrectionsGPUESProducer = EcalESProducerGPU<
+    EcalTimeBiasCorrectionsGPU,
+    EcalTimeBiasCorrections,
+    EcalTimeBiasCorrectionsRcd>;
 
-using EcalTimeCalibConstantsGPUESProducer =
-    EcalESProducerGPU<EcalTimeCalibConstantsGPU, EcalTimeCalibConstants, EcalTimeCalibConstantsRcd>;
+using EcalTimeCalibConstantsGPUESProducer = EcalESProducerGPU<
+    EcalTimeCalibConstantsGPU,
+    EcalTimeCalibConstants,
+    EcalTimeCalibConstantsRcd>;
 
 DEFINE_FWK_EVENTSETUP_MODULE(EcalPedestalsGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(EcalGainRatiosGPUESProducer);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc
index 20f51ea5245df..916230516f070 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc
@@ -3,7 +3,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/MakerMacros.h" 
 
 // algorithm specific
 #include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
@@ -14,87 +14,103 @@
 
 #include <iostream>
 
-class EcalUncalibRecHitConvertGPU2CPUFormat : public edm::stream::EDProducer<> {
+class EcalUncalibRecHitConvertGPU2CPUFormat
+    : public edm::stream::EDProducer<>
+{
 public:
-  explicit EcalUncalibRecHitConvertGPU2CPUFormat(edm::ParameterSet const& ps);
-  ~EcalUncalibRecHitConvertGPU2CPUFormat() override;
-  static void fillDescriptions(edm::ConfigurationDescriptions&);
+    explicit EcalUncalibRecHitConvertGPU2CPUFormat(edm::ParameterSet const& ps);
+    ~EcalUncalibRecHitConvertGPU2CPUFormat() override;
+    static void fillDescriptions(edm::ConfigurationDescriptions&);
 
 private:
-  using GPURecHitType = ecal::UncalibratedRecHit<ecal::Tag::soa>;
-  void produce(edm::Event&, edm::EventSetup const&) override;
+    using GPURecHitType = ecal::UncalibratedRecHit<ecal::Tag::soa>;
+    void produce(edm::Event&, edm::EventSetup const&) override;
 
 private:
-  const edm::EDGetTokenT<ecal::SoAUncalibratedRecHitCollection> recHitsGPUEB_;
-  const edm::EDGetTokenT<ecal::SoAUncalibratedRecHitCollection> recHitsGPUEE_;
+    const edm::EDGetTokenT<ecal::SoAUncalibratedRecHitCollection> recHitsGPUEB_;
+    const edm::EDGetTokenT<ecal::SoAUncalibratedRecHitCollection> recHitsGPUEE_;
 
-  const std::string recHitsLabelCPUEB_, recHitsLabelCPUEE_;
+    const std::string recHitsLabelCPUEB_, recHitsLabelCPUEE_;
 };
 
-void EcalUncalibRecHitConvertGPU2CPUFormat::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
-  edm::ParameterSetDescription desc;
+void EcalUncalibRecHitConvertGPU2CPUFormat::fillDescriptions(
+        edm::ConfigurationDescriptions& confDesc) {
+    edm::ParameterSetDescription desc;
 
-  desc.add<edm::InputTag>("recHitsLabelGPUEB", edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEB"));
-  desc.add<edm::InputTag>("recHitsLabelGPUEE", edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEE"));
+    desc.add<edm::InputTag>("recHitsLabelGPUEB", 
+        edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEB"));
+    desc.add<edm::InputTag>("recHitsLabelGPUEE", 
+        edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEE"));
 
-  desc.add<std::string>("recHitsLabelCPUEB", "EcalUncalibRecHitsEB");
-  desc.add<std::string>("recHitsLabelCPUEE", "EcalUncalibRecHitsEE");
+    desc.add<std::string>("recHitsLabelCPUEB", "EcalUncalibRecHitsEB");
+    desc.add<std::string>("recHitsLabelCPUEE", "EcalUncalibRecHitsEE");
 
-  std::string label = "ecalUncalibRecHitConvertGPU2CPUFormat";
-  confDesc.add(label, desc);
+    std::string label = "ecalUncalibRecHitConvertGPU2CPUFormat";
+    confDesc.add(label, desc);
 }
 
-EcalUncalibRecHitConvertGPU2CPUFormat::EcalUncalibRecHitConvertGPU2CPUFormat(const edm::ParameterSet& ps)
+EcalUncalibRecHitConvertGPU2CPUFormat::EcalUncalibRecHitConvertGPU2CPUFormat(
+        const edm::ParameterSet& ps) 
     : recHitsGPUEB_{consumes<ecal::SoAUncalibratedRecHitCollection>(
-          ps.getParameter<edm::InputTag>("recHitsLabelGPUEB"))},
-      recHitsGPUEE_{
-          consumes<ecal::SoAUncalibratedRecHitCollection>(ps.getParameter<edm::InputTag>("recHitsLabelGPUEE"))},
-      recHitsLabelCPUEB_{ps.getParameter<std::string>("recHitsLabelCPUEB")},
-      recHitsLabelCPUEE_{ps.getParameter<std::string>("recHitsLabelCPUEE")} {
-  produces<EBUncalibratedRecHitCollection>(recHitsLabelCPUEB_);
-  produces<EEUncalibratedRecHitCollection>(recHitsLabelCPUEE_);
+        ps.getParameter<edm::InputTag>("recHitsLabelGPUEB"))}
+    , recHitsGPUEE_{consumes<ecal::SoAUncalibratedRecHitCollection>(
+        ps.getParameter<edm::InputTag>("recHitsLabelGPUEE"))}
+    , recHitsLabelCPUEB_{ps.getParameter<std::string>("recHitsLabelCPUEB")}
+    , recHitsLabelCPUEE_{ps.getParameter<std::string>("recHitsLabelCPUEE")}
+{
+    produces<EBUncalibratedRecHitCollection>(recHitsLabelCPUEB_);
+    produces<EEUncalibratedRecHitCollection>(recHitsLabelCPUEE_);
 }
 
 EcalUncalibRecHitConvertGPU2CPUFormat::~EcalUncalibRecHitConvertGPU2CPUFormat() {}
 
-void EcalUncalibRecHitConvertGPU2CPUFormat::produce(edm::Event& event, edm::EventSetup const& setup) {
-  edm::Handle<ecal::SoAUncalibratedRecHitCollection> hRecHitsGPUEB, hRecHitsGPUEE;
-  event.getByToken(recHitsGPUEB_, hRecHitsGPUEB);
-  event.getByToken(recHitsGPUEE_, hRecHitsGPUEE);
-
-  auto recHitsCPUEB = std::make_unique<EBUncalibratedRecHitCollection>();
-  auto recHitsCPUEE = std::make_unique<EEUncalibratedRecHitCollection>();
-  recHitsCPUEB->reserve(hRecHitsGPUEB->amplitude.size());
-  recHitsCPUEE->reserve(hRecHitsGPUEE->amplitude.size());
-
-  for (uint32_t i = 0; i < hRecHitsGPUEB->amplitude.size(); ++i) {
-    recHitsCPUEB->emplace_back(DetId{hRecHitsGPUEB->did[i]},
-                               hRecHitsGPUEB->amplitude[i],
-                               hRecHitsGPUEB->pedestal[i],
-                               hRecHitsGPUEB->jitter[i],
-                               hRecHitsGPUEB->chi2[i],
-                               hRecHitsGPUEB->flags[i]);
-    (*recHitsCPUEB)[i].setJitterError(hRecHitsGPUEB->jitterError[i]);
-    auto const offset = i * EcalDataFrame::MAXSAMPLES;
-    for (uint32_t sample = 0; sample < EcalDataFrame::MAXSAMPLES; ++sample)
-      (*recHitsCPUEB)[i].setOutOfTimeAmplitude(sample, hRecHitsGPUEB->amplitudesAll[offset + sample]);
-  }
-
-  for (uint32_t i = 0; i < hRecHitsGPUEE->amplitude.size(); ++i) {
-    recHitsCPUEE->emplace_back(DetId{hRecHitsGPUEE->did[i]},
-                               hRecHitsGPUEE->amplitude[i],
-                               hRecHitsGPUEE->pedestal[i],
-                               hRecHitsGPUEE->jitter[i],
-                               hRecHitsGPUEE->chi2[i],
-                               hRecHitsGPUEE->flags[i]);
-    (*recHitsCPUEE)[i].setJitterError(hRecHitsGPUEE->jitterError[i]);
-    auto const offset = i * EcalDataFrame::MAXSAMPLES;
-    for (uint32_t sample = 0; sample < EcalDataFrame::MAXSAMPLES; ++sample)
-      (*recHitsCPUEE)[i].setOutOfTimeAmplitude(sample, hRecHitsGPUEE->amplitudesAll[offset + sample]);
-  }
-
-  event.put(std::move(recHitsCPUEB), recHitsLabelCPUEB_);
-  event.put(std::move(recHitsCPUEE), recHitsLabelCPUEE_);
+void EcalUncalibRecHitConvertGPU2CPUFormat::produce(
+        edm::Event& event, 
+        edm::EventSetup const& setup) 
+{
+    edm::Handle<ecal::SoAUncalibratedRecHitCollection> hRecHitsGPUEB, hRecHitsGPUEE;
+    event.getByToken(recHitsGPUEB_, hRecHitsGPUEB);
+    event.getByToken(recHitsGPUEE_, hRecHitsGPUEE);
+
+    auto recHitsCPUEB = std::make_unique<EBUncalibratedRecHitCollection>();
+    auto recHitsCPUEE = std::make_unique<EEUncalibratedRecHitCollection>();
+    recHitsCPUEB->reserve(hRecHitsGPUEB->amplitude.size());
+    recHitsCPUEE->reserve(hRecHitsGPUEE->amplitude.size());
+
+    for (uint32_t i=0; i<hRecHitsGPUEB->amplitude.size(); ++i) {
+        recHitsCPUEB->emplace_back(
+            DetId{hRecHitsGPUEB->did[i]},
+            hRecHitsGPUEB->amplitude[i],
+            hRecHitsGPUEB->pedestal[i],
+            hRecHitsGPUEB->jitter[i],
+            hRecHitsGPUEB->chi2[i],
+            hRecHitsGPUEB->flags[i]
+        );
+        (*recHitsCPUEB)[i].setJitterError(hRecHitsGPUEB->jitterError[i]);
+        auto const offset = i * EcalDataFrame::MAXSAMPLES;
+        for (uint32_t sample=0; sample<EcalDataFrame::MAXSAMPLES; ++sample) 
+            (*recHitsCPUEB)[i].setOutOfTimeAmplitude(
+                sample, hRecHitsGPUEB->amplitudesAll[offset + sample]);
+    }
+
+    for (uint32_t i=0; i<hRecHitsGPUEE->amplitude.size(); ++i) {
+        recHitsCPUEE->emplace_back(
+            DetId{hRecHitsGPUEE->did[i]},
+            hRecHitsGPUEE->amplitude[i],
+            hRecHitsGPUEE->pedestal[i],
+            hRecHitsGPUEE->jitter[i],
+            hRecHitsGPUEE->chi2[i],
+            hRecHitsGPUEE->flags[i]
+        );
+        (*recHitsCPUEE)[i].setJitterError(hRecHitsGPUEE->jitterError[i]);
+        auto const offset = i * EcalDataFrame::MAXSAMPLES;
+        for (uint32_t sample=0; sample<EcalDataFrame::MAXSAMPLES; ++sample) 
+            (*recHitsCPUEE)[i].setOutOfTimeAmplitude(
+                sample, hRecHitsGPUEE->amplitudesAll[offset + sample]);
+    }
+
+    event.put(std::move(recHitsCPUEB), recHitsLabelCPUEB_);
+    event.put(std::move(recHitsCPUEE), recHitsLabelCPUEE_);
 }
 
 DEFINE_FWK_MODULE(EcalUncalibRecHitConvertGPU2CPUFormat);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc
index a90cc1536c482..a96b729223d01 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc
@@ -8,7 +8,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/MakerMacros.h" 
 
 // algorithm specific
 #include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
@@ -40,426 +40,355 @@
 #include "RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalUncalibRecHitMultiFitAlgo_gpu_new.h"
 
-class EcalUncalibRecHitProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
+class EcalUncalibRecHitProducerGPU
+    : public edm::stream::EDProducer<edm::ExternalWork>
+{
 public:
-  explicit EcalUncalibRecHitProducerGPU(edm::ParameterSet const& ps);
-  ~EcalUncalibRecHitProducerGPU() override;
-  static void fillDescriptions(edm::ConfigurationDescriptions&);
+    explicit EcalUncalibRecHitProducerGPU(edm::ParameterSet const& ps);
+    ~EcalUncalibRecHitProducerGPU() override;
+    static void fillDescriptions(edm::ConfigurationDescriptions&);
 
 private:
-  using RecHitType = ecal::UncalibratedRecHit<ecal::Tag::soa>;
-  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
-  void produce(edm::Event&, edm::EventSetup const&) override;
-
-  void transferToHost(RecHitType& ebRecHits, RecHitType& eeRecHits, cudaStream_t cudaStream);
+    using RecHitType = ecal::UncalibratedRecHit<ecal::Tag::soa>;
+    void acquire(edm::Event const&, 
+                 edm::EventSetup const&,
+                 edm::WaitingTaskWithArenaHolder) override;
+    void produce(edm::Event&, edm::EventSetup const&) override;
 
 private:
-  edm::EDGetTokenT<EBDigiCollection> digisTokenEB_;
-  edm::EDGetTokenT<EEDigiCollection> digisTokenEE_;
-
-  std::string recHitsLabelEB_, recHitsLabelEE_;
-
-  // conditions handles
-  edm::ESHandle<EcalPedestalsGPU> pedestalsHandle_;
-  edm::ESHandle<EcalGainRatiosGPU> gainRatiosHandle_;
-  edm::ESHandle<EcalPulseShapesGPU> pulseShapesHandle_;
-  edm::ESHandle<EcalPulseCovariancesGPU> pulseCovariancesHandle_;
-  edm::ESHandle<EcalSamplesCorrelationGPU> samplesCorrelationHandle_;
-  edm::ESHandle<EcalTimeBiasCorrectionsGPU> timeBiasCorrectionsHandle_;
-  edm::ESHandle<EcalTimeCalibConstantsGPU> timeCalibConstantsHandle_;
-  edm::ESHandle<EcalSampleMask> sampleMaskHandle_;
-  edm::ESHandle<EcalTimeOffsetConstant> timeOffsetConstantHandle_;
-
-  // configuration parameters
-  ecal::multifit::ConfigurationParameters configParameters_;
+    edm::EDGetTokenT<cms::cuda::Product<ecal::DigisCollection>> digisTokenEB_, digisTokenEE_;
+    edm::EDPutTokenT<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>>
+        recHitsTokenEB_, recHitsTokenEE_;
+    
+    // conditions handles
+    edm::ESHandle<EcalPedestalsGPU> pedestalsHandle_;
+    edm::ESHandle<EcalGainRatiosGPU> gainRatiosHandle_;
+    edm::ESHandle<EcalPulseShapesGPU> pulseShapesHandle_;
+    edm::ESHandle<EcalPulseCovariancesGPU> pulseCovariancesHandle_;
+    edm::ESHandle<EcalSamplesCorrelationGPU> samplesCorrelationHandle_;
+    edm::ESHandle<EcalTimeBiasCorrectionsGPU> timeBiasCorrectionsHandle_;
+    edm::ESHandle<EcalTimeCalibConstantsGPU> timeCalibConstantsHandle_;
+    edm::ESHandle<EcalSampleMask> sampleMaskHandle_;
+    edm::ESHandle<EcalTimeOffsetConstant> timeOffsetConstantHandle_;
 
-  // event data
-  ecal::multifit::EventInputDataGPU eventInputDataGPU_;
-  ecal::multifit::EventOutputDataGPU eventOutputDataGPU_;
-  ecal::multifit::EventDataForScratchGPU eventDataForScratchGPU_;
-  bool shouldTransferToHost_{true};
+    // configuration parameters
+    ecal::multifit::ConfigurationParameters configParameters_;
 
-  cms::cuda::ContextState cudaState_;
+    // event data
+    ecal::multifit::EventOutputDataGPU eventOutputDataGPU_;
+    ecal::multifit::EventDataForScratchGPU eventDataForScratchGPU_;
+    bool shouldTransferToHost_{true};
 
-  std::unique_ptr<ecal::UncalibratedRecHit<ecal::Tag::soa>> ebRecHits_{nullptr}, eeRecHits_{nullptr};
+    cms::cuda::ContextState cudaState_;
 
-  uint32_t maxNumberHits_;
+    uint32_t maxNumberHits_;
+    uint32_t neb_, nee_;
 };
 
-void EcalUncalibRecHitProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
-  edm::ParameterSetDescription desc;
-
-  desc.add<edm::InputTag>("digisLabelEB", edm::InputTag("ecalDigis", "ebDigis"));
-  desc.add<edm::InputTag>("digisLabelEE", edm::InputTag("ecalDigis", "eeDigis"));
-
-  desc.add<std::string>("recHitsLabelEB", "EcalUncalibRecHitsEB");
-  desc.add<std::string>("recHitsLabelEE", "EcalUncalibRecHitsEE");
-
-  desc.add<std::vector<double>>("EBtimeFitParameters",
-                                {-2.015452e+00,
-                                 3.130702e+00,
-                                 -1.234730e+01,
-                                 4.188921e+01,
-                                 -8.283944e+01,
-                                 9.101147e+01,
-                                 -5.035761e+01,
-                                 1.105621e+01});
-  desc.add<std::vector<double>>("EEtimeFitParameters",
-                                {-2.390548e+00,
-                                 3.553628e+00,
-                                 -1.762341e+01,
-                                 6.767538e+01,
-                                 -1.332130e+02,
-                                 1.407432e+02,
-                                 -7.541106e+01,
-                                 1.620277e+01});
-  desc.add<std::vector<double>>("EBamplitudeFitParameters", {1.138, 1.652});
-  desc.add<std::vector<double>>("EEamplitudeFitParameters", {1.890, 1.400});
-  desc.add<double>("EBtimeFitLimits_Lower", 0.2);
-  desc.add<double>("EBtimeFitLimits_Upper", 1.4);
-  desc.add<double>("EEtimeFitLimits_Lower", 0.2);
-  desc.add<double>("EEtimeFitLimits_Upper", 1.4);
-  desc.add<double>("EBtimeConstantTerm", .6);
-  desc.add<double>("EEtimeConstantTerm", 1.0);
-  desc.add<double>("EBtimeNconst", 28.5);
-  desc.add<double>("EEtimeNconst", 31.8);
-  desc.add<double>("outOfTimeThresholdGain12pEB", 5);
-  desc.add<double>("outOfTimeThresholdGain12mEB", 5);
-  desc.add<double>("outOfTimeThresholdGain61pEB", 5);
-  desc.add<double>("outOfTimeThresholdGain61mEB", 5);
-  desc.add<double>("outOfTimeThresholdGain12pEE", 1000);
-  desc.add<double>("outOfTimeThresholdGain12mEE", 1000);
-  desc.add<double>("outOfTimeThresholdGain61pEE", 1000);
-  desc.add<double>("outOfTimeThresholdGain61mEE", 1000);
-  desc.add<double>("amplitudeThresholdEB", 10);
-  desc.add<double>("amplitudeThresholdEE", 10);
-  desc.add<uint32_t>("maxNumberHits", 20000);  //---- AM TEST
-  desc.add<bool>("shouldTransferToHost", true);
-  desc.add<std::vector<uint32_t>>("kernelMinimizeThreads", {32, 1, 1});
-  // ---- default false or true? It was set to true, but at HLT it is false
-  desc.add<bool>("shouldRunTimingComputation", false);
-  std::string label = "ecalUncalibRecHitProducerGPU";
-  confDesc.add(label, desc);
+void EcalUncalibRecHitProducerGPU::fillDescriptions(
+        edm::ConfigurationDescriptions& confDesc) {
+    edm::ParameterSetDescription desc;
+
+    desc.add<edm::InputTag>("digisLabelEB", 
+        edm::InputTag("ecalRawToDigiGPU", "ebDigisGPU"));
+    desc.add<edm::InputTag>("digisLabelEE", 
+        edm::InputTag("ecalRawToDigiGPU", "eeDigisGPU"));
+
+    desc.add<std::string>("recHitsLabelEB", "EcalUncalibRecHitsEB");
+    desc.add<std::string>("recHitsLabelEE", "EcalUncalibRecHitsEE");
+
+    desc.add<std::vector<double>>("EBtimeFitParameters", 
+        {-2.015452e+00, 3.130702e+00, -1.234730e+01, 4.188921e+01, -8.283944e+01, 
+         9.101147e+01, -5.035761e+01, 1.105621e+01});
+    desc.add<std::vector<double>>("EEtimeFitParameters", 
+        {-2.390548e+00, 3.553628e+00, -1.762341e+01, 6.767538e+01, -1.332130e+02, 
+         1.407432e+02, -7.541106e+01, 1.620277e+01});
+    desc.add<std::vector<double>>("EBamplitudeFitParameters", {1.138,1.652});
+    desc.add<std::vector<double>>("EEamplitudeFitParameters", {1.890,1.400});
+    desc.add<double>("EBtimeFitLimits_Lower", 0.2);
+    desc.add<double>("EBtimeFitLimits_Upper", 1.4);
+    desc.add<double>("EEtimeFitLimits_Lower", 0.2);
+    desc.add<double>("EEtimeFitLimits_Upper", 1.4);
+    desc.add<double>("EBtimeConstantTerm", .6);
+    desc.add<double>("EEtimeConstantTerm", 1.0);
+    desc.add<double>("EBtimeNconst", 28.5);
+    desc.add<double>("EEtimeNconst", 31.8);
+    desc.add<double>("outOfTimeThresholdGain12pEB", 5);
+    desc.add<double>("outOfTimeThresholdGain12mEB", 5);
+    desc.add<double>("outOfTimeThresholdGain61pEB", 5);
+    desc.add<double>("outOfTimeThresholdGain61mEB", 5);
+    desc.add<double>("outOfTimeThresholdGain12pEE", 1000);
+    desc.add<double>("outOfTimeThresholdGain12mEE", 1000);
+    desc.add<double>("outOfTimeThresholdGain61pEE", 1000);
+    desc.add<double>("outOfTimeThresholdGain61mEE", 1000);
+    desc.add<double>("amplitudeThresholdEB", 10);
+    desc.add<double>("amplitudeThresholdEE", 10);
+    desc.add<uint32_t>("maxNumberHits", 20000);   //---- AM TEST
+    desc.add<bool>("shouldTransferToHost", true);
+    desc.add<std::vector<uint32_t>>("kernelMinimizeThreads", {32, 1, 1});
+    // ---- default false or true? It was set to true, but at HLT it is false
+    desc.add<bool>("shouldRunTimingComputation", false);
+    std::string label = "ecalUncalibRecHitProducerGPU";
+    confDesc.add(label, desc);
 }
 
-EcalUncalibRecHitProducerGPU::EcalUncalibRecHitProducerGPU(const edm::ParameterSet& ps) {
-  digisTokenEB_ = consumes<EBDigiCollection>(ps.getParameter<edm::InputTag>("digisLabelEB"));
-  digisTokenEE_ = consumes<EEDigiCollection>(ps.getParameter<edm::InputTag>("digisLabelEE"));
-
-  recHitsLabelEB_ = ps.getParameter<std::string>("recHitsLabelEB");
-  recHitsLabelEE_ = ps.getParameter<std::string>("recHitsLabelEE");
-
-  auto EBamplitudeFitParameters = ps.getParameter<std::vector<double>>("EBamplitudeFitParameters");
-  auto EEamplitudeFitParameters = ps.getParameter<std::vector<double>>("EEamplitudeFitParameters");
-  auto EBtimeFitParameters = ps.getParameter<std::vector<double>>("EBtimeFitParameters");
-  auto EEtimeFitParameters = ps.getParameter<std::vector<double>>("EEtimeFitParameters");
-  std::pair<double, double> EBtimeFitLimits, EEtimeFitLimits;
-  EBtimeFitLimits.first = ps.getParameter<double>("EBtimeFitLimits_Lower");
-  EBtimeFitLimits.second = ps.getParameter<double>("EBtimeFitLimits_Upper");
-  EEtimeFitLimits.first = ps.getParameter<double>("EEtimeFitLimits_Lower");
-  EEtimeFitLimits.second = ps.getParameter<double>("EEtimeFitLimits_Upper");
-
-  auto EBtimeConstantTerm = ps.getParameter<double>("EBtimeConstantTerm");
-  auto EEtimeConstantTerm = ps.getParameter<double>("EEtimeConstantTerm");
-  auto EBtimeNconst = ps.getParameter<double>("EBtimeNconst");
-  auto EEtimeNconst = ps.getParameter<double>("EEtimeNconst");
-
-  auto outOfTimeThreshG12pEB = ps.getParameter<double>("outOfTimeThresholdGain12pEB");
-  auto outOfTimeThreshG12mEB = ps.getParameter<double>("outOfTimeThresholdGain12mEB");
-  auto outOfTimeThreshG61pEB = ps.getParameter<double>("outOfTimeThresholdGain61pEB");
-  auto outOfTimeThreshG61mEB = ps.getParameter<double>("outOfTimeThresholdGain61mEB");
-  auto outOfTimeThreshG12pEE = ps.getParameter<double>("outOfTimeThresholdGain12pEE");
-  auto outOfTimeThreshG12mEE = ps.getParameter<double>("outOfTimeThresholdGain12mEE");
-  auto outOfTimeThreshG61pEE = ps.getParameter<double>("outOfTimeThresholdGain61pEE");
-  auto outOfTimeThreshG61mEE = ps.getParameter<double>("outOfTimeThresholdGain61mEE");
-  auto amplitudeThreshEB = ps.getParameter<double>("amplitudeThresholdEB");
-  auto amplitudeThreshEE = ps.getParameter<double>("amplitudeThresholdEE");
-
-  // max number of digis to allocate for
-  maxNumberHits_ = ps.getParameter<uint32_t>("maxNumberHits");
-
-  // transfer to host switch
-  shouldTransferToHost_ = ps.getParameter<bool>("shouldTransferToHost");
-
-  // switch to run timing computation kernels
-  configParameters_.shouldRunTimingComputation = ps.getParameter<bool>("shouldRunTimingComputation");
-
-  // minimize kernel launch conf
-  auto threadsMinimize = ps.getParameter<std::vector<uint32_t>>("kernelMinimizeThreads");
-  configParameters_.kernelMinimizeThreads[0] = threadsMinimize[0];
-  configParameters_.kernelMinimizeThreads[1] = threadsMinimize[1];
-  configParameters_.kernelMinimizeThreads[2] = threadsMinimize[2];
-
-  produces<ecal::SoAUncalibratedRecHitCollection>(recHitsLabelEB_);
-  produces<ecal::SoAUncalibratedRecHitCollection>(recHitsLabelEE_);
-
-  //
-  // configuration and physics parameters: done once
-  // assume there is a single device
-  // use sync copying
-  //
-
-  // amplitude fit parameters copying
-  cudaCheck(cudaMalloc((void**)&configParameters_.amplitudeFitParametersEB,
-                       sizeof(ecal::multifit::ConfigurationParameters::type) * EBamplitudeFitParameters.size()));
-  cudaCheck(cudaMemcpy(configParameters_.amplitudeFitParametersEB,
-                       EBamplitudeFitParameters.data(),
-                       EBamplitudeFitParameters.size() * sizeof(ecal::multifit::ConfigurationParameters::type),
-                       cudaMemcpyHostToDevice));
-  cudaCheck(cudaMalloc((void**)&configParameters_.amplitudeFitParametersEE,
-                       sizeof(ecal::multifit::ConfigurationParameters::type) * EEamplitudeFitParameters.size()));
-  cudaCheck(cudaMemcpy(configParameters_.amplitudeFitParametersEE,
-                       EEamplitudeFitParameters.data(),
-                       EEamplitudeFitParameters.size() * sizeof(ecal::multifit::ConfigurationParameters::type),
-                       cudaMemcpyHostToDevice));
-
-  // time fit parameters and limits
-  configParameters_.timeFitParametersSizeEB = EBtimeFitParameters.size();
-  configParameters_.timeFitParametersSizeEE = EEtimeFitParameters.size();
-  configParameters_.timeFitLimitsFirstEB = EBtimeFitLimits.first;
-  configParameters_.timeFitLimitsSecondEB = EBtimeFitLimits.second;
-  configParameters_.timeFitLimitsFirstEE = EEtimeFitLimits.first;
-  configParameters_.timeFitLimitsSecondEE = EEtimeFitLimits.second;
-  cudaCheck(cudaMalloc((void**)&configParameters_.timeFitParametersEB,
-                       sizeof(ecal::multifit::ConfigurationParameters::type) * EBtimeFitParameters.size()));
-  cudaCheck(cudaMemcpy(configParameters_.timeFitParametersEB,
-                       EBtimeFitParameters.data(),
-                       EBtimeFitParameters.size() * sizeof(ecal::multifit::ConfigurationParameters::type),
-                       cudaMemcpyHostToDevice));
-  cudaCheck(cudaMalloc((void**)&configParameters_.timeFitParametersEE,
-                       sizeof(ecal::multifit::ConfigurationParameters::type) * EEtimeFitParameters.size()));
-  cudaCheck(cudaMemcpy(configParameters_.timeFitParametersEE,
-                       EEtimeFitParameters.data(),
-                       EEtimeFitParameters.size() * sizeof(ecal::multifit::ConfigurationParameters::type),
-                       cudaMemcpyHostToDevice));
-
-  // time constant terms
-  configParameters_.timeConstantTermEB = EBtimeConstantTerm;
-  configParameters_.timeConstantTermEE = EEtimeConstantTerm;
-
-  // time N const
-  configParameters_.timeNconstEB = EBtimeNconst;
-  configParameters_.timeNconstEE = EEtimeNconst;
-
-  // amplitude threshold for time flags
-  configParameters_.amplitudeThreshEB = amplitudeThreshEB;
-  configParameters_.amplitudeThreshEE = amplitudeThreshEE;
-
-  // out of time thresholds gain-dependent
-  configParameters_.outOfTimeThreshG12pEB = outOfTimeThreshG12pEB;
-  configParameters_.outOfTimeThreshG12pEE = outOfTimeThreshG12pEE;
-  configParameters_.outOfTimeThreshG61pEB = outOfTimeThreshG61pEB;
-  configParameters_.outOfTimeThreshG61pEE = outOfTimeThreshG61pEE;
-  configParameters_.outOfTimeThreshG12mEB = outOfTimeThreshG12mEB;
-  configParameters_.outOfTimeThreshG12mEE = outOfTimeThreshG12mEE;
-  configParameters_.outOfTimeThreshG61mEB = outOfTimeThreshG61mEB;
-  configParameters_.outOfTimeThreshG61mEE = outOfTimeThreshG61mEE;
-
-  // allocate event input data
-  eventInputDataGPU_.allocate(maxNumberHits_);
-
-  // allocate event output data
-  eventOutputDataGPU_.allocate(configParameters_, maxNumberHits_);
-
-  // allocate scratch data for gpu
-  eventDataForScratchGPU_.allocate(configParameters_, maxNumberHits_);
+EcalUncalibRecHitProducerGPU::EcalUncalibRecHitProducerGPU(
+        const edm::ParameterSet& ps) 
+    : digisTokenEB_{consumes<cms::cuda::Product<ecal::DigisCollection>>(
+        ps.getParameter<edm::InputTag>("digisLabelEB"))}
+    , digisTokenEE_{consumes<cms::cuda::Product<ecal::DigisCollection>>(
+        ps.getParameter<edm::InputTag>("digisLabelEE"))}
+    , recHitsTokenEB_{produces<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>>(
+        ps.getParameter<std::string>("recHitsLabelEB"))}
+    , recHitsTokenEE_{produces<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>>(
+        ps.getParameter<std::string>("recHitsLabelEE"))}
+{
+    auto EBamplitudeFitParameters = ps.getParameter<std::vector<double>>(
+        "EBamplitudeFitParameters");
+    auto EEamplitudeFitParameters = ps.getParameter<std::vector<double>>(
+        "EEamplitudeFitParameters");
+    auto EBtimeFitParameters = ps.getParameter<std::vector<double>>(
+        "EBtimeFitParameters");
+    auto EEtimeFitParameters = ps.getParameter<std::vector<double>>(
+        "EEtimeFitParameters");
+    std::pair<double, double> EBtimeFitLimits, EEtimeFitLimits;
+    EBtimeFitLimits.first  = ps.getParameter<double>("EBtimeFitLimits_Lower");
+    EBtimeFitLimits.second = ps.getParameter<double>("EBtimeFitLimits_Upper");
+    EEtimeFitLimits.first  = ps.getParameter<double>("EEtimeFitLimits_Lower");
+    EEtimeFitLimits.second = ps.getParameter<double>("EEtimeFitLimits_Upper");
+
+    auto EBtimeConstantTerm = ps.getParameter<double>("EBtimeConstantTerm");
+    auto EEtimeConstantTerm = ps.getParameter<double>("EEtimeConstantTerm");
+    auto EBtimeNconst = ps.getParameter<double>("EBtimeNconst");
+    auto EEtimeNconst = ps.getParameter<double>("EEtimeNconst");
+
+    auto outOfTimeThreshG12pEB = ps.getParameter<double>(
+        "outOfTimeThresholdGain12pEB");
+    auto outOfTimeThreshG12mEB = ps.getParameter<double>(
+        "outOfTimeThresholdGain12mEB");
+    auto outOfTimeThreshG61pEB = ps.getParameter<double>(
+        "outOfTimeThresholdGain61pEB");
+    auto outOfTimeThreshG61mEB = ps.getParameter<double>(
+        "outOfTimeThresholdGain61mEB");
+    auto outOfTimeThreshG12pEE = ps.getParameter<double>(
+        "outOfTimeThresholdGain12pEE");
+    auto outOfTimeThreshG12mEE = ps.getParameter<double>(
+        "outOfTimeThresholdGain12mEE");
+    auto outOfTimeThreshG61pEE = ps.getParameter<double>(
+        "outOfTimeThresholdGain61pEE");
+    auto outOfTimeThreshG61mEE = ps.getParameter<double>(
+        "outOfTimeThresholdGain61mEE");
+    auto amplitudeThreshEB = ps.getParameter<double>("amplitudeThresholdEB");
+    auto amplitudeThreshEE = ps.getParameter<double>("amplitudeThresholdEE");
+
+    // max number of digis to allocate for
+    maxNumberHits_ = ps.getParameter<uint32_t>("maxNumberHits");
+
+    // transfer to host switch
+    shouldTransferToHost_ = ps.getParameter<bool>("shouldTransferToHost");
+
+    // switch to run timing computation kernels
+    configParameters_.shouldRunTimingComputation = 
+        ps.getParameter<bool>("shouldRunTimingComputation");
+
+    // minimize kernel launch conf
+    auto threadsMinimize = ps.getParameter<std::vector<uint32_t>>("kernelMinimizeThreads");
+    configParameters_.kernelMinimizeThreads[0] = threadsMinimize[0];
+    configParameters_.kernelMinimizeThreads[1] = threadsMinimize[1];
+    configParameters_.kernelMinimizeThreads[2] = threadsMinimize[2];
+
+    //
+    // configuration and physics parameters: done once
+    // assume there is a single device
+    // use sync copying
+    //
+
+    // amplitude fit parameters copying
+    cudaCheck( cudaMalloc((void**)&configParameters_.amplitudeFitParametersEB,
+        sizeof(ecal::multifit::ConfigurationParameters::type) 
+        * EBamplitudeFitParameters.size()) );
+    cudaCheck( cudaMemcpy(configParameters_.amplitudeFitParametersEB,
+        EBamplitudeFitParameters.data(),
+        EBamplitudeFitParameters.size() * 
+        sizeof(ecal::multifit::ConfigurationParameters::type),
+        cudaMemcpyHostToDevice) );
+    cudaCheck( cudaMalloc((void**)&configParameters_.amplitudeFitParametersEE,
+        sizeof(ecal::multifit::ConfigurationParameters::type) * 
+        EEamplitudeFitParameters.size()) );
+    cudaCheck( cudaMemcpy(configParameters_.amplitudeFitParametersEE,
+        EEamplitudeFitParameters.data(),
+        EEamplitudeFitParameters.size() * 
+        sizeof(ecal::multifit::ConfigurationParameters::type),
+        cudaMemcpyHostToDevice) );
+
+    // time fit parameters and limits
+    configParameters_.timeFitParametersSizeEB = EBtimeFitParameters.size();
+    configParameters_.timeFitParametersSizeEE = EEtimeFitParameters.size();
+    configParameters_.timeFitLimitsFirstEB = EBtimeFitLimits.first;
+    configParameters_.timeFitLimitsSecondEB = EBtimeFitLimits.second;
+    configParameters_.timeFitLimitsFirstEE = EEtimeFitLimits.first;
+    configParameters_.timeFitLimitsSecondEE = EEtimeFitLimits.second;
+    cudaCheck( cudaMalloc((void**)&configParameters_.timeFitParametersEB,
+        sizeof(ecal::multifit::ConfigurationParameters::type) 
+        * EBtimeFitParameters.size()) );
+    cudaCheck( cudaMemcpy(configParameters_.timeFitParametersEB,
+        EBtimeFitParameters.data(),
+        EBtimeFitParameters.size() * 
+        sizeof(ecal::multifit::ConfigurationParameters::type),
+        cudaMemcpyHostToDevice) );
+    cudaCheck( cudaMalloc((void**)&configParameters_.timeFitParametersEE,
+        sizeof(ecal::multifit::ConfigurationParameters::type) 
+        * EEtimeFitParameters.size()) );
+    cudaCheck( cudaMemcpy(configParameters_.timeFitParametersEE,
+        EEtimeFitParameters.data(),
+        EEtimeFitParameters.size() 
+        * sizeof(ecal::multifit::ConfigurationParameters::type),
+        cudaMemcpyHostToDevice) );
+
+    // time constant terms
+    configParameters_.timeConstantTermEB = EBtimeConstantTerm;
+    configParameters_.timeConstantTermEE = EEtimeConstantTerm;
+
+    // time N const 
+    configParameters_.timeNconstEB = EBtimeNconst;
+    configParameters_.timeNconstEE = EEtimeNconst;
+
+    // amplitude threshold for time flags
+    configParameters_.amplitudeThreshEB = amplitudeThreshEB;
+    configParameters_.amplitudeThreshEE = amplitudeThreshEE;
+
+    // out of time thresholds gain-dependent
+    configParameters_.outOfTimeThreshG12pEB = outOfTimeThreshG12pEB;
+    configParameters_.outOfTimeThreshG12pEE = outOfTimeThreshG12pEE;
+    configParameters_.outOfTimeThreshG61pEB = outOfTimeThreshG61pEB;
+    configParameters_.outOfTimeThreshG61pEE = outOfTimeThreshG61pEE;
+    configParameters_.outOfTimeThreshG12mEB = outOfTimeThreshG12mEB;
+    configParameters_.outOfTimeThreshG12mEE = outOfTimeThreshG12mEE;
+    configParameters_.outOfTimeThreshG61mEB = outOfTimeThreshG61mEB;
+    configParameters_.outOfTimeThreshG61mEE = outOfTimeThreshG61mEE;
+
+    // allocate event output data
+    eventOutputDataGPU_.allocate(configParameters_, maxNumberHits_);
+
+    // allocate scratch data for gpu
+    eventDataForScratchGPU_.allocate(configParameters_, maxNumberHits_);
 }
 
 EcalUncalibRecHitProducerGPU::~EcalUncalibRecHitProducerGPU() {
-  //
-  // assume single device for now
-  //
-
-  if (configParameters_.amplitudeFitParametersEB) {
-    // configuration parameters
-    cudaCheck(cudaFree(configParameters_.amplitudeFitParametersEB));
-    cudaCheck(cudaFree(configParameters_.amplitudeFitParametersEE));
-    cudaCheck(cudaFree(configParameters_.timeFitParametersEB));
-    cudaCheck(cudaFree(configParameters_.timeFitParametersEE));
-
-    // free event input data
-    eventInputDataGPU_.deallocate();
-
-    // free event ouput data
-    eventOutputDataGPU_.deallocate(configParameters_);
-
-    // free event scratch data
-    eventDataForScratchGPU_.deallocate(configParameters_);
-  }
-}
-
-void EcalUncalibRecHitProducerGPU::acquire(edm::Event const& event,
-                                           edm::EventSetup const& setup,
-                                           edm::WaitingTaskWithArenaHolder holder) {
-  //DurationMeasurer<std::chrono::milliseconds> timer{std::string{"acquire duration"}};
-
-  // raii
-  cms::cuda::ScopedContextAcquire ctx{event.streamID(), std::move(holder), cudaState_};
-
-  // conditions
-  setup.get<EcalPedestalsRcd>().get(pedestalsHandle_);
-  setup.get<EcalGainRatiosRcd>().get(gainRatiosHandle_);
-  setup.get<EcalPulseShapesRcd>().get(pulseShapesHandle_);
-  setup.get<EcalPulseCovariancesRcd>().get(pulseCovariancesHandle_);
-  setup.get<EcalSamplesCorrelationRcd>().get(samplesCorrelationHandle_);
-  setup.get<EcalTimeBiasCorrectionsRcd>().get(timeBiasCorrectionsHandle_);
-  setup.get<EcalTimeCalibConstantsRcd>().get(timeCalibConstantsHandle_);
-  setup.get<EcalSampleMaskRcd>().get(sampleMaskHandle_);
-  setup.get<EcalTimeOffsetConstantRcd>().get(timeOffsetConstantHandle_);
-
-  auto const& pedProduct = pedestalsHandle_->getProduct(ctx.stream());
-  auto const& gainsProduct = gainRatiosHandle_->getProduct(ctx.stream());
-  auto const& pulseShapesProduct = pulseShapesHandle_->getProduct(ctx.stream());
-  auto const& pulseCovariancesProduct = pulseCovariancesHandle_->getProduct(ctx.stream());
-  auto const& samplesCorrelationProduct = samplesCorrelationHandle_->getProduct(ctx.stream());
-  auto const& timeBiasCorrectionsProduct = timeBiasCorrectionsHandle_->getProduct(ctx.stream());
-  auto const& timeCalibConstantsProduct = timeCalibConstantsHandle_->getProduct(ctx.stream());
-
-  // bundle up conditions
-  ecal::multifit::ConditionsProducts conditions{pedProduct,
-                                                gainsProduct,
-                                                pulseShapesProduct,
-                                                pulseCovariancesProduct,
-                                                samplesCorrelationProduct,
-                                                timeBiasCorrectionsProduct,
-                                                timeCalibConstantsProduct,
-                                                *sampleMaskHandle_,
-                                                *timeOffsetConstantHandle_,
-                                                timeCalibConstantsHandle_->getOffset()};
-
-  //
-  // retrieve collections
-  //
-  edm::Handle<EBDigiCollection> ebDigis;
-  edm::Handle<EEDigiCollection> eeDigis;
-  event.getByToken(digisTokenEB_, ebDigis);
-  event.getByToken(digisTokenEE_, eeDigis);
-
-  ecal::multifit::EventInputDataCPU eventInputDataCPU{*ebDigis, *eeDigis};
-
-  //
-  // schedule algorithms
-  //
-  ecal::multifit::entryPoint(eventInputDataCPU,
-                             eventInputDataGPU_,
-                             eventOutputDataGPU_,
-                             eventDataForScratchGPU_,
-                             conditions,
-                             configParameters_,
-                             ctx.stream());
-
-  ebRecHits_ = std::make_unique<ecal::UncalibratedRecHit<ecal::Tag::soa>>();
-  eeRecHits_ = std::make_unique<ecal::UncalibratedRecHit<ecal::Tag::soa>>();
-
-  if (shouldTransferToHost_) {
-    // allocate for the result while kernels are running
-    ebRecHits_->resize(ebDigis->size());
-    eeRecHits_->resize(eeDigis->size());
-
-    // det ids are host copy only - no need to run device -> host
-    std::memcpy(ebRecHits_->did.data(), ebDigis->ids().data(), ebDigis->ids().size() * sizeof(uint32_t));
-    std::memcpy(eeRecHits_->did.data(), eeDigis->ids().data(), eeDigis->ids().size() * sizeof(uint32_t));
-  }
+    //
+    // assume single device for now
+    //
+
+    if (configParameters_.amplitudeFitParametersEB) {
+        // configuration parameters
+        cudaCheck( cudaFree(configParameters_.amplitudeFitParametersEB) );
+        cudaCheck( cudaFree(configParameters_.amplitudeFitParametersEE) );
+        cudaCheck( cudaFree(configParameters_.timeFitParametersEB) );
+        cudaCheck( cudaFree(configParameters_.timeFitParametersEE) );
+
+        // free event ouput data 
+        eventOutputDataGPU_.deallocate(configParameters_);
+
+        // free event scratch data
+        eventDataForScratchGPU_.deallocate(configParameters_);
+    }
 }
 
-void EcalUncalibRecHitProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) {
-  //DurationMeasurer<std::chrono::milliseconds> timer{std::string{"produce duration"}};
-  cms::cuda::ScopedContextProduce ctx{cudaState_};
-
-  if (shouldTransferToHost_) {
-    // rec hits objects were not originally member variables
-    transferToHost(*ebRecHits_, *eeRecHits_, ctx.stream());
-
-    // TODO
-    // for now just sync on the host when transferring back products
-    cudaStreamSynchronize(ctx.stream());
-  }
-
-  event.put(std::move(ebRecHits_), recHitsLabelEB_);
-  event.put(std::move(eeRecHits_), recHitsLabelEE_);
+void EcalUncalibRecHitProducerGPU::acquire(
+        edm::Event const& event,
+        edm::EventSetup const& setup,
+        edm::WaitingTaskWithArenaHolder holder) 
+{
+    // cuda products
+    auto const& ebDigisProduct = event.get(digisTokenEB_);
+    auto const& eeDigisProduct = event.get(digisTokenEE_);
+    
+    // raii
+    cms::cuda::ScopedContextAcquire ctx{ebDigisProduct, std::move(holder), cudaState_};
+
+    // get actual obj
+    auto const& ebDigis = ctx.get(ebDigisProduct);
+    auto const& eeDigis = ctx.get(eeDigisProduct);
+    ecal::multifit::EventInputDataGPU inputDataGPU{ebDigis, eeDigis};
+    neb_ = ebDigis.ndigis;
+    nee_ = eeDigis.ndigis;
+
+    // conditions
+    setup.get<EcalPedestalsRcd>().get(pedestalsHandle_);
+    setup.get<EcalGainRatiosRcd>().get(gainRatiosHandle_);
+    setup.get<EcalPulseShapesRcd>().get(pulseShapesHandle_);
+    setup.get<EcalPulseCovariancesRcd>().get(pulseCovariancesHandle_);
+    setup.get<EcalSamplesCorrelationRcd>().get(samplesCorrelationHandle_);
+    setup.get<EcalTimeBiasCorrectionsRcd>().get(timeBiasCorrectionsHandle_);
+    setup.get<EcalTimeCalibConstantsRcd>().get(timeCalibConstantsHandle_);
+    setup.get<EcalSampleMaskRcd>().get(sampleMaskHandle_);
+    setup.get<EcalTimeOffsetConstantRcd>().get(timeOffsetConstantHandle_);
+
+    auto const& pedProduct = pedestalsHandle_->getProduct(ctx.stream());
+    auto const& gainsProduct = gainRatiosHandle_->getProduct(ctx.stream());
+    auto const& pulseShapesProduct = pulseShapesHandle_->getProduct(ctx.stream());
+    auto const& pulseCovariancesProduct = pulseCovariancesHandle_->getProduct(ctx.stream());
+    auto const& samplesCorrelationProduct = samplesCorrelationHandle_->getProduct(ctx.stream());
+    auto const& timeBiasCorrectionsProduct = timeBiasCorrectionsHandle_->getProduct(ctx.stream());
+    auto const& timeCalibConstantsProduct = timeCalibConstantsHandle_->getProduct(ctx.stream());
+
+    // bundle up conditions
+    ecal::multifit::ConditionsProducts conditions {
+        pedProduct, gainsProduct, pulseShapesProduct,
+        pulseCovariancesProduct, 
+        samplesCorrelationProduct,
+        timeBiasCorrectionsProduct,
+        timeCalibConstantsProduct,
+        *sampleMaskHandle_,
+        *timeOffsetConstantHandle_,
+        timeCalibConstantsHandle_->getOffset()
+    };
+    
+    //
+    // schedule algorithms
+    //
+    ecal::multifit::entryPoint(
+        inputDataGPU,
+        eventOutputDataGPU_,
+        eventDataForScratchGPU_,
+        conditions,
+        configParameters_,
+        ctx.stream()
+    );
 }
 
-void EcalUncalibRecHitProducerGPU::transferToHost(RecHitType& ebRecHits,
-                                                  RecHitType& eeRecHits,
-                                                  cudaStream_t cudaStream) {
-  cudaCheck(cudaMemcpyAsync(ebRecHits.amplitude.data(),
-                            eventOutputDataGPU_.amplitude,
-                            ebRecHits.amplitude.size() * sizeof(ecal::reco::StorageScalarType),
-                            cudaMemcpyDeviceToHost,
-                            cudaStream));
-  cudaCheck(cudaMemcpyAsync(eeRecHits.amplitude.data(),
-                            eventOutputDataGPU_.amplitude + ebRecHits.amplitude.size(),
-                            eeRecHits.amplitude.size() * sizeof(ecal::reco::StorageScalarType),
-                            cudaMemcpyDeviceToHost,
-                            cudaStream));
-
-  cudaCheck(cudaMemcpyAsync(ebRecHits.pedestal.data(),
-                            eventOutputDataGPU_.pedestal,
-                            ebRecHits.pedestal.size() * sizeof(ecal::reco::StorageScalarType),
-                            cudaMemcpyDeviceToHost,
-                            cudaStream));
-  cudaCheck(cudaMemcpyAsync(eeRecHits.pedestal.data(),
-                            eventOutputDataGPU_.pedestal + ebRecHits.pedestal.size(),
-                            eeRecHits.pedestal.size() * sizeof(ecal::reco::StorageScalarType),
-                            cudaMemcpyDeviceToHost,
-                            cudaStream));
-
-  cudaCheck(cudaMemcpyAsync(ebRecHits.chi2.data(),
-                            eventOutputDataGPU_.chi2,
-                            ebRecHits.chi2.size() * sizeof(ecal::reco::StorageScalarType),
-                            cudaMemcpyDeviceToHost,
-                            cudaStream));
-  cudaCheck(cudaMemcpyAsync(eeRecHits.chi2.data(),
-                            eventOutputDataGPU_.chi2 + ebRecHits.chi2.size(),
-                            eeRecHits.chi2.size() * sizeof(ecal::reco::StorageScalarType),
-                            cudaMemcpyDeviceToHost,
-                            cudaStream));
-
-  if (configParameters_.shouldRunTimingComputation) {
-    cudaCheck(cudaMemcpyAsync(ebRecHits.jitter.data(),
-                              eventOutputDataGPU_.jitter,
-                              ebRecHits.jitter.size() * sizeof(ecal::reco::StorageScalarType),
-                              cudaMemcpyDeviceToHost,
-                              cudaStream));
-    cudaCheck(cudaMemcpyAsync(eeRecHits.jitter.data(),
-                              eventOutputDataGPU_.jitter + ebRecHits.jitter.size(),
-                              eeRecHits.jitter.size() * sizeof(ecal::reco::StorageScalarType),
-                              cudaMemcpyDeviceToHost,
-                              cudaStream));
-
-    cudaCheck(cudaMemcpyAsync(ebRecHits.jitterError.data(),
-                              eventOutputDataGPU_.jitterError,
-                              ebRecHits.jitterError.size() * sizeof(ecal::reco::StorageScalarType),
-                              cudaMemcpyDeviceToHost,
-                              cudaStream));
-    cudaCheck(cudaMemcpyAsync(eeRecHits.jitterError.data(),
-                              eventOutputDataGPU_.jitterError + ebRecHits.jitterError.size(),
-                              eeRecHits.jitterError.size() * sizeof(ecal::reco::StorageScalarType),
-                              cudaMemcpyDeviceToHost,
-                              cudaStream));
-  }
-
-  cudaCheck(cudaMemcpyAsync(ebRecHits.flags.data(),
-                            eventOutputDataGPU_.flags,
-                            ebRecHits.flags.size() * sizeof(uint32_t),
-                            cudaMemcpyDeviceToHost,
-                            cudaStream));
-  cudaCheck(cudaMemcpyAsync(eeRecHits.flags.data(),
-                            eventOutputDataGPU_.flags + ebRecHits.flags.size(),
-                            eeRecHits.flags.size() * sizeof(uint32_t),
-                            cudaMemcpyDeviceToHost,
-                            cudaStream));
-
-  cudaCheck(cudaMemcpyAsync(ebRecHits.amplitudesAll.data(),
-                            eventOutputDataGPU_.amplitudesAll,
-                            ebRecHits.amplitudesAll.size() * sizeof(ecal::reco::ComputationScalarType),
-                            cudaMemcpyDeviceToHost,
-                            cudaStream));
-  cudaCheck(cudaMemcpyAsync(eeRecHits.amplitudesAll.data(),
-                            eventOutputDataGPU_.amplitudesAll + ebRecHits.amplitudesAll.size(),
-                            eeRecHits.amplitudesAll.size() * sizeof(ecal::reco::ComputationScalarType),
-                            cudaMemcpyDeviceToHost,
-                            cudaStream));
+void EcalUncalibRecHitProducerGPU::produce(
+        edm::Event& event, 
+        edm::EventSetup const& setup) 
+{
+    //DurationMeasurer<std::chrono::milliseconds> timer{std::string{"produce duration"}};
+    cms::cuda::ScopedContextProduce ctx{cudaState_};
+
+    // copy construct output collections
+    // note, output collections do not own device memory!
+    ecal::UncalibratedRecHit<ecal::Tag::ptr> 
+        ebRecHits{eventOutputDataGPU_},
+        eeRecHits{eventOutputDataGPU_};
+
+    // set the size of eb and ee
+    ebRecHits.size = neb_;
+    eeRecHits.size = nee_;
+
+    // shift ptrs for ee
+    eeRecHits.amplitudesAll += neb_ * EcalDataFrame::MAXSAMPLES;
+    eeRecHits.amplitude += neb_;
+    eeRecHits.chi2 += neb_;
+    eeRecHits.pedestal += neb_;
+    eeRecHits.did += neb_;
+    eeRecHits.flags += neb_;
+    if (configParameters_.shouldRunTimingComputation) {
+        eeRecHits.jitter += neb_;
+        eeRecHits.jitterError += neb_;
+    }
+
+    // put into the event
+    ctx.emplace(event, recHitsTokenEB_, std::move(ebRecHits));
+    ctx.emplace(event, recHitsTokenEE_, std::move(eeRecHits));
 }
 
 DEFINE_FWK_MODULE(EcalUncalibRecHitProducerGPU);

From 6e0f6304b3d85943b3b4f218e45b090f5bce7f5b Mon Sep 17 00:00:00 2001
From: Viktor Khristenko <vdkhristenko1991@gmail.com>
Date: Wed, 1 Apr 2020 12:01:28 +0200
Subject: [PATCH 06/30] make sure proper types are deduced for cuda copies

---
 .../plugins/EcalCPUUncalibRecHitProducer.cc   | 114 +++++-------------
 1 file changed, 30 insertions(+), 84 deletions(-)

diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc
index 9c531d7060525..9661f98139f7b 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc
@@ -90,95 +90,41 @@ void EcalCPUUncalibRecHitProducer::acquire(
     recHitsEB_.resize(ebRecHits.size);
     recHitsEE_.resize(eeRecHits.size);
 
+    auto lambdaToTransfer = [&ctx](auto& dest, auto* src) {
+        using vector_type = typename std::remove_reference<decltype(dest)>::type;
+        using type = typename vector_type::value_type;
+        cudaCheck(cudaMemcpyAsync(dest.data(),
+                                  src,
+                                  dest.size() * sizeof(type),
+                                  cudaMemcpyDeviceToHost,
+                                  ctx.stream()));
+    };
+
     // enqeue transfers
-    cudaCheck( cudaMemcpyAsync(recHitsEB_.did.data(),
-                               ebRecHits.did,
-                               recHitsEB_.did.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-    cudaCheck( cudaMemcpyAsync(recHitsEE_.did.data(),
-                               eeRecHits.did,
-                               recHitsEE_.did.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-
-    cudaCheck( cudaMemcpyAsync(recHitsEB_.amplitudesAll.data(),
-                               ebRecHits.amplitudesAll,
-                               recHitsEB_.amplitudesAll.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-    cudaCheck( cudaMemcpyAsync(recHitsEE_.amplitudesAll.data(),
-                               eeRecHits.amplitudesAll,
-                               recHitsEE_.amplitudesAll.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-    
-    cudaCheck( cudaMemcpyAsync(recHitsEB_.amplitude.data(),
-                               ebRecHits.amplitude,
-                               recHitsEB_.amplitude.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-    cudaCheck( cudaMemcpyAsync(recHitsEE_.amplitude.data(),
-                               eeRecHits.amplitude,
-                               recHitsEE_.amplitude.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
+    lambdaToTransfer(recHitsEB_.did, ebRecHits.did);
+    lambdaToTransfer(recHitsEE_.did, eeRecHits.did);
     
-    cudaCheck( cudaMemcpyAsync(recHitsEB_.chi2.data(),
-                               ebRecHits.chi2,
-                               recHitsEB_.chi2.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-    cudaCheck( cudaMemcpyAsync(recHitsEE_.chi2.data(),
-                               eeRecHits.chi2,
-                               recHitsEE_.chi2.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
+    lambdaToTransfer(recHitsEB_.amplitudesAll, ebRecHits.amplitudesAll);
+    lambdaToTransfer(recHitsEE_.amplitudesAll, eeRecHits.amplitudesAll);
     
-    cudaCheck( cudaMemcpyAsync(recHitsEB_.pedestal.data(),
-                               ebRecHits.pedestal,
-                               recHitsEB_.pedestal.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-    cudaCheck( cudaMemcpyAsync(recHitsEE_.pedestal.data(),
-                               eeRecHits.pedestal,
-                               recHitsEE_.pedestal.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-    
-    cudaCheck( cudaMemcpyAsync(recHitsEB_.flags.data(),
-                               ebRecHits.flags,
-                               recHitsEB_.flags.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-    cudaCheck( cudaMemcpyAsync(recHitsEE_.flags.data(),
-                               eeRecHits.flags,
-                               recHitsEE_.flags.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
+    lambdaToTransfer(recHitsEB_.amplitude, ebRecHits.amplitude);
+    lambdaToTransfer(recHitsEE_.amplitude, eeRecHits.amplitude);
+
+    lambdaToTransfer(recHitsEB_.chi2, ebRecHits.chi2);
+    lambdaToTransfer(recHitsEE_.chi2, eeRecHits.chi2);
     
+    lambdaToTransfer(recHitsEB_.pedestal, ebRecHits.pedestal);
+    lambdaToTransfer(recHitsEE_.pedestal, eeRecHits.pedestal);
+
+    lambdaToTransfer(recHitsEB_.flags, ebRecHits.flags);
+    lambdaToTransfer(recHitsEE_.flags, eeRecHits.flags);
+
     if (containsTimingInformation_) {
-        cudaCheck( cudaMemcpyAsync(recHitsEB_.jitter.data(),
-                                   ebRecHits.jitter,
-                                   recHitsEB_.jitter.size() * sizeof(uint32_t),
-                                   cudaMemcpyDeviceToHost,
-                                   ctx.stream()) );
-        cudaCheck( cudaMemcpyAsync(recHitsEE_.jitter.data(),
-                                   eeRecHits.jitter,
-                                   recHitsEE_.jitter.size() * sizeof(uint32_t),
-                                   cudaMemcpyDeviceToHost,
-                                   ctx.stream()) );
-        
-        cudaCheck( cudaMemcpyAsync(recHitsEB_.jitterError.data(),
-                                   ebRecHits.jitterError,
-                                   recHitsEB_.jitterError.size() * sizeof(uint32_t),
-                                   cudaMemcpyDeviceToHost,
-                                   ctx.stream()) );
-        cudaCheck( cudaMemcpyAsync(recHitsEE_.jitterError.data(),
-                                   eeRecHits.jitterError,
-                                   recHitsEE_.jitterError.size() * sizeof(uint32_t),
-                                   cudaMemcpyDeviceToHost,
-                                   ctx.stream()) );
+        lambdaToTransfer(recHitsEB_.jitter, ebRecHits.jitter);
+        lambdaToTransfer(recHitsEE_.jitter, eeRecHits.jitter);
+    
+        lambdaToTransfer(recHitsEB_.jitterError, ebRecHits.jitterError);
+        lambdaToTransfer(recHitsEE_.jitterError, eeRecHits.jitterError);
     }
 }
 

From 0e946cc670dcfa2f513a1b415c6fe175145f73c4 Mon Sep 17 00:00:00 2001
From: Viktor Khristenko <vdkhristenko1991@gmail.com>
Date: Wed, 1 Apr 2020 12:17:10 +0200
Subject: [PATCH 07/30] add ratio plot

---
 ...eEcalMultifitResultsGpuValidationPlots.cpp | 24 ++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp
index a336de13b9e7d..e0cca70f93795 100644
--- a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp
+++ b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp
@@ -45,6 +45,8 @@ int main(int argc, char *argv[]) {
     auto hSOIAmplitudesEEGPU = new TH1D("hSOIAmplitudesEEGPU", "hSOIAmplitudesEEGPU", nbins, 0, last);
     auto hSOIAmplitudesEBCPU = new TH1D("hSOIAmplitudesEBCPU", "hSOIAmplitudesEBCPU", nbins, 0, last);
     auto hSOIAmplitudesEECPU = new TH1D("hSOIAmplitudesEECPU", "hSOIAmplitudesEECPU", nbins, 0, last);
+    auto hSOIAmplitudesEBGPUCPUratio = new TH1D("SOIAmplitudesEBGPU/CPUratio", "SOIAmplitudesEBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
+    auto hSOIAmplitudesEEGPUCPUratio = new TH1D("SOIAmplitudesEEGPU/CPUratio", "SOIAmplitudesEEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
 
     auto hChi2EBGPU = new TH1D("hChi2EBGPU", "hChi2EBGPU", nbins_chi2, 0, last_chi2);
     auto hChi2EEGPU = new TH1D("hChi2EEGPU", "hChi2EEGPU", nbins_chi2, 0, last_chi2);
@@ -113,6 +115,7 @@ int main(int argc, char *argv[]) {
             hSOIAmplitudesEBCPU->Fill(soi_amp_cpu);
             hSOIAmplitudesEBGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu);
             hSOIAmplitudesEBdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu-soi_amp_cpu);
+            hSOIAmplitudesEBGPUCPUratio->Fill( (float) soi_amp_gpu/soi_amp_cpu);
             hChi2EBGPU->Fill(chi2_gpu);
             hChi2EBCPU->Fill(chi2_cpu);
             hChi2EBGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
@@ -146,6 +149,7 @@ int main(int argc, char *argv[]) {
             hSOIAmplitudesEECPU->Fill(soi_amp_cpu);
             hSOIAmplitudesEEGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu);
             hSOIAmplitudesEEdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu-soi_amp_cpu);
+            hSOIAmplitudesEEGPUCPUratio->Fill( (float) soi_amp_gpu/soi_amp_cpu);
             hChi2EEGPU->Fill(chi2_gpu);
             hChi2EECPU->Fill(chi2_cpu);
             hChi2EEGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
@@ -164,7 +168,7 @@ int main(int argc, char *argv[]) {
 
     {
       TCanvas c("plots", "plots", 4200, 6200);
-      c.Divide(2, 3);
+      c.Divide(2, 4);
 
       c.cd(1);
       {
@@ -206,8 +210,26 @@ int main(int argc, char *argv[]) {
       hSOIAmplitudesEBdeltavsCPU->Draw("COLZ");
       c.cd(6);
       hSOIAmplitudesEEdeltavsCPU->Draw("COLZ");
+      c.cd(7);
+      {
+          gPad->SetLogy();
+          hSOIAmplitudesEBGPUCPUratio->SetLineColor(kBlack);
+          hSOIAmplitudesEBGPUCPUratio->SetLineWidth(1.);
+          hSOIAmplitudesEBGPUCPUratio->Draw("");
+      }
+      c.cd(8);
+      {
+          gPad->SetLogy();
+          hSOIAmplitudesEEGPUCPUratio->SetLineColor(kBlack);
+          hSOIAmplitudesEEGPUCPUratio->SetLineWidth(1.);
+          hSOIAmplitudesEEGPUCPUratio->Draw("");
+      }
 
       c.SaveAs("ecal-amplitudes.pdf");
+    }
+    {
+      TCanvas c("plots", "plots", 4200, 6200);
+      c.Divide(2, 3);
 
       c.cd(1);
       {

From b21427c6d505b2c6780fde61cffbec7c173a5585 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 3 Apr 2020 18:43:16 +0200
Subject: [PATCH 08/30] Clean up ECAL unapcker code

Fix compilation warnings, remove commented out code, and apply code formatting rules.
---
 CUDADataFormats/EcalDigi/BuildFile.xml        |    3 +-
 CUDADataFormats/EcalRecHitSoA/BuildFile.xml   |    4 +-
 .../EcalRecHitSoA/src/classes_def.xml         |    8 -
 .../EcalObjects/interface/EcalXtalGroupId.h   |    2 +-
 .../EcalDigi/interface/EcalMatacqDigi.h       |    4 +-
 EventFilter/EcalRawToDigi/BuildFile.xml       |   49 +-
 .../makeEcalRaw2DigiGpuValidationPlots.cpp    |  392 ++--
 .../EcalRawToDigi/plugins/BuildFile.xml       |   45 +-
 .../plugins/EcalCPUDigisProducer.cc           |  183 +-
 .../plugins/EcalRawESProducersGPUDefs.cc      |    6 +-
 .../EcalRawToDigi/plugins/EcalRawToDigiGPU.cc |  234 +--
 .../src/ElectronicsMappingGPU.cc              |   85 +-
 EventFilter/EcalRawToDigi/src/UnpackGPU.cu    |  703 +++----
 ...eEcalMultifitResultsGpuValidationPlots.cpp |  486 ++---
 .../src/AmplitudeComputationCommonKernels.cu  |  819 ++++----
 .../src/AmplitudeComputationKernels.cu        |  685 +++----
 .../EcalRecAlgos/src/EcalGainRatiosGPU.cc     |   83 +-
 .../EcalRecAlgos/src/EcalPedestalsGPU.cc      |  167 +-
 .../src/EcalPulseCovariancesGPU.cc            |   68 +-
 .../EcalRecAlgos/src/EcalPulseShapesGPU.cc    |   68 +-
 .../src/EcalSamplesCorrelationGPU.cc          |  143 +-
 .../src/EcalTimeBiasCorrectionsGPU.cc         |  111 +-
 .../src/EcalTimeCalibConstantsGPU.cc          |   65 +-
 .../EcalUncalibRecHitMultiFitAlgo_gpu_new.cu  |  307 ++-
 .../EcalRecAlgos/src/KernelHelpers.cu         |  156 +-
 .../src/TimeComputationKernels.cu             | 1783 ++++++++---------
 .../EcalRecAlgos/src/inplace_fnnls.cu         |  196 +-
 .../plugins/EcalCPUUncalibRecHitProducer.cc   |  191 +-
 .../plugins/EcalESProducersGPUDefs.cc         |   35 +-
 .../EcalUncalibRecHitConvertGPU2CPUFormat.cc  |  144 +-
 .../plugins/EcalUncalibRecHitProducerGPU.cc   |  634 +++---
 31 files changed, 3610 insertions(+), 4249 deletions(-)

diff --git a/CUDADataFormats/EcalDigi/BuildFile.xml b/CUDADataFormats/EcalDigi/BuildFile.xml
index a1838ba91dc91..4a5c646e3a1b3 100644
--- a/CUDADataFormats/EcalDigi/BuildFile.xml
+++ b/CUDADataFormats/EcalDigi/BuildFile.xml
@@ -1,7 +1,6 @@
-<use name="DataFormats/Common"/>
 <use name="CUDADataFormats/Common"/>
+<use name="DataFormats/Common"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
-<use name="cuda"/>
 
 <export>
   <lib   name="1"/>
diff --git a/CUDADataFormats/EcalRecHitSoA/BuildFile.xml b/CUDADataFormats/EcalRecHitSoA/BuildFile.xml
index 927a7a57a86a7..de31c3f42a961 100644
--- a/CUDADataFormats/EcalRecHitSoA/BuildFile.xml
+++ b/CUDADataFormats/EcalRecHitSoA/BuildFile.xml
@@ -1,8 +1,8 @@
-<use name="DataFormats/Common"/>
+<use name="cuda"/>
 <use name="CUDADataFormats/Common"/>
+<use name="DataFormats/Common"/>
 <use name="DataFormats/EcalDigi"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
-<use name="cuda"/>
 
 <export>
   <lib   name="1"/>
diff --git a/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml b/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml
index 68056d21ad4c1..b75a258a5151e 100644
--- a/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml
+++ b/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml
@@ -8,14 +8,6 @@
     <class name="cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>" persistent="false" />
     <class name="edm::Wrapper<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>>" persistent="false" />
 
-    <!--
-    <class name="std::vector<double, CUDAHostAllocator<double> >"/>
-    <class name="std::vector<float, CUDAHostAllocator<float> >"/>
-    <class name="std::vector<unsigned int, CUDAHostAllocator<unsigned int> >" />
-    -->
-
-    <!--  <class name="std::array<double, 10>" />
-    <class name="std::array<float, 10>" /> -->
     <class name="ecal::UncalibratedRecHit<ecal::Tag::soa>"/>
     <class name="edm::Wrapper<ecal::UncalibratedRecHit<ecal::Tag::soa> >"/>
 </lcgdict>
diff --git a/CondFormats/EcalObjects/interface/EcalXtalGroupId.h b/CondFormats/EcalObjects/interface/EcalXtalGroupId.h
index 3331db375dfd1..b21c2b9889d9f 100644
--- a/CondFormats/EcalObjects/interface/EcalXtalGroupId.h
+++ b/CondFormats/EcalObjects/interface/EcalXtalGroupId.h
@@ -19,7 +19,7 @@ class EcalXtalGroupId {
   bool operator<(const EcalXtalGroupId& rhs) const { return (id_ < rhs.id()); }
   bool operator<=(const EcalXtalGroupId& rhs) const { return (id_ <= rhs.id()); }
 
-  const unsigned int id() const { return id_; }
+  unsigned int id() const { return id_; }
 
 private:
   unsigned int id_;
diff --git a/DataFormats/EcalDigi/interface/EcalMatacqDigi.h b/DataFormats/EcalDigi/interface/EcalMatacqDigi.h
index 952bd894ec891..b7cbc3949cf01 100644
--- a/DataFormats/EcalDigi/interface/EcalMatacqDigi.h
+++ b/DataFormats/EcalDigi/interface/EcalMatacqDigi.h
@@ -50,12 +50,12 @@ class EcalMatacqDigi {
   /** Gets amplitude in ADC count of time sample i. i between 0 and size()-1.
    * Note: Amplitude is pedestal subtracted at acquisition time.
    */
-  const float adcCount(const int& i) const { return data_[i]; }
+  float adcCount(const int& i) const { return data_[i]; }
 
   /** Gets amplitude in Volt of time sample i. i between 0 and size()-1.
    * Note: Amplitude is pedestal subtracted at acquisition time.
    */
-  const float amplitudeV(const int& i) const { return data_[i] * lsb_; }
+  float amplitudeV(const int& i) const { return data_[i] * lsb_; }
 
   /** Gets Matacq electronics channel id
    */
diff --git a/EventFilter/EcalRawToDigi/BuildFile.xml b/EventFilter/EcalRawToDigi/BuildFile.xml
index 61a07973df153..e31aea9a0b58a 100644
--- a/EventFilter/EcalRawToDigi/BuildFile.xml
+++ b/EventFilter/EcalRawToDigi/BuildFile.xml
@@ -1,28 +1,27 @@
-<use   name="FWCore/Framework"/>
-<use   name="FWCore/PluginManager"/>
-<use   name="FWCore/ParameterSet"/>
-<use   name="boost"/>
-<use   name="CalibCalorimetry/EcalLaserCorrection"/>
-<use   name="CondFormats/DataRecord"/>
-<use   name="CondFormats/EcalObjects"/>
-<use   name="DataFormats/EcalDetId"/>
-<use   name="DataFormats/EcalDigi"/>
-<use   name="DataFormats/EcalRawData"/>
-<use   name="DataFormats/EcalRecHit"/>
-<use   name="DataFormats/FEDRawData"/>
-<use   name="DataFormats/Common"/>
-<use   name="FWCore/MessageLogger"/>
-<use   name="Geometry/EcalMapping"/>
-<use   name="Geometry/Records"/>
-<use   name="RecoLocalCalo/EcalRecAlgos"/>
-<use   name="RecoLocalCalo/EcalRecProducers"/>
-<use   name="Utilities/StorageFactory"/>
-
-<use   name="cuda"/>
-<use   name="HeterogeneousCore/CUDAUtilities"/>
-<use   name="HeterogeneousCore/CUDACore"/>
-<use   name="CUDADataFormats/EcalDigi" />
+<use name="boost"/>
+<use name="cuda"/>
+<use name="CUDADataFormats/EcalDigi" />
+<use name="CalibCalorimetry/EcalLaserCorrection"/>
+<use name="CondFormats/DataRecord"/>
+<use name="CondFormats/EcalObjects"/>
+<use name="DataFormats/Common"/>
+<use name="DataFormats/EcalDetId"/>
+<use name="DataFormats/EcalDigi"/>
+<use name="DataFormats/EcalRawData"/>
+<use name="DataFormats/EcalRecHit"/>
+<use name="DataFormats/FEDRawData"/>
+<use name="FWCore/Framework"/>
+<use name="FWCore/MessageLogger"/>
+<use name="FWCore/ParameterSet"/>
+<use name="FWCore/PluginManager"/>
+<use name="Geometry/EcalMapping"/>
+<use name="Geometry/Records"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="RecoLocalCalo/EcalRecAlgos"/>
+<use name="RecoLocalCalo/EcalRecProducers"/>
+<use name="Utilities/StorageFactory"/>
 
 <export>
-  <lib   name="1"/>
+  <lib name="1"/>
 </export>
diff --git a/EventFilter/EcalRawToDigi/bin/makeEcalRaw2DigiGpuValidationPlots.cpp b/EventFilter/EcalRawToDigi/bin/makeEcalRaw2DigiGpuValidationPlots.cpp
index 9fc9ec26e3714..609c277e19288 100644
--- a/EventFilter/EcalRawToDigi/bin/makeEcalRaw2DigiGpuValidationPlots.cpp
+++ b/EventFilter/EcalRawToDigi/bin/makeEcalRaw2DigiGpuValidationPlots.cpp
@@ -13,212 +13,198 @@
 #include "DataFormats/Common/interface/Wrapper.h"
 #include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
 
-int main(int argc, char *argv[]) {
-    if (argc<3) {
-        std::cout << "run with: ./<exe> <path to input file> <path to output file>\n";
-        exit(0);
+int main(int argc, char* argv[]) {
+  if (argc < 3) {
+    std::cout << "run with: ./<exe> <path to input file> <path to output file>\n";
+    exit(0);
+  }
+
+  // branches to use
+  edm::Wrapper<EBDigiCollection>*wgpuEB = nullptr, *wcpuEB = nullptr;
+  edm::Wrapper<EEDigiCollection>*wgpuEE = nullptr, *wcpuEE = nullptr;
+
+  std::string inFileName{argv[1]};
+  std::string outFileName{argv[2]};
+
+  // prep output
+  TFile rfout{outFileName.c_str(), "recreate"};
+
+  int const nbins = 400;
+  float const last = 4096.;
+  auto hADCEBGPU = new TH1D("hADCEBGPU", "hADCEBGPU", nbins, 0, last);
+  auto hADCEBCPU = new TH1D("hADCEBCPU", "hADCEBCPU", nbins, 0, last);
+  auto hADCEEGPU = new TH1D("hADCEEGPU", "hADCEEGPU", nbins, 0, last);
+  auto hADCEECPU = new TH1D("hADCEECPU", "hADCEECPU", nbins, 0, last);
+
+  auto hGainEBGPU = new TH1D("hGainEBGPU", "hGainEBGPU", 4, 0, 4);
+  auto hGainEBCPU = new TH1D("hGainEBCPU", "hGainEBCPU", 4, 0, 4);
+  auto hGainEEGPU = new TH1D("hGainEEGPU", "hGainEEGPU", 4, 0, 4);
+  auto hGainEECPU = new TH1D("hGainEECPU", "hGainEECPU", 4, 0, 4);
+
+  auto hADCEBGPUvsCPU = new TH2D("hADCEBGPUvsCPU", "hADCEBGPUvsCPU", nbins, 0, last, nbins, 0, last);
+  auto hADCEEGPUvsCPU = new TH2D("hADCEEGPUvsCPU", "hADCEEGPUvsCPU", nbins, 0, last, nbins, 0, last);
+  auto hGainEBGPUvsCPU = new TH2D("hGainEBGPUvsCPU", "hGainEBGPUvsCPU", 4, 0, 4, 4, 0, 4);
+  auto hGainEEGPUvsCPU = new TH2D("hGainEEGPUvsCPU", "hGainEEGPUvsCPU", 4, 0, 4, 4, 0, 4);
+
+  // prep input
+  TFile rfin{inFileName.c_str()};
+  TTree* rt = (TTree*)rfin.Get("Events");
+  rt->SetBranchAddress("EBDigiCollection_ecalCPUDigisProducer_ebDigis_RECO.", &wgpuEB);
+  rt->SetBranchAddress("EEDigiCollection_ecalCPUDigisProducer_eeDigis_RECO.", &wgpuEE);
+  rt->SetBranchAddress("EBDigiCollection_ecalDigis_ebDigis_RECO.", &wcpuEB);
+  rt->SetBranchAddress("EEDigiCollection_ecalDigis_eeDigis_RECO.", &wcpuEE);
+
+  // accumulate
+  auto const nentries = rt->GetEntries();
+  std::cout << ">>> nentries = " << nentries << std::endl;
+  for (int ie = 0; ie < nentries; ++ie) {
+    rt->GetEntry(ie);
+
+    auto const ngpuebs = wgpuEB->bareProduct().size();
+    auto const ncpuebs = wcpuEB->bareProduct().size();
+    auto const ngpuees = wgpuEE->bareProduct().size();
+    auto const ncpuees = wcpuEE->bareProduct().size();
+
+    if (ngpuebs != ncpuebs or ngpuees != ncpuees) {
+      std::cerr << "*** mismatch in ndigis: "
+                << "ie = " << ie << "  ngpuebs = " << ngpuebs << "  ncpuebs = " << ncpuebs << "  ngpuees = " << ngpuees
+                << "  ncpuees = " << ncpuees << std::endl;
+
+      // this is a must for now
+      //assert(ngpuebs==ncpuebs);
+      //assert(ngpuees==ncpuees);
     }
-    
-    // branches to use
-    edm::Wrapper<EBDigiCollection> *wgpuEB=nullptr, *wcpuEB=nullptr;
-    edm::Wrapper<EEDigiCollection> *wgpuEE=nullptr, *wcpuEE=nullptr;
-
-    std::string inFileName{argv[1]};
-    std::string outFileName{argv[2]};
-
-    // prep output 
-    TFile rfout{outFileName.c_str(), "recreate"};
-
-    int const nbins = 400;
-    float const last = 4096.;
-    auto hADCEBGPU = new TH1D("hADCEBGPU", "hADCEBGPU", nbins, 0, last);
-    auto hADCEBCPU = new TH1D("hADCEBCPU", "hADCEBCPU", nbins, 0, last);
-    auto hADCEEGPU = new TH1D("hADCEEGPU", "hADCEEGPU", nbins, 0, last);
-    auto hADCEECPU = new TH1D("hADCEECPU", "hADCEECPU", nbins, 0, last);
-
-    auto hGainEBGPU = new TH1D("hGainEBGPU", "hGainEBGPU", 4, 0, 4);
-    auto hGainEBCPU = new TH1D("hGainEBCPU", "hGainEBCPU", 4, 0, 4);
-    auto hGainEEGPU = new TH1D("hGainEEGPU", "hGainEEGPU", 4, 0, 4);
-    auto hGainEECPU = new TH1D("hGainEECPU", "hGainEECPU", 4, 0, 4);
-
-    auto hADCEBGPUvsCPU = new TH2D("hADCEBGPUvsCPU", "hADCEBGPUvsCPU",
-        nbins, 0, last, nbins, 0, last);
-    auto hADCEEGPUvsCPU = new TH2D("hADCEEGPUvsCPU", "hADCEEGPUvsCPU",
-        nbins, 0, last, nbins, 0, last);
-    auto hGainEBGPUvsCPU = new TH2D("hGainEBGPUvsCPU", "hGainEBGPUvsCPU",
-        4, 0, 4, 4, 0, 4);
-    auto hGainEEGPUvsCPU = new TH2D("hGainEEGPUvsCPU", "hGainEEGPUvsCPU",
-        4, 0, 4, 4, 0, 4);
-
-    // prep input
-    TFile rfin{inFileName.c_str()};
-    TTree *rt = (TTree*)rfin.Get("Events");
-    rt->SetBranchAddress("EBDigiCollection_ecalCPUDigisProducer_ebDigis_RECO.",
-        &wgpuEB);
-    rt->SetBranchAddress("EEDigiCollection_ecalCPUDigisProducer_eeDigis_RECO.",
-        &wgpuEE);
-    rt->SetBranchAddress("EBDigiCollection_ecalDigis_ebDigis_RECO.",
-        &wcpuEB);
-    rt->SetBranchAddress("EEDigiCollection_ecalDigis_eeDigis_RECO.",
-        &wcpuEE);
-
-    // accumulate
-    auto const nentries = rt->GetEntries();
-    std::cout << ">>> nentries = " << nentries << std::endl;
-    for (int ie=0; ie<nentries; ++ie) {
-        rt->GetEntry(ie);
-
-        auto const ngpuebs = wgpuEB->bareProduct().size();
-        auto const ncpuebs = wcpuEB->bareProduct().size();
-        auto const ngpuees = wgpuEE->bareProduct().size();
-        auto const ncpuees = wcpuEE->bareProduct().size();
-
-        if (ngpuebs!=ncpuebs or ngpuees!=ncpuees) {
-            std::cerr << "*** mismatch in ndigis: "
-                      << "ie = " << ie
-                      << "  ngpuebs = " << ngpuebs
-                      << "  ncpuebs = " << ncpuebs
-                      << "  ngpuees = " << ngpuees
-                      << "  ncpuees = " << ncpuees
-                      << std::endl;
-
-            // this is a must for now
-            //assert(ngpuebs==ncpuebs);
-            //assert(ngpuees==ncpuees);
-        }
-
-        // assume identical sizes
-        auto const& idsgpuEB = wgpuEB->bareProduct().ids();
-        auto const& datagpuEB = wgpuEB->bareProduct().data();
-        auto const& idscpuEB = wcpuEB->bareProduct().ids();
-        auto const& datacpuEB = wcpuEB->bareProduct().data();
-        for (uint32_t ieb=0; ieb<ngpuebs; ++ieb) {
-            auto const& idgpu = idsgpuEB[ieb];
-            auto iter2idcpu = std::find(idscpuEB.begin(), idscpuEB.end(), 
-                idgpu);
-            // FIXME
-            assert(idgpu == *iter2idcpu);
-
-            auto const ptrdiff = iter2idcpu - idscpuEB.begin();
-            for (uint32_t s=0u; s<10u; s++) {
-                EcalMGPASample sampleGPU{datagpuEB[ieb*10 + s]};
-                EcalMGPASample sampleCPU{datacpuEB[ptrdiff * 10 + s]};
-
-                hADCEBGPU->Fill(sampleGPU.adc());
-                hGainEBGPU->Fill(sampleGPU.gainId());
-                hADCEBCPU->Fill(sampleCPU.adc());
-                hGainEBCPU->Fill(sampleCPU.gainId());
-                hADCEBGPUvsCPU->Fill(sampleCPU.adc(), sampleGPU.adc());
-                hGainEBGPUvsCPU->Fill(sampleCPU.gainId(), sampleGPU.gainId());
-            }
-        }
-
-        auto const& idsgpuEE = wgpuEE->bareProduct().ids();
-        auto const& datagpuEE = wgpuEE->bareProduct().data();
-        auto const& idscpuEE = wcpuEE->bareProduct().ids();
-        auto const& datacpuEE = wcpuEE->bareProduct().data();
-        for (uint32_t iee=0; iee<ngpuees; ++iee) {
-            auto const& idgpu = idsgpuEE[iee];
-            auto iter2idcpu = std::find(idscpuEE.begin(), idscpuEE.end(), 
-                idgpu);
-            // FIXME
-            assert(idgpu == *iter2idcpu);
-
-            // get the digis
-            auto const ptrdiff = iter2idcpu - idscpuEE.begin();
-            for (uint32_t s=0u; s<10u; s++) {
-                EcalMGPASample sampleGPU{datagpuEE[iee * 10 + s]};
-                EcalMGPASample sampleCPU{datacpuEE[ptrdiff * 10 + s]};
-
-                hADCEEGPU->Fill(sampleGPU.adc());
-                hGainEEGPU->Fill(sampleGPU.gainId());
-                hADCEECPU->Fill(sampleCPU.adc());
-                hGainEECPU->Fill(sampleCPU.gainId());
-                hADCEEGPUvsCPU->Fill(sampleCPU.adc(), sampleGPU.adc());
-                hGainEEGPUvsCPU->Fill(sampleCPU.gainId(), sampleGPU.gainId());
-            }
-        }
+
+    // assume identical sizes
+    auto const& idsgpuEB = wgpuEB->bareProduct().ids();
+    auto const& datagpuEB = wgpuEB->bareProduct().data();
+    auto const& idscpuEB = wcpuEB->bareProduct().ids();
+    auto const& datacpuEB = wcpuEB->bareProduct().data();
+    for (uint32_t ieb = 0; ieb < ngpuebs; ++ieb) {
+      auto const& idgpu = idsgpuEB[ieb];
+      auto iter2idcpu = std::find(idscpuEB.begin(), idscpuEB.end(), idgpu);
+      // FIXME
+      assert(idgpu == *iter2idcpu);
+
+      auto const ptrdiff = iter2idcpu - idscpuEB.begin();
+      for (uint32_t s = 0u; s < 10u; s++) {
+        EcalMGPASample sampleGPU{datagpuEB[ieb * 10 + s]};
+        EcalMGPASample sampleCPU{datacpuEB[ptrdiff * 10 + s]};
+
+        hADCEBGPU->Fill(sampleGPU.adc());
+        hGainEBGPU->Fill(sampleGPU.gainId());
+        hADCEBCPU->Fill(sampleCPU.adc());
+        hGainEBCPU->Fill(sampleCPU.gainId());
+        hADCEBGPUvsCPU->Fill(sampleCPU.adc(), sampleGPU.adc());
+        hGainEBGPUvsCPU->Fill(sampleCPU.gainId(), sampleGPU.gainId());
+      }
     }
 
-    {
-        TCanvas c{"plots", "plots", 4200, 6200};
-        c.Divide(2, 4);
-        c.cd(1);
-        {
-            gPad->SetLogy();
-            hADCEBCPU->SetLineColor(kBlack);
-            hADCEBCPU->SetLineWidth(1.);
-            hADCEBCPU->Draw("");
-            hADCEBGPU->SetLineColor(kBlue);
-            hADCEBGPU->SetLineWidth(1.);
-            hADCEBGPU->Draw("sames");
-            gPad->Update();
-            auto stats = (TPaveStats*)hADCEBGPU->FindObject("stats");
-            auto y2 = stats->GetY2NDC();
-            auto y1 = stats->GetY1NDC();
-            stats->SetY2NDC(y1);
-            stats->SetY1NDC(y1 - (y2-y1));
-        }
-        c.cd(2);
-        {
-            gPad->SetLogy();
-            hADCEECPU->SetLineColor(kBlack);
-            hADCEECPU->SetLineWidth(1.);
-            hADCEECPU->Draw("");
-            hADCEEGPU->SetLineColor(kBlue);
-            hADCEEGPU->SetLineWidth(1.);
-            hADCEEGPU->Draw("sames");
-            gPad->Update();
-            auto stats = (TPaveStats*)hADCEEGPU->FindObject("stats");
-            auto y2 = stats->GetY2NDC();
-            auto y1 = stats->GetY1NDC();
-            stats->SetY2NDC(y1);
-            stats->SetY1NDC(y1 - (y2-y1));
-        }
-        c.cd(3);
-        {
-            gPad->SetLogy();
-            hGainEBCPU->SetLineColor(kBlack);
-            hGainEBCPU->SetLineWidth(1.);
-            hGainEBCPU->Draw("");
-            hGainEBGPU->SetLineColor(kBlue);
-            hGainEBGPU->SetLineWidth(1.);
-            hGainEBGPU->Draw("sames");
-            gPad->Update();
-            auto stats = (TPaveStats*)hGainEBGPU->FindObject("stats");
-            auto y2 = stats->GetY2NDC();
-            auto y1 = stats->GetY1NDC();
-            stats->SetY2NDC(y1);
-            stats->SetY1NDC(y1 - (y2-y1));
-        }
-        c.cd(4);
-        {
-            gPad->SetLogy();
-            hGainEECPU->SetLineColor(kBlack);
-            hGainEECPU->SetLineWidth(1.);
-            hGainEECPU->Draw("");
-            hGainEEGPU->SetLineColor(kBlue);
-            hGainEEGPU->SetLineWidth(1.);
-            hGainEEGPU->Draw("sames");
-            gPad->Update();
-            auto stats = (TPaveStats*)hGainEEGPU->FindObject("stats");
-            auto y2 = stats->GetY2NDC();
-            auto y1 = stats->GetY1NDC();
-            stats->SetY2NDC(y1);
-            stats->SetY1NDC(y1 - (y2-y1));
-        }
-        c.cd(5);
-        hADCEBGPUvsCPU->Draw("colz");
-        c.cd(6);
-        hADCEEGPUvsCPU->Draw("colz");
-        c.cd(7);
-        hGainEBGPUvsCPU->Draw("colz");
-        c.cd(8);
-        hGainEEGPUvsCPU->Draw("colz");
-        c.SaveAs("plots.pdf");
+    auto const& idsgpuEE = wgpuEE->bareProduct().ids();
+    auto const& datagpuEE = wgpuEE->bareProduct().data();
+    auto const& idscpuEE = wcpuEE->bareProduct().ids();
+    auto const& datacpuEE = wcpuEE->bareProduct().data();
+    for (uint32_t iee = 0; iee < ngpuees; ++iee) {
+      auto const& idgpu = idsgpuEE[iee];
+      auto iter2idcpu = std::find(idscpuEE.begin(), idscpuEE.end(), idgpu);
+      // FIXME
+      assert(idgpu == *iter2idcpu);
+
+      // get the digis
+      auto const ptrdiff = iter2idcpu - idscpuEE.begin();
+      for (uint32_t s = 0u; s < 10u; s++) {
+        EcalMGPASample sampleGPU{datagpuEE[iee * 10 + s]};
+        EcalMGPASample sampleCPU{datacpuEE[ptrdiff * 10 + s]};
+
+        hADCEEGPU->Fill(sampleGPU.adc());
+        hGainEEGPU->Fill(sampleGPU.gainId());
+        hADCEECPU->Fill(sampleCPU.adc());
+        hGainEECPU->Fill(sampleCPU.gainId());
+        hADCEEGPUvsCPU->Fill(sampleCPU.adc(), sampleGPU.adc());
+        hGainEEGPUvsCPU->Fill(sampleCPU.gainId(), sampleGPU.gainId());
+      }
     }
+  }
 
-    rfin.Close();
-    rfout.Write();
-    rfout.Close();
+  {
+    TCanvas c{"plots", "plots", 4200, 6200};
+    c.Divide(2, 4);
+    c.cd(1);
+    {
+      gPad->SetLogy();
+      hADCEBCPU->SetLineColor(kBlack);
+      hADCEBCPU->SetLineWidth(1.);
+      hADCEBCPU->Draw("");
+      hADCEBGPU->SetLineColor(kBlue);
+      hADCEBGPU->SetLineWidth(1.);
+      hADCEBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hADCEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(2);
+    {
+      gPad->SetLogy();
+      hADCEECPU->SetLineColor(kBlack);
+      hADCEECPU->SetLineWidth(1.);
+      hADCEECPU->Draw("");
+      hADCEEGPU->SetLineColor(kBlue);
+      hADCEEGPU->SetLineWidth(1.);
+      hADCEEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hADCEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(3);
+    {
+      gPad->SetLogy();
+      hGainEBCPU->SetLineColor(kBlack);
+      hGainEBCPU->SetLineWidth(1.);
+      hGainEBCPU->Draw("");
+      hGainEBGPU->SetLineColor(kBlue);
+      hGainEBGPU->SetLineWidth(1.);
+      hGainEBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hGainEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(4);
+    {
+      gPad->SetLogy();
+      hGainEECPU->SetLineColor(kBlack);
+      hGainEECPU->SetLineWidth(1.);
+      hGainEECPU->Draw("");
+      hGainEEGPU->SetLineColor(kBlue);
+      hGainEEGPU->SetLineWidth(1.);
+      hGainEEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hGainEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(5);
+    hADCEBGPUvsCPU->Draw("colz");
+    c.cd(6);
+    hADCEEGPUvsCPU->Draw("colz");
+    c.cd(7);
+    hGainEBGPUvsCPU->Draw("colz");
+    c.cd(8);
+    hGainEEGPUvsCPU->Draw("colz");
+    c.SaveAs("plots.pdf");
+  }
+
+  rfin.Close();
+  rfout.Write();
+  rfout.Close();
 }
diff --git a/EventFilter/EcalRawToDigi/plugins/BuildFile.xml b/EventFilter/EcalRawToDigi/plugins/BuildFile.xml
index 296a6b2461f8c..6c2f2bb94db7c 100644
--- a/EventFilter/EcalRawToDigi/plugins/BuildFile.xml
+++ b/EventFilter/EcalRawToDigi/plugins/BuildFile.xml
@@ -1,25 +1,24 @@
-<use   name="EventFilter/EcalRawToDigi"/>
-<use   name="root"/>
-<use   name="DataFormats/Candidate"/>
-<use   name="DataFormats/EcalRecHit"/>
-<use   name="DataFormats/L1GlobalMuonTrigger"/>
-<use   name="DataFormats/L1Trigger"/>
-<use   name="CondFormats/L1TObjects"/>
-<use   name="CondFormats/DataRecord"/>
-<use   name="FWCore/Utilities"/>
-<use   name="DataFormats/Common"/>
-<use   name="DataFormats/Scalers"/>
-<use   name="FWCore/Framework"/>
-<use   name="RecoEcal/EgammaCoreTools"/>
-<use   name="TrackingTools/Records"/>
-<use   name="TrackingTools/GeomPropagators"/>
-<use   name="TrackingTools/TrajectoryState"/>
+<use name="cuda"/>
+<use name="root"/>
+<use name="CUDADataFormats/EcalDigi" />
+<use name="CondFormats/DataRecord"/>
+<use name="CondFormats/L1TObjects"/>
+<use name="DataFormats/Candidate"/>
+<use name="DataFormats/Common"/>
+<use name="DataFormats/EcalRecHit"/>
+<use name="DataFormats/L1GlobalMuonTrigger"/>
+<use name="DataFormats/L1Trigger"/>
+<use name="DataFormats/Scalers"/>
+<use name="EventFilter/EcalRawToDigi"/>
+<use name="FWCore/Framework"/>
+<use name="FWCore/Utilities"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="RecoEcal/EgammaCoreTools"/>
+<use name="TrackingTools/GeomPropagators"/>
+<use name="TrackingTools/Records"/>
+<use name="TrackingTools/TrajectoryState"/>
 
-<use   name="cuda"/>
-<use   name="HeterogeneousCore/CUDAUtilities"/>
-<use   name="HeterogeneousCore/CUDACore"/>
-<use   name="CUDADataFormats/EcalDigi" />
-
-<library   file="*.cc" name="EventFilterEcalRawToDigiPlugins">
-  <flags   EDM_PLUGIN="1"/>
+<library file="*.cc" name="EventFilterEcalRawToDigiPlugins">
+  <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc b/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc
index 6f488053b204b..00491efe634cd 100644
--- a/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc
+++ b/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc
@@ -10,7 +10,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h" 
+#include "FWCore/Framework/interface/MakerMacros.h"
 
 // algorithm specific
 
@@ -25,128 +25,99 @@
 #include "EventFilter/EcalRawToDigi/interface/DeclsForKernels.h"
 #include "EventFilter/EcalRawToDigi/interface/UnpackGPU.h"
 
-class EcalCPUDigisProducer
-    : public edm::stream::EDProducer<edm::ExternalWork>
-{
+class EcalCPUDigisProducer : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
-    explicit EcalCPUDigisProducer(edm::ParameterSet const& ps);
-    ~EcalCPUDigisProducer() override;
-    static void fillDescriptions(edm::ConfigurationDescriptions&);
+  explicit EcalCPUDigisProducer(edm::ParameterSet const& ps);
+  ~EcalCPUDigisProducer() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
 
 private:
-    void acquire(edm::Event const&, 
-                 edm::EventSetup const&,
-                 edm::WaitingTaskWithArenaHolder) override;
-    void produce(edm::Event&, edm::EventSetup const&) override;
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
 
 private:
-    edm::EDGetTokenT<cms::cuda::Product<ecal::DigisCollection>> digisInEBToken_, 
-        digisInEEToken_;
-    edm::EDPutTokenT<EBDigiCollection> digisOutEBToken_;
-    edm::EDPutTokenT<EEDigiCollection> digisOutEEToken_;
-
-    // FIXME better way to pass pointers from acquire to produce?
-    std::vector<uint32_t, CUDAHostAllocator<uint32_t>> idsebtmp, idseetmp;
-    std::vector<uint16_t, CUDAHostAllocator<uint16_t>> dataebtmp, dataeetmp;
+  edm::EDGetTokenT<cms::cuda::Product<ecal::DigisCollection>> digisInEBToken_, digisInEEToken_;
+  edm::EDPutTokenT<EBDigiCollection> digisOutEBToken_;
+  edm::EDPutTokenT<EEDigiCollection> digisOutEEToken_;
+
+  // FIXME better way to pass pointers from acquire to produce?
+  std::vector<uint32_t, CUDAHostAllocator<uint32_t>> idsebtmp, idseetmp;
+  std::vector<uint16_t, CUDAHostAllocator<uint16_t>> dataebtmp, dataeetmp;
 };
 
-void EcalCPUDigisProducer::fillDescriptions(
-        edm::ConfigurationDescriptions& confDesc) {
-    edm::ParameterSetDescription desc;
+void EcalCPUDigisProducer::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
 
-    desc.add<edm::InputTag>("digisInLabelEB", 
-        edm::InputTag{"ecalRawToDigiGPU", "ebDigisGPU"});
-    desc.add<edm::InputTag>("digisInLabelEE", 
-        edm::InputTag{"ecalRawToDigiGPU", "eeDigisGPU"});
-    desc.add<std::string>("digisOutLabelEB", "ebDigis");
-    desc.add<std::string>("digisOutLabelEE", "eeDigis");
+  desc.add<edm::InputTag>("digisInLabelEB", edm::InputTag{"ecalRawToDigiGPU", "ebDigisGPU"});
+  desc.add<edm::InputTag>("digisInLabelEE", edm::InputTag{"ecalRawToDigiGPU", "eeDigisGPU"});
+  desc.add<std::string>("digisOutLabelEB", "ebDigis");
+  desc.add<std::string>("digisOutLabelEE", "eeDigis");
 
-    std::string label = "ecalCPUDigisProducer";
-    confDesc.add(label, desc);
+  std::string label = "ecalCPUDigisProducer";
+  confDesc.add(label, desc);
 }
 
-EcalCPUDigisProducer::EcalCPUDigisProducer(
-        const edm::ParameterSet& ps) 
+EcalCPUDigisProducer::EcalCPUDigisProducer(const edm::ParameterSet& ps)
     : digisInEBToken_{consumes<cms::cuda::Product<ecal::DigisCollection>>(
-        ps.getParameter<edm::InputTag>("digisInLabelEB"))}
-    , digisInEEToken_{consumes<cms::cuda::Product<ecal::DigisCollection>>(
-        ps.getParameter<edm::InputTag>("digisInLabelEE"))}
-    , digisOutEBToken_{produces<EBDigiCollection>(
-        ps.getParameter<std::string>("digisOutLabelEB"))}
-    , digisOutEEToken_{produces<EEDigiCollection>(
-        ps.getParameter<std::string>("digisOutLabelEE"))}
-{}
+          ps.getParameter<edm::InputTag>("digisInLabelEB"))},
+      digisInEEToken_{
+          consumes<cms::cuda::Product<ecal::DigisCollection>>(ps.getParameter<edm::InputTag>("digisInLabelEE"))},
+      digisOutEBToken_{produces<EBDigiCollection>(ps.getParameter<std::string>("digisOutLabelEB"))},
+      digisOutEEToken_{produces<EEDigiCollection>(ps.getParameter<std::string>("digisOutLabelEE"))} {}
 
 EcalCPUDigisProducer::~EcalCPUDigisProducer() {}
 
-void EcalCPUDigisProducer::acquire(
-        edm::Event const& event,
-        edm::EventSetup const& setup,
-        edm::WaitingTaskWithArenaHolder taskHolder) 
-{
-    // retrieve data/ctx
-    auto const& ebdigisProduct = event.get(digisInEBToken_);
-    auto const& eedigisProduct = event.get(digisInEEToken_);
-    cms::cuda::ScopedContextAcquire ctx{ebdigisProduct, std::move(taskHolder)};
-    auto const& ebdigis = ctx.get(ebdigisProduct);
-    auto const& eedigis = ctx.get(eedigisProduct);
-
-    // resize out tmp buffers
-    // FIXME remove hardcoded values
-    idsebtmp.resize(ebdigis.ndigis);
-    dataebtmp.resize(ebdigis.ndigis * 10);
-    idseetmp.resize(eedigis.ndigis);
-    dataeetmp.resize(eedigis.ndigis * 10);
-
-    // enqeue transfers
-    cudaCheck( cudaMemcpyAsync(dataebtmp.data(),
-                               ebdigis.data,
-                               dataebtmp.size() * sizeof(uint16_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-    cudaCheck( cudaMemcpyAsync(dataeetmp.data(),
-                               eedigis.data,
-                               dataeetmp.size() * sizeof(uint16_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-    cudaCheck( cudaMemcpyAsync(idsebtmp.data(),
-                               ebdigis.ids,
-                               idsebtmp.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-    cudaCheck( cudaMemcpyAsync(idseetmp.data(),
-                               eedigis.ids,
-                               idseetmp.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
+void EcalCPUDigisProducer::acquire(edm::Event const& event,
+                                   edm::EventSetup const& setup,
+                                   edm::WaitingTaskWithArenaHolder taskHolder) {
+  // retrieve data/ctx
+  auto const& ebdigisProduct = event.get(digisInEBToken_);
+  auto const& eedigisProduct = event.get(digisInEEToken_);
+  cms::cuda::ScopedContextAcquire ctx{ebdigisProduct, std::move(taskHolder)};
+  auto const& ebdigis = ctx.get(ebdigisProduct);
+  auto const& eedigis = ctx.get(eedigisProduct);
+
+  // resize out tmp buffers
+  // FIXME remove hardcoded values
+  idsebtmp.resize(ebdigis.ndigis);
+  dataebtmp.resize(ebdigis.ndigis * 10);
+  idseetmp.resize(eedigis.ndigis);
+  dataeetmp.resize(eedigis.ndigis * 10);
+
+  // enqeue transfers
+  cudaCheck(cudaMemcpyAsync(
+      dataebtmp.data(), ebdigis.data, dataebtmp.size() * sizeof(uint16_t), cudaMemcpyDeviceToHost, ctx.stream()));
+  cudaCheck(cudaMemcpyAsync(
+      dataeetmp.data(), eedigis.data, dataeetmp.size() * sizeof(uint16_t), cudaMemcpyDeviceToHost, ctx.stream()));
+  cudaCheck(cudaMemcpyAsync(
+      idsebtmp.data(), ebdigis.ids, idsebtmp.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, ctx.stream()));
+  cudaCheck(cudaMemcpyAsync(
+      idseetmp.data(), eedigis.ids, idseetmp.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, ctx.stream()));
 }
 
-void EcalCPUDigisProducer::produce(
-        edm::Event& event, 
-        edm::EventSetup const& setup) 
-{
-    // output collections
-    auto digisEB = std::make_unique<EBDigiCollection>();
-    auto digisEE = std::make_unique<EEDigiCollection>();
-    digisEB->resize(idsebtmp.size());
-    digisEE->resize(idseetmp.size());
-    
-    // cast constness away
-    // use pointers to buffers instead of move operator= semantics
-    // cause we have different allocators in there...
-    auto *dataEB = const_cast<uint16_t*>(digisEB->data().data());
-    auto *dataEE = const_cast<uint16_t*>(digisEE->data().data());
-    auto *idsEB = const_cast<uint32_t*>(digisEB->ids().data());
-    auto *idsEE = const_cast<uint32_t*>(digisEE->ids().data());
-
-    // copy data
-    std::memcpy(dataEB, dataebtmp.data(), dataebtmp.size() * sizeof(uint16_t));
-    std::memcpy(dataEE, dataeetmp.data(), dataeetmp.size() * sizeof(uint16_t));
-    std::memcpy(idsEB, idsebtmp.data(), idsebtmp.size() * sizeof(uint32_t));
-    std::memcpy(idsEE, idseetmp.data(), idseetmp.size() * sizeof(uint32_t));
-
-    event.put(digisOutEBToken_, std::move(digisEB));
-    event.put(digisOutEEToken_, std::move(digisEE));
+void EcalCPUDigisProducer::produce(edm::Event& event, edm::EventSetup const& setup) {
+  // output collections
+  auto digisEB = std::make_unique<EBDigiCollection>();
+  auto digisEE = std::make_unique<EEDigiCollection>();
+  digisEB->resize(idsebtmp.size());
+  digisEE->resize(idseetmp.size());
+
+  // cast constness away
+  // use pointers to buffers instead of move operator= semantics
+  // cause we have different allocators in there...
+  auto* dataEB = const_cast<uint16_t*>(digisEB->data().data());
+  auto* dataEE = const_cast<uint16_t*>(digisEE->data().data());
+  auto* idsEB = const_cast<uint32_t*>(digisEB->ids().data());
+  auto* idsEE = const_cast<uint32_t*>(digisEE->ids().data());
+
+  // copy data
+  std::memcpy(dataEB, dataebtmp.data(), dataebtmp.size() * sizeof(uint16_t));
+  std::memcpy(dataEE, dataeetmp.data(), dataeetmp.size() * sizeof(uint16_t));
+  std::memcpy(idsEB, idsebtmp.data(), idsebtmp.size() * sizeof(uint32_t));
+  std::memcpy(idsEE, idseetmp.data(), idseetmp.size() * sizeof(uint32_t));
+
+  event.put(digisOutEBToken_, std::move(digisEB));
+  event.put(digisOutEEToken_, std::move(digisEE));
 }
 
 DEFINE_FWK_MODULE(EcalCPUDigisProducer);
diff --git a/EventFilter/EcalRawToDigi/plugins/EcalRawESProducersGPUDefs.cc b/EventFilter/EcalRawToDigi/plugins/EcalRawESProducersGPUDefs.cc
index 6538cb0f32816..0133eb27d5c71 100644
--- a/EventFilter/EcalRawToDigi/plugins/EcalRawESProducersGPUDefs.cc
+++ b/EventFilter/EcalRawToDigi/plugins/EcalRawESProducersGPUDefs.cc
@@ -6,9 +6,7 @@
 
 #include <iostream>
 
-using EcalElectronicsMappingGPUESProducer = EcalRawESProducerGPU<
-    ecal::raw::ElectronicsMappingGPU, 
-    EcalMappingElectronics, 
-    EcalMappingElectronicsRcd>;
+using EcalElectronicsMappingGPUESProducer =
+    EcalRawESProducerGPU<ecal::raw::ElectronicsMappingGPU, EcalMappingElectronics, EcalMappingElectronicsRcd>;
 
 DEFINE_FWK_EVENTSETUP_MODULE(EcalElectronicsMappingGPUESProducer);
diff --git a/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc b/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc
index 3198017117cb6..18dc2307e9bfc 100644
--- a/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc
+++ b/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc
@@ -10,7 +10,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h" 
+#include "FWCore/Framework/interface/MakerMacros.h"
 
 // algorithm specific
 
@@ -25,151 +25,131 @@
 #include "EventFilter/EcalRawToDigi/interface/DeclsForKernels.h"
 #include "EventFilter/EcalRawToDigi/interface/UnpackGPU.h"
 
-class EcalRawToDigiGPU
-    : public edm::stream::EDProducer<edm::ExternalWork>
-{
+class EcalRawToDigiGPU : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
-    explicit EcalRawToDigiGPU(edm::ParameterSet const& ps);
-    ~EcalRawToDigiGPU() override;
-    static void fillDescriptions(edm::ConfigurationDescriptions&);
+  explicit EcalRawToDigiGPU(edm::ParameterSet const& ps);
+  ~EcalRawToDigiGPU() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
 
 private:
-    void acquire(edm::Event const&, 
-                 edm::EventSetup const&,
-                 edm::WaitingTaskWithArenaHolder) override;
-    void produce(edm::Event&, edm::EventSetup const&) override;
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
 
 private:
-    edm::EDGetTokenT<FEDRawDataCollection> rawDataToken_;
-    edm::EDPutTokenT<cms::cuda::Product<ecal::DigisCollection>> digisEBToken_, 
-        digisEEToken_;
+  edm::EDGetTokenT<FEDRawDataCollection> rawDataToken_;
+  edm::EDPutTokenT<cms::cuda::Product<ecal::DigisCollection>> digisEBToken_, digisEEToken_;
 
-    cms::cuda::ContextState cudaState_;
+  cms::cuda::ContextState cudaState_;
 
-    std::vector<int> fedsToUnpack_;
+  std::vector<int> fedsToUnpack_;
 
-    ecal::raw::ConfigurationParameters config_;
-    // FIXME move this to use raii
-    ecal::raw::InputDataCPU inputCPU_;
-    ecal::raw::InputDataGPU inputGPU_;
-    ecal::raw::OutputDataGPU outputGPU_;
-    ecal::raw::ScratchDataGPU scratchGPU_;
-    ecal::raw::OutputDataCPU outputCPU_;
+  ecal::raw::ConfigurationParameters config_;
+  // FIXME move this to use raii
+  ecal::raw::InputDataCPU inputCPU_;
+  ecal::raw::InputDataGPU inputGPU_;
+  ecal::raw::OutputDataGPU outputGPU_;
+  ecal::raw::ScratchDataGPU scratchGPU_;
+  ecal::raw::OutputDataCPU outputCPU_;
 };
 
-void EcalRawToDigiGPU::fillDescriptions(
-        edm::ConfigurationDescriptions& confDesc) {
-    edm::ParameterSetDescription desc;
-
-    desc.add<edm::InputTag>("InputLabel", edm::InputTag("rawDataCollector"));
-    std::vector<int> feds(54);
-    for (uint32_t i=0; i<54; ++i)
-        feds[i] = i+601;
-    desc.add<std::vector<int>>("FEDs", feds);
-    desc.add<uint32_t>("maxChannels", 20000);
-    desc.add<std::string>("digisLabelEB", "ebDigisGPU");
-    desc.add<std::string>("digisLabelEE", "eeDigisGPU");
-
-    std::string label = "ecalRawToDigiGPU";
-    confDesc.add(label, desc);
+void EcalRawToDigiGPU::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("InputLabel", edm::InputTag("rawDataCollector"));
+  std::vector<int> feds(54);
+  for (uint32_t i = 0; i < 54; ++i)
+    feds[i] = i + 601;
+  desc.add<std::vector<int>>("FEDs", feds);
+  desc.add<uint32_t>("maxChannels", 20000);
+  desc.add<std::string>("digisLabelEB", "ebDigisGPU");
+  desc.add<std::string>("digisLabelEE", "eeDigisGPU");
+
+  std::string label = "ecalRawToDigiGPU";
+  confDesc.add(label, desc);
 }
 
-EcalRawToDigiGPU::EcalRawToDigiGPU(
-        const edm::ParameterSet& ps) 
-    : rawDataToken_{consumes<FEDRawDataCollection>(ps.getParameter<edm::InputTag>(
-        "InputLabel"))}
-    , digisEBToken_{produces<cms::cuda::Product<ecal::DigisCollection>>(
-        ps.getParameter<std::string>("digisLabelEB"))}
-    , digisEEToken_{produces<cms::cuda::Product<ecal::DigisCollection>>(
-        ps.getParameter<std::string>("digisLabelEE"))}
-    , fedsToUnpack_{ps.getParameter<std::vector<int>>("FEDs")}
-{
-    config_.maxChannels = ps.getParameter<uint32_t>("maxChannels");
-
-    inputCPU_.allocate();
-    inputGPU_.allocate();
-    outputGPU_.allocate(config_);
-    scratchGPU_.allocate(config_);
-    outputCPU_.allocate();
+EcalRawToDigiGPU::EcalRawToDigiGPU(const edm::ParameterSet& ps)
+    : rawDataToken_{consumes<FEDRawDataCollection>(ps.getParameter<edm::InputTag>("InputLabel"))},
+      digisEBToken_{produces<cms::cuda::Product<ecal::DigisCollection>>(ps.getParameter<std::string>("digisLabelEB"))},
+      digisEEToken_{produces<cms::cuda::Product<ecal::DigisCollection>>(ps.getParameter<std::string>("digisLabelEE"))},
+      fedsToUnpack_{ps.getParameter<std::vector<int>>("FEDs")} {
+  config_.maxChannels = ps.getParameter<uint32_t>("maxChannels");
+
+  inputCPU_.allocate();
+  inputGPU_.allocate();
+  outputGPU_.allocate(config_);
+  scratchGPU_.allocate(config_);
+  outputCPU_.allocate();
 }
 
 EcalRawToDigiGPU::~EcalRawToDigiGPU() {
-    inputGPU_.deallocate();
-    outputGPU_.deallocate(config_);
-    scratchGPU_.deallocate(config_);
+  inputGPU_.deallocate();
+  outputGPU_.deallocate(config_);
+  scratchGPU_.deallocate(config_);
 }
 
-void EcalRawToDigiGPU::acquire(
-        edm::Event const& event,
-        edm::EventSetup const& setup,
-        edm::WaitingTaskWithArenaHolder holder) 
-{
-    // raii
-    cms::cuda::ScopedContextAcquire ctx{
-        event.streamID(), std::move(holder), cudaState_};
-
-    // conditions
-    edm::ESHandle<ecal::raw::ElectronicsMappingGPU> eMappingHandle;
-    setup.get<EcalMappingElectronicsRcd>().get(eMappingHandle);
-    auto const& eMappingProduct = eMappingHandle->getProduct(ctx.stream());
-
-    // bundle up conditions
-    ecal::raw::ConditionsProducts conditions{eMappingProduct};
-
-    // event data
-    edm::Handle<FEDRawDataCollection> rawDataHandle;
-    event.getByToken(rawDataToken_, rawDataHandle);
-
-    // iterate over feds
-    // TODO: another idea
-    //   - loop over all feds to unpack and enqueue cuda memcpy 
-    //   - accumulate the sizes
-    //   - after the loop launch cuda memcpy for sizes
-    //   - enqueue the kernel
-    uint32_t currentCummOffset = 0;
-    uint32_t counter = 0;
-    for (auto const& fed : fedsToUnpack_) {
-        //std::cout << "fed: " << fed << std::endl;
-        auto const& data = rawDataHandle->FEDData(fed);
-        auto const nbytes = data.size();
-
-        // skip empty feds
-        if (nbytes < ecal::raw::empty_event_size)
-            continue;
-
-        // copy raw data into plain buffer
-        std::memcpy(inputCPU_.data.data() + currentCummOffset, data.data(), nbytes);
-        // set the offset in bytes from the start
-        inputCPU_.offsets[counter] = currentCummOffset;
-        inputCPU_.feds[counter] = fed;
-
-        // this is the current offset into the vector
-        currentCummOffset += nbytes;
-        ++counter;
-    }
-
-    ecal::raw::entryPoint(
-        inputCPU_, inputGPU_, outputGPU_, scratchGPU_, outputCPU_,
-        conditions, ctx.stream(), counter, currentCummOffset);
+void EcalRawToDigiGPU::acquire(edm::Event const& event,
+                               edm::EventSetup const& setup,
+                               edm::WaitingTaskWithArenaHolder holder) {
+  // raii
+  cms::cuda::ScopedContextAcquire ctx{event.streamID(), std::move(holder), cudaState_};
+
+  // conditions
+  edm::ESHandle<ecal::raw::ElectronicsMappingGPU> eMappingHandle;
+  setup.get<EcalMappingElectronicsRcd>().get(eMappingHandle);
+  auto const& eMappingProduct = eMappingHandle->getProduct(ctx.stream());
+
+  // bundle up conditions
+  ecal::raw::ConditionsProducts conditions{eMappingProduct};
+
+  // event data
+  edm::Handle<FEDRawDataCollection> rawDataHandle;
+  event.getByToken(rawDataToken_, rawDataHandle);
+
+  // iterate over feds
+  // TODO: another idea
+  //   - loop over all feds to unpack and enqueue cuda memcpy
+  //   - accumulate the sizes
+  //   - after the loop launch cuda memcpy for sizes
+  //   - enqueue the kernel
+  uint32_t currentCummOffset = 0;
+  uint32_t counter = 0;
+  for (auto const& fed : fedsToUnpack_) {
+    //std::cout << "fed: " << fed << std::endl;
+    auto const& data = rawDataHandle->FEDData(fed);
+    auto const nbytes = data.size();
+
+    // skip empty feds
+    if (nbytes < ecal::raw::empty_event_size)
+      continue;
+
+    // copy raw data into plain buffer
+    std::memcpy(inputCPU_.data.data() + currentCummOffset, data.data(), nbytes);
+    // set the offset in bytes from the start
+    inputCPU_.offsets[counter] = currentCummOffset;
+    inputCPU_.feds[counter] = fed;
+
+    // this is the current offset into the vector
+    currentCummOffset += nbytes;
+    ++counter;
+  }
+
+  ecal::raw::entryPoint(
+      inputCPU_, inputGPU_, outputGPU_, scratchGPU_, outputCPU_, conditions, ctx.stream(), counter, currentCummOffset);
 }
 
-void EcalRawToDigiGPU::produce(
-        edm::Event& event, 
-        edm::EventSetup const& setup) 
-{
-    cms::cuda::ScopedContextProduce ctx{cudaState_};
-
-    // get the number of channels 
-    auto const nchannelsEB = outputCPU_.nchannels[0];
-    auto const nchannelsEE = outputCPU_.nchannels[1];
-    
-    ecal::DigisCollection digisEB{outputGPU_.idsEB, 
-        outputGPU_.samplesEB, nchannelsEB};
-    ecal::DigisCollection digisEE{outputGPU_.idsEE,
-        outputGPU_.samplesEE, nchannelsEE};
-
-    ctx.emplace(event, digisEBToken_, std::move(digisEB));
-    ctx.emplace(event, digisEEToken_, std::move(digisEE));
+void EcalRawToDigiGPU::produce(edm::Event& event, edm::EventSetup const& setup) {
+  cms::cuda::ScopedContextProduce ctx{cudaState_};
+
+  // get the number of channels
+  auto const nchannelsEB = outputCPU_.nchannels[0];
+  auto const nchannelsEE = outputCPU_.nchannels[1];
+
+  ecal::DigisCollection digisEB{outputGPU_.idsEB, outputGPU_.samplesEB, nchannelsEB};
+  ecal::DigisCollection digisEE{outputGPU_.idsEE, outputGPU_.samplesEE, nchannelsEE};
+
+  ctx.emplace(event, digisEBToken_, std::move(digisEB));
+  ctx.emplace(event, digisEEToken_, std::move(digisEE));
 }
 
 DEFINE_FWK_MODULE(EcalRawToDigiGPU);
diff --git a/EventFilter/EcalRawToDigi/src/ElectronicsMappingGPU.cc b/EventFilter/EcalRawToDigi/src/ElectronicsMappingGPU.cc
index c09a963b62a1d..8264c501a896c 100644
--- a/EventFilter/EcalRawToDigi/src/ElectronicsMappingGPU.cc
+++ b/EventFilter/EcalRawToDigi/src/ElectronicsMappingGPU.cc
@@ -5,58 +5,53 @@
 
 #include "DataFormats/EcalDetId/interface/EcalElectronicsId.h"
 
-namespace ecal { namespace raw {
-
-// TODO: 0x3FFFFF * 4B ~= 16MB
-// tmp solution for linear mapping of eid -> did
-ElectronicsMappingGPU::ElectronicsMappingGPU(EcalMappingElectronics const& mapping) 
-    : eid2did_(0x3FFFFF)
-{   
-
-    // fill in eb
-    // TODO: EB vector is actually empty
-    auto const& barrelValues = mapping.barrelItems();
-    for (unsigned int i=0; i<barrelValues.size(); i++) {
+namespace ecal {
+  namespace raw {
+
+    // TODO: 0x3FFFFF * 4B ~= 16MB
+    // tmp solution for linear mapping of eid -> did
+    ElectronicsMappingGPU::ElectronicsMappingGPU(EcalMappingElectronics const& mapping) : eid2did_(0x3FFFFF) {
+      // fill in eb
+      // TODO: EB vector is actually empty
+      auto const& barrelValues = mapping.barrelItems();
+      for (unsigned int i = 0; i < barrelValues.size(); i++) {
         EcalElectronicsId eid{barrelValues[i].electronicsid};
         EBDetId did{EBDetId::unhashIndex(i)};
         eid2did_[eid.linearIndex()] = did.rawId();
-    }
-    
-    // fill in ee
-    auto const& endcapValues = mapping.endcapItems();
-    for (unsigned int i=0; i<endcapValues.size(); i++) {
+      }
+
+      // fill in ee
+      auto const& endcapValues = mapping.endcapItems();
+      for (unsigned int i = 0; i < endcapValues.size(); i++) {
         EcalElectronicsId eid{endcapValues[i].electronicsid};
         EEDetId did{EEDetId::unhashIndex(i)};
         eid2did_[eid.linearIndex()] = did.rawId();
+      }
     }
-}
-
-ElectronicsMappingGPU::Product::~Product() {
-    // deallocation
-    cudaCheck( cudaFree(eid2did) );
-}
-
-ElectronicsMappingGPU::Product const& ElectronicsMappingGPU::getProduct(
-        cudaStream_t cudaStream) const
-{
-    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
-        [this](ElectronicsMappingGPU::Product& product, cudaStream_t cudaStream) {
+
+    ElectronicsMappingGPU::Product::~Product() {
+      // deallocation
+      cudaCheck(cudaFree(eid2did));
+    }
+
+    ElectronicsMappingGPU::Product const& ElectronicsMappingGPU::getProduct(cudaStream_t cudaStream) const {
+      auto const& product = product_.dataForCurrentDeviceAsync(
+          cudaStream, [this](ElectronicsMappingGPU::Product& product, cudaStream_t cudaStream) {
             // malloc
-            cudaCheck( cudaMalloc((void**)&product.eid2did,
-                                  this->eid2did_.size() * sizeof(uint32_t)) );
-
-            // transfer 
-            cudaCheck( cudaMemcpyAsync(product.eid2did,
-                                       this->eid2did_.data(),
-                                       this->eid2did_.size() * sizeof(uint32_t),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-        }
-    );
-
-    return product;
-}
-
-}}
+            cudaCheck(cudaMalloc((void**)&product.eid2did, this->eid2did_.size() * sizeof(uint32_t)));
+
+            // transfer
+            cudaCheck(cudaMemcpyAsync(product.eid2did,
+                                      this->eid2did_.data(),
+                                      this->eid2did_.size() * sizeof(uint32_t),
+                                      cudaMemcpyHostToDevice,
+                                      cudaStream));
+          });
+
+      return product;
+    }
+
+  }  // namespace raw
+}  // namespace ecal
 
 TYPELOOKUP_DATA_REG(ecal::raw::ElectronicsMappingGPU);
diff --git a/EventFilter/EcalRawToDigi/src/UnpackGPU.cu b/EventFilter/EcalRawToDigi/src/UnpackGPU.cu
index 8c9f05535b70d..a2e5057bbbf6a 100644
--- a/EventFilter/EcalRawToDigi/src/UnpackGPU.cu
+++ b/EventFilter/EcalRawToDigi/src/UnpackGPU.cu
@@ -1,476 +1,331 @@
-#include "EventFilter/EcalRawToDigi/interface/UnpackGPU.h"
 #include "EventFilter/EcalRawToDigi/interface/ElectronicsIdGPU.h"
+#include "EventFilter/EcalRawToDigi/interface/UnpackGPU.h"
 
-namespace ecal { namespace raw {
+namespace ecal {
+  namespace raw {
 
-__forceinline__ __device__
-void print_raw_buffer(
-        uint8_t const* const buffer, 
-        uint32_t const nbytes, uint32_t const nbytes_per_row = 20) {
-    for (uint32_t i=0; i<nbytes; i++) {
-        if (i % nbytes_per_row == 0 && i>0)
-            printf("\n");
+    __forceinline__ __device__ void print_raw_buffer(uint8_t const* const buffer,
+                                                     uint32_t const nbytes,
+                                                     uint32_t const nbytes_per_row = 20) {
+      for (uint32_t i = 0; i < nbytes; i++) {
+        if (i % nbytes_per_row == 0 && i > 0)
+          printf("\n");
         printf("%02X ", buffer[i]);
+      }
     }
-}
 
-__forceinline__ __device__
-void print_first3bits(uint64_t const* buffer, uint32_t size) {
-    for (uint32_t i=0; i<size; ++i) {
+    __forceinline__ __device__ void print_first3bits(uint64_t const* buffer, uint32_t size) {
+      for (uint32_t i = 0; i < size; ++i) {
         uint8_t const b61 = (buffer[i] >> 61) & 0x1;
         uint8_t const b62 = (buffer[i] >> 62) & 0x1;
         uint8_t const b63 = (buffer[i] >> 63) & 0x1;
-        printf("[word: %u] %u%u%u\n", i,
-            b63, b62, b61);
+        printf("[word: %u] %u%u%u\n", i, b63, b62, b61);
+      }
+    }
+
+    __forceinline__ __device__ bool is_barrel(uint8_t dccid) {
+      return dccid >= ElectronicsIdGPU::MIN_DCCID_EBM && dccid <= ElectronicsIdGPU::MAX_DCCID_EBP;
+    }
+
+    __forceinline__ __device__ uint8_t fed2dcc(int fed) { return static_cast<uint8_t>(fed - 600); }
+
+    __forceinline__ __device__ int zside_for_eb(ElectronicsIdGPU const& eid) {
+      int dcc = eid.dccId();
+      return ((dcc >= ElectronicsIdGPU::MIN_DCCID_EBM && dcc <= ElectronicsIdGPU::MAX_DCCID_EBM)) ? -1 : 1;
     }
-}
-
-__forceinline__ __device__
-bool is_barrel(uint8_t dccid) {
-    return dccid >= ElectronicsIdGPU::MIN_DCCID_EBM && 
-           dccid <= ElectronicsIdGPU::MAX_DCCID_EBP;
-}
-
-__forceinline__ __device__
-uint8_t fed2dcc(int fed) { return static_cast<uint8_t>(fed - 600); }
-
-__forceinline__ __device__
-int zside_for_eb(ElectronicsIdGPU const& eid) {
-    int dcc = eid.dccId();
-    return ((dcc >= ElectronicsIdGPU::MIN_DCCID_EBM && 
-            dcc <= ElectronicsIdGPU::MAX_DCCID_EBM))
-        ? -1
-        : 1;
-    /*
-    if ((dcc >= MIN_DCCID_EBP && dcc <= MAX_DCCID_EBP))
-        return +1;
-        */
-}
-
-__forceinline__ __device__
-bool is_synced_towerblock(
-        uint16_t const dccbx,
-        uint16_t const bx,
-        uint16_t const dccl1,
-        uint16_t const l1) {
-    bool const bxsync = (bx==0 && dccbx==3564) || (bx==dccbx && dccbx!=3564);
-    bool const l1sync = (l1 == ((dccl1 - 1) & 0xfff));
-    return bxsync && l1sync;
-}
-
-__forceinline__ __device__
-bool right_tower_for_eb(int tower) {
-    // for EB, two types of tower (LVRB top/bottom)
-    if ((tower > 12 && tower < 21) || 
-        (tower > 28 && tower < 37) || 
-        (tower > 44 && tower < 53) ||
-        (tower > 60 && tower < 69))
+
+    __forceinline__ __device__ bool is_synced_towerblock(uint16_t const dccbx,
+                                                         uint16_t const bx,
+                                                         uint16_t const dccl1,
+                                                         uint16_t const l1) {
+      bool const bxsync = (bx == 0 && dccbx == 3564) || (bx == dccbx && dccbx != 3564);
+      bool const l1sync = (l1 == ((dccl1 - 1) & 0xfff));
+      return bxsync && l1sync;
+    }
+
+    __forceinline__ __device__ bool right_tower_for_eb(int tower) {
+      // for EB, two types of tower (LVRB top/bottom)
+      if ((tower > 12 && tower < 21) || (tower > 28 && tower < 37) || (tower > 44 && tower < 53) ||
+          (tower > 60 && tower < 69))
         return true;
-    else
+      else
         return false;
-}
-
-__forceinline__ __device__
-uint32_t compute_ebdetid(ElectronicsIdGPU const& eid) {
-    // as in Geometry/EcalMaping/.../EcalElectronicsMapping
-    auto const dcc = eid.dccId();
-    auto const tower = eid.towerId();
-    auto const strip = eid.stripId();
-    auto const xtal = eid.xtalId();
-
-    int smid = 0;
-    int iphi = 0;
-    bool EBPlus = (zside_for_eb(eid) > 0);
-    bool EBMinus = !EBPlus;
-
-    if (zside_for_eb(eid) < 0) {
+    }
+
+    __forceinline__ __device__ uint32_t compute_ebdetid(ElectronicsIdGPU const& eid) {
+      // as in Geometry/EcalMaping/.../EcalElectronicsMapping
+      auto const dcc = eid.dccId();
+      auto const tower = eid.towerId();
+      auto const strip = eid.stripId();
+      auto const xtal = eid.xtalId();
+
+      int smid = 0;
+      int iphi = 0;
+      bool EBPlus = (zside_for_eb(eid) > 0);
+      bool EBMinus = !EBPlus;
+
+      if (zside_for_eb(eid) < 0) {
         smid = dcc + 19 - ElectronicsIdGPU::DCCID_PHI0_EBM;
         iphi = (smid - 19) * ElectronicsIdGPU::kCrystalsInPhi;
         iphi += 5 * ((tower - 1) % ElectronicsIdGPU::kTowersInPhi);
-    } else {
+      } else {
         smid = dcc + 1 - ElectronicsIdGPU::DCCID_PHI0_EBP;
         iphi = (smid - 1) * ElectronicsIdGPU::kCrystalsInPhi;
         iphi += 5 * (ElectronicsIdGPU::kTowersInPhi - ((tower - 1) % ElectronicsIdGPU::kTowersInPhi) - 1);
-    }
+      }
 
-    bool RightTower = right_tower_for_eb(tower);
-    int ieta = 5 * ((tower - 1) / ElectronicsIdGPU::kTowersInPhi) + 1;
-    if (RightTower) {
+      bool RightTower = right_tower_for_eb(tower);
+      int ieta = 5 * ((tower - 1) / ElectronicsIdGPU::kTowersInPhi) + 1;
+      if (RightTower) {
         ieta += (strip - 1);
         if (strip % 2 == 1) {
-            if (EBMinus)
-                iphi += (xtal - 1) + 1;
-            else
-                iphi += (4 - (xtal - 1)) + 1;
+          if (EBMinus)
+            iphi += (xtal - 1) + 1;
+          else
+            iphi += (4 - (xtal - 1)) + 1;
         } else {
-            if (EBMinus)
-                iphi += (4 - (xtal - 1)) + 1;
-            else
-                iphi += (xtal - 1) + 1;
+          if (EBMinus)
+            iphi += (4 - (xtal - 1)) + 1;
+          else
+            iphi += (xtal - 1) + 1;
         }
-    } else {
+      } else {
         ieta += 4 - (strip - 1);
         if (strip % 2 == 1) {
-            if (EBMinus)
-                iphi += (4 - (xtal - 1)) + 1;
-            else
-                iphi += (xtal - 1) + 1;
+          if (EBMinus)
+            iphi += (4 - (xtal - 1)) + 1;
+          else
+            iphi += (xtal - 1) + 1;
         } else {
-            if (EBMinus)
-                iphi += (xtal - 1) + 1;
-            else
-                iphi += (4 - (xtal - 1)) + 1;
+          if (EBMinus)
+            iphi += (xtal - 1) + 1;
+          else
+            iphi += (4 - (xtal - 1)) + 1;
         }
-    }
-    
-    if (zside_for_eb(eid) < 0)
+      }
+
+      if (zside_for_eb(eid) < 0)
         ieta = -ieta;
 
-    DetId did{DetId::Ecal, EcalBarrel};
-    return  did.rawId() |  
-        ((ieta > 0) 
-            ? (0x10000 | (ieta << 9)) 
-            : ((-ieta) << 9)) | (iphi & 0x1FF);
-}
-
-__forceinline__ __device__
-int adc(uint16_t sample) { return sample & 0xfff; }
-__forceinline__ __device__
-int gainId(uint16_t sample) { return (sample>>12) & 0x3; }
-
-template<int NTHREADS>
-__global__
-void kernel_unpack_test(
-        unsigned char const* __restrict__ data,
-        uint32_t const* __restrict__ offsets,
-        int const* __restrict__ feds,
-        uint16_t *samplesEB,
-        uint16_t *samplesEE,
-        uint32_t *idsEB,
-        uint32_t *idsEE,
-        uint32_t *pChannelsCounterEBEE,
-        uint32_t const* eid2did,
-        uint32_t const nbytesTotal) {
-    // indices
-    auto const ifed = blockIdx.x;
-
-    // FIXME: use only the very first fed
-    //if (ifed!=10) return;
-
-    // offset in bytes
-    auto const offset = offsets[ifed];
-    // fed id
-    auto const fed = feds[ifed];
-    auto const isBarrel = is_barrel(static_cast<uint8_t>(fed - 600));
-    // size
-    auto const size = ifed==gridDim.x-1 ? nbytesTotal - offset : offsets[ifed+1] - offset;
-    auto *samples = isBarrel ? samplesEB : samplesEE;
-    auto *ids = isBarrel ? idsEB : idsEE;
-    auto *pChannelsCounter = isBarrel 
-        ? &pChannelsCounterEBEE[0] 
-        : &pChannelsCounterEBEE[1];
-
-    // FIXME: debugging
-    //printf("ifed = %u fed = %d offset = %u size = %u\n", ifed, fed, offset, size);
-
-    // offset to the right raw buffer
-    uint64_t const* buffer = reinterpret_cast<uint64_t const*>(data + offset);
-
-    // dump first 3 bits for each 64-bit word
-    //print_first3bits(buffer, size / 8);
-
-    //
-    // fed header
-    //
-    //print_raw_buffer(reinterpret_cast<uint8_t const*>(buffer), 8);
-    //printf("\n");
-    auto const fed_header = buffer[0];
-    uint32_t fed_id = (fed_header >> 8) & 0xfff;
-    uint32_t bx = (fed_header >> 20) & 0xfff;
-    uint32_t lv1 = (fed_header >> 32) & 0xffffff;
-    uint8_t trigger_type = (fed_header >> 56) & 0xf;
-    uint8_t const bid_fed_header = (fed_header >> 60) & 0xf;
-    //printf("fed = %d fed_id = %u bx = %u lv1 = %u tt=%hhu  bid = 0x%u\n",
-    //    fed, fed_id, bx, lv1, trigger_type, bid_fed_header);
-
-    //
-    // dcc header: w1
-    //
-    //print_raw_buffer(reinterpret_cast<uint8_t const*>(buffer + 1), 8);
-    //printf("\n");
-    auto const dcc_header = buffer[1];
-    uint32_t event_length = dcc_header & 0xffffff;
-    uint8_t dcc_errors = (dcc_header >> 24) & 0xff;
-    uint32_t run_number = (dcc_header >> 32) & 0xffffff;
-    uint8_t const word_dcc = (dcc_header >> 56) & 0x3f;
-    uint8_t const bid_dcc_header = (dcc_header >> 62) & 0x3;
-    //printf("fed = %d size = %u event_length = %u dcc_errors = %u run_number = %u word_dcc = 0x%u bid_dcc_header = 0x%u\n",
-    //    fed, size, 8*event_length, static_cast<uint32_t>(dcc_errors), run_number, static_cast<uint32_t>(word_dcc), static_cast<uint32_t>(bid_dcc_header));
-
-    // 
-    // dcc header w2
-    //
-    //print_raw_buffer(reinterpret_cast<uint8_t const*>(buffer + 2), 8);
-    //printf("\n");
-    auto const w2 = buffer[2];
-    uint32_t const run_type = w2 & 0xffffffff;
-    uint16_t const det_trigger_type = (w2 >> 32) & 0xffff;
-    uint8_t w2_dcc = (w2 >> 56) & 0x3f;
-    uint8_t w2_bid_dcc = (w2 >> 62) & 0x3;
-    //printf("run_type = %u det_trigger_type = %u w2_dcc = %u w2_bid_dcc = %u\n", 
-    //    run_type, det_trigger_type, w2_dcc, w2_bid_dcc);
-
-    //
-    // dcc header w3
-    //
-    auto const w3 = buffer[3];
-    //print_raw_buffer(reinterpret_cast<uint8_t const*>(&w3), 8);
-    //printf("\n");
-    uint32_t const orbit_number = w3 & 0xffffffff;
-    uint8_t const sr = (w3 >> 32) & 0x1;
-    uint8_t const zs = (w3 >> 33) & 0x1;
-    uint8_t const tzs = (w3 >> 34) & 0x1;
-    uint8_t const sr_chstatus = (w3 >> 36) & 0xf;
-    uint8_t const tcc_chstatus1 = (w3 >> 40) & 0xf;
-    uint8_t const tcc_chstatus2 = (w3 >> 44) & 0xf;
-    uint8_t const tcc_chstatus3 = (w3 >> 48) & 0xf;
-    uint8_t const tcc_chstatus4 = (w3 >> 52) & 0xf;
-    uint8_t const w3_dcc = (w3 >> 56) & 0x3f;
-    uint8_t const w3_bid_dcc = (w3 >> 62) & 0x3;
-    //printf("orbit_number = %u sr = %u zs = %u tzs = %u sr_chstatus = %u\n",
-    //    orbit_number, static_cast<uint32_t>(sr), static_cast<uint32_t>(zs),
-    //    static_cast<uint32_t>(tzs), static_cast<uint32_t>(sr_chstatus));
-    //printf("tcc_chstatus1 = %u tcc_chstatus2 = %u tcc_chstatus3 = %u tcc_chstatus4 = %u\n",
-    //    static_cast<uint32_t>(tcc_chstatus1), static_cast<uint32_t>(tcc_chstatus2),
-    //    static_cast<uint32_t>(tcc_chstatus3), static_cast<uint32_t>(tcc_chstatus4));
-
-    //
-    // w4 - w8 (including 5 64-bit words)
-    //
-    /*
-    for (uint32_t i=0; i<5; i++) {
-        auto const wi = buffer[4 + i];
-        for (uint32_t i=0; i<14; i++) {
-            uint8_t value_i = (wi >> i*4) & 0xf;
-            printf("fe_chstatus_%u = %u  ", i, static_cast<uint32_t>(value_i));
-        }
-        uint8_t wi_dcc = (wi >> 56) & 0x3f;
-        uint8_t wi_bid_dcc = (wi >> 62) & 0x3;
-        printf("wi_dcc = %u wi_bid-dcc = %u\n", 
-            static_cast<uint32_t>(wi_dcc), static_cast<uint32_t>(wi_bid_dcc));
-        printf("\n");
-    }
-    */
-
-    //
-    // TCC block
-    //
-    {
-        auto const w = buffer[9];
-        //print_raw_buffer(reinterpret_cast<uint8_t const*>(&w), 8);
-        //printf("\n");
-        uint8_t const tccid = w & 0xff;
-        uint8_t const bxlocal = (w >> 16) & 0xff;
-        uint8_t const e0 = (w >> 17) & 0x1;
-        uint8_t const w_bfield_0 = (w >> 29) & 0x7;
-        uint16_t const lv1local = (w >> 32) & 0xfff;
-        uint8_t const e1 = (w >> 44) & 0x1;
-        uint8_t const ntt = (w >> 48) & 0x7f;
-        uint8_t const ntimesamples = (w >> 55) & 0xf;
-        uint8_t const le0 = (w >> 59) & 0x1;
-        uint8_t const le1 = (w >> 60) & 0x1;
-        uint8_t const w_bfield_1 = (w >> 61) & 0x7;
-        //printf("tccid = %u bxlocal = %u e0 = %u w_bitfield_0 = %u lv1local = %u\n",
-        //    tccid, bxlocal, e0, w_bfield_0, lv1local);
-        //printf("e1 = %u ntt = %u ntimesamples = %u le0 = %u le1 = %u w_bfield_1 = %u\n",
-        //    e1, ntt, ntimesamples, le0, le1, w_bfield_1);
+      DetId did{DetId::Ecal, EcalBarrel};
+      return did.rawId() | ((ieta > 0) ? (0x10000 | (ieta << 9)) : ((-ieta) << 9)) | (iphi & 0x1FF);
     }
 
-    // 9 for fed + dcc header 
-    // 36 for 4 EE TCC blocks or 18 for 1 EB TCC block
-    // 6 for SR block size
-    //print_first3bits(buffer, size / 8);
-    //auto const* tower_block_start = buffer + 9 + 36 + 6;
-    //print_first3bits(tower_block_start, size / 8 - 10 - 36 - 6);
-
-    //
-    // print Tower block headers
-    //
-    uint8_t ntccblockwords = isBarrel ? 18 : 36;
-    auto const* tower_blocks_start = buffer + 9 + ntccblockwords + 6;
-    auto const* trailer = buffer + (size / 8 - 1);
-    auto const* current_tower_block = tower_blocks_start;
-    while (current_tower_block != trailer) {
+    __forceinline__ __device__ int adc(uint16_t sample) { return sample & 0xfff; }
+
+    __forceinline__ __device__ int gainId(uint16_t sample) { return (sample >> 12) & 0x3; }
+
+    template <int NTHREADS>
+    __global__ void kernel_unpack_test(unsigned char const* __restrict__ data,
+                                       uint32_t const* __restrict__ offsets,
+                                       int const* __restrict__ feds,
+                                       uint16_t* samplesEB,
+                                       uint16_t* samplesEE,
+                                       uint32_t* idsEB,
+                                       uint32_t* idsEE,
+                                       uint32_t* pChannelsCounterEBEE,
+                                       uint32_t const* eid2did,
+                                       uint32_t const nbytesTotal) {
+      // indices
+      auto const ifed = blockIdx.x;
+
+      // FIXME: use only the very first fed
+      //if (ifed!=10) return;
+
+      // offset in bytes
+      auto const offset = offsets[ifed];
+      // fed id
+      auto const fed = feds[ifed];
+      auto const isBarrel = is_barrel(static_cast<uint8_t>(fed - 600));
+      // size
+      auto const size = ifed == gridDim.x - 1 ? nbytesTotal - offset : offsets[ifed + 1] - offset;
+      auto* samples = isBarrel ? samplesEB : samplesEE;
+      auto* ids = isBarrel ? idsEB : idsEE;
+      auto* pChannelsCounter = isBarrel ? &pChannelsCounterEBEE[0] : &pChannelsCounterEBEE[1];
+
+      // FIXME: debugging
+      //printf("ifed = %u fed = %d offset = %u size = %u\n", ifed, fed, offset, size);
+
+      // offset to the right raw buffer
+      uint64_t const* buffer = reinterpret_cast<uint64_t const*>(data + offset);
+
+      // dump first 3 bits for each 64-bit word
+      //print_first3bits(buffer, size / 8);
+
+      //
+      // fed header
+      //
+      auto const fed_header = buffer[0];
+      uint32_t bx = (fed_header >> 20) & 0xfff;
+      uint32_t lv1 = (fed_header >> 32) & 0xffffff;
+
+      // 9 for fed + dcc header
+      // 36 for 4 EE TCC blocks or 18 for 1 EB TCC block
+      // 6 for SR block size
+
+      //
+      // print Tower block headers
+      //
+      uint8_t ntccblockwords = isBarrel ? 18 : 36;
+      auto const* tower_blocks_start = buffer + 9 + ntccblockwords + 6;
+      auto const* trailer = buffer + (size / 8 - 1);
+      auto const* current_tower_block = tower_blocks_start;
+      while (current_tower_block != trailer) {
         auto const w = *current_tower_block;
         uint8_t ttid = w & 0xff;
-        uint8_t ntimesamples = (w >> 8) & 0x7f;
         uint16_t bxlocal = (w >> 16) & 0xfff;
-        uint8_t e0 = (w >> 28) & 0x1;
-        uint8_t w_bfield_0 = (w >> 30) & 0x3;
         uint16_t lv1local = (w >> 32) & 0xfff;
-        uint8_t e1 = (w >> 44) & 0x1;
         uint16_t block_length = (w >> 48) & 0x1ff;
-        uint16_t w_bfield_1 = (w >> 62) & 0x3;
 
-        // 
         uint16_t const dccbx = bx & 0xfff;
         uint16_t const dccl1 = lv1 & 0xfff;
-        //printf("dccbx = %u bxlocal = %u dccl1 = %u l1local = %u\n",
-        //    dccbx, bxlocal, dccl1, lv1local);
         if (!is_synced_towerblock(dccbx, bxlocal, dccl1, lv1local)) {
-            current_tower_block += block_length;
-            continue;
+          current_tower_block += block_length;
+          continue;
         }
 
-        //printf("ttid = %u ntimesamples = %u\ bxlocal = %u e0 = %u w_bfield_0 = %u\n", 
-        //    ttid, ntimesamples, bxlocal, e0, w_bfield_0);
-        //printf("lv1local = %u e1 = %u block_length = %u w_bfield-1 = %u\n",
-        //    lv1local, e1, block_length, w_bfield_1);
-
-        // go thru all the channels
+        // go through all the channels
         // get the next channel coordinates
         uint32_t nchannels = (block_length - 1) / 3;
 
         // 1 threads per channel in this block
-        for (uint32_t ich=0; ich<nchannels; ich+=NTHREADS) {
-            auto const i_to_access = ich + threadIdx.x;
-            // threads outside of the range -> leave the loop
-            if (i_to_access>=nchannels) break;
-
-            // inc the channel's counter and get the pos where to store
-            auto const wdata = current_tower_block[1 + i_to_access*3];
-            uint8_t const stripid = wdata & 0x7;
-            uint8_t const xtalid = (wdata >> 4) & 0x7;
-            ElectronicsIdGPU eid{fed2dcc(fed), ttid, stripid, xtalid};
-            auto const didraw = isBarrel 
-                ? compute_ebdetid(eid)
-                : eid2did[eid.linearIndex()];
-            // FIXME: what kind of channels are these guys
-            if (didraw == 0) 
-                continue;
-            
-            // get samples
-            uint16_t sampleValues[10];
-            sampleValues[0] = (wdata >> 16) & 0x3fff;
-            sampleValues[1] = (wdata >> 32) & 0x3fff;
-            sampleValues[2] = (wdata >> 48) & 0x3fff;
-            auto const wdata1 = current_tower_block[2+i_to_access*3];
-            sampleValues[3] = wdata1 & 0x3fff;
-            sampleValues[4] = (wdata1 >> 16) & 0x3fff;
-            sampleValues[5] = (wdata1 >> 32) & 0x3fff;
-            sampleValues[6] = (wdata1 >> 48) & 0x3fff;
-            auto const wdata2 = current_tower_block[3+i_to_access*3];
-            sampleValues[7] = wdata2 & 0x3fff;
-            sampleValues[8] = (wdata2 >> 16) & 0x3fff;
-            sampleValues[9] = (wdata2 >> 32) & 0x3fff;
-            //printf("stripid = %u xtalid = %u\n", stripid, xtalid);
-            
-            // check gain
-            bool isSaturation = true;
-            short firstGainZeroSampID{-1}, firstGainZeroSampADC{-1};
-            for (uint32_t si=0; si<10; si++) {
-                if (gainId(sampleValues[si]) == 0) {
-                    firstGainZeroSampID = si;
-                    firstGainZeroSampADC = adc(sampleValues[si]);
-                    break;
-                }
+        for (uint32_t ich = 0; ich < nchannels; ich += NTHREADS) {
+          auto const i_to_access = ich + threadIdx.x;
+          // threads outside of the range -> leave the loop
+          if (i_to_access >= nchannels)
+            break;
+
+          // inc the channel's counter and get the pos where to store
+          auto const wdata = current_tower_block[1 + i_to_access * 3];
+          uint8_t const stripid = wdata & 0x7;
+          uint8_t const xtalid = (wdata >> 4) & 0x7;
+          ElectronicsIdGPU eid{fed2dcc(fed), ttid, stripid, xtalid};
+          auto const didraw = isBarrel ? compute_ebdetid(eid) : eid2did[eid.linearIndex()];
+          // FIXME: what kind of channels are these guys
+          if (didraw == 0)
+            continue;
+
+          // get samples
+          uint16_t sampleValues[10];
+          sampleValues[0] = (wdata >> 16) & 0x3fff;
+          sampleValues[1] = (wdata >> 32) & 0x3fff;
+          sampleValues[2] = (wdata >> 48) & 0x3fff;
+          auto const wdata1 = current_tower_block[2 + i_to_access * 3];
+          sampleValues[3] = wdata1 & 0x3fff;
+          sampleValues[4] = (wdata1 >> 16) & 0x3fff;
+          sampleValues[5] = (wdata1 >> 32) & 0x3fff;
+          sampleValues[6] = (wdata1 >> 48) & 0x3fff;
+          auto const wdata2 = current_tower_block[3 + i_to_access * 3];
+          sampleValues[7] = wdata2 & 0x3fff;
+          sampleValues[8] = (wdata2 >> 16) & 0x3fff;
+          sampleValues[9] = (wdata2 >> 32) & 0x3fff;
+          //printf("stripid = %u xtalid = %u\n", stripid, xtalid);
+
+          // check gain
+          bool isSaturation = true;
+          short firstGainZeroSampID{-1}, firstGainZeroSampADC{-1};
+          for (uint32_t si = 0; si < 10; si++) {
+            if (gainId(sampleValues[si]) == 0) {
+              firstGainZeroSampID = si;
+              firstGainZeroSampADC = adc(sampleValues[si]);
+              break;
+            }
+          }
+          if (firstGainZeroSampID != -1) {
+            unsigned int plateauEnd = std::min(10u, (unsigned int)(firstGainZeroSampID + 5));
+            for (unsigned int s = firstGainZeroSampID; s < plateauEnd; s++) {
+              if (gainId(sampleValues[s]) == 0 && adc(sampleValues[s]) == firstGainZeroSampADC) {
+                ;
+              } else {
+                isSaturation = false;
+                break;
+              }  //it's not saturation
+            }
+            // get rid of channels which are stuck in gain0
+            if (firstGainZeroSampID < 3) {
+              isSaturation = false;
             }
-            if (firstGainZeroSampID!=-1) {
-                unsigned int plateauEnd = std::min(10u ,(unsigned int)(firstGainZeroSampID+5));
-                for (unsigned int s=firstGainZeroSampID; s<plateauEnd; s++) {
-                    if( gainId(sampleValues[s])==0 && 
-                        adc(sampleValues[s])==firstGainZeroSampADC ) {;}
-                    else { isSaturation=false;  break;}  //it's not saturation
-                }     
-                // get rid of channels which are stuck in gain0
-                if(firstGainZeroSampID<3) {isSaturation=false; }
-                if (!isSaturation)
-                    continue;
-            } else { // there is no zero gainId sample
-                // gain switch check
-                short numGain=1;
-                bool gainSwitchError = false;
-                for (unsigned int si=1; si<10; si++) {
-                    if ((gainId(sampleValues[si-1]) > gainId(sampleValues[si])) && 
-                        numGain<5) gainSwitchError=true;
-                    if (gainId(sampleValues[si-1]) == gainId(sampleValues[si])) numGain++;
-                    else numGain=1;
-                }
-                if (gainSwitchError)
-                    continue;
+            if (!isSaturation)
+              continue;
+          } else {  // there is no zero gainId sample
+            // gain switch check
+            short numGain = 1;
+            bool gainSwitchError = false;
+            for (unsigned int si = 1; si < 10; si++) {
+              if ((gainId(sampleValues[si - 1]) > gainId(sampleValues[si])) && numGain < 5)
+                gainSwitchError = true;
+              if (gainId(sampleValues[si - 1]) == gainId(sampleValues[si]))
+                numGain++;
+              else
+                numGain = 1;
             }
-            
-            auto const pos = atomicAdd(pChannelsCounter, 1);
-        
-            // store to global
-            ids[pos] = didraw;
-            samples[pos*10] = sampleValues[0];
-            samples[pos*10 + 1] = sampleValues[1];
-            samples[pos*10 + 2] = sampleValues[2];
-            samples[pos*10 + 3] = sampleValues[3];
-            samples[pos*10 + 4] = sampleValues[4];
-            samples[pos*10 + 5] = sampleValues[5];
-            samples[pos*10 + 6] = sampleValues[6];
-            samples[pos*10 + 7] = sampleValues[7];
-            samples[pos*10 + 8] = sampleValues[8];
-            samples[pos*10 + 9] = sampleValues[9];
+            if (gainSwitchError)
+              continue;
+          }
+
+          auto const pos = atomicAdd(pChannelsCounter, 1);
+
+          // store to global
+          ids[pos] = didraw;
+          samples[pos * 10] = sampleValues[0];
+          samples[pos * 10 + 1] = sampleValues[1];
+          samples[pos * 10 + 2] = sampleValues[2];
+          samples[pos * 10 + 3] = sampleValues[3];
+          samples[pos * 10 + 4] = sampleValues[4];
+          samples[pos * 10 + 5] = sampleValues[5];
+          samples[pos * 10 + 6] = sampleValues[6];
+          samples[pos * 10 + 7] = sampleValues[7];
+          samples[pos * 10 + 8] = sampleValues[8];
+          samples[pos * 10 + 9] = sampleValues[9];
         }
 
         current_tower_block += block_length;
+      }
     }
-}
-
-void entryPoint(
-        InputDataCPU const& inputCPU, 
-        InputDataGPU& inputGPU,
-        OutputDataGPU& outputGPU,
-        ScratchDataGPU& scratchGPU,
-        OutputDataCPU& outputCPU,
-        ConditionsProducts const& conditions,
-        cudaStream_t cudaStream,
-        uint32_t const nfedsWithData,
-        uint32_t const nbytesTotal) {
-    // transfer
-    cudaCheck( cudaMemcpyAsync(inputGPU.data,
-                               inputCPU.data.data(),
-                               nbytesTotal * sizeof(unsigned char),
-                               cudaMemcpyHostToDevice,
-                               cudaStream) );
-    cudaCheck( cudaMemcpyAsync(inputGPU.offsets,
-                               inputCPU.offsets.data(),
-                               nfedsWithData * sizeof(uint32_t),
-                               cudaMemcpyHostToDevice,
-                               cudaStream) );
-    cudaCheck( cudaMemsetAsync(scratchGPU.pChannelsCounter,
-                               0,
-                               sizeof(uint32_t) * 2, // EB + EE
-                               cudaStream) );
-    cudaCheck( cudaMemcpyAsync(inputGPU.feds,
-                               inputCPU.feds.data(),
-                               nfedsWithData * sizeof(int),
-                               cudaMemcpyHostToDevice,
-                               cudaStream) );
-
-    kernel_unpack_test<32><<<nfedsWithData,32, 0, cudaStream>>>(
-        inputGPU.data,
-        inputGPU.offsets,
-        inputGPU.feds,
-        outputGPU.samplesEB,
-        outputGPU.samplesEE,
-        outputGPU.idsEB,
-        outputGPU.idsEE,
-        scratchGPU.pChannelsCounter,
-        conditions.eMappingProduct.eid2did,
-        nbytesTotal
-    );
-    cudaCheck( cudaGetLastError() );
-
-    // transfer the counters for how many eb and ee channels we got
-    cudaCheck( cudaMemcpyAsync(outputCPU.nchannels.data(),
-                               scratchGPU.pChannelsCounter,
-                               sizeof(uint32_t) * 2,
-                               cudaMemcpyDeviceToHost,
-                               cudaStream) );
-}
-
-}}
+
+    void entryPoint(InputDataCPU const& inputCPU,
+                    InputDataGPU& inputGPU,
+                    OutputDataGPU& outputGPU,
+                    ScratchDataGPU& scratchGPU,
+                    OutputDataCPU& outputCPU,
+                    ConditionsProducts const& conditions,
+                    cudaStream_t cudaStream,
+                    uint32_t const nfedsWithData,
+                    uint32_t const nbytesTotal) {
+      // transfer
+      cudaCheck(cudaMemcpyAsync(
+          inputGPU.data, inputCPU.data.data(), nbytesTotal * sizeof(unsigned char), cudaMemcpyHostToDevice, cudaStream));
+      cudaCheck(cudaMemcpyAsync(inputGPU.offsets,
+                                inputCPU.offsets.data(),
+                                nfedsWithData * sizeof(uint32_t),
+                                cudaMemcpyHostToDevice,
+                                cudaStream));
+      cudaCheck(cudaMemsetAsync(scratchGPU.pChannelsCounter,
+                                0,
+                                sizeof(uint32_t) * 2,  // EB + EE
+                                cudaStream));
+      cudaCheck(cudaMemcpyAsync(
+          inputGPU.feds, inputCPU.feds.data(), nfedsWithData * sizeof(int), cudaMemcpyHostToDevice, cudaStream));
+
+      kernel_unpack_test<32><<<nfedsWithData, 32, 0, cudaStream>>>(inputGPU.data,
+                                                                   inputGPU.offsets,
+                                                                   inputGPU.feds,
+                                                                   outputGPU.samplesEB,
+                                                                   outputGPU.samplesEE,
+                                                                   outputGPU.idsEB,
+                                                                   outputGPU.idsEE,
+                                                                   scratchGPU.pChannelsCounter,
+                                                                   conditions.eMappingProduct.eid2did,
+                                                                   nbytesTotal);
+      cudaCheck(cudaGetLastError());
+
+      // transfer the counters for how many eb and ee channels we got
+      cudaCheck(cudaMemcpyAsync(outputCPU.nchannels.data(),
+                                scratchGPU.pChannelsCounter,
+                                sizeof(uint32_t) * 2,
+                                cudaMemcpyDeviceToHost,
+                                cudaStream));
+    }
+
+  }  // namespace raw
+}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp
index e0cca70f93795..4d50b758d39f3 100644
--- a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp
+++ b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp
@@ -16,268 +16,288 @@
 #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
 
 int main(int argc, char *argv[]) {
-    if (argc<3) {
-        std::cout << "run with: ./validateGPU <path to input file> <output file>\n";
-        exit(0);
-    }
+  if (argc < 3) {
+    std::cout << "run with: ./validateGPU <path to input file> <output file>\n";
+    exit(0);
+  }
 
-    edm::Wrapper<ecal::UncalibratedRecHit<ecal::Tag::soa>> *wgpuEB=nullptr;
-    edm::Wrapper<ecal::UncalibratedRecHit<ecal::Tag::soa>> *wgpuEE=nullptr;
-    edm::Wrapper<EBUncalibratedRecHitCollection> *wcpuEB = nullptr;
-    edm::Wrapper<EEUncalibratedRecHitCollection> *wcpuEE = nullptr;
+  edm::Wrapper<ecal::UncalibratedRecHit<ecal::Tag::soa>> *wgpuEB = nullptr;
+  edm::Wrapper<ecal::UncalibratedRecHit<ecal::Tag::soa>> *wgpuEE = nullptr;
+  edm::Wrapper<EBUncalibratedRecHitCollection> *wcpuEB = nullptr;
+  edm::Wrapper<EEUncalibratedRecHitCollection> *wcpuEE = nullptr;
 
-    std::string fileName = argv[1];
-    std::string outFileName = argv[2];
+  std::string fileName = argv[1];
+  std::string outFileName = argv[2];
 
-    // output
-    TFile rfout{outFileName.c_str(), "recreate"};
+  // output
+  TFile rfout{outFileName.c_str(), "recreate"};
 
-    int nbins = 300;
-    float last = 3000.;
+  int nbins = 300;
+  float last = 3000.;
 
-    int nbins_chi2 = 1000;
-    float last_chi2 = 1000.;
+  int nbins_chi2 = 1000;
+  float last_chi2 = 1000.;
 
-    int nbins_delta = 201;  // use an odd number to center around 0
-    float delta = 0.2;
+  int nbins_delta = 201;  // use an odd number to center around 0
+  float delta = 0.2;
 
-    auto hSOIAmplitudesEBGPU = new TH1D("hSOIAmplitudesEBGPU", "hSOIAmplitudesEBGPU", nbins, 0, last);
-    auto hSOIAmplitudesEEGPU = new TH1D("hSOIAmplitudesEEGPU", "hSOIAmplitudesEEGPU", nbins, 0, last);
-    auto hSOIAmplitudesEBCPU = new TH1D("hSOIAmplitudesEBCPU", "hSOIAmplitudesEBCPU", nbins, 0, last);
-    auto hSOIAmplitudesEECPU = new TH1D("hSOIAmplitudesEECPU", "hSOIAmplitudesEECPU", nbins, 0, last);
-    auto hSOIAmplitudesEBGPUCPUratio = new TH1D("SOIAmplitudesEBGPU/CPUratio", "SOIAmplitudesEBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
-    auto hSOIAmplitudesEEGPUCPUratio = new TH1D("SOIAmplitudesEEGPU/CPUratio", "SOIAmplitudesEEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
+  auto hSOIAmplitudesEBGPU = new TH1D("hSOIAmplitudesEBGPU", "hSOIAmplitudesEBGPU", nbins, 0, last);
+  auto hSOIAmplitudesEEGPU = new TH1D("hSOIAmplitudesEEGPU", "hSOIAmplitudesEEGPU", nbins, 0, last);
+  auto hSOIAmplitudesEBCPU = new TH1D("hSOIAmplitudesEBCPU", "hSOIAmplitudesEBCPU", nbins, 0, last);
+  auto hSOIAmplitudesEECPU = new TH1D("hSOIAmplitudesEECPU", "hSOIAmplitudesEECPU", nbins, 0, last);
+  auto hSOIAmplitudesEBGPUCPUratio =
+      new TH1D("SOIAmplitudesEBGPU/CPUratio", "SOIAmplitudesEBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
+  auto hSOIAmplitudesEEGPUCPUratio =
+      new TH1D("SOIAmplitudesEEGPU/CPUratio", "SOIAmplitudesEEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
 
-    auto hChi2EBGPU = new TH1D("hChi2EBGPU", "hChi2EBGPU", nbins_chi2, 0, last_chi2);
-    auto hChi2EEGPU = new TH1D("hChi2EEGPU", "hChi2EEGPU", nbins_chi2, 0, last_chi2);
-    auto hChi2EBCPU = new TH1D("hChi2EBCPU", "hChi2EBCPU", nbins_chi2, 0, last_chi2);
-    auto hChi2EECPU = new TH1D("hChi2EECPU", "hChi2EECPU", nbins_chi2, 0, last_chi2);
+  auto hChi2EBGPU = new TH1D("hChi2EBGPU", "hChi2EBGPU", nbins_chi2, 0, last_chi2);
+  auto hChi2EEGPU = new TH1D("hChi2EEGPU", "hChi2EEGPU", nbins_chi2, 0, last_chi2);
+  auto hChi2EBCPU = new TH1D("hChi2EBCPU", "hChi2EBCPU", nbins_chi2, 0, last_chi2);
+  auto hChi2EECPU = new TH1D("hChi2EECPU", "hChi2EECPU", nbins_chi2, 0, last_chi2);
 
-    auto hSOIAmplitudesEBGPUvsCPU = new TH2D("hSOIAmplitudesEBGPUvsCPU", "hSOIAmplitudesEBGPUvsCPU", nbins, 0, last, nbins, 0, last);
-    auto hSOIAmplitudesEEGPUvsCPU = new TH2D("hSOIAmplitudesEEGPUvsCPU", "hSOIAmplitudesEEGPUvsCPU", nbins, 0, last, nbins, 0, last);
-    auto hSOIAmplitudesEBdeltavsCPU = new TH2D("hSOIAmplitudesEBdeltavsCPU", "hSOIAmplitudesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
-    auto hSOIAmplitudesEEdeltavsCPU = new TH2D("hSOIAmplitudesEEdeltavsCPU", "hSOIAmplitudesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+  auto hSOIAmplitudesEBGPUvsCPU =
+      new TH2D("hSOIAmplitudesEBGPUvsCPU", "hSOIAmplitudesEBGPUvsCPU", nbins, 0, last, nbins, 0, last);
+  auto hSOIAmplitudesEEGPUvsCPU =
+      new TH2D("hSOIAmplitudesEEGPUvsCPU", "hSOIAmplitudesEEGPUvsCPU", nbins, 0, last, nbins, 0, last);
+  auto hSOIAmplitudesEBdeltavsCPU =
+      new TH2D("hSOIAmplitudesEBdeltavsCPU", "hSOIAmplitudesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+  auto hSOIAmplitudesEEdeltavsCPU =
+      new TH2D("hSOIAmplitudesEEdeltavsCPU", "hSOIAmplitudesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
 
-    auto hChi2EBGPUvsCPU = new TH2D("hChi2EBGPUvsCPU", "hChi2EBGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2);
-    auto hChi2EEGPUvsCPU = new TH2D("hChi2EEGPUvsCPU", "hChi2EEGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2);
-    auto hChi2EBdeltavsCPU = new TH2D("hChi2EBdeltavsCPU", "hChi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
-    auto hChi2EEdeltavsCPU = new TH2D("hChi2EEdeltavsCPU", "hChi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
+  auto hChi2EBGPUvsCPU =
+      new TH2D("hChi2EBGPUvsCPU", "hChi2EBGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2);
+  auto hChi2EEGPUvsCPU =
+      new TH2D("hChi2EEGPUvsCPU", "hChi2EEGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2);
+  auto hChi2EBdeltavsCPU =
+      new TH2D("hChi2EBdeltavsCPU", "hChi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
+  auto hChi2EEdeltavsCPU =
+      new TH2D("hChi2EEdeltavsCPU", "hChi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
 
-    // input
-    std::cout << "validating file " << fileName << std::endl;
-    TFile rf{fileName.c_str()};
-    TTree *rt = (TTree*)rf.Get("Events");
-    rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEB_RECO.", &wgpuEB);
-    rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEE_RECO.", &wgpuEE);
-    rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEB_RECO.", &wcpuEB);
-    rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEE_RECO.", &wcpuEE);
+  // input
+  std::cout << "validating file " << fileName << std::endl;
+  TFile rf{fileName.c_str()};
+  TTree *rt = (TTree *)rf.Get("Events");
+  rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEB_RECO.",
+                       &wgpuEB);
+  rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEE_RECO.",
+                       &wgpuEE);
+  rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEB_RECO.", &wcpuEB);
+  rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEE_RECO.", &wcpuEE);
 
-    constexpr float eps_diff = 1e-3;
+  constexpr float eps_diff = 1e-3;
 
-    // accumulate
-    auto const nentries = rt->GetEntries();
-    std::cout << "#events to validate over: " << nentries << std::endl;
-    for (int ie=0; ie<nentries; ++ie) {
-        rt->GetEntry(ie);
+  // accumulate
+  auto const nentries = rt->GetEntries();
+  std::cout << "#events to validate over: " << nentries << std::endl;
+  for (int ie = 0; ie < nentries; ++ie) {
+    rt->GetEntry(ie);
 
-        const char* ordinal[] = { "th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th" };
-        auto cpu_eb_size = wcpuEB->bareProduct().size();
-        auto cpu_ee_size = wcpuEE->bareProduct().size();
-        auto gpu_eb_size = wgpuEB->bareProduct().amplitude.size();
-        auto gpu_ee_size = wgpuEE->bareProduct().amplitude.size();
-        if (cpu_eb_size != gpu_eb_size or cpu_ee_size != gpu_ee_size) {
-          std::cerr << ie << ordinal[ie % 10] << " entry:\n"
-                    << "  EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << gpu_eb_size << " (gpu)\n"
-                    << "  EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << gpu_ee_size << " (gpu)" << std::endl;
-          continue;
-        }
+    const char *ordinal[] = {"th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th"};
+    auto cpu_eb_size = wcpuEB->bareProduct().size();
+    auto cpu_ee_size = wcpuEE->bareProduct().size();
+    auto gpu_eb_size = wgpuEB->bareProduct().amplitude.size();
+    auto gpu_ee_size = wgpuEE->bareProduct().amplitude.size();
+    if (cpu_eb_size != gpu_eb_size or cpu_ee_size != gpu_ee_size) {
+      std::cerr << ie << ordinal[ie % 10] << " entry:\n"
+                << "  EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << gpu_eb_size
+                << " (gpu)\n"
+                << "  EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << gpu_ee_size
+                << " (gpu)" << std::endl;
+      continue;
+    }
 
-        assert(wgpuEB->bareProduct().amplitude.size() == wcpuEB->bareProduct().size());
-        assert(wgpuEE->bareProduct().amplitude.size() == wcpuEE->bareProduct().size());
-        auto const neb = wcpuEB->bareProduct().size();
-        auto const nee = wcpuEE->bareProduct().size();
+    assert(wgpuEB->bareProduct().amplitude.size() == wcpuEB->bareProduct().size());
+    assert(wgpuEE->bareProduct().amplitude.size() == wcpuEE->bareProduct().size());
+    auto const neb = wcpuEB->bareProduct().size();
+    auto const nee = wcpuEE->bareProduct().size();
 
-        for (uint32_t i=0; i<neb; ++i) {
-            auto const did_gpu = wgpuEB->bareProduct().did[i];
-            auto const soi_amp_gpu = wgpuEB->bareProduct().amplitude[i];
-            auto const cpu_iter = wcpuEB->bareProduct().find(DetId{did_gpu});
-            if (cpu_iter == wcpuEB->bareProduct().end()) {
-                std::cerr << ie << ordinal[ie % 10] << " entry\n"
-                          << "  Did not find a DetId " << did_gpu
-                          << " in a CPU collection\n";
-                continue;
-            }
-            auto const soi_amp_cpu = cpu_iter->amplitude();
-            auto const chi2_gpu = wgpuEB->bareProduct().chi2[i];
-            auto const chi2_cpu = cpu_iter->chi2();
+    for (uint32_t i = 0; i < neb; ++i) {
+      auto const did_gpu = wgpuEB->bareProduct().did[i];
+      auto const soi_amp_gpu = wgpuEB->bareProduct().amplitude[i];
+      auto const cpu_iter = wcpuEB->bareProduct().find(DetId{did_gpu});
+      if (cpu_iter == wcpuEB->bareProduct().end()) {
+        std::cerr << ie << ordinal[ie % 10] << " entry\n"
+                  << "  Did not find a DetId " << did_gpu << " in a CPU collection\n";
+        continue;
+      }
+      auto const soi_amp_cpu = cpu_iter->amplitude();
+      auto const chi2_gpu = wgpuEB->bareProduct().chi2[i];
+      auto const chi2_cpu = cpu_iter->chi2();
 
-            hSOIAmplitudesEBGPU->Fill(soi_amp_gpu);
-            hSOIAmplitudesEBCPU->Fill(soi_amp_cpu);
-            hSOIAmplitudesEBGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu);
-            hSOIAmplitudesEBdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu-soi_amp_cpu);
-            hSOIAmplitudesEBGPUCPUratio->Fill( (float) soi_amp_gpu/soi_amp_cpu);
-            hChi2EBGPU->Fill(chi2_gpu);
-            hChi2EBCPU->Fill(chi2_cpu);
-            hChi2EBGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
-            hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu);
+      hSOIAmplitudesEBGPU->Fill(soi_amp_gpu);
+      hSOIAmplitudesEBCPU->Fill(soi_amp_cpu);
+      hSOIAmplitudesEBGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu);
+      hSOIAmplitudesEBdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu - soi_amp_cpu);
+      hSOIAmplitudesEBGPUCPUratio->Fill((float)soi_amp_gpu / soi_amp_cpu);
+      hChi2EBGPU->Fill(chi2_gpu);
+      hChi2EBCPU->Fill(chi2_cpu);
+      hChi2EBGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
+      hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu);
 
-            if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or
-                (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu))
-            {
-                printf("EB eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n",
-                    ie, i, soi_amp_gpu, soi_amp_cpu, chi2_gpu, chi2_cpu);
-                if (std::isnan(chi2_gpu))
-                  printf("*** nan ***\n");
-            }
-        }
+      if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or
+          std::isnan(chi2_gpu)) {
+        printf("EB eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n",
+               ie,
+               i,
+               soi_amp_gpu,
+               soi_amp_cpu,
+               chi2_gpu,
+               chi2_cpu);
+        if (std::isnan(chi2_gpu))
+          printf("*** nan ***\n");
+      }
+    }
 
-        for (uint32_t i=0; i<nee; ++i) {
-            auto const did_gpu = wgpuEE->bareProduct().did[i];
-            auto const soi_amp_gpu = wgpuEE->bareProduct().amplitude[i];
-            auto const cpu_iter = wcpuEE->bareProduct().find(DetId{did_gpu});
-            if (cpu_iter == wcpuEE->bareProduct().end()) {
-                std::cerr << ie << ordinal[ie % 10] << " entry\n"
-                          << "  did not find a DetId " << did_gpu
-                          << " in a CPU collection\n";
-                continue;
-            }
-            auto const soi_amp_cpu = cpu_iter->amplitude();
-            auto const chi2_gpu = wgpuEE->bareProduct().chi2[i];
-            auto const chi2_cpu = cpu_iter->chi2();
+    for (uint32_t i = 0; i < nee; ++i) {
+      auto const did_gpu = wgpuEE->bareProduct().did[i];
+      auto const soi_amp_gpu = wgpuEE->bareProduct().amplitude[i];
+      auto const cpu_iter = wcpuEE->bareProduct().find(DetId{did_gpu});
+      if (cpu_iter == wcpuEE->bareProduct().end()) {
+        std::cerr << ie << ordinal[ie % 10] << " entry\n"
+                  << "  did not find a DetId " << did_gpu << " in a CPU collection\n";
+        continue;
+      }
+      auto const soi_amp_cpu = cpu_iter->amplitude();
+      auto const chi2_gpu = wgpuEE->bareProduct().chi2[i];
+      auto const chi2_cpu = cpu_iter->chi2();
 
-            hSOIAmplitudesEEGPU->Fill(soi_amp_gpu);
-            hSOIAmplitudesEECPU->Fill(soi_amp_cpu);
-            hSOIAmplitudesEEGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu);
-            hSOIAmplitudesEEdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu-soi_amp_cpu);
-            hSOIAmplitudesEEGPUCPUratio->Fill( (float) soi_amp_gpu/soi_amp_cpu);
-            hChi2EEGPU->Fill(chi2_gpu);
-            hChi2EECPU->Fill(chi2_cpu);
-            hChi2EEGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
-            hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu);
+      hSOIAmplitudesEEGPU->Fill(soi_amp_gpu);
+      hSOIAmplitudesEECPU->Fill(soi_amp_cpu);
+      hSOIAmplitudesEEGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu);
+      hSOIAmplitudesEEdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu - soi_amp_cpu);
+      hSOIAmplitudesEEGPUCPUratio->Fill((float)soi_amp_gpu / soi_amp_cpu);
+      hChi2EEGPU->Fill(chi2_gpu);
+      hChi2EECPU->Fill(chi2_cpu);
+      hChi2EEGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
+      hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu);
 
-            if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or
-                (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu))
-            {
-                printf("EE eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n",
-                    ie, static_cast<int>(neb+i), soi_amp_gpu, soi_amp_cpu, chi2_gpu, chi2_cpu);
-                if (std::isnan(chi2_gpu))
-                  printf("*** nan ***\n");
-            }
-        }
+      if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or
+          std::isnan(chi2_gpu)) {
+        printf("EE eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n",
+               ie,
+               static_cast<int>(neb + i),
+               soi_amp_gpu,
+               soi_amp_cpu,
+               chi2_gpu,
+               chi2_cpu);
+        if (std::isnan(chi2_gpu))
+          printf("*** nan ***\n");
+      }
     }
+  }
 
-    {
-      TCanvas c("plots", "plots", 4200, 6200);
-      c.Divide(2, 4);
-
-      c.cd(1);
-      {
-          gPad->SetLogy();
-          hSOIAmplitudesEBCPU->SetLineColor(kBlack);
-          hSOIAmplitudesEBCPU->SetLineWidth(1.);
-          hSOIAmplitudesEBCPU->Draw("");
-          hSOIAmplitudesEBGPU->SetLineColor(kBlue);
-          hSOIAmplitudesEBGPU->SetLineWidth(1.);
-          hSOIAmplitudesEBGPU->Draw("sames");
-          gPad->Update();
-          auto stats = (TPaveStats*)hSOIAmplitudesEBGPU->FindObject("stats");
-          auto y2 = stats->GetY2NDC();
-          auto y1 = stats->GetY1NDC();
-          stats->SetY2NDC(y1);
-          stats->SetY1NDC(y1 - (y2-y1));
-      }
-      c.cd(2);
-      {
-          gPad->SetLogy();
-          hSOIAmplitudesEECPU->SetLineColor(kBlack);
-          hSOIAmplitudesEECPU->SetLineWidth(1.);
-          hSOIAmplitudesEECPU->Draw("");
-          hSOIAmplitudesEEGPU->SetLineColor(kBlue);
-          hSOIAmplitudesEEGPU->SetLineWidth(1.);
-          hSOIAmplitudesEEGPU->Draw("sames");
-          gPad->Update();
-          auto stats = (TPaveStats*)hSOIAmplitudesEEGPU->FindObject("stats");
-          auto y2 = stats->GetY2NDC();
-          auto y1 = stats->GetY1NDC();
-          stats->SetY2NDC(y1);
-          stats->SetY1NDC(y1 - (y2-y1));
-      }
-      c.cd(3);
-      hSOIAmplitudesEBGPUvsCPU->Draw("COLZ");
-      c.cd(4);
-      hSOIAmplitudesEEGPUvsCPU->Draw("COLZ");
-      c.cd(5);
-      hSOIAmplitudesEBdeltavsCPU->Draw("COLZ");
-      c.cd(6);
-      hSOIAmplitudesEEdeltavsCPU->Draw("COLZ");
-      c.cd(7);
-      {
-          gPad->SetLogy();
-          hSOIAmplitudesEBGPUCPUratio->SetLineColor(kBlack);
-          hSOIAmplitudesEBGPUCPUratio->SetLineWidth(1.);
-          hSOIAmplitudesEBGPUCPUratio->Draw("");
-      }
-      c.cd(8);
-      {
-          gPad->SetLogy();
-          hSOIAmplitudesEEGPUCPUratio->SetLineColor(kBlack);
-          hSOIAmplitudesEEGPUCPUratio->SetLineWidth(1.);
-          hSOIAmplitudesEEGPUCPUratio->Draw("");
-      }
+  {
+    TCanvas c("plots", "plots", 4200, 6200);
+    c.Divide(2, 4);
 
-      c.SaveAs("ecal-amplitudes.pdf");
+    c.cd(1);
+    {
+      gPad->SetLogy();
+      hSOIAmplitudesEBCPU->SetLineColor(kBlack);
+      hSOIAmplitudesEBCPU->SetLineWidth(1.);
+      hSOIAmplitudesEBCPU->Draw("");
+      hSOIAmplitudesEBGPU->SetLineColor(kBlue);
+      hSOIAmplitudesEBGPU->SetLineWidth(1.);
+      hSOIAmplitudesEBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats *)hSOIAmplitudesEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(2);
+    {
+      gPad->SetLogy();
+      hSOIAmplitudesEECPU->SetLineColor(kBlack);
+      hSOIAmplitudesEECPU->SetLineWidth(1.);
+      hSOIAmplitudesEECPU->Draw("");
+      hSOIAmplitudesEEGPU->SetLineColor(kBlue);
+      hSOIAmplitudesEEGPU->SetLineWidth(1.);
+      hSOIAmplitudesEEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats *)hSOIAmplitudesEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
     }
+    c.cd(3);
+    hSOIAmplitudesEBGPUvsCPU->Draw("COLZ");
+    c.cd(4);
+    hSOIAmplitudesEEGPUvsCPU->Draw("COLZ");
+    c.cd(5);
+    hSOIAmplitudesEBdeltavsCPU->Draw("COLZ");
+    c.cd(6);
+    hSOIAmplitudesEEdeltavsCPU->Draw("COLZ");
+    c.cd(7);
     {
-      TCanvas c("plots", "plots", 4200, 6200);
-      c.Divide(2, 3);
+      gPad->SetLogy();
+      hSOIAmplitudesEBGPUCPUratio->SetLineColor(kBlack);
+      hSOIAmplitudesEBGPUCPUratio->SetLineWidth(1.);
+      hSOIAmplitudesEBGPUCPUratio->Draw("");
+    }
+    c.cd(8);
+    {
+      gPad->SetLogy();
+      hSOIAmplitudesEEGPUCPUratio->SetLineColor(kBlack);
+      hSOIAmplitudesEEGPUCPUratio->SetLineWidth(1.);
+      hSOIAmplitudesEEGPUCPUratio->Draw("");
+    }
 
-      c.cd(1);
-      {
-          gPad->SetLogy();
-          hChi2EBCPU->SetLineColor(kBlack);
-          hChi2EBCPU->SetLineWidth(1.);
-          hChi2EBCPU->Draw("");
-          hChi2EBGPU->SetLineColor(kBlue);
-          hChi2EBGPU->SetLineWidth(1.);
-          hChi2EBGPU->Draw("sames");
-          gPad->Update();
-          auto stats = (TPaveStats*)hChi2EBGPU->FindObject("stats");
-          auto y2 = stats->GetY2NDC();
-          auto y1 = stats->GetY1NDC();
-          stats->SetY2NDC(y1);
-          stats->SetY1NDC(y1 - (y2-y1));
-      }
-      c.cd(2);
-      {
-          gPad->SetLogy();
-          hChi2EECPU->SetLineColor(kBlack);
-          hChi2EECPU->SetLineWidth(1.);
-          hChi2EECPU->Draw("");
-          hChi2EEGPU->SetLineColor(kBlue);
-          hChi2EEGPU->SetLineWidth(1.);
-          hChi2EEGPU->Draw("sames");
-          gPad->Update();
-          auto stats = (TPaveStats*)hChi2EEGPU->FindObject("stats");
-          auto y2 = stats->GetY2NDC();
-          auto y1 = stats->GetY1NDC();
-          stats->SetY2NDC(y1);
-          stats->SetY1NDC(y1 - (y2-y1));
-      }
-      c.cd(3);
-      hChi2EBGPUvsCPU->Draw("COLZ");
-      c.cd(4);
-      hChi2EEGPUvsCPU->Draw("COLZ");
-      c.cd(5);
-      hChi2EBdeltavsCPU->Draw("COLZ");
-      c.cd(6);
-      hChi2EEdeltavsCPU->Draw("COLZ");
+    c.SaveAs("ecal-amplitudes.pdf");
+  }
+  {
+    TCanvas c("plots", "plots", 4200, 6200);
+    c.Divide(2, 3);
 
-      c.SaveAs("ecal-chi2.pdf");
+    c.cd(1);
+    {
+      gPad->SetLogy();
+      hChi2EBCPU->SetLineColor(kBlack);
+      hChi2EBCPU->SetLineWidth(1.);
+      hChi2EBCPU->Draw("");
+      hChi2EBGPU->SetLineColor(kBlue);
+      hChi2EBGPU->SetLineWidth(1.);
+      hChi2EBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats *)hChi2EBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
     }
+    c.cd(2);
+    {
+      gPad->SetLogy();
+      hChi2EECPU->SetLineColor(kBlack);
+      hChi2EECPU->SetLineWidth(1.);
+      hChi2EECPU->Draw("");
+      hChi2EEGPU->SetLineColor(kBlue);
+      hChi2EEGPU->SetLineWidth(1.);
+      hChi2EEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats *)hChi2EEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
+    c.cd(3);
+    hChi2EBGPUvsCPU->Draw("COLZ");
+    c.cd(4);
+    hChi2EEGPUvsCPU->Draw("COLZ");
+    c.cd(5);
+    hChi2EBdeltavsCPU->Draw("COLZ");
+    c.cd(6);
+    hChi2EEdeltavsCPU->Draw("COLZ");
+
+    c.SaveAs("ecal-chi2.pdf");
+  }
 
-    rf.Close();
-    rfout.Write();
-    rfout.Close();
+  rf.Close();
+  rfout.Write();
+  rfout.Close();
 
-    return 0;
+  return 0;
 }
diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu
index 83a3e2b39ed0b..d095a0f2181ef 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu
@@ -16,84 +16,68 @@
 #include "inplace_fnnls.h"
 #include "KernelHelpers.h"
 
-namespace ecal { namespace multifit {
-
-///
-/// assume kernel launch configuration is 
-/// (MAXSAMPLES * nchannels, blocks)
-/// 
-__global__
-void kernel_prep_1d_and_initialize(
-                    EcalPulseShape const* shapes_in,
-                    uint16_t const* digis_in_eb,
-                    uint32_t const* dids_eb,
-                    uint16_t const* digis_in_ee,
-                    uint32_t const* dids_ee,
-                    SampleVector* amplitudes,
-                    SampleVector* amplitudesForMinimization,
-                    SampleGainVector* gainsNoise,
-                    float const* mean_x1,
-                    float const* mean_x12,
-                    float const* rms_x12,
-                    float const* mean_x6,
-                    float const* gain6Over1,
-                    float const* gain12Over6,
-                    bool* hasSwitchToGain6,
-                    bool* hasSwitchToGain1,
-                    bool* isSaturated,
-                    ::ecal::reco::StorageScalarType* energies,
-                    ::ecal::reco::StorageScalarType* chi2,
-                    ::ecal::reco::StorageScalarType* g_pedestal,
-                    uint32_t *dids_out,
-                    uint32_t *flags,
-                    char* acState,
-                    BXVectorType *bxs,
-                    uint32_t const offsetForHashes,
-                    uint32_t const offsetForInputs,
-                    bool const gainSwitchUseMaxSampleEB,
-                    bool const gainSwitchUseMaxSampleEE,
-                    int const nchannels) {
-    constexpr bool dynamicPedestal = false;  //---- default to false, ok
-    constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
-    constexpr int sample_max = 5;
-    constexpr int full_pulse_max = 9;
-    int const tx = threadIdx.x + blockIdx.x*blockDim.x;
-    int const nchannels_per_block = blockDim.x / nsamples;
-    int const total_threads = nchannels * nsamples;
-    int const ch = tx / nsamples;
-    // for accessing input arrays
-    int const inputCh = ch >= offsetForInputs
-        ? ch - offsetForInputs
-        : ch;
-    int const inputTx = ch >= offsetForInputs
-        ? tx - offsetForInputs*10
-        : tx;
-    // eb is first and then ee
-    auto const* digis_in = ch >= offsetForInputs
-        ? digis_in_ee
-        : digis_in_eb;
-    auto const* dids = ch >= offsetForInputs
-        ? dids_ee
-        : dids_eb;
-    int const sample = threadIdx.x % nsamples;
-
-    if (ch < nchannels) {
+namespace ecal {
+  namespace multifit {
+
+    ///
+    /// assume kernel launch configuration is
+    /// (MAXSAMPLES * nchannels, blocks)
+    ///
+    __global__ void kernel_prep_1d_and_initialize(EcalPulseShape const* shapes_in,
+                                                  uint16_t const* digis_in_eb,
+                                                  uint32_t const* dids_eb,
+                                                  uint16_t const* digis_in_ee,
+                                                  uint32_t const* dids_ee,
+                                                  SampleVector* amplitudes,
+                                                  SampleVector* amplitudesForMinimization,
+                                                  SampleGainVector* gainsNoise,
+                                                  float const* mean_x1,
+                                                  float const* mean_x12,
+                                                  float const* rms_x12,
+                                                  float const* mean_x6,
+                                                  float const* gain6Over1,
+                                                  float const* gain12Over6,
+                                                  bool* hasSwitchToGain6,
+                                                  bool* hasSwitchToGain1,
+                                                  bool* isSaturated,
+                                                  ::ecal::reco::StorageScalarType* energies,
+                                                  ::ecal::reco::StorageScalarType* chi2,
+                                                  ::ecal::reco::StorageScalarType* g_pedestal,
+                                                  uint32_t* dids_out,
+                                                  uint32_t* flags,
+                                                  char* acState,
+                                                  BXVectorType* bxs,
+                                                  uint32_t const offsetForHashes,
+                                                  uint32_t const offsetForInputs,
+                                                  bool const gainSwitchUseMaxSampleEB,
+                                                  bool const gainSwitchUseMaxSampleEE,
+                                                  int const nchannels) {
+      constexpr bool dynamicPedestal = false;  //---- default to false, ok
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+      constexpr int sample_max = 5;
+      constexpr int full_pulse_max = 9;
+      int const tx = threadIdx.x + blockIdx.x * blockDim.x;
+      int const nchannels_per_block = blockDim.x / nsamples;
+      int const ch = tx / nsamples;
+      // for accessing input arrays
+      int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+      int const inputTx = ch >= offsetForInputs ? tx - offsetForInputs * 10 : tx;
+      // eb is first and then ee
+      auto const* digis_in = ch >= offsetForInputs ? digis_in_ee : digis_in_eb;
+      auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+      int const sample = threadIdx.x % nsamples;
+
+      if (ch < nchannels) {
         // array of 10 x channels per block
         // TODO: any other way of doing simple reduction
         // assume bool is 1 byte, should be quite safe
         extern __shared__ char shared_mem[];
-        bool* shr_hasSwitchToGain6 = reinterpret_cast<bool*>(
-            shared_mem);
-        bool* shr_hasSwitchToGain1 = shr_hasSwitchToGain6 + 
-            nchannels_per_block*nsamples;
-        bool* shr_hasSwitchToGain0 = shr_hasSwitchToGain1 + 
-            nchannels_per_block*nsamples;
-        bool* shr_isSaturated = shr_hasSwitchToGain0 + 
-            nchannels_per_block*nsamples;
-        bool* shr_hasSwitchToGain0_tmp = shr_isSaturated + 
-            nchannels_per_block*nsamples;
-        char* shr_counts = reinterpret_cast<char*>(
-            shr_hasSwitchToGain0_tmp) + nchannels_per_block*nsamples;
+        bool* shr_hasSwitchToGain6 = reinterpret_cast<bool*>(shared_mem);
+        bool* shr_hasSwitchToGain1 = shr_hasSwitchToGain6 + nchannels_per_block * nsamples;
+        bool* shr_hasSwitchToGain0 = shr_hasSwitchToGain1 + nchannels_per_block * nsamples;
+        bool* shr_isSaturated = shr_hasSwitchToGain0 + nchannels_per_block * nsamples;
+        bool* shr_hasSwitchToGain0_tmp = shr_isSaturated + nchannels_per_block * nsamples;
+        char* shr_counts = reinterpret_cast<char*>(shr_hasSwitchToGain0_tmp) + nchannels_per_block * nsamples;
 
         //
         // indices
@@ -101,10 +85,7 @@ void kernel_prep_1d_and_initialize(
         auto const did = DetId{dids[inputCh]};
         auto const isBarrel = did.subdetId() == EcalBarrel;
         // TODO offset for ee, 0 for eb
-        auto const hashedId = isBarrel
-            ? hashedIndexEB(did.rawId())
-            : offsetForHashes + hashedIndexEE(did.rawId());
-
+        auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId());
 
         //
         // pulse shape template
@@ -113,7 +94,7 @@ void kernel_prep_1d_and_initialize(
             isample+=nsamples)
             shapes_out[ch](isample + 7) = shapes_in[hashedId].pdfval[isample];
             */
-        
+
         // will be used in the future for setting state
         auto const rmsForChecking = rms_x12[hashedId];
 
@@ -133,13 +114,12 @@ void kernel_prep_1d_and_initialize(
         shr_hasSwitchToGain0[threadIdx.x] = shr_hasSwitchToGain0_tmp[threadIdx.x];
         shr_counts[threadIdx.x] = 0;
         __syncthreads();
-        
+
         // non-divergent branch (except for the last 4 threads)
-        if (threadIdx.x<=blockDim.x-5) {
-            #pragma unroll
-            for (int i=0; i<5; i++)
-                shr_counts[threadIdx.x] += 
-                    shr_hasSwitchToGain0[threadIdx.x+i];
+        if (threadIdx.x <= blockDim.x - 5) {
+#pragma unroll
+          for (int i = 0; i < 5; i++)
+            shr_counts[threadIdx.x] += shr_hasSwitchToGain0[threadIdx.x + i];
         }
         shr_isSaturated[threadIdx.x] = shr_counts[threadIdx.x] == 5;
 
@@ -148,102 +128,89 @@ void kernel_prep_1d_and_initialize(
         // TODO
         //
         if (sample < 5) {
-            shr_hasSwitchToGain6[threadIdx.x] = 
-                shr_hasSwitchToGain6[threadIdx.x] ||
-                shr_hasSwitchToGain6[threadIdx.x + 5];
-            shr_hasSwitchToGain1[threadIdx.x] =
-                shr_hasSwitchToGain1[threadIdx.x] ||
-                shr_hasSwitchToGain1[threadIdx.x + 5];
-            
-            // duplication of hasSwitchToGain0 in order not to
-            // introduce another syncthreads
-            shr_hasSwitchToGain0_tmp[threadIdx.x] = 
-                shr_hasSwitchToGain0_tmp[threadIdx.x] || 
-                shr_hasSwitchToGain0_tmp[threadIdx.x+5];
+          shr_hasSwitchToGain6[threadIdx.x] =
+              shr_hasSwitchToGain6[threadIdx.x] || shr_hasSwitchToGain6[threadIdx.x + 5];
+          shr_hasSwitchToGain1[threadIdx.x] =
+              shr_hasSwitchToGain1[threadIdx.x] || shr_hasSwitchToGain1[threadIdx.x + 5];
+
+          // duplication of hasSwitchToGain0 in order not to
+          // introduce another syncthreads
+          shr_hasSwitchToGain0_tmp[threadIdx.x] =
+              shr_hasSwitchToGain0_tmp[threadIdx.x] || shr_hasSwitchToGain0_tmp[threadIdx.x + 5];
         }
         __syncthreads();
-        
-        if (sample<2) {
-            // note, both threads per channel take value [3] twice to avoid another if
-            shr_hasSwitchToGain6[threadIdx.x] = 
-                shr_hasSwitchToGain6[threadIdx.x] ||
-                shr_hasSwitchToGain6[threadIdx.x+2] || 
-                shr_hasSwitchToGain6[threadIdx.x+3];
-            shr_hasSwitchToGain1[threadIdx.x] =
-                shr_hasSwitchToGain1[threadIdx.x] ||
-                shr_hasSwitchToGain1[threadIdx.x+2] || 
-                shr_hasSwitchToGain1[threadIdx.x+3];
-
-            shr_hasSwitchToGain0_tmp[threadIdx.x] = 
-                shr_hasSwitchToGain0_tmp[threadIdx.x] ||
-                shr_hasSwitchToGain0_tmp[threadIdx.x+2] || 
-                shr_hasSwitchToGain0_tmp[threadIdx.x+3];
-
-            // sample < 2 -> first 2 threads of each channel will be used here
-            // => 0 -> will compare 3 and 4 and put into 0
-            // => 1 -> will compare 4 and 5 and put into 1
-            shr_isSaturated[threadIdx.x] = 
-                shr_isSaturated[threadIdx.x+3] || shr_isSaturated[threadIdx.x+4];
+
+        if (sample < 2) {
+          // note, both threads per channel take value [3] twice to avoid another if
+          shr_hasSwitchToGain6[threadIdx.x] = shr_hasSwitchToGain6[threadIdx.x] ||
+                                              shr_hasSwitchToGain6[threadIdx.x + 2] ||
+                                              shr_hasSwitchToGain6[threadIdx.x + 3];
+          shr_hasSwitchToGain1[threadIdx.x] = shr_hasSwitchToGain1[threadIdx.x] ||
+                                              shr_hasSwitchToGain1[threadIdx.x + 2] ||
+                                              shr_hasSwitchToGain1[threadIdx.x + 3];
+
+          shr_hasSwitchToGain0_tmp[threadIdx.x] = shr_hasSwitchToGain0_tmp[threadIdx.x] ||
+                                                  shr_hasSwitchToGain0_tmp[threadIdx.x + 2] ||
+                                                  shr_hasSwitchToGain0_tmp[threadIdx.x + 3];
+
+          // sample < 2 -> first 2 threads of each channel will be used here
+          // => 0 -> will compare 3 and 4 and put into 0
+          // => 1 -> will compare 4 and 5 and put into 1
+          shr_isSaturated[threadIdx.x] = shr_isSaturated[threadIdx.x + 3] || shr_isSaturated[threadIdx.x + 4];
         }
         __syncthreads();
 
         bool check_hasSwitchToGain0 = false;
 
-        if (sample==0) {
-            shr_hasSwitchToGain6[threadIdx.x] = 
-                shr_hasSwitchToGain6[threadIdx.x] || 
-                shr_hasSwitchToGain6[threadIdx.x+1];
-            shr_hasSwitchToGain1[threadIdx.x] = 
-                shr_hasSwitchToGain1[threadIdx.x] ||
-                shr_hasSwitchToGain1[threadIdx.x+1];
-            shr_hasSwitchToGain0_tmp[threadIdx.x] =
-                shr_hasSwitchToGain0_tmp[threadIdx.x] ||
-                shr_hasSwitchToGain0_tmp[threadIdx.x+1];
-
-            hasSwitchToGain6[ch] = shr_hasSwitchToGain6[threadIdx.x];
-            hasSwitchToGain1[ch] = shr_hasSwitchToGain1[threadIdx.x];
-
-            // set only for the threadIdx.x corresponding to sample==0
-            check_hasSwitchToGain0 = shr_hasSwitchToGain0_tmp[threadIdx.x];
-
-            shr_isSaturated[threadIdx.x+3] = 
-                shr_isSaturated[threadIdx.x] || 
-                shr_isSaturated[threadIdx.x+1];
-            isSaturated[ch] = shr_isSaturated[threadIdx.x+3];
+        if (sample == 0) {
+          shr_hasSwitchToGain6[threadIdx.x] =
+              shr_hasSwitchToGain6[threadIdx.x] || shr_hasSwitchToGain6[threadIdx.x + 1];
+          shr_hasSwitchToGain1[threadIdx.x] =
+              shr_hasSwitchToGain1[threadIdx.x] || shr_hasSwitchToGain1[threadIdx.x + 1];
+          shr_hasSwitchToGain0_tmp[threadIdx.x] =
+              shr_hasSwitchToGain0_tmp[threadIdx.x] || shr_hasSwitchToGain0_tmp[threadIdx.x + 1];
+
+          hasSwitchToGain6[ch] = shr_hasSwitchToGain6[threadIdx.x];
+          hasSwitchToGain1[ch] = shr_hasSwitchToGain1[threadIdx.x];
+
+          // set only for the threadIdx.x corresponding to sample==0
+          check_hasSwitchToGain0 = shr_hasSwitchToGain0_tmp[threadIdx.x];
+
+          shr_isSaturated[threadIdx.x + 3] = shr_isSaturated[threadIdx.x] || shr_isSaturated[threadIdx.x + 1];
+          isSaturated[ch] = shr_isSaturated[threadIdx.x + 3];
         }
 
         // TODO: w/o this sync, there is a race
         // if (threadIdx == sample_max) below uses max sample thread, not for 0 sample
         // check if we can remove it
         __syncthreads();
-        
+
         // TODO: divergent branch
-        if (gainId==0 || gainId==3) {
-            pedestal = mean_x1[hashedId];
-            gainratio = gain6Over1[hashedId] * gain12Over6[hashedId];
-            gainsNoise[ch](sample) = 2;
-        } else if (gainId==1) {
-            pedestal = mean_x12[hashedId];
-            gainratio = 1.;
-            gainsNoise[ch](sample) = 0;
-        } else if (gainId==2) {
-            pedestal = mean_x6[hashedId];
-            gainratio = gain12Over6[hashedId];
-            gainsNoise[ch](sample)  = 1;
+        if (gainId == 0 || gainId == 3) {
+          pedestal = mean_x1[hashedId];
+          gainratio = gain6Over1[hashedId] * gain12Over6[hashedId];
+          gainsNoise[ch](sample) = 2;
+        } else if (gainId == 1) {
+          pedestal = mean_x12[hashedId];
+          gainratio = 1.;
+          gainsNoise[ch](sample) = 0;
+        } else if (gainId == 2) {
+          pedestal = mean_x6[hashedId];
+          gainratio = gain12Over6[hashedId];
+          gainsNoise[ch](sample) = 1;
         }
-        
+
         // TODO: compile time constant -> branch should be non-divergent
         if (dynamicPedestal)
-            amplitude = static_cast<SampleVector::Scalar>(adc) * gainratio;
+          amplitude = static_cast<SampleVector::Scalar>(adc) * gainratio;
         else
-            amplitude = (static_cast<SampleVector::Scalar>(adc) - pedestal) * gainratio;
+          amplitude = (static_cast<SampleVector::Scalar>(adc) - pedestal) * gainratio;
         amplitudes[ch][sample] = amplitude;
 
 #ifdef ECAL_RECO_CUDA_DEBUG
-        printf("%d %d %d %d %f %f %f\n", tx, ch, sample, adc, amplitude,
-            pedestal, gainratio);
-        if (adc==0)
-            printf("adc is zero\n");
+        printf("%d %d %d %d %f %f %f\n", tx, ch, sample, adc, amplitude, pedestal, gainratio);
+        if (adc == 0)
+          printf("adc is zero\n");
 #endif
 
         //
@@ -252,325 +219,287 @@ void kernel_prep_1d_and_initialize(
         amplitudesForMinimization[ch](sample) = 0;
         bxs[ch](sample) = sample - 5;
 
-        // select the thread for the max sample 
+        // select the thread for the max sample
         //---> hardcoded above to be 5th sample, ok
         if (sample == sample_max) {
-            //
-            // initialization
-            //
-            acState[ch] = static_cast<char>(MinimizationState::NotFinished);
-            energies[ch] = 0;
-            chi2[ch] = 0;
-            g_pedestal[ch] = 0;
-            uint32_t flag = 0;
-            dids_out[ch] = did.rawId();
-
-            // start of this channel in shared mem
-            int const chStart = threadIdx.x - sample_max;
-            // thread for the max sample in shared mem
-            int const threadMax = threadIdx.x;
-            auto const gainSwitchUseMaxSample = isBarrel
-                ? gainSwitchUseMaxSampleEB
-                : gainSwitchUseMaxSampleEE;
-            
-            // this flag setting is applied to all of the cases
-            if (shr_hasSwitchToGain6[chStart])
-                flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain6;
-            if (shr_hasSwitchToGain1[chStart])
-                flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain1;
-
-            // this corresponds to cpu branching on lastSampleBeforeSaturation
-            // likely false
-            if (check_hasSwitchToGain0) {
-                // assign for the case some sample having gainId == 0
-                //energies[ch] = amplitudes[ch][sample_max];
-                energies[ch] = amplitude;
-
-                // check if samples before sample_max have true
-                bool saturated_before_max = false;
-                #pragma unroll
-                for (char ii=0; ii<5; ii++)
-                    saturated_before_max = saturated_before_max ||
-                        shr_hasSwitchToGain0[chStart + ii];
-
-                // if saturation is in the max sample and not in the first 5
-                if (!saturated_before_max && 
-                    shr_hasSwitchToGain0[threadMax])
-                    energies[ch] = 49140; // 4095 * 12
-                    //---- AM FIXME : no pedestal subtraction???  
-                    //It should be "(4095. - pedestal) * gainratio"
-
-                // set state flag to terminate further processing of this channel
-                acState[ch] = static_cast<char>(MinimizationState::Precomputed); 
-                flag |= 0x1 << EcalUncalibratedRecHit::kSaturated;
-                flags[ch] = flag;
-                return;
-            }
-
-            // according to cpu version
-//            auto max_amplitude = amplitudes[ch][sample_max]; 
-            auto const max_amplitude = amplitude;
-            // according to cpu version
-            auto shape_value = shapes_in[hashedId].pdfval[full_pulse_max-7]; 
-            // note, no syncing as the same thread will be accessing here
-            bool hasGainSwitch = shr_hasSwitchToGain6[chStart]
-                || shr_hasSwitchToGain1[chStart]
-                || shr_isSaturated[chStart+3];
-
-            // pedestal is final unconditionally
-            g_pedestal[ch] = pedestal;
-            if (hasGainSwitch && gainSwitchUseMaxSample) {
-                // thread for sample=0 will access the right guys
-                energies[ch] = max_amplitude / shape_value;
-                acState[ch] = static_cast<char>(MinimizationState::Precomputed);
-                flags[ch] = flag;
-                return;
-            }
-            
-            // this happens cause sometimes rms_x12 is 0...
-            // needs to be checkec why this is the case
-            // general case here is that noisecov is a Zero matrix
-            if (rmsForChecking == 0) {
-                acState[ch] = static_cast<char>(MinimizationState::Precomputed);
-                flags[ch] = flag;
-                return;
-            }
-
-            // for the case when no shortcuts were taken
+          //
+          // initialization
+          //
+          acState[ch] = static_cast<char>(MinimizationState::NotFinished);
+          energies[ch] = 0;
+          chi2[ch] = 0;
+          g_pedestal[ch] = 0;
+          uint32_t flag = 0;
+          dids_out[ch] = did.rawId();
+
+          // start of this channel in shared mem
+          int const chStart = threadIdx.x - sample_max;
+          // thread for the max sample in shared mem
+          int const threadMax = threadIdx.x;
+          auto const gainSwitchUseMaxSample = isBarrel ? gainSwitchUseMaxSampleEB : gainSwitchUseMaxSampleEE;
+
+          // this flag setting is applied to all of the cases
+          if (shr_hasSwitchToGain6[chStart])
+            flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain6;
+          if (shr_hasSwitchToGain1[chStart])
+            flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain1;
+
+          // this corresponds to cpu branching on lastSampleBeforeSaturation
+          // likely false
+          if (check_hasSwitchToGain0) {
+            // assign for the case some sample having gainId == 0
+            //energies[ch] = amplitudes[ch][sample_max];
+            energies[ch] = amplitude;
+
+            // check if samples before sample_max have true
+            bool saturated_before_max = false;
+#pragma unroll
+            for (char ii = 0; ii < 5; ii++)
+              saturated_before_max = saturated_before_max || shr_hasSwitchToGain0[chStart + ii];
+
+            // if saturation is in the max sample and not in the first 5
+            if (!saturated_before_max && shr_hasSwitchToGain0[threadMax])
+              energies[ch] = 49140;  // 4095 * 12
+                                     //---- AM FIXME : no pedestal subtraction???
+                                     //It should be "(4095. - pedestal) * gainratio"
+
+            // set state flag to terminate further processing of this channel
+            acState[ch] = static_cast<char>(MinimizationState::Precomputed);
+            flag |= 0x1 << EcalUncalibratedRecHit::kSaturated;
             flags[ch] = flag;
+            return;
+          }
+
+          // according to cpu version
+          //            auto max_amplitude = amplitudes[ch][sample_max];
+          auto const max_amplitude = amplitude;
+          // according to cpu version
+          auto shape_value = shapes_in[hashedId].pdfval[full_pulse_max - 7];
+          // note, no syncing as the same thread will be accessing here
+          bool hasGainSwitch =
+              shr_hasSwitchToGain6[chStart] || shr_hasSwitchToGain1[chStart] || shr_isSaturated[chStart + 3];
+
+          // pedestal is final unconditionally
+          g_pedestal[ch] = pedestal;
+          if (hasGainSwitch && gainSwitchUseMaxSample) {
+            // thread for sample=0 will access the right guys
+            energies[ch] = max_amplitude / shape_value;
+            acState[ch] = static_cast<char>(MinimizationState::Precomputed);
+            flags[ch] = flag;
+            return;
+          }
+
+          // this happens cause sometimes rms_x12 is 0...
+          // needs to be checkec why this is the case
+          // general case here is that noisecov is a Zero matrix
+          if (rmsForChecking == 0) {
+            acState[ch] = static_cast<char>(MinimizationState::Precomputed);
+            flags[ch] = flag;
+            return;
+          }
+
+          // for the case when no shortcuts were taken
+          flags[ch] = flag;
         }
+      }
     }
-}
 
-///
-/// assume kernel launch configuration is 
-/// ([MAXSAMPLES, MAXSAMPLES], nchannels)
-///
-__global__
-void kernel_prep_2d(SampleGainVector const* gainNoise,
-                    uint32_t const* dids_eb,
-                    uint32_t const* dids_ee,
-                    float const* rms_x12,
-                    float const* rms_x6,
-                    float const* rms_x1,
-                    float const* gain12Over6,
-                    float const* gain6Over1,
-                    double const* G12SamplesCorrelationEB,
-                    double const* G6SamplesCorrelationEB,
-                    double const* G1SamplesCorrelationEB,
-                    double const* G12SamplesCorrelationEE,
-                    double const* G6SamplesCorrelationEE,
-                    double const* G1SamplesCorrelationEE,
-                    SampleMatrix* noisecov,
-                    PulseMatrixType* pulse_matrix,
-                    EcalPulseShape const* pulse_shape,
-                    bool const* hasSwitchToGain6,
-                    bool const* hasSwitchToGain1,
-                    bool const* isSaturated,
-                    uint32_t const offsetForHashes,
-                    uint32_t const offsetForInputs) {
-    int const ch = blockIdx.x;
-    int const tx = threadIdx.x;
-    int const ty = threadIdx.y;
-    constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
-    constexpr float addPedestalUncertainty = 0.f;
-    constexpr bool dynamicPedestal = false;
-    constexpr bool simplifiedNoiseModelForGainSwitch = true;  //---- default is true
-    constexpr int template_samples = EcalPulseShape::TEMPLATESAMPLES;
-
-    // to access input arrays (ids and digis only)
-    int const inputCh = ch >= offsetForInputs
-        ? ch - offsetForInputs
-        : ch;
-    auto const* dids = ch >= offsetForInputs
-        ? dids_ee
-        : dids_eb;
-
-    bool tmp0 = hasSwitchToGain6[ch];
-    bool tmp1 = hasSwitchToGain1[ch];
-    auto const did = DetId{dids[inputCh]};
-    auto const isBarrel = did.subdetId() == EcalBarrel;
-    auto const hashedId = isBarrel
-        ? hashedIndexEB(did.rawId())
-        : offsetForHashes + hashedIndexEE(did.rawId());
-    auto const G12SamplesCorrelation = isBarrel
-        ? G12SamplesCorrelationEB
-        : G12SamplesCorrelationEE;
-    auto const* G6SamplesCorrelation = isBarrel
-        ? G6SamplesCorrelationEB
-        : G6SamplesCorrelationEE;
-    auto const* G1SamplesCorrelation = isBarrel
-        ? G1SamplesCorrelationEB
-        : G1SamplesCorrelationEE;
-    bool tmp2 = isSaturated[ch];
-    bool hasGainSwitch = tmp0 || tmp1 || tmp2;
-    auto const vidx = ecal::abs(ty - tx);
-
-    // non-divergent branch for all threads per block
-    if (hasGainSwitch) {
+    ///
+    /// assume kernel launch configuration is
+    /// ([MAXSAMPLES, MAXSAMPLES], nchannels)
+    ///
+    __global__ void kernel_prep_2d(SampleGainVector const* gainNoise,
+                                   uint32_t const* dids_eb,
+                                   uint32_t const* dids_ee,
+                                   float const* rms_x12,
+                                   float const* rms_x6,
+                                   float const* rms_x1,
+                                   float const* gain12Over6,
+                                   float const* gain6Over1,
+                                   double const* G12SamplesCorrelationEB,
+                                   double const* G6SamplesCorrelationEB,
+                                   double const* G1SamplesCorrelationEB,
+                                   double const* G12SamplesCorrelationEE,
+                                   double const* G6SamplesCorrelationEE,
+                                   double const* G1SamplesCorrelationEE,
+                                   SampleMatrix* noisecov,
+                                   PulseMatrixType* pulse_matrix,
+                                   EcalPulseShape const* pulse_shape,
+                                   bool const* hasSwitchToGain6,
+                                   bool const* hasSwitchToGain1,
+                                   bool const* isSaturated,
+                                   uint32_t const offsetForHashes,
+                                   uint32_t const offsetForInputs) {
+      int const ch = blockIdx.x;
+      int const tx = threadIdx.x;
+      int const ty = threadIdx.y;
+      constexpr float addPedestalUncertainty = 0.f;
+      constexpr bool dynamicPedestal = false;
+      constexpr bool simplifiedNoiseModelForGainSwitch = true;  //---- default is true
+
+      // to access input arrays (ids and digis only)
+      int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+      auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+
+      bool tmp0 = hasSwitchToGain6[ch];
+      bool tmp1 = hasSwitchToGain1[ch];
+      auto const did = DetId{dids[inputCh]};
+      auto const isBarrel = did.subdetId() == EcalBarrel;
+      auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId());
+      auto const G12SamplesCorrelation = isBarrel ? G12SamplesCorrelationEB : G12SamplesCorrelationEE;
+      auto const* G6SamplesCorrelation = isBarrel ? G6SamplesCorrelationEB : G6SamplesCorrelationEE;
+      auto const* G1SamplesCorrelation = isBarrel ? G1SamplesCorrelationEB : G1SamplesCorrelationEE;
+      bool tmp2 = isSaturated[ch];
+      bool hasGainSwitch = tmp0 || tmp1 || tmp2;
+      auto const vidx = ecal::abs(ty - tx);
+
+      // non-divergent branch for all threads per block
+      if (hasGainSwitch) {
         // TODO: did not include simplified noise model
         float noise_value = 0;
 
         // non-divergent branch - all threads per block
-        // TODO: all of these constants indicate that 
-        // that these parts could be splitted into completely different 
+        // TODO: all of these constants indicate that
+        // that these parts could be splitted into completely different
         // kernels and run one of them only depending on the config
         if (simplifiedNoiseModelForGainSwitch) {
-            int isample_max = 5; // according to cpu defs
-            int gainidx = gainNoise[ch][isample_max];
-
-            // non-divergent branches
-            if (gainidx==0)
-                //noise_value = rms_x12[ch]*rms_x12[ch]*noisecorrs[0](ty, tx);
-                noise_value = rms_x12[hashedId]*rms_x12[hashedId]
-                    * G12SamplesCorrelation[vidx];
-            if (gainidx==1) 
-//                noise_value = gain12Over6[ch]*gain12Over6[ch] * rms_x6[ch]*rms_x6[ch]
-//                    *noisecorrs[1](ty, tx);
-                noise_value = gain12Over6[hashedId]*gain12Over6[hashedId] 
-                    * rms_x6[hashedId]*rms_x6[hashedId]
-                    * G6SamplesCorrelation[vidx];
-            if (gainidx==2)
-//                noise_value = gain12Over6[ch]*gain12Over6[ch]
-//                    * gain6Over1[ch]*gain6Over1[ch] * rms_x1[ch]*rms_x1[ch]
-//                    * noisecorrs[2](ty, tx);
-                noise_value = gain12Over6[hashedId]*gain12Over6[hashedId]
-                    * gain6Over1[hashedId]*gain6Over1[hashedId] 
-                    * rms_x1[hashedId]*rms_x1[hashedId]
-                    * G1SamplesCorrelation[vidx];
-            if (!dynamicPedestal && addPedestalUncertainty>0.f)
-                noise_value += addPedestalUncertainty*addPedestalUncertainty;
+          int isample_max = 5;  // according to cpu defs
+          int gainidx = gainNoise[ch][isample_max];
+
+          // non-divergent branches
+          if (gainidx == 0)
+            //noise_value = rms_x12[ch]*rms_x12[ch]*noisecorrs[0](ty, tx);
+            noise_value = rms_x12[hashedId] * rms_x12[hashedId] * G12SamplesCorrelation[vidx];
+          if (gainidx == 1)
+            //                noise_value = gain12Over6[ch]*gain12Over6[ch] * rms_x6[ch]*rms_x6[ch]
+            //                    *noisecorrs[1](ty, tx);
+            noise_value = gain12Over6[hashedId] * gain12Over6[hashedId] * rms_x6[hashedId] * rms_x6[hashedId] *
+                          G6SamplesCorrelation[vidx];
+          if (gainidx == 2)
+            //                noise_value = gain12Over6[ch]*gain12Over6[ch]
+            //                    * gain6Over1[ch]*gain6Over1[ch] * rms_x1[ch]*rms_x1[ch]
+            //                    * noisecorrs[2](ty, tx);
+            noise_value = gain12Over6[hashedId] * gain12Over6[hashedId] * gain6Over1[hashedId] * gain6Over1[hashedId] *
+                          rms_x1[hashedId] * rms_x1[hashedId] * G1SamplesCorrelation[vidx];
+          if (!dynamicPedestal && addPedestalUncertainty > 0.f)
+            noise_value += addPedestalUncertainty * addPedestalUncertainty;
         } else {
-            int gainidx=0;
-            char mask = gainidx;
-            int pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
-//            noise_value += /* gainratio is 1*/ rms_x12[ch]*rms_x12[ch]
-//                *pedestal*noisecorrs[0](ty, tx);
-            noise_value += /* gainratio is 1*/ rms_x12[hashedId]*rms_x12[hashedId]
-                * pedestal* G12SamplesCorrelation[vidx];
-            // non-divergent branch
-            if (!dynamicPedestal && addPedestalUncertainty>0.f) {
-                noise_value += /* gainratio is 1 */
-                    addPedestalUncertainty*addPedestalUncertainty*pedestal;
-            }
-
-            //
-            gainidx=1;
-            mask = gainidx;
-            pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
-//            noise_value += gain12Over6[ch]*gain12Over6[ch]
-//                *rms_x6[ch]*rms_x6[ch]*pedestal*noisecorrs[1](ty, tx);
-            noise_value += gain12Over6[hashedId]*gain12Over6[hashedId]
-                *rms_x6[hashedId]*rms_x6[hashedId]*pedestal
-                * G6SamplesCorrelation[vidx];
-            // non-divergent branch
-            if (!dynamicPedestal && addPedestalUncertainty>0.f) {
-                noise_value += gain12Over6[hashedId]*gain12Over6[hashedId]
-                    *addPedestalUncertainty*addPedestalUncertainty
-                    *pedestal;
-            }
-            
-            //
-            gainidx=2;
-            mask = gainidx;
-            pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
-            float tmp = gain6Over1[hashedId] * gain12Over6[hashedId];
-//            noise_value += tmp*tmp * rms_x1[ch]*rms_x1[ch]
-//                *pedestal*noisecorrs[2](ty, tx);
-            noise_value += tmp*tmp * rms_x1[hashedId]*rms_x1[hashedId]
-                *pedestal* G1SamplesCorrelation[vidx];
-            // non-divergent branch
-            if (!dynamicPedestal && addPedestalUncertainty>0.f) {
-                noise_value += tmp*tmp * addPedestalUncertainty*addPedestalUncertainty
-                    * pedestal;
-            }
+          int gainidx = 0;
+          char mask = gainidx;
+          int pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
+          //            noise_value += /* gainratio is 1*/ rms_x12[ch]*rms_x12[ch]
+          //                *pedestal*noisecorrs[0](ty, tx);
+          noise_value +=
+              /* gainratio is 1*/ rms_x12[hashedId] * rms_x12[hashedId] * pedestal * G12SamplesCorrelation[vidx];
+          // non-divergent branch
+          if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
+            noise_value += /* gainratio is 1 */
+                addPedestalUncertainty * addPedestalUncertainty * pedestal;
+          }
+
+          //
+          gainidx = 1;
+          mask = gainidx;
+          pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
+          //            noise_value += gain12Over6[ch]*gain12Over6[ch]
+          //                *rms_x6[ch]*rms_x6[ch]*pedestal*noisecorrs[1](ty, tx);
+          noise_value += gain12Over6[hashedId] * gain12Over6[hashedId] * rms_x6[hashedId] * rms_x6[hashedId] *
+                         pedestal * G6SamplesCorrelation[vidx];
+          // non-divergent branch
+          if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
+            noise_value += gain12Over6[hashedId] * gain12Over6[hashedId] * addPedestalUncertainty *
+                           addPedestalUncertainty * pedestal;
+          }
+
+          //
+          gainidx = 2;
+          mask = gainidx;
+          pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
+          float tmp = gain6Over1[hashedId] * gain12Over6[hashedId];
+          //            noise_value += tmp*tmp * rms_x1[ch]*rms_x1[ch]
+          //                *pedestal*noisecorrs[2](ty, tx);
+          noise_value += tmp * tmp * rms_x1[hashedId] * rms_x1[hashedId] * pedestal * G1SamplesCorrelation[vidx];
+          // non-divergent branch
+          if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
+            noise_value += tmp * tmp * addPedestalUncertainty * addPedestalUncertainty * pedestal;
+          }
         }
 
         noisecov[ch](ty, tx) = noise_value;
-    } else {
+      } else {
         auto rms = rms_x12[hashedId];
-        float noise_value = rms*rms * G12SamplesCorrelation[vidx];
-        if (!dynamicPedestal && addPedestalUncertainty>0.f) {
-            //----  add fully correlated component to noise covariance to inflate pedestal uncertainty
-            noise_value += addPedestalUncertainty*addPedestalUncertainty;
+        float noise_value = rms * rms * G12SamplesCorrelation[vidx];
+        if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
+          //----  add fully correlated component to noise covariance to inflate pedestal uncertainty
+          noise_value += addPedestalUncertainty * addPedestalUncertainty;
         }
         noisecov[ch](ty, tx) = noise_value;
+      }
+
+      // pulse matrix
+      //    int const bx = tx - 5; // -5 -4 -3 ... 3 4
+      //    int bx = (*bxs)(tx);
+      //    int const offset = 7 - 3 - bx;
+      int const posToAccess = 9 - tx + ty;  // see cpu for reference
+      float const value = posToAccess >= 7 ? pulse_shape[hashedId].pdfval[posToAccess - 7] : 0;
+      pulse_matrix[ch](ty, tx) = value;
     }
 
-    // pulse matrix
-//    int const bx = tx - 5; // -5 -4 -3 ... 3 4
-//    int bx = (*bxs)(tx);
-//    int const offset = 7 - 3 - bx;
-    int const posToAccess = 9 - tx + ty; // see cpu for reference
-    float const value = posToAccess>=7 
-        ? pulse_shape[hashedId].pdfval[posToAccess-7]
-        : 0;
-    pulse_matrix[ch](ty, tx) = value;
-}
-
-__global__
-void kernel_permute_results(
-        SampleVector *amplitudes,
-        BXVectorType const*activeBXs,
-        ::ecal::reco::StorageScalarType *energies,
-        char const* acState,
-        int const nchannels) {
-    // constants
-    constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
-
-    // indices
-    int const tx = threadIdx.x + blockIdx.x * blockDim.x;
-    int const ch = tx / nsamples;
-    int const iii = tx % nsamples; // this is to address activeBXs
-
-    if (ch >= nchannels) return;
-    
-    // channels that have amplitude precomputed do not need results to be permuted
-    auto const state = static_cast<MinimizationState>(acState[ch]);
-    if (static_cast<MinimizationState>(acState[ch]) ==
-        MinimizationState::Precomputed)
+    __global__ void kernel_permute_results(SampleVector* amplitudes,
+                                           BXVectorType const* activeBXs,
+                                           ::ecal::reco::StorageScalarType* energies,
+                                           char const* acState,
+                                           int const nchannels) {
+      // constants
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      int const tx = threadIdx.x + blockIdx.x * blockDim.x;
+      int const ch = tx / nsamples;
+      int const iii = tx % nsamples;  // this is to address activeBXs
+
+      if (ch >= nchannels)
         return;
 
-    // configure shared memory and cp into it
-    extern __shared__ char smem[];
-    SampleVector::Scalar* values = reinterpret_cast<SampleVector::Scalar*>(
-        smem);
-    values[threadIdx.x] = amplitudes[ch](iii);
-    __syncthreads();
+      // channels that have amplitude precomputed do not need results to be permuted
+      auto const state = static_cast<MinimizationState>(acState[ch]);
+      if (state == MinimizationState::Precomputed)
+        return;
 
-    // get the sample for this bx
-    auto const sample = static_cast<int>(activeBXs[ch](iii)) + 5;
+      // configure shared memory and cp into it
+      extern __shared__ char smem[];
+      SampleVector::Scalar* values = reinterpret_cast<SampleVector::Scalar*>(smem);
+      values[threadIdx.x] = amplitudes[ch](iii);
+      __syncthreads();
 
-    // store back to global
-    amplitudes[ch](sample) = values[threadIdx.x];
+      // get the sample for this bx
+      auto const sample = static_cast<int>(activeBXs[ch](iii)) + 5;
 
-    // store sample 5 separately
-    // only for the case when minimization was performed
-    // not for cases with precomputed amplitudes
-    if (sample == 5)
+      // store back to global
+      amplitudes[ch](sample) = values[threadIdx.x];
+
+      // store sample 5 separately
+      // only for the case when minimization was performed
+      // not for cases with precomputed amplitudes
+      if (sample == 5)
         energies[ch] = values[threadIdx.x];
-}
+    }
 
 ///
 /// Build an Ecal RecHit.
 /// TODO: Use SoA data structures on the host directly
-/// the reason for removing this from minimize kernel is to isolate the minimize + 
+/// the reason for removing this from minimize kernel is to isolate the minimize +
 /// again, building an aos rec hit involves strides... -> bad memory access pattern
 ///
 #ifdef RUN_BUILD_AOS_RECHIT
-__global__
-void kernel_build_rechit(
-    float const* energies,
-    float const* chi2s,
-    uint32_t* dids,
-    EcalUncalibratedRecHit* rechits,
-    int nchannels) {
-    int idx = threadIdx.x + blockDim.x * blockIdx.x;
-    if (idx < nchannels) {
-        rechits[idx] = EcalUncalibratedRecHit{dids[idx], energies[idx],
-            0, 0, chi2s[idx], 0};
+    __global__ void kernel_build_rechit(
+        float const* energies, float const* chi2s, uint32_t* dids, EcalUncalibratedRecHit* rechits, int nchannels) {
+      int idx = threadIdx.x + blockDim.x * blockIdx.x;
+      if (idx < nchannels) {
+        rechits[idx] = EcalUncalibratedRecHit{dids[idx], energies[idx], 0, 0, chi2s[idx], 0};
+      }
     }
-}
 #endif
 
-}}
+  }  // namespace multifit
+}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu
index fb6b396089151..ddcfa254e43e1 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu
@@ -16,179 +16,163 @@
 #include "AmplitudeComputationKernels.h"
 #include "AmplitudeComputationCommonKernels.h"
 
-namespace ecal { namespace multifit {
-
-void eigen_solve_submatrix(SampleMatrix& mat, 
-                           SampleVector& invec, 
-                           SampleVector& outvec, unsigned NP) {
-    using namespace Eigen;
-    switch( NP ) { // pulse matrix is always square.
-    case 10: {   
-        Matrix<SampleMatrix::Scalar,10,10> temp = mat.topLeftCorner<10,10>();
-        outvec.head<10>() = temp.ldlt().solve(invec.head<10>());
-        break;
-    }   
-    case 9: {
-        Matrix<SampleMatrix::Scalar,9,9> temp = mat.topLeftCorner<9,9>();
-        outvec.head<9>() = temp.ldlt().solve(invec.head<9>());
-        break;
-    }   
-    case 8: {   
-        Matrix<SampleMatrix::Scalar,8,8> temp = mat.topLeftCorner<8,8>();
-        outvec.head<8>() = temp.ldlt().solve(invec.head<8>());
-        break;
-    }   
-    case 7: {   
-        Matrix<SampleMatrix::Scalar,7,7> temp = mat.topLeftCorner<7,7>();
-        outvec.head<7>() = temp.ldlt().solve(invec.head<7>());
-        break;
-    }   
-    case 6: {   
-        Matrix<SampleMatrix::Scalar,6,6> temp = mat.topLeftCorner<6,6>();
-        outvec.head<6>() = temp.ldlt().solve(invec.head<6>());
-        break;
-    }   
-    case 5: {   
-        Matrix<SampleMatrix::Scalar,5,5> temp = mat.topLeftCorner<5,5>();
-        outvec.head<5>() = temp.ldlt().solve(invec.head<5>());
-        break;
-    }   
-    case 4: {   
-        Matrix<SampleMatrix::Scalar,4,4> temp = mat.topLeftCorner<4,4>();
-        outvec.head<4>() = temp.ldlt().solve(invec.head<4>());
-        break;
-    }   
-    case 3: {   
-        Matrix<SampleMatrix::Scalar,3,3> temp = mat.topLeftCorner<3,3>();
-        outvec.head<3>() = temp.ldlt().solve(invec.head<3>());
-        break;
-    }   
-    case 2: {   
-        Matrix<SampleMatrix::Scalar,2,2> temp = mat.topLeftCorner<2,2>();
-        outvec.head<2>() = temp.ldlt().solve(invec.head<2>());
-        break;
-    }   
-    case 1: {   
-        Matrix<SampleMatrix::Scalar,1,1> temp = mat.topLeftCorner<1,1>();
-        outvec.head<1>() = temp.ldlt().solve(invec.head<1>());
-        break;
-    }    
-    default:
-        return;
+namespace ecal {
+  namespace multifit {
+
+    void eigen_solve_submatrix(SampleMatrix& mat, SampleVector& invec, SampleVector& outvec, unsigned NP) {
+      using namespace Eigen;
+      switch (NP) {  // pulse matrix is always square.
+        case 10: {
+          Matrix<SampleMatrix::Scalar, 10, 10> temp = mat.topLeftCorner<10, 10>();
+          outvec.head<10>() = temp.ldlt().solve(invec.head<10>());
+          break;
+        }
+        case 9: {
+          Matrix<SampleMatrix::Scalar, 9, 9> temp = mat.topLeftCorner<9, 9>();
+          outvec.head<9>() = temp.ldlt().solve(invec.head<9>());
+          break;
+        }
+        case 8: {
+          Matrix<SampleMatrix::Scalar, 8, 8> temp = mat.topLeftCorner<8, 8>();
+          outvec.head<8>() = temp.ldlt().solve(invec.head<8>());
+          break;
+        }
+        case 7: {
+          Matrix<SampleMatrix::Scalar, 7, 7> temp = mat.topLeftCorner<7, 7>();
+          outvec.head<7>() = temp.ldlt().solve(invec.head<7>());
+          break;
+        }
+        case 6: {
+          Matrix<SampleMatrix::Scalar, 6, 6> temp = mat.topLeftCorner<6, 6>();
+          outvec.head<6>() = temp.ldlt().solve(invec.head<6>());
+          break;
+        }
+        case 5: {
+          Matrix<SampleMatrix::Scalar, 5, 5> temp = mat.topLeftCorner<5, 5>();
+          outvec.head<5>() = temp.ldlt().solve(invec.head<5>());
+          break;
+        }
+        case 4: {
+          Matrix<SampleMatrix::Scalar, 4, 4> temp = mat.topLeftCorner<4, 4>();
+          outvec.head<4>() = temp.ldlt().solve(invec.head<4>());
+          break;
+        }
+        case 3: {
+          Matrix<SampleMatrix::Scalar, 3, 3> temp = mat.topLeftCorner<3, 3>();
+          outvec.head<3>() = temp.ldlt().solve(invec.head<3>());
+          break;
+        }
+        case 2: {
+          Matrix<SampleMatrix::Scalar, 2, 2> temp = mat.topLeftCorner<2, 2>();
+          outvec.head<2>() = temp.ldlt().solve(invec.head<2>());
+          break;
+        }
+        case 1: {
+          Matrix<SampleMatrix::Scalar, 1, 1> temp = mat.topLeftCorner<1, 1>();
+          outvec.head<1>() = temp.ldlt().solve(invec.head<1>());
+          break;
+        }
+        default:
+          return;
+      }
     }
-}
-
-template<typename MatrixType>
-__device__ __forceinline__
-bool update_covariance(
-        EcalPulseCovariance const& pulse_covariance,
-        MatrixType& inverse_cov,
-        SampleVector const& amplitudes) {
-    constexpr int nsamples = SampleVector::RowsAtCompileTime;
-    constexpr int npulses = BXVectorType::RowsAtCompileTime;
-
-    #pragma unroll
-    for (unsigned int ipulse=0; ipulse<npulses; ipulse++) {
+
+    template <typename MatrixType>
+    __device__ __forceinline__ bool update_covariance(EcalPulseCovariance const& pulse_covariance,
+                                                      MatrixType& inverse_cov,
+                                                      SampleVector const& amplitudes) {
+      constexpr int nsamples = SampleVector::RowsAtCompileTime;
+      constexpr int npulses = BXVectorType::RowsAtCompileTime;
+
+#pragma unroll
+      for (unsigned int ipulse = 0; ipulse < npulses; ipulse++) {
         auto const amplitude = amplitudes.coeff(ipulse);
-        if (amplitude == 0) 
-            continue;
+        if (amplitude == 0)
+          continue;
 
         // FIXME: ipulse - 5 -> ipulse - firstOffset
         int bx = ipulse - 5;
-        int first_sample_t = std::max(0, bx+3);
+        int first_sample_t = std::max(0, bx + 3);
         int offset = -3 - bx;
 
         auto const value_sq = amplitude * amplitude;
 
-        unsigned int nsample_pulse = nsamples - first_sample_t;
-
-        for (int col=first_sample_t; col<nsamples; col++) {
-            for (int row=col; row<nsamples; row++) {
-                inverse_cov(row, col) += value_sq * 
-                    __ldg(&pulse_covariance.covval[row + offset][col + offset]);
-            }
+        for (int col = first_sample_t; col < nsamples; col++) {
+          for (int row = col; row < nsamples; row++) {
+            inverse_cov(row, col) += value_sq * __ldg(&pulse_covariance.covval[row + offset][col + offset]);
+          }
         }
+      }
+
+      return true;
     }
 
-    return true;
-}
-
-///
-/// launch ctx parameters are (nchannels / block, blocks)
-/// TODO: trivial impl for now, there must be a way to improve
-///
-/// Conventions:
-///   - amplitudes -> solution vector, what we are fitting for
-///   - samples -> raw detector responses
-///   - passive constraint - satisfied constraint
-///   - active constraint - unsatisfied (yet) constraint
-///
-__global__
-void kernel_minimize(
-        uint32_t const* dids_eb,
-        uint32_t const* dids_ee,
-        SampleMatrix const* __restrict__ noisecov,
-        EcalPulseCovariance const* __restrict__ pulse_covariance,
-        BXVectorType *bxs,
-        SampleVector const* __restrict__ samples,
-        SampleVector* amplitudes,
-        PulseMatrixType const* __restrict__ pulse_matrix, 
-        ::ecal::reco::StorageScalarType* chi2s,
-        ::ecal::reco::StorageScalarType* energies,
-        char *acState,
-        int nchannels,
-        int max_iterations,
-        uint32_t const offsetForHashes,
-        uint32_t const offsetForInputs) {
-    // FIXME: ecal has 10 samples and 10 pulses....
-    // but this needs to be properly treated and renamed everywhere
-    constexpr auto NSAMPLES = SampleMatrix::RowsAtCompileTime;
-    constexpr auto NPULSES = SampleMatrix::RowsAtCompileTime;
-    static_assert(NSAMPLES == NPULSES);
-
-    using DataType = SampleVector::Scalar;
-
-    extern __shared__ char shrmem[];
-    DataType *shrMatrixLForFnnlsStorage = 
-        reinterpret_cast<DataType*>(shrmem) + MapSymM<DataType, NPULSES>::total * threadIdx.x;
-    DataType *shrAtAStorage = 
-        reinterpret_cast<DataType*>(shrmem) + MapSymM<DataType, NPULSES>::total * (
-            threadIdx.x + blockDim.x);
-
-    // FIXME: remove eitehr idx or ch -> they are teh same thing
-    int idx = threadIdx.x + blockDim.x*blockIdx.x;
-    auto const ch = idx;
-    if (idx < nchannels) {
-        if (static_cast<MinimizationState>(acState[idx]) == 
-            MinimizationState::Precomputed)
-            return;
+    ///
+    /// launch ctx parameters are (nchannels / block, blocks)
+    /// TODO: trivial impl for now, there must be a way to improve
+    ///
+    /// Conventions:
+    ///   - amplitudes -> solution vector, what we are fitting for
+    ///   - samples -> raw detector responses
+    ///   - passive constraint - satisfied constraint
+    ///   - active constraint - unsatisfied (yet) constraint
+    ///
+    __global__ void kernel_minimize(uint32_t const* dids_eb,
+                                    uint32_t const* dids_ee,
+                                    SampleMatrix const* __restrict__ noisecov,
+                                    EcalPulseCovariance const* __restrict__ pulse_covariance,
+                                    BXVectorType* bxs,
+                                    SampleVector const* __restrict__ samples,
+                                    SampleVector* amplitudes,
+                                    PulseMatrixType const* __restrict__ pulse_matrix,
+                                    ::ecal::reco::StorageScalarType* chi2s,
+                                    ::ecal::reco::StorageScalarType* energies,
+                                    char* acState,
+                                    int nchannels,
+                                    int max_iterations,
+                                    uint32_t const offsetForHashes,
+                                    uint32_t const offsetForInputs) {
+      // FIXME: ecal has 10 samples and 10 pulses....
+      // but this needs to be properly treated and renamed everywhere
+      constexpr auto NSAMPLES = SampleMatrix::RowsAtCompileTime;
+      constexpr auto NPULSES = SampleMatrix::RowsAtCompileTime;
+      static_assert(NSAMPLES == NPULSES);
+
+      using DataType = SampleVector::Scalar;
+
+      extern __shared__ char shrmem[];
+      DataType* shrMatrixLForFnnlsStorage =
+          reinterpret_cast<DataType*>(shrmem) + MapSymM<DataType, NPULSES>::total * threadIdx.x;
+      DataType* shrAtAStorage =
+          reinterpret_cast<DataType*>(shrmem) + MapSymM<DataType, NPULSES>::total * (threadIdx.x + blockDim.x);
+
+      // FIXME: remove eitehr idx or ch -> they are teh same thing
+      int idx = threadIdx.x + blockDim.x * blockIdx.x;
+      auto const ch = idx;
+      if (idx < nchannels) {
+        if (static_cast<MinimizationState>(acState[idx]) == MinimizationState::Precomputed)
+          return;
 
         // get the hash
-        int const inputCh = ch >= offsetForInputs
-            ? ch - offsetForInputs
-            : ch;
-        auto const* dids = ch >= offsetForInputs
-            ? dids_ee
-            : dids_eb;
+        int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+        auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
         auto const did = DetId{dids[inputCh]};
         auto const isBarrel = did.subdetId() == EcalBarrel;
-        auto const hashedId = isBarrel
-            ? hashedIndexEB(did.rawId())
-            : offsetForHashes + hashedIndexEE(did.rawId());
+        auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId());
 
         // inits
         int iter = 0;
         int npassive = 0;
 
         ColumnVector<NPULSES, int> pulseOffsets;
-        #pragma unroll
-        for (int i=0; i<NPULSES; ++i)
-            pulseOffsets(i) = i;
+#pragma unroll
+        for (int i = 0; i < NPULSES; ++i)
+          pulseOffsets(i) = i;
 
         ColumnVector<NPULSES, DataType> resultAmplitudes;
-        #pragma unroll
-        for (int counter=0; counter<NPULSES; counter++)
-            resultAmplitudes(counter) = 0;
+#pragma unroll
+        for (int counter = 0; counter < NPULSES; counter++)
+          resultAmplitudes(counter) = 0;
 
         // inits
         //SampleDecompLLT covariance_decomposition;
@@ -197,100 +181,95 @@ void kernel_minimize(
 
         // loop until ocnverge
         while (true) {
-            if (iter >= max_iterations)
-                break;
-
-            //inverse_cov = noisecov[idx];
-            //DataType covMatrixStorage[MapSymM<DataType, NSAMPLES>::total];
-            DataType* covMatrixStorage = shrMatrixLForFnnlsStorage;
-            MapSymM<DataType, NSAMPLES> covMatrix{covMatrixStorage};
-            int counter = 0;
-            #pragma unroll
-            for (int col=0; col<NSAMPLES; col++)
-                #pragma unroll
-                for (int row=col; row<NSAMPLES; row++)
-                    covMatrixStorage[counter++] = __ldg(
-                        &noisecov[idx].coeffRef(row, col));
-
-            update_covariance(
-                pulse_covariance[hashedId],
-                covMatrix,
-                resultAmplitudes);
-
-            // compute actual covariance decomposition
-            //covariance_decomposition.compute(inverse_cov);
-            //auto const& matrixL = covariance_decomposition.matrixL();
-            DataType matrixLStorage[MapSymM<DataType, NSAMPLES>::total];
-            MapSymM<DataType, NSAMPLES> matrixL{matrixLStorage};
-            compute_decomposition_unrolled(matrixL, covMatrix);
-
-            // L * A = P
-            ColMajorMatrix<NSAMPLES, NPULSES> A;
-            solve_forward_subst_matrix(A, pulse_matrix[idx], matrixL);
-
-            // L b = s
-            float reg_b[NSAMPLES];
-            solve_forward_subst_vector(reg_b, samples[idx], matrixL);
-
-            // FIXME: shared mem
-            //DataType AtAStorage[MapSymM<DataType, NPULSES>::total];
-            MapSymM<DataType, NPULSES> AtA{shrAtAStorage};
-            //SampleMatrix AtA;
-            SampleVector Atb;
-            #pragma unroll
-            for (int icol=0; icol<NPULSES; icol++) {
-                float reg_ai[NSAMPLES];
-
-                // load column icol
-                #pragma unroll
-                for (int counter=0; counter<NSAMPLES; counter++)
-                    reg_ai[counter] = A(counter, icol);
-
-                // compute diagoanl
-                float sum = 0.f;
-                #pragma unroll
-                for (int counter=0; counter<NSAMPLES; counter++)
-                    sum += reg_ai[counter] * reg_ai[counter];
-
-                // store
-                AtA(icol, icol) = sum;
-
-                // go thru the other columns
-                #pragma unroll
-                for (int j=icol+1; j<NPULSES; j++) {
-                    // load column j
-                    float reg_aj[NSAMPLES];
-                    #pragma unroll
-                    for (int counter=0; counter<NSAMPLES; counter++)
-                        reg_aj[counter] = A(counter, j);
-
-                    // accum
-                    float sum = 0.f;
-                    #pragma unroll
-                    for (int counter=0; counter<NSAMPLES; counter++)
-                        sum += reg_aj[counter] * reg_ai[counter];
-
-                    // store
-                    //AtA(icol, j) = sum;
-                    AtA(j, icol) = sum;
-                }
-
-                // Atb accum
-                float sum_atb = 0.f;
-                #pragma unroll
-                for (int counter=0; counter<NSAMPLES; counter++)
-                    sum_atb += reg_ai[counter] * reg_b[counter];
-
-                // store atb
-                Atb(icol) = sum_atb;
+          if (iter >= max_iterations)
+            break;
+
+          //inverse_cov = noisecov[idx];
+          //DataType covMatrixStorage[MapSymM<DataType, NSAMPLES>::total];
+          DataType* covMatrixStorage = shrMatrixLForFnnlsStorage;
+          MapSymM<DataType, NSAMPLES> covMatrix{covMatrixStorage};
+          int counter = 0;
+#pragma unroll
+          for (int col = 0; col < NSAMPLES; col++)
+#pragma unroll
+            for (int row = col; row < NSAMPLES; row++)
+              covMatrixStorage[counter++] = __ldg(&noisecov[idx].coeffRef(row, col));
+
+          update_covariance(pulse_covariance[hashedId], covMatrix, resultAmplitudes);
+
+          // compute actual covariance decomposition
+          //covariance_decomposition.compute(inverse_cov);
+          //auto const& matrixL = covariance_decomposition.matrixL();
+          DataType matrixLStorage[MapSymM<DataType, NSAMPLES>::total];
+          MapSymM<DataType, NSAMPLES> matrixL{matrixLStorage};
+          compute_decomposition_unrolled(matrixL, covMatrix);
+
+          // L * A = P
+          ColMajorMatrix<NSAMPLES, NPULSES> A;
+          solve_forward_subst_matrix(A, pulse_matrix[idx], matrixL);
+
+          // L b = s
+          float reg_b[NSAMPLES];
+          solve_forward_subst_vector(reg_b, samples[idx], matrixL);
+
+          // FIXME: shared mem
+          //DataType AtAStorage[MapSymM<DataType, NPULSES>::total];
+          MapSymM<DataType, NPULSES> AtA{shrAtAStorage};
+          //SampleMatrix AtA;
+          SampleVector Atb;
+#pragma unroll
+          for (int icol = 0; icol < NPULSES; icol++) {
+            float reg_ai[NSAMPLES];
+
+// load column icol
+#pragma unroll
+            for (int counter = 0; counter < NSAMPLES; counter++)
+              reg_ai[counter] = A(counter, icol);
+
+            // compute diagoanl
+            float sum = 0.f;
+#pragma unroll
+            for (int counter = 0; counter < NSAMPLES; counter++)
+              sum += reg_ai[counter] * reg_ai[counter];
+
+            // store
+            AtA(icol, icol) = sum;
+
+// go thru the other columns
+#pragma unroll
+            for (int j = icol + 1; j < NPULSES; j++) {
+              // load column j
+              float reg_aj[NSAMPLES];
+#pragma unroll
+              for (int counter = 0; counter < NSAMPLES; counter++)
+                reg_aj[counter] = A(counter, j);
+
+              // accum
+              float sum = 0.f;
+#pragma unroll
+              for (int counter = 0; counter < NSAMPLES; counter++)
+                sum += reg_aj[counter] * reg_ai[counter];
+
+              // store
+              //AtA(icol, j) = sum;
+              AtA(j, icol) = sum;
             }
-            
-            // FIXME: shared mem
-            //DataType matrixLForFnnlsStorage[MapSymM<DataType, NPULSES>::total];
-            MapSymM<DataType, NPULSES> matrixLForFnnls{shrMatrixLForFnnlsStorage};
 
-            fnnls(
-                AtA,
+            // Atb accum
+            float sum_atb = 0.f;
+#pragma unroll
+            for (int counter = 0; counter < NSAMPLES; counter++)
+              sum_atb += reg_ai[counter] * reg_b[counter];
+
+            // store atb
+            Atb(icol) = sum_atb;
+          }
+
+          // FIXME: shared mem
+          //DataType matrixLForFnnlsStorage[MapSymM<DataType, NPULSES>::total];
+          MapSymM<DataType, NPULSES> matrixLForFnnls{shrMatrixLForFnnlsStorage};
+
+          fnnls(AtA,
                 Atb,
                 //amplitudes[idx],
                 resultAmplitudes,
@@ -298,128 +277,124 @@ void kernel_minimize(
                 pulseOffsets,
                 matrixLForFnnls,
                 1e-11,
-                500
-                );
-                
-            {    
-                DataType accum[NSAMPLES];
-                // load accum
-                #pragma unroll
-                for (int counter=0; counter<NSAMPLES; counter++)
-                    accum[counter] = -samples[idx](counter);
-
-                // iterate
-                for (int icol=0; icol<NPULSES; icol++) {
-                    DataType pm_col[NSAMPLES];
-
-                    // preload a column of pulse matrix
-                    #pragma unroll
-                    for (int counter=0; counter<NSAMPLES; counter++)
-                        pm_col[counter] = __ldg(
-                            &pulse_matrix[idx].coeffRef(counter, icol));
-
-                    // accum
-                    #pragma unroll
-                    for (int counter=0; counter<NSAMPLES; counter++)
-                        accum[counter] += resultAmplitudes[icol] * pm_col[counter];
-                }
-
-                DataType reg_L[NSAMPLES];
-                DataType accumSum = 0;
-
-                // preload a column and load column 0 of cholesky
-                #pragma unroll
-                for (int i=0; i<NSAMPLES; i++)
-                    reg_L[i] = matrixL(i, 0);
-
-                // compute x0 and store it
-                auto x_prev = accum[0] / reg_L[0];
-                accumSum += x_prev * x_prev;
-
-                // iterate
-                #pragma unroll
-                for (int iL=1; iL<NSAMPLES; iL++) {
-                    // update accum
-                    #pragma unroll
-                    for (int counter=iL; counter<NSAMPLES; counter++)
-                        accum[counter] -= x_prev * reg_L[counter];
-
-                    // load the next column of cholesky
-                    #pragma unroll
-                    for (int counter=iL; counter<NSAMPLES; counter++)
-                        reg_L[counter] = matrixL(counter, iL);
-
-                    // compute the next x for M(iL, icol)
-                    x_prev = accum[iL] / reg_L[iL];
-
-                    // store teh result value
-                    accumSum += x_prev * x_prev;
-                }
-
-                chi2_now = accumSum;
+                500);
+
+          {
+            DataType accum[NSAMPLES];
+// load accum
+#pragma unroll
+            for (int counter = 0; counter < NSAMPLES; counter++)
+              accum[counter] = -samples[idx](counter);
+
+            // iterate
+            for (int icol = 0; icol < NPULSES; icol++) {
+              DataType pm_col[NSAMPLES];
+
+// preload a column of pulse matrix
+#pragma unroll
+              for (int counter = 0; counter < NSAMPLES; counter++)
+                pm_col[counter] = __ldg(&pulse_matrix[idx].coeffRef(counter, icol));
+
+// accum
+#pragma unroll
+              for (int counter = 0; counter < NSAMPLES; counter++)
+                accum[counter] += resultAmplitudes[icol] * pm_col[counter];
+            }
+
+            DataType reg_L[NSAMPLES];
+            DataType accumSum = 0;
+
+// preload a column and load column 0 of cholesky
+#pragma unroll
+            for (int i = 0; i < NSAMPLES; i++)
+              reg_L[i] = matrixL(i, 0);
+
+            // compute x0 and store it
+            auto x_prev = accum[0] / reg_L[0];
+            accumSum += x_prev * x_prev;
+
+// iterate
+#pragma unroll
+            for (int iL = 1; iL < NSAMPLES; iL++) {
+// update accum
+#pragma unroll
+              for (int counter = iL; counter < NSAMPLES; counter++)
+                accum[counter] -= x_prev * reg_L[counter];
+
+// load the next column of cholesky
+#pragma unroll
+              for (int counter = iL; counter < NSAMPLES; counter++)
+                reg_L[counter] = matrixL(counter, iL);
+
+              // compute the next x for M(iL, icol)
+              x_prev = accum[iL] / reg_L[iL];
+
+              // store teh result value
+              accumSum += x_prev * x_prev;
             }
 
-            auto deltachi2 = chi2_now - chi2;
-            chi2 = chi2_now;
+            chi2_now = accumSum;
+          }
+
+          auto deltachi2 = chi2_now - chi2;
+          chi2 = chi2_now;
 
-            if (ecal::abs(deltachi2) < 1e-3)
-                break;
+          if (ecal::abs(deltachi2) < 1e-3)
+            break;
 
-            //---- AM: TEST
-            //---- it was 3 lines above, now here as in the CPU version
-            ++iter;
+          //---- AM: TEST
+          //---- it was 3 lines above, now here as in the CPU version
+          ++iter;
         }
 
         // store to global output values
         // FIXME: amplitudes are used in global directly
         chi2s[idx] = chi2;
         energies[idx] = resultAmplitudes(5);
-        #pragma unroll
-        for (int counter=0; counter<NPULSES; counter++)
-            amplitudes[idx](counter) = resultAmplitudes(counter);
+#pragma unroll
+        for (int counter = 0; counter < NPULSES; counter++)
+          amplitudes[idx](counter) = resultAmplitudes(counter);
+      }
     }
-}
-
-namespace v1 {
-
-void minimization_procedure(
-        EventInputDataGPU const& eventInputGPU,
-        EventOutputDataGPU& eventOutputGPU, EventDataForScratchGPU& scratch,
-        ConditionsProducts const& conditions,
-        ConfigurationParameters const& configParameters,
-        cudaStream_t cudaStream) {
-    using DataType = SampleVector::Scalar;
-    unsigned int totalChannels = eventInputGPU.ebDigis.ndigis
-        + eventInputGPU.eeDigis.ndigis;
-//    unsigned int threads_min = conf.threads.x;
-    // TODO: configure from python
-    unsigned int threads_min = configParameters.kernelMinimizeThreads[0];
-    unsigned int blocks_min = threads_min > totalChannels
-        ? 1
-        : (totalChannels + threads_min - 1) / threads_min;
-    uint32_t const offsetForHashes = conditions.offsetForHashes;
-    uint32_t const offsetForInputs = eventInputGPU.ebDigis.ndigis;
-    auto const nbytesShared = 2 * threads_min * 
-        MapSymM<DataType, SampleVector::RowsAtCompileTime>::total * sizeof(DataType);
-    kernel_minimize<<<blocks_min, threads_min, nbytesShared, cudaStream>>>(
-        eventInputGPU.ebDigis.ids,
-        eventInputGPU.eeDigis.ids,
-        scratch.noisecov,
-        conditions.pulseCovariances.values,
-        scratch.activeBXs,
-        scratch.samples,
-        (SampleVector*)eventOutputGPU.amplitudesAll,
-        scratch.pulse_matrix,
-        eventOutputGPU.chi2,
-        eventOutputGPU.amplitude,
-        scratch.acState,
-        totalChannels,
-        50,
-        offsetForHashes,
-        offsetForInputs);
-    cudaCheck(cudaGetLastError());
-}
-
-}
-
-}}
+
+    namespace v1 {
+
+      void minimization_procedure(EventInputDataGPU const& eventInputGPU,
+                                  EventOutputDataGPU& eventOutputGPU,
+                                  EventDataForScratchGPU& scratch,
+                                  ConditionsProducts const& conditions,
+                                  ConfigurationParameters const& configParameters,
+                                  cudaStream_t cudaStream) {
+        using DataType = SampleVector::Scalar;
+        unsigned int totalChannels = eventInputGPU.ebDigis.ndigis + eventInputGPU.eeDigis.ndigis;
+        //    unsigned int threads_min = conf.threads.x;
+        // TODO: configure from python
+        unsigned int threads_min = configParameters.kernelMinimizeThreads[0];
+        unsigned int blocks_min = threads_min > totalChannels ? 1 : (totalChannels + threads_min - 1) / threads_min;
+        uint32_t const offsetForHashes = conditions.offsetForHashes;
+        uint32_t const offsetForInputs = eventInputGPU.ebDigis.ndigis;
+        auto const nbytesShared =
+            2 * threads_min * MapSymM<DataType, SampleVector::RowsAtCompileTime>::total * sizeof(DataType);
+        kernel_minimize<<<blocks_min, threads_min, nbytesShared, cudaStream>>>(
+            eventInputGPU.ebDigis.ids,
+            eventInputGPU.eeDigis.ids,
+            scratch.noisecov,
+            conditions.pulseCovariances.values,
+            scratch.activeBXs,
+            scratch.samples,
+            (SampleVector*)eventOutputGPU.amplitudesAll,
+            scratch.pulse_matrix,
+            eventOutputGPU.chi2,
+            eventOutputGPU.amplitude,
+            scratch.acState,
+            totalChannels,
+            50,
+            offsetForHashes,
+            offsetForInputs);
+        cudaCheck(cudaGetLastError());
+      }
+
+    }  // namespace v1
+
+  }  // namespace multifit
+}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalGainRatiosGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalGainRatiosGPU.cc
index bcb199b133c0d..d5980d8a757aa 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalGainRatiosGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalGainRatiosGPU.cc
@@ -3,57 +3,50 @@
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-EcalGainRatiosGPU::EcalGainRatiosGPU(EcalGainRatios const& values) 
-    : gain12Over6_(values.size())
-    , gain6Over1_(values.size())
-{
-    // fill in eb
-    auto const& barrelValues = values.barrelItems();
-    for (unsigned int i=0; i<barrelValues.size(); i++) {
-        gain12Over6_[i] = barrelValues[i].gain12Over6();
-        gain6Over1_[i] = barrelValues[i].gain6Over1();
-    }
-    
-    // fill in ee
-    auto const& endcapValues = values.endcapItems();
-    auto const offset = barrelValues.size();
-    for (unsigned int i=0; i<endcapValues.size(); i++) {
-        gain12Over6_[offset + i] = endcapValues[i].gain12Over6();
-        gain6Over1_[offset + i] = endcapValues[i].gain6Over1();
-    }
+EcalGainRatiosGPU::EcalGainRatiosGPU(EcalGainRatios const& values)
+    : gain12Over6_(values.size()), gain6Over1_(values.size()) {
+  // fill in eb
+  auto const& barrelValues = values.barrelItems();
+  for (unsigned int i = 0; i < barrelValues.size(); i++) {
+    gain12Over6_[i] = barrelValues[i].gain12Over6();
+    gain6Over1_[i] = barrelValues[i].gain6Over1();
+  }
+
+  // fill in ee
+  auto const& endcapValues = values.endcapItems();
+  auto const offset = barrelValues.size();
+  for (unsigned int i = 0; i < endcapValues.size(); i++) {
+    gain12Over6_[offset + i] = endcapValues[i].gain12Over6();
+    gain6Over1_[offset + i] = endcapValues[i].gain6Over1();
+  }
 }
 
 EcalGainRatiosGPU::Product::~Product() {
-    // deallocation
-    cudaCheck( cudaFree(gain12Over6) );
-    cudaCheck( cudaFree(gain6Over1) );
+  // deallocation
+  cudaCheck(cudaFree(gain12Over6));
+  cudaCheck(cudaFree(gain6Over1));
 }
 
-EcalGainRatiosGPU::Product const& EcalGainRatiosGPU::getProduct(
-        cudaStream_t cudaStream) const
-{
-    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
-        [this](EcalGainRatiosGPU::Product& product, cudaStream_t cudaStream) {
-            // malloc
-            cudaCheck( cudaMalloc((void**)&product.gain12Over6,
-                                  this->gain12Over6_.size() * sizeof(float)) );
-            cudaCheck( cudaMalloc((void**)&product.gain6Over1,
-                                  this->gain6Over1_.size() * sizeof(float)) );
-            // transfer 
-            cudaCheck( cudaMemcpyAsync(product.gain12Over6,
-                                       this->gain12Over6_.data(),
-                                       this->gain12Over6_.size() * sizeof(float),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-            cudaCheck( cudaMemcpyAsync(product.gain6Over1,
-                                       this->gain6Over1_.data(),
-                                       this->gain6Over1_.size() * sizeof(float),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-        }
-    );
+EcalGainRatiosGPU::Product const& EcalGainRatiosGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalGainRatiosGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.gain12Over6, this->gain12Over6_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.gain6Over1, this->gain6Over1_.size() * sizeof(float)));
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.gain12Over6,
+                                  this->gain12Over6_.data(),
+                                  this->gain12Over6_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.gain6Over1,
+                                  this->gain6Over1_.data(),
+                                  this->gain6Over1_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
 
-    return product;
+  return product;
 }
 
 TYPELOOKUP_DATA_REG(EcalGainRatiosGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalPedestalsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalPedestalsGPU.cc
index 401ad8c454737..9e3284cd9c7c8 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalPedestalsGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalPedestalsGPU.cc
@@ -3,103 +3,92 @@
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-EcalPedestalsGPU::EcalPedestalsGPU(EcalPedestals const& pedestals) 
-    : mean_x12_(pedestals.size())
-    , rms_x12_(pedestals.size())
-    , mean_x6_(pedestals.size())
-    , rms_x6_(pedestals.size())
-    , mean_x1_(pedestals.size())
-    , rms_x1_(pedestals.size())
-{   
+EcalPedestalsGPU::EcalPedestalsGPU(EcalPedestals const& pedestals)
+    : mean_x12_(pedestals.size()),
+      rms_x12_(pedestals.size()),
+      mean_x6_(pedestals.size()),
+      rms_x6_(pedestals.size()),
+      mean_x1_(pedestals.size()),
+      rms_x1_(pedestals.size()) {
+  // fill in eb
+  auto const& barrelValues = pedestals.barrelItems();
+  for (unsigned int i = 0; i < barrelValues.size(); i++) {
+    mean_x12_[i] = barrelValues[i].mean_x12;
+    rms_x12_[i] = barrelValues[i].rms_x12;
+    mean_x6_[i] = barrelValues[i].mean_x6;
+    rms_x6_[i] = barrelValues[i].rms_x6;
+    mean_x1_[i] = barrelValues[i].mean_x1;
+    rms_x1_[i] = barrelValues[i].rms_x1;
+  }
 
-    // fill in eb
-    auto const& barrelValues = pedestals.barrelItems();
-    for (unsigned int i=0; i<barrelValues.size(); i++) {
-        mean_x12_[i] = barrelValues[i].mean_x12;
-        rms_x12_[i] = barrelValues[i].rms_x12;
-        mean_x6_[i] = barrelValues[i].mean_x6;
-        rms_x6_[i] = barrelValues[i].rms_x6;
-        mean_x1_[i] = barrelValues[i].mean_x1;
-        rms_x1_[i] = barrelValues[i].rms_x1;
-    }
-    
-    // fill in ee
-    auto const& endcapValues = pedestals.endcapItems();
-    auto const offset = barrelValues.size();
-    for (unsigned int i=0; i<endcapValues.size(); i++) {
-        mean_x12_[offset + i] = endcapValues[i].mean_x12;
-        rms_x12_[offset + i] = endcapValues[i].rms_x12;
-        mean_x6_[offset + i] = endcapValues[i].mean_x6;
-        rms_x6_[offset + i] = endcapValues[i].rms_x6;
-        mean_x1_[offset + i] = endcapValues[i].mean_x1;
-        rms_x1_[offset + i] = endcapValues[i].rms_x1;
-    }
+  // fill in ee
+  auto const& endcapValues = pedestals.endcapItems();
+  auto const offset = barrelValues.size();
+  for (unsigned int i = 0; i < endcapValues.size(); i++) {
+    mean_x12_[offset + i] = endcapValues[i].mean_x12;
+    rms_x12_[offset + i] = endcapValues[i].rms_x12;
+    mean_x6_[offset + i] = endcapValues[i].mean_x6;
+    rms_x6_[offset + i] = endcapValues[i].rms_x6;
+    mean_x1_[offset + i] = endcapValues[i].mean_x1;
+    rms_x1_[offset + i] = endcapValues[i].rms_x1;
+  }
 }
 
 EcalPedestalsGPU::Product::~Product() {
-    // deallocation
-    cudaCheck( cudaFree(mean_x12) );
-    cudaCheck( cudaFree(rms_x12) );
-    cudaCheck( cudaFree(mean_x6) );
-    cudaCheck( cudaFree(rms_x6) );
-    cudaCheck( cudaFree(mean_x1) );
-    cudaCheck( cudaFree(rms_x1) );
+  // deallocation
+  cudaCheck(cudaFree(mean_x12));
+  cudaCheck(cudaFree(rms_x12));
+  cudaCheck(cudaFree(mean_x6));
+  cudaCheck(cudaFree(rms_x6));
+  cudaCheck(cudaFree(mean_x1));
+  cudaCheck(cudaFree(rms_x1));
 }
 
-EcalPedestalsGPU::Product const& EcalPedestalsGPU::getProduct(
-        cudaStream_t cudaStream) const
-{
-    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
-        [this](EcalPedestalsGPU::Product& product, cudaStream_t cudaStream) {
-            // malloc
-            cudaCheck( cudaMalloc((void**)&product.mean_x12,
-                                  this->mean_x12_.size() * sizeof(float)) );
-            cudaCheck( cudaMalloc((void**)&product.rms_x12,
-                                  this->mean_x12_.size() * sizeof(float)) );
-            cudaCheck( cudaMalloc((void**)&product.mean_x6,
-                                  this->mean_x12_.size() * sizeof(float)) );
-            cudaCheck( cudaMalloc((void**)&product.rms_x6,
-                                  this->mean_x12_.size() * sizeof(float)) );
-            cudaCheck( cudaMalloc((void**)&product.mean_x1,
-                                  this->mean_x12_.size() * sizeof(float)) );
-            cudaCheck( cudaMalloc((void**)&product.rms_x1,
-                                  this->mean_x12_.size() * sizeof(float)) );
+EcalPedestalsGPU::Product const& EcalPedestalsGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalPedestalsGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.mean_x12, this->mean_x12_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.rms_x12, this->mean_x12_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.mean_x6, this->mean_x12_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.rms_x6, this->mean_x12_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.mean_x1, this->mean_x12_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.rms_x1, this->mean_x12_.size() * sizeof(float)));
 
-            // transfer 
-            cudaCheck( cudaMemcpyAsync(product.mean_x12,
-                                       this->mean_x12_.data(),
-                                       this->mean_x12_.size() * sizeof(float),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-            cudaCheck( cudaMemcpyAsync(product.rms_x12,
-                                       this->rms_x12_.data(),
-                                       this->rms_x12_.size() * sizeof(float),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-            cudaCheck( cudaMemcpyAsync(product.mean_x6,
-                                       this->mean_x6_.data(),
-                                       this->mean_x6_.size() * sizeof(float),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-            cudaCheck( cudaMemcpyAsync(product.rms_x6,
-                                       this->rms_x6_.data(),
-                                       this->rms_x6_.size() * sizeof(float),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-            cudaCheck( cudaMemcpyAsync(product.mean_x1,
-                                       this->mean_x1_.data(),
-                                       this->mean_x1_.size() * sizeof(float),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-            cudaCheck( cudaMemcpyAsync(product.rms_x1,
-                                       this->rms_x1_.data(),
-                                       this->rms_x1_.size() * sizeof(float),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-        }
-    );
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.mean_x12,
+                                  this->mean_x12_.data(),
+                                  this->mean_x12_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.rms_x12,
+                                  this->rms_x12_.data(),
+                                  this->rms_x12_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.mean_x6,
+                                  this->mean_x6_.data(),
+                                  this->mean_x6_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.rms_x6,
+                                  this->rms_x6_.data(),
+                                  this->rms_x6_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.mean_x1,
+                                  this->mean_x1_.data(),
+                                  this->mean_x1_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.rms_x1,
+                                  this->rms_x1_.data(),
+                                  this->rms_x1_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
 
-    return product;
+  return product;
 }
 
 TYPELOOKUP_DATA_REG(EcalPedestalsGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalPulseCovariancesGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalPulseCovariancesGPU.cc
index 121a5b9e684f7..bbeda99652e22 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalPulseCovariancesGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalPulseCovariancesGPU.cc
@@ -3,48 +3,40 @@
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-EcalPulseCovariancesGPU::EcalPulseCovariancesGPU(EcalPulseCovariances const& values) 
-    : valuesEB_{values.barrelItems()}
-    , valuesEE_{values.endcapItems()}
-{}
+EcalPulseCovariancesGPU::EcalPulseCovariancesGPU(EcalPulseCovariances const& values)
+    : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {}
 
 EcalPulseCovariancesGPU::Product::~Product() {
-    // deallocation
-    cudaCheck( cudaFree(values) );
+  // deallocation
+  cudaCheck(cudaFree(values));
 }
 
-EcalPulseCovariancesGPU::Product const& EcalPulseCovariancesGPU::getProduct(
-        cudaStream_t cudaStream) const
-{
-    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
-        [this](EcalPulseCovariancesGPU::Product& product, cudaStream_t cudaStream) {
-            // malloc
-            cudaCheck( cudaMalloc((void**)&product.values,
-                                  (this->valuesEE_.size() + this->valuesEB_.size()) 
-                                  * sizeof(EcalPulseCovariance)) );
-           
-            // offset in terms of sizeof(EcalPulseCovariance)
-            uint32_t offset = this->valuesEB_.size();
-
-            // transfer eb 
-            cudaCheck( cudaMemcpyAsync(product.values,
-                                       this->valuesEB_.data(),
-                                       this->valuesEB_.size() * 
-                                       sizeof(EcalPulseCovariance),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-
-            // transfer ee starting at values + offset
-            cudaCheck( cudaMemcpyAsync(product.values + offset,
-                                       this->valuesEE_.data(),
-                                       this->valuesEE_.size() * 
-                                       sizeof(EcalPulseCovariance),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-        }
-    );
-
-    return product;
+EcalPulseCovariancesGPU::Product const& EcalPulseCovariancesGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalPulseCovariancesGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.values,
+                             (this->valuesEE_.size() + this->valuesEB_.size()) * sizeof(EcalPulseCovariance)));
+
+        // offset in terms of sizeof(EcalPulseCovariance)
+        uint32_t offset = this->valuesEB_.size();
+
+        // transfer eb
+        cudaCheck(cudaMemcpyAsync(product.values,
+                                  this->valuesEB_.data(),
+                                  this->valuesEB_.size() * sizeof(EcalPulseCovariance),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+
+        // transfer ee starting at values + offset
+        cudaCheck(cudaMemcpyAsync(product.values + offset,
+                                  this->valuesEE_.data(),
+                                  this->valuesEE_.size() * sizeof(EcalPulseCovariance),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
 }
 
 TYPELOOKUP_DATA_REG(EcalPulseCovariancesGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalPulseShapesGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalPulseShapesGPU.cc
index 8e8f00795d225..aee122a01627d 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalPulseShapesGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalPulseShapesGPU.cc
@@ -3,48 +3,40 @@
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-EcalPulseShapesGPU::EcalPulseShapesGPU(EcalPulseShapes const& values) 
-    : valuesEB_{values.barrelItems()}
-    , valuesEE_{values.endcapItems()}
-{}
+EcalPulseShapesGPU::EcalPulseShapesGPU(EcalPulseShapes const& values)
+    : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {}
 
 EcalPulseShapesGPU::Product::~Product() {
-    // deallocation
-    cudaCheck( cudaFree(values) );
+  // deallocation
+  cudaCheck(cudaFree(values));
 }
 
-EcalPulseShapesGPU::Product const& EcalPulseShapesGPU::getProduct(
-        cudaStream_t cudaStream) const
-{
-    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
-        [this](EcalPulseShapesGPU::Product& product, cudaStream_t cudaStream) {
-            // malloc
-            cudaCheck( cudaMalloc((void**)&product.values,
-                                  (this->valuesEE_.size() + this->valuesEB_.size()) 
-                                  * sizeof(EcalPulseShape)) );
-           
-            // offset in terms of sizeof(EcalPulseShape) - plain c array
-            uint32_t offset = this->valuesEB_.size();
-
-            // transfer eb 
-            cudaCheck( cudaMemcpyAsync(product.values,
-                                       this->valuesEB_.data(),
-                                       this->valuesEB_.size() * 
-                                       sizeof(EcalPulseShape),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-
-            // transfer ee starting at values + offset
-            cudaCheck( cudaMemcpyAsync(product.values + offset,
-                                       this->valuesEE_.data(),
-                                       this->valuesEE_.size() * 
-                                       sizeof(EcalPulseShape),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-        }
-    );
-
-    return product;
+EcalPulseShapesGPU::Product const& EcalPulseShapesGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalPulseShapesGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.values,
+                             (this->valuesEE_.size() + this->valuesEB_.size()) * sizeof(EcalPulseShape)));
+
+        // offset in terms of sizeof(EcalPulseShape) - plain c array
+        uint32_t offset = this->valuesEB_.size();
+
+        // transfer eb
+        cudaCheck(cudaMemcpyAsync(product.values,
+                                  this->valuesEB_.data(),
+                                  this->valuesEB_.size() * sizeof(EcalPulseShape),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+
+        // transfer ee starting at values + offset
+        cudaCheck(cudaMemcpyAsync(product.values + offset,
+                                  this->valuesEE_.data(),
+                                  this->valuesEE_.size() * sizeof(EcalPulseShape),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
 }
 
 TYPELOOKUP_DATA_REG(EcalPulseShapesGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalSamplesCorrelationGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalSamplesCorrelationGPU.cc
index 7294c759aaa0d..2a98067f51d9e 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalSamplesCorrelationGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalSamplesCorrelationGPU.cc
@@ -3,91 +3,74 @@
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-EcalSamplesCorrelationGPU::EcalSamplesCorrelationGPU(
-        EcalSamplesCorrelation const& values) 
-    : EBG12SamplesCorrelation_{values.EBG12SamplesCorrelation}
-    , EBG6SamplesCorrelation_{values.EBG6SamplesCorrelation}
-    , EBG1SamplesCorrelation_{values.EBG1SamplesCorrelation}
-    , EEG12SamplesCorrelation_{values.EEG12SamplesCorrelation}
-    , EEG6SamplesCorrelation_{values.EEG6SamplesCorrelation}
-    , EEG1SamplesCorrelation_{values.EEG1SamplesCorrelation}
-{}
+EcalSamplesCorrelationGPU::EcalSamplesCorrelationGPU(EcalSamplesCorrelation const& values)
+    : EBG12SamplesCorrelation_{values.EBG12SamplesCorrelation},
+      EBG6SamplesCorrelation_{values.EBG6SamplesCorrelation},
+      EBG1SamplesCorrelation_{values.EBG1SamplesCorrelation},
+      EEG12SamplesCorrelation_{values.EEG12SamplesCorrelation},
+      EEG6SamplesCorrelation_{values.EEG6SamplesCorrelation},
+      EEG1SamplesCorrelation_{values.EEG1SamplesCorrelation} {}
 
 EcalSamplesCorrelationGPU::Product::~Product() {
-    // deallocation
-    cudaCheck( cudaFree(EBG12SamplesCorrelation) );
-    cudaCheck( cudaFree(EBG6SamplesCorrelation) );
-    cudaCheck( cudaFree(EBG1SamplesCorrelation) );
-    cudaCheck( cudaFree(EEG12SamplesCorrelation) );
-    cudaCheck( cudaFree(EEG6SamplesCorrelation) );
-    cudaCheck( cudaFree(EEG1SamplesCorrelation) );
+  // deallocation
+  cudaCheck(cudaFree(EBG12SamplesCorrelation));
+  cudaCheck(cudaFree(EBG6SamplesCorrelation));
+  cudaCheck(cudaFree(EBG1SamplesCorrelation));
+  cudaCheck(cudaFree(EEG12SamplesCorrelation));
+  cudaCheck(cudaFree(EEG6SamplesCorrelation));
+  cudaCheck(cudaFree(EEG1SamplesCorrelation));
 }
 
-EcalSamplesCorrelationGPU::Product const& EcalSamplesCorrelationGPU::getProduct(
-        cudaStream_t cudaStream) const
-{
-    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
-        [this](EcalSamplesCorrelationGPU::Product& product, cudaStream_t cudaStream) {
-            // malloc
-            cudaCheck( cudaMalloc((void**)&product.EBG12SamplesCorrelation,
-                                  this->EBG12SamplesCorrelation_.size() * 
-                                  sizeof(double)) );
-            cudaCheck( cudaMalloc((void**)&product.EBG6SamplesCorrelation,
-                                  this->EBG6SamplesCorrelation_.size() * 
-                                  sizeof(double)) );
-            cudaCheck( cudaMalloc((void**)&product.EBG1SamplesCorrelation,
-                                  this->EBG1SamplesCorrelation_.size() * 
-                                  sizeof(double)) );
-            cudaCheck( cudaMalloc((void**)&product.EEG12SamplesCorrelation,
-                                  this->EEG12SamplesCorrelation_.size() * 
-                                  sizeof(double)) );
-            cudaCheck( cudaMalloc((void**)&product.EEG6SamplesCorrelation,
-                                  this->EEG6SamplesCorrelation_.size() * 
-                                  sizeof(double)) );
-            cudaCheck( cudaMalloc((void**)&product.EEG1SamplesCorrelation,
-                                  this->EEG1SamplesCorrelation_.size() * 
-                                  sizeof(double)) );
-            // transfer 
-            cudaCheck( cudaMemcpyAsync(product.EBG12SamplesCorrelation,
-                                       this->EBG12SamplesCorrelation_.data(),
-                                       this->EBG12SamplesCorrelation_.size() * 
-                                       sizeof(double),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-            cudaCheck( cudaMemcpyAsync(product.EBG6SamplesCorrelation,
-                                       this->EBG6SamplesCorrelation_.data(),
-                                       this->EBG6SamplesCorrelation_.size() * 
-                                       sizeof(double),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-            cudaCheck( cudaMemcpyAsync(product.EBG1SamplesCorrelation,
-                                       this->EBG1SamplesCorrelation_.data(),
-                                       this->EBG1SamplesCorrelation_.size() * 
-                                       sizeof(double),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-            cudaCheck( cudaMemcpyAsync(product.EEG12SamplesCorrelation,
-                                       this->EEG12SamplesCorrelation_.data(),
-                                       this->EEG12SamplesCorrelation_.size() * 
-                                       sizeof(double),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-            cudaCheck( cudaMemcpyAsync(product.EEG6SamplesCorrelation,
-                                       this->EEG6SamplesCorrelation_.data(),
-                                       this->EEG6SamplesCorrelation_.size() * 
-                                       sizeof(double),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-            cudaCheck( cudaMemcpyAsync(product.EEG1SamplesCorrelation,
-                                       this->EEG1SamplesCorrelation_.data(),
-                                       this->EEG1SamplesCorrelation_.size() * 
-                                       sizeof(double),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-        }
-    );
+EcalSamplesCorrelationGPU::Product const& EcalSamplesCorrelationGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalSamplesCorrelationGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.EBG12SamplesCorrelation,
+                             this->EBG12SamplesCorrelation_.size() * sizeof(double)));
+        cudaCheck(
+            cudaMalloc((void**)&product.EBG6SamplesCorrelation, this->EBG6SamplesCorrelation_.size() * sizeof(double)));
+        cudaCheck(
+            cudaMalloc((void**)&product.EBG1SamplesCorrelation, this->EBG1SamplesCorrelation_.size() * sizeof(double)));
+        cudaCheck(cudaMalloc((void**)&product.EEG12SamplesCorrelation,
+                             this->EEG12SamplesCorrelation_.size() * sizeof(double)));
+        cudaCheck(
+            cudaMalloc((void**)&product.EEG6SamplesCorrelation, this->EEG6SamplesCorrelation_.size() * sizeof(double)));
+        cudaCheck(
+            cudaMalloc((void**)&product.EEG1SamplesCorrelation, this->EEG1SamplesCorrelation_.size() * sizeof(double)));
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.EBG12SamplesCorrelation,
+                                  this->EBG12SamplesCorrelation_.data(),
+                                  this->EBG12SamplesCorrelation_.size() * sizeof(double),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.EBG6SamplesCorrelation,
+                                  this->EBG6SamplesCorrelation_.data(),
+                                  this->EBG6SamplesCorrelation_.size() * sizeof(double),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.EBG1SamplesCorrelation,
+                                  this->EBG1SamplesCorrelation_.data(),
+                                  this->EBG1SamplesCorrelation_.size() * sizeof(double),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.EEG12SamplesCorrelation,
+                                  this->EEG12SamplesCorrelation_.data(),
+                                  this->EEG12SamplesCorrelation_.size() * sizeof(double),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.EEG6SamplesCorrelation,
+                                  this->EEG6SamplesCorrelation_.data(),
+                                  this->EEG6SamplesCorrelation_.size() * sizeof(double),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.EEG1SamplesCorrelation,
+                                  this->EEG1SamplesCorrelation_.data(),
+                                  this->EEG1SamplesCorrelation_.size() * sizeof(double),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
 
-    return product;
+  return product;
 }
 
 TYPELOOKUP_DATA_REG(EcalSamplesCorrelationGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalTimeBiasCorrectionsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalTimeBiasCorrectionsGPU.cc
index 277661b030c68..9ab0a6302a9c4 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalTimeBiasCorrectionsGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalTimeBiasCorrectionsGPU.cc
@@ -3,76 +3,59 @@
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-EcalTimeBiasCorrectionsGPU::EcalTimeBiasCorrectionsGPU(
-        EcalTimeBiasCorrections const& values) 
-    : EBTimeCorrAmplitudeBins_{values.EBTimeCorrAmplitudeBins}
-    , EBTimeCorrShiftBins_{values.EBTimeCorrShiftBins}
-    , EETimeCorrAmplitudeBins_{values.EETimeCorrAmplitudeBins}
-    , EETimeCorrShiftBins_{values.EETimeCorrShiftBins}
-{}
+EcalTimeBiasCorrectionsGPU::EcalTimeBiasCorrectionsGPU(EcalTimeBiasCorrections const& values)
+    : EBTimeCorrAmplitudeBins_{values.EBTimeCorrAmplitudeBins},
+      EBTimeCorrShiftBins_{values.EBTimeCorrShiftBins},
+      EETimeCorrAmplitudeBins_{values.EETimeCorrAmplitudeBins},
+      EETimeCorrShiftBins_{values.EETimeCorrShiftBins} {}
 
 EcalTimeBiasCorrectionsGPU::Product::~Product() {
-    // deallocation
-    cudaCheck( cudaFree(EBTimeCorrAmplitudeBins) );
-    cudaCheck( cudaFree(EBTimeCorrShiftBins) );
-    cudaCheck( cudaFree(EETimeCorrAmplitudeBins) );
-    cudaCheck( cudaFree(EETimeCorrShiftBins) );
+  // deallocation
+  cudaCheck(cudaFree(EBTimeCorrAmplitudeBins));
+  cudaCheck(cudaFree(EBTimeCorrShiftBins));
+  cudaCheck(cudaFree(EETimeCorrAmplitudeBins));
+  cudaCheck(cudaFree(EETimeCorrShiftBins));
 }
 
-EcalTimeBiasCorrectionsGPU::Product const& EcalTimeBiasCorrectionsGPU::getProduct(
-        cudaStream_t cudaStream) const
-{
-    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
-        [this](EcalTimeBiasCorrectionsGPU::Product& product, cudaStream_t cudaStream) {
-            // to get the size of vectors later on
-            // should be removed and host conditions' objects used directly
-            product.EBTimeCorrAmplitudeBinsSize = 
-                this->EBTimeCorrAmplitudeBins_.size();
-            product.EETimeCorrAmplitudeBinsSize = 
-                this->EETimeCorrAmplitudeBins_.size();
+EcalTimeBiasCorrectionsGPU::Product const& EcalTimeBiasCorrectionsGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalTimeBiasCorrectionsGPU::Product& product, cudaStream_t cudaStream) {
+        // to get the size of vectors later on
+        // should be removed and host conditions' objects used directly
+        product.EBTimeCorrAmplitudeBinsSize = this->EBTimeCorrAmplitudeBins_.size();
+        product.EETimeCorrAmplitudeBinsSize = this->EETimeCorrAmplitudeBins_.size();
 
-            // malloc
-            cudaCheck( cudaMalloc((void**)&product.EBTimeCorrAmplitudeBins,
-                                  this->EBTimeCorrAmplitudeBins_.size() * 
-                                  sizeof(float)) );
-            cudaCheck( cudaMalloc((void**)&product.EBTimeCorrShiftBins,
-                                  this->EBTimeCorrShiftBins_.size() * 
-                                  sizeof(float)) );
-            cudaCheck( cudaMalloc((void**)&product.EETimeCorrAmplitudeBins,
-                                  this->EETimeCorrAmplitudeBins_.size() * 
-                                  sizeof(float)) );
-            cudaCheck( cudaMalloc((void**)&product.EETimeCorrShiftBins,
-                                  this->EETimeCorrShiftBins_.size() * 
-                                  sizeof(float)) );
-            // transfer 
-            cudaCheck( cudaMemcpyAsync(product.EBTimeCorrAmplitudeBins,
-                                       this->EBTimeCorrAmplitudeBins_.data(),
-                                       this->EBTimeCorrAmplitudeBins_.size() * 
-                                       sizeof(float),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-            cudaCheck( cudaMemcpyAsync(product.EBTimeCorrShiftBins,
-                                       this->EBTimeCorrShiftBins_.data(),
-                                       this->EBTimeCorrShiftBins_.size() * 
-                                       sizeof(float),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-            cudaCheck( cudaMemcpyAsync(product.EETimeCorrAmplitudeBins,
-                                       this->EETimeCorrAmplitudeBins_.data(),
-                                       this->EETimeCorrAmplitudeBins_.size() * 
-                                       sizeof(float),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-            cudaCheck( cudaMemcpyAsync(product.EETimeCorrShiftBins,
-                                       this->EETimeCorrShiftBins_.data(),
-                                       this->EETimeCorrShiftBins_.size() * 
-                                       sizeof(float),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-        }
-    );
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.EBTimeCorrAmplitudeBins,
+                             this->EBTimeCorrAmplitudeBins_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.EBTimeCorrShiftBins, this->EBTimeCorrShiftBins_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.EETimeCorrAmplitudeBins,
+                             this->EETimeCorrAmplitudeBins_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.EETimeCorrShiftBins, this->EETimeCorrShiftBins_.size() * sizeof(float)));
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.EBTimeCorrAmplitudeBins,
+                                  this->EBTimeCorrAmplitudeBins_.data(),
+                                  this->EBTimeCorrAmplitudeBins_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.EBTimeCorrShiftBins,
+                                  this->EBTimeCorrShiftBins_.data(),
+                                  this->EBTimeCorrShiftBins_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.EETimeCorrAmplitudeBins,
+                                  this->EETimeCorrAmplitudeBins_.data(),
+                                  this->EETimeCorrAmplitudeBins_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.EETimeCorrShiftBins,
+                                  this->EETimeCorrShiftBins_.data(),
+                                  this->EETimeCorrShiftBins_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
 
-    return product;
+  return product;
 }
 
 TYPELOOKUP_DATA_REG(EcalTimeBiasCorrectionsGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalTimeCalibConstantsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalTimeCalibConstantsGPU.cc
index 1da155b2539f2..d724a33f1d4e1 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalTimeCalibConstantsGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalTimeCalibConstantsGPU.cc
@@ -3,47 +3,38 @@
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-EcalTimeCalibConstantsGPU::EcalTimeCalibConstantsGPU(
-        EcalTimeCalibConstants const& values) 
-    : valuesEB_{values.barrelItems()}
-    , valuesEE_{values.endcapItems()}
-{}
+EcalTimeCalibConstantsGPU::EcalTimeCalibConstantsGPU(EcalTimeCalibConstants const& values)
+    : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {}
 
 EcalTimeCalibConstantsGPU::Product::~Product() {
-    // deallocation
-    cudaCheck( cudaFree(values) );
+  // deallocation
+  cudaCheck(cudaFree(values));
 }
 
-EcalTimeCalibConstantsGPU::Product const& EcalTimeCalibConstantsGPU::getProduct(
-        cudaStream_t cudaStream) const
-{
-    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
-        [this](EcalTimeCalibConstantsGPU::Product& product, cudaStream_t cudaStream) {
-            // malloc
-            cudaCheck( cudaMalloc((void**)&product.values,
-                                  (this->valuesEB_.size() + this->valuesEE_.size()) * 
-                                  sizeof(float)) );
-
-            // offset in floats, not bytes
-            auto const offset = this->valuesEB_.size();
-
-            // transfer 
-            cudaCheck( cudaMemcpyAsync(product.values,
-                                       this->valuesEB_.data(),
-                                       this->valuesEB_.size() * 
-                                       sizeof(float),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-            cudaCheck( cudaMemcpyAsync(product.values + offset,
-                                       this->valuesEE_.data(),
-                                       this->valuesEE_.size() * 
-                                       sizeof(float),
-                                       cudaMemcpyHostToDevice,
-                                       cudaStream) );
-        }
-    );
-
-    return product;
+EcalTimeCalibConstantsGPU::Product const& EcalTimeCalibConstantsGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalTimeCalibConstantsGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(
+            cudaMalloc((void**)&product.values, (this->valuesEB_.size() + this->valuesEE_.size()) * sizeof(float)));
+
+        // offset in floats, not bytes
+        auto const offset = this->valuesEB_.size();
+
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.values,
+                                  this->valuesEB_.data(),
+                                  this->valuesEB_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.values + offset,
+                                  this->valuesEE_.data(),
+                                  this->valuesEE_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
 }
 
 TYPELOOKUP_DATA_REG(EcalTimeCalibConstantsGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu
index b67bb74235e4a..c8d2926b29afc 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu
@@ -25,107 +25,99 @@
 
 //#define ECAL_RECO_CUDA_DEBUG
 
-namespace ecal { namespace multifit {
-   
-void entryPoint(
-        EventInputDataGPU const& eventInputGPU,
-        EventOutputDataGPU& eventOutputGPU, EventDataForScratchGPU& scratch,
-        ConditionsProducts const& conditions, 
-        ConfigurationParameters const& configParameters,
-        cudaStream_t cudaStream) {
-    using digis_type = std::vector<uint16_t>;
-    using dids_type = std::vector<uint32_t>;
-    // accodring to the cpu setup  //----> hardcoded
-    bool const gainSwitchUseMaxSampleEB = true;
-    // accodring to the cpu setup  //----> hardcoded
-    bool const gainSwitchUseMaxSampleEE = false;
-    
-    uint32_t const offsetForHashes = conditions.offsetForHashes;
-    uint32_t const offsetForInputs = eventInputGPU.ebDigis.ndigis;
-    unsigned int totalChannels = eventInputGPU.ebDigis.ndigis +
-        eventInputGPU.eeDigis.ndigis;
-    
-    // 
-    // 1d preparation kernel
-    //
-    unsigned int nchannels_per_block = 32;
-    unsigned int threads_1d = 10 * nchannels_per_block;
-    unsigned int blocks_1d = threads_1d > 10*totalChannels 
-        ? 1 : (totalChannels*10 + threads_1d - 1) / threads_1d;
-    int shared_bytes = nchannels_per_block * EcalDataFrame::MAXSAMPLES * (
-        sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(char)
-        + sizeof(bool)
-    );
-    kernel_prep_1d_and_initialize<<<blocks_1d, threads_1d, 
-                                    shared_bytes, cudaStream>>>(
-        conditions.pulseShapes.values, 
-        eventInputGPU.ebDigis.data, 
-        eventInputGPU.ebDigis.ids,
-        eventInputGPU.eeDigis.data,
-        eventInputGPU.eeDigis.ids,
-        scratch.samples,
-        (SampleVector*)eventOutputGPU.amplitudesAll,
-        scratch.gainsNoise,
-        conditions.pedestals.mean_x1,
-        conditions.pedestals.mean_x12,
-        conditions.pedestals.rms_x12,
-        conditions.pedestals.mean_x6,
-        conditions.gainRatios.gain6Over1,
-        conditions.gainRatios.gain12Over6,
-        scratch.hasSwitchToGain6,
-        scratch.hasSwitchToGain1,
-        scratch.isSaturated,
-        eventOutputGPU.amplitude,
-        eventOutputGPU.chi2,
-        eventOutputGPU.pedestal,
-        eventOutputGPU.did,
-        eventOutputGPU.flags,
-        scratch.acState,
-        scratch.activeBXs,
-        offsetForHashes,
-        offsetForInputs,
-        gainSwitchUseMaxSampleEB,
-        gainSwitchUseMaxSampleEE,
-        totalChannels);
-    cudaCheck(cudaGetLastError());
+namespace ecal {
+  namespace multifit {
 
-    //
-    // 2d preparation kernel
-    //
-    int blocks_2d = totalChannels;
-    dim3 threads_2d{10, 10};
-    kernel_prep_2d<<<blocks_2d, threads_2d, 0, cudaStream>>>(
-        scratch.gainsNoise,
-        eventInputGPU.ebDigis.ids,
-        eventInputGPU.eeDigis.ids,
-        conditions.pedestals.rms_x12,
-        conditions.pedestals.rms_x6,
-        conditions.pedestals.rms_x1,
-        conditions.gainRatios.gain12Over6,
-        conditions.gainRatios.gain6Over1,
-        conditions.samplesCorrelation.EBG12SamplesCorrelation,
-        conditions.samplesCorrelation.EBG6SamplesCorrelation,
-        conditions.samplesCorrelation.EBG1SamplesCorrelation,
-        conditions.samplesCorrelation.EEG12SamplesCorrelation,
-        conditions.samplesCorrelation.EEG6SamplesCorrelation,
-        conditions.samplesCorrelation.EEG1SamplesCorrelation,
-        scratch.noisecov,
-        scratch.pulse_matrix,
-        conditions.pulseShapes.values,
-        scratch.hasSwitchToGain6,
-        scratch.hasSwitchToGain1,
-        scratch.isSaturated,
-        offsetForHashes,
-        offsetForInputs);
-    cudaCheck(cudaGetLastError());
-    
-    // run minimization kernels
-    v1::minimization_procedure(
-        eventInputGPU, eventOutputGPU,
-        scratch, conditions, configParameters, cudaStream);
+    void entryPoint(EventInputDataGPU const& eventInputGPU,
+                    EventOutputDataGPU& eventOutputGPU,
+                    EventDataForScratchGPU& scratch,
+                    ConditionsProducts const& conditions,
+                    ConfigurationParameters const& configParameters,
+                    cudaStream_t cudaStream) {
+      using digis_type = std::vector<uint16_t>;
+      using dids_type = std::vector<uint32_t>;
+      // accodring to the cpu setup  //----> hardcoded
+      bool const gainSwitchUseMaxSampleEB = true;
+      // accodring to the cpu setup  //----> hardcoded
+      bool const gainSwitchUseMaxSampleEE = false;
 
-    if (configParameters.shouldRunTimingComputation) {
-        
+      uint32_t const offsetForHashes = conditions.offsetForHashes;
+      uint32_t const offsetForInputs = eventInputGPU.ebDigis.ndigis;
+      unsigned int totalChannels = eventInputGPU.ebDigis.ndigis + eventInputGPU.eeDigis.ndigis;
+
+      //
+      // 1d preparation kernel
+      //
+      unsigned int nchannels_per_block = 32;
+      unsigned int threads_1d = 10 * nchannels_per_block;
+      unsigned int blocks_1d = threads_1d > 10 * totalChannels ? 1 : (totalChannels * 10 + threads_1d - 1) / threads_1d;
+      int shared_bytes = nchannels_per_block * EcalDataFrame::MAXSAMPLES *
+                         (sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(char) + sizeof(bool));
+      kernel_prep_1d_and_initialize<<<blocks_1d, threads_1d, shared_bytes, cudaStream>>>(
+          conditions.pulseShapes.values,
+          eventInputGPU.ebDigis.data,
+          eventInputGPU.ebDigis.ids,
+          eventInputGPU.eeDigis.data,
+          eventInputGPU.eeDigis.ids,
+          scratch.samples,
+          (SampleVector*)eventOutputGPU.amplitudesAll,
+          scratch.gainsNoise,
+          conditions.pedestals.mean_x1,
+          conditions.pedestals.mean_x12,
+          conditions.pedestals.rms_x12,
+          conditions.pedestals.mean_x6,
+          conditions.gainRatios.gain6Over1,
+          conditions.gainRatios.gain12Over6,
+          scratch.hasSwitchToGain6,
+          scratch.hasSwitchToGain1,
+          scratch.isSaturated,
+          eventOutputGPU.amplitude,
+          eventOutputGPU.chi2,
+          eventOutputGPU.pedestal,
+          eventOutputGPU.did,
+          eventOutputGPU.flags,
+          scratch.acState,
+          scratch.activeBXs,
+          offsetForHashes,
+          offsetForInputs,
+          gainSwitchUseMaxSampleEB,
+          gainSwitchUseMaxSampleEE,
+          totalChannels);
+      cudaCheck(cudaGetLastError());
+
+      //
+      // 2d preparation kernel
+      //
+      int blocks_2d = totalChannels;
+      dim3 threads_2d{10, 10};
+      kernel_prep_2d<<<blocks_2d, threads_2d, 0, cudaStream>>>(scratch.gainsNoise,
+                                                               eventInputGPU.ebDigis.ids,
+                                                               eventInputGPU.eeDigis.ids,
+                                                               conditions.pedestals.rms_x12,
+                                                               conditions.pedestals.rms_x6,
+                                                               conditions.pedestals.rms_x1,
+                                                               conditions.gainRatios.gain12Over6,
+                                                               conditions.gainRatios.gain6Over1,
+                                                               conditions.samplesCorrelation.EBG12SamplesCorrelation,
+                                                               conditions.samplesCorrelation.EBG6SamplesCorrelation,
+                                                               conditions.samplesCorrelation.EBG1SamplesCorrelation,
+                                                               conditions.samplesCorrelation.EEG12SamplesCorrelation,
+                                                               conditions.samplesCorrelation.EEG6SamplesCorrelation,
+                                                               conditions.samplesCorrelation.EEG1SamplesCorrelation,
+                                                               scratch.noisecov,
+                                                               scratch.pulse_matrix,
+                                                               conditions.pulseShapes.values,
+                                                               scratch.hasSwitchToGain6,
+                                                               scratch.hasSwitchToGain1,
+                                                               scratch.isSaturated,
+                                                               offsetForHashes,
+                                                               offsetForInputs);
+      cudaCheck(cudaGetLastError());
+
+      // run minimization kernels
+      v1::minimization_procedure(eventInputGPU, eventOutputGPU, scratch, conditions, configParameters, cudaStream);
+
+      if (configParameters.shouldRunTimingComputation) {
         //
         // TODO: this guy can run concurrently with other kernels,
         // there is no dependence on the order of execution
@@ -133,9 +125,8 @@ void entryPoint(
         unsigned int threads_time_init = threads_1d;
         unsigned int blocks_time_init = blocks_1d;
         int sharedBytesInit = 2 * threads_time_init * sizeof(SampleVector::Scalar);
-        kernel_time_computation_init<<<blocks_time_init, threads_time_init,
-                                       sharedBytesInit, cudaStream>>>(
-            eventInputGPU.ebDigis.data, 
+        kernel_time_computation_init<<<blocks_time_init, threads_time_init, sharedBytesInit, cudaStream>>>(
+            eventInputGPU.ebDigis.data,
             eventInputGPU.ebDigis.ids,
             eventInputGPU.eeDigis.data,
             eventInputGPU.eeDigis.ids,
@@ -156,24 +147,21 @@ void entryPoint(
             offsetForInputs,
             conditions.sampleMask.getEcalSampleMaskRecordEB(),
             conditions.sampleMask.getEcalSampleMaskRecordEE(),
-            totalChannels
-        );
+            totalChannels);
         cudaCheck(cudaGetLastError());
 
-        // 
-        // TODO: small kernel only for EB. It needs to be checked if 
+        //
+        // TODO: small kernel only for EB. It needs to be checked if
         /// fusing such small kernels is beneficial in here
         //
         // we are running only over EB digis
         // therefore we need to create threads/blocks only for that
         unsigned int const threadsFixMGPA = threads_1d;
-        unsigned int const blocksFixMGPA = 
+        unsigned int const blocksFixMGPA =
             threadsFixMGPA > 10 * eventInputGPU.ebDigis.ndigis
                 ? 1
-                : (10 * eventInputGPU.ebDigis.ndigis + threadsFixMGPA - 1) 
-                    / threadsFixMGPA;
-        kernel_time_compute_fixMGPAslew<<<blocksFixMGPA, threadsFixMGPA, 
-                                          0, cudaStream>>>(
+                : (10 * eventInputGPU.ebDigis.ndigis + threadsFixMGPA - 1) / threadsFixMGPA;
+        kernel_time_compute_fixMGPAslew<<<blocksFixMGPA, threadsFixMGPA, 0, cudaStream>>>(
             eventInputGPU.ebDigis.data,
             eventInputGPU.eeDigis.data,
             scratch.sample_values,
@@ -181,37 +169,32 @@ void entryPoint(
             scratch.useless_sample_values,
             conditions.sampleMask.getEcalSampleMaskRecordEB(),
             totalChannels,
-            offsetForInputs
-        );
+            offsetForInputs);
         cudaCheck(cudaGetLastError());
 
         //
-        // 
         //
-        int sharedBytes = EcalDataFrame::MAXSAMPLES * nchannels_per_block *
-            4 * sizeof(SampleVector::Scalar);
+        //
+        int sharedBytes = EcalDataFrame::MAXSAMPLES * nchannels_per_block * 4 * sizeof(SampleVector::Scalar);
         auto const threads_nullhypot = threads_1d;
         auto const blocks_nullhypot = blocks_1d;
-        kernel_time_compute_nullhypot<<<blocks_nullhypot, threads_nullhypot, 
-                                        sharedBytes, cudaStream>>>(
+        kernel_time_compute_nullhypot<<<blocks_nullhypot, threads_nullhypot, sharedBytes, cudaStream>>>(
             scratch.sample_values,
             scratch.sample_value_errors,
             scratch.useless_sample_values,
             scratch.chi2sNullHypot,
             scratch.sum0sNullHypot,
             scratch.sumAAsNullHypot,
-            totalChannels
-        );
+            totalChannels);
         cudaCheck(cudaGetLastError());
 
         unsigned int nchannels_per_block_makeratio = 10;
         unsigned int threads_makeratio = 45 * nchannels_per_block_makeratio;
         unsigned int blocks_makeratio = threads_makeratio > 45 * totalChannels
-            ? 1
-            : (totalChannels * 45 + threads_makeratio - 1) / threads_makeratio;
+                                            ? 1
+                                            : (totalChannels * 45 + threads_makeratio - 1) / threads_makeratio;
         int sharedBytesMakeRatio = 5 * threads_makeratio * sizeof(SampleVector::Scalar);
-        kernel_time_compute_makeratio<<<blocks_makeratio, threads_makeratio,
-                                        sharedBytesMakeRatio, cudaStream>>>(
+        kernel_time_compute_makeratio<<<blocks_makeratio, threads_makeratio, sharedBytesMakeRatio, cudaStream>>>(
             scratch.sample_values,
             scratch.sample_value_errors,
             eventInputGPU.ebDigis.ids,
@@ -229,15 +212,14 @@ void entryPoint(
             scratch.accTimeMax,
             scratch.accTimeWgt,
             scratch.tcState,
-            configParameters.timeFitParametersSizeEB, 
+            configParameters.timeFitParametersSizeEB,
             configParameters.timeFitParametersSizeEE,
             configParameters.timeFitLimitsFirstEB,
             configParameters.timeFitLimitsFirstEE,
             configParameters.timeFitLimitsSecondEB,
             configParameters.timeFitLimitsSecondEE,
             totalChannels,
-            offsetForInputs
-        );
+            offsetForInputs);
         cudaCheck(cudaGetLastError());
 
         //
@@ -245,43 +227,40 @@ void entryPoint(
         //
         auto const threads_findamplchi2 = threads_1d;
         auto const blocks_findamplchi2 = blocks_1d;
-        int const sharedBytesFindAmplChi2 = 2 * threads_findamplchi2 * 
-            sizeof(SampleVector::Scalar);
+        int const sharedBytesFindAmplChi2 = 2 * threads_findamplchi2 * sizeof(SampleVector::Scalar);
         kernel_time_compute_findamplchi2_and_finish<<<blocks_findamplchi2,
-                                           threads_findamplchi2,
-                                           sharedBytesFindAmplChi2, cudaStream>>>(
-            scratch.sample_values,
-            scratch.sample_value_errors,
-            eventInputGPU.ebDigis.ids,
-            eventInputGPU.eeDigis.ids,
-            scratch.useless_sample_values,
-            scratch.tMaxAlphaBetas,
-            scratch.tMaxErrorAlphaBetas,
-            scratch.accTimeMax,
-            scratch.accTimeWgt,
-            configParameters.amplitudeFitParametersEB,
-            configParameters.amplitudeFitParametersEE,
-            scratch.sumAAsNullHypot,
-            scratch.sum0sNullHypot,
-            scratch.chi2sNullHypot,
-            scratch.tcState,
-            scratch.ampMaxAlphaBeta,
-            scratch.ampMaxError,
-            scratch.timeMax,
-            scratch.timeError,
-            totalChannels,
-            offsetForInputs
-        );
+                                                      threads_findamplchi2,
+                                                      sharedBytesFindAmplChi2,
+                                                      cudaStream>>>(scratch.sample_values,
+                                                                    scratch.sample_value_errors,
+                                                                    eventInputGPU.ebDigis.ids,
+                                                                    eventInputGPU.eeDigis.ids,
+                                                                    scratch.useless_sample_values,
+                                                                    scratch.tMaxAlphaBetas,
+                                                                    scratch.tMaxErrorAlphaBetas,
+                                                                    scratch.accTimeMax,
+                                                                    scratch.accTimeWgt,
+                                                                    configParameters.amplitudeFitParametersEB,
+                                                                    configParameters.amplitudeFitParametersEE,
+                                                                    scratch.sumAAsNullHypot,
+                                                                    scratch.sum0sNullHypot,
+                                                                    scratch.chi2sNullHypot,
+                                                                    scratch.tcState,
+                                                                    scratch.ampMaxAlphaBeta,
+                                                                    scratch.ampMaxError,
+                                                                    scratch.timeMax,
+                                                                    scratch.timeError,
+                                                                    totalChannels,
+                                                                    offsetForInputs);
         cudaCheck(cudaGetLastError());
-        
+
         //
         //
         //
         auto const threads_timecorr = 32;
-        auto const blocks_timecorr = threads_timecorr > totalChannels
-            ? 1 : (totalChannels + threads_timecorr-1) / threads_timecorr;
-        kernel_time_correction_and_finalize<<<blocks_timecorr, threads_timecorr,
-                                              0, cudaStream>>>(
+        auto const blocks_timecorr =
+            threads_timecorr > totalChannels ? 1 : (totalChannels + threads_timecorr - 1) / threads_timecorr;
+        kernel_time_correction_and_finalize<<<blocks_timecorr, threads_timecorr, 0, cudaStream>>>(
             eventOutputGPU.amplitude,
             eventInputGPU.ebDigis.data,
             eventInputGPU.ebDigis.ids,
@@ -318,18 +297,18 @@ void entryPoint(
             configParameters.outOfTimeThreshG61mEE,
             offsetForHashes,
             offsetForInputs,
-            totalChannels
-        );
+            totalChannels);
         cudaCheck(cudaGetLastError());
-    }
+      }
 
-        /*
+      /*
     cudaEventRecord(end_event, 0);
     cudaEventSynchronize(end_event);
     float ms;
     cudaEventElapsedTime(&ms, start_event, end_event);
     std::cout << "elapsed time = " << ms << std::endl;
     */
-}
+    }
 
-}}
+  }  // namespace multifit
+}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu
index 6b60f4fc35560..b85f002464f65 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu
@@ -3,88 +3,74 @@
 #include "DataFormats/EcalDetId/interface/EBDetId.h"
 #include "DataFormats/EcalDetId/interface/EEDetId.h"
 
-namespace ecal { namespace multifit {
-
-namespace internal {
-
-namespace barrel {
-
-__device__
-__forceinline__
-bool positiveZ(uint32_t id) { return id & 0x10000; }
-
-__device__
-__forceinline__
-uint32_t ietaAbs(uint32_t id) { return (id >> 9) & 0x7F; }
-
-__device__
-__forceinline__
-uint32_t iphi(uint32_t id) { return id & 0x1FF; }
-
-}
-
-}
-
-__device__ 
-uint32_t hashedIndexEB(uint32_t id) {
-    using namespace internal::barrel;
-    return (EBDetId::MAX_IETA + 
-            (positiveZ(id) ? ietaAbs(id)-1 : -ietaAbs(id)) ) * EBDetId::MAX_IPHI + 
-            iphi(id)-1;
-}
-
-namespace internal {
-
-namespace endcap {
-
-__device__
-__forceinline__
-uint32_t ix(uint32_t id) { return (id >> 7) & 0x7F; }
-
-__device__
-__forceinline__
-uint32_t iy(uint32_t id) { return id & 0x7F; }
-
-__device__
-__forceinline__
-bool positiveZ(uint32_t id) { return id & 0x4000; }
-
-// these constants come from EE Det Id 
-__constant__ 
-const unsigned short kxf[] = {
-  41, 51, 41, 51, 41, 51, 36, 51, 36, 51, 26, 51, 26, 51, 26, 51, 21, 51, 21, 51, 21, 51, 21, 51, 21, 51, 16, 51, 16,
-  51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 9,  51, 9,  51, 9,  51, 9,  51, 9,  51, 6,  51, 6,  51, 6,  51, 6,  51,
-  6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 4,  51, 4,  51, 4,  51, 4,  51, 4,  56, 1,  58, 1,  59, 1,  60, 1,
-  61, 1,  61, 1,  62, 1,  62, 1,  62, 1,  62, 1,  62, 1,  62, 1,  62, 1,  62, 1,  62, 1,  62, 1,  61, 1,  61, 1,  60,
-  1,  59, 1,  58, 4,  56, 4,  51, 4,  51, 4,  51, 4,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,
-  51, 6,  51, 6,  51, 9,  51, 9,  51, 9,  51, 9,  51, 9,  51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 16, 51, 16, 51,
-  21, 51, 21, 51, 21, 51, 21, 51, 21, 51, 26, 51, 26, 51, 26, 51, 36, 51, 36, 51, 41, 51, 41, 51, 41, 51};
-
-__constant__
-const unsigned short kdi[] = {
-  0,    10,   20,   30,   40,   50,   60,   75,   90,   105,  120,  145,  170,  195,  220,  245,  270,  300,  330,
-  360,  390,  420,  450,  480,  510,  540,  570,  605,  640,  675,  710,  747,  784,  821,  858,  895,  932,  969,
-  1006, 1043, 1080, 1122, 1164, 1206, 1248, 1290, 1332, 1374, 1416, 1458, 1500, 1545, 1590, 1635, 1680, 1725, 1770,
-  1815, 1860, 1905, 1950, 1995, 2040, 2085, 2130, 2175, 2220, 2265, 2310, 2355, 2400, 2447, 2494, 2541, 2588, 2635,
-  2682, 2729, 2776, 2818, 2860, 2903, 2946, 2988, 3030, 3071, 3112, 3152, 3192, 3232, 3272, 3311, 3350, 3389, 3428,
-  3467, 3506, 3545, 3584, 3623, 3662, 3701, 3740, 3779, 3818, 3857, 3896, 3935, 3974, 4013, 4052, 4092, 4132, 4172,
-  4212, 4253, 4294, 4336, 4378, 4421, 4464, 4506, 4548, 4595, 4642, 4689, 4736, 4783, 4830, 4877, 4924, 4969, 5014,
-  5059, 5104, 5149, 5194, 5239, 5284, 5329, 5374, 5419, 5464, 5509, 5554, 5599, 5644, 5689, 5734, 5779, 5824, 5866,
-  5908, 5950, 5992, 6034, 6076, 6118, 6160, 6202, 6244, 6281, 6318, 6355, 6392, 6429, 6466, 6503, 6540, 6577, 6614,
-  6649, 6684, 6719, 6754, 6784, 6814, 6844, 6874, 6904, 6934, 6964, 6994, 7024, 7054, 7079, 7104, 7129, 7154, 7179,
-  7204, 7219, 7234, 7249, 7264, 7274, 7284, 7294, 7304, 7314};
-
-}
-
-}
-
-__device__
-uint32_t hashedIndexEE(uint32_t id) {
-    using namespace internal::endcap;
-
-    const uint32_t jx ( ix(id) ) ;
-    const uint32_t jd ( 2*( iy(id) - 1 ) + ( jx - 1 )/50 ) ;
-    return (  ( positiveZ(id) ? EEDetId::kEEhalf : 0) + kdi[jd] + jx - kxf[jd] ) ;
-}
-
-}}
+namespace ecal {
+  namespace multifit {
+
+    namespace internal {
+
+      namespace barrel {
+
+        __device__ __forceinline__ bool positiveZ(uint32_t id) { return id & 0x10000; }
+
+        __device__ __forceinline__ uint32_t ietaAbs(uint32_t id) { return (id >> 9) & 0x7F; }
+
+        __device__ __forceinline__ uint32_t iphi(uint32_t id) { return id & 0x1FF; }
+
+      }  // namespace barrel
+
+    }  // namespace internal
+
+    __device__ uint32_t hashedIndexEB(uint32_t id) {
+      using namespace internal::barrel;
+      return (EBDetId::MAX_IETA + (positiveZ(id) ? ietaAbs(id) - 1 : -ietaAbs(id))) * EBDetId::MAX_IPHI + iphi(id) - 1;
+    }
+
+    namespace internal {
+
+      namespace endcap {
+
+        __device__ __forceinline__ uint32_t ix(uint32_t id) { return (id >> 7) & 0x7F; }
+
+        __device__ __forceinline__ uint32_t iy(uint32_t id) { return id & 0x7F; }
+
+        __device__ __forceinline__ bool positiveZ(uint32_t id) { return id & 0x4000; }
+
+        // these constants come from EE Det Id
+        __constant__ const unsigned short kxf[] = {
+            41, 51, 41, 51, 41, 51, 36, 51, 36, 51, 26, 51, 26, 51, 26, 51, 21, 51, 21, 51, 21, 51, 21, 51, 21,
+            51, 16, 51, 16, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 9,  51, 9,  51, 9,  51, 9,  51, 9,  51,
+            6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 4,  51, 4,  51, 4,
+            51, 4,  51, 4,  56, 1,  58, 1,  59, 1,  60, 1,  61, 1,  61, 1,  62, 1,  62, 1,  62, 1,  62, 1,  62,
+            1,  62, 1,  62, 1,  62, 1,  62, 1,  62, 1,  61, 1,  61, 1,  60, 1,  59, 1,  58, 4,  56, 4,  51, 4,
+            51, 4,  51, 4,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51,
+            9,  51, 9,  51, 9,  51, 9,  51, 9,  51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 16, 51, 16, 51, 21,
+            51, 21, 51, 21, 51, 21, 51, 21, 51, 26, 51, 26, 51, 26, 51, 36, 51, 36, 51, 41, 51, 41, 51, 41, 51};
+
+        __constant__ const unsigned short kdi[] = {
+            0,    10,   20,   30,   40,   50,   60,   75,   90,   105,  120,  145,  170,  195,  220,  245,  270,
+            300,  330,  360,  390,  420,  450,  480,  510,  540,  570,  605,  640,  675,  710,  747,  784,  821,
+            858,  895,  932,  969,  1006, 1043, 1080, 1122, 1164, 1206, 1248, 1290, 1332, 1374, 1416, 1458, 1500,
+            1545, 1590, 1635, 1680, 1725, 1770, 1815, 1860, 1905, 1950, 1995, 2040, 2085, 2130, 2175, 2220, 2265,
+            2310, 2355, 2400, 2447, 2494, 2541, 2588, 2635, 2682, 2729, 2776, 2818, 2860, 2903, 2946, 2988, 3030,
+            3071, 3112, 3152, 3192, 3232, 3272, 3311, 3350, 3389, 3428, 3467, 3506, 3545, 3584, 3623, 3662, 3701,
+            3740, 3779, 3818, 3857, 3896, 3935, 3974, 4013, 4052, 4092, 4132, 4172, 4212, 4253, 4294, 4336, 4378,
+            4421, 4464, 4506, 4548, 4595, 4642, 4689, 4736, 4783, 4830, 4877, 4924, 4969, 5014, 5059, 5104, 5149,
+            5194, 5239, 5284, 5329, 5374, 5419, 5464, 5509, 5554, 5599, 5644, 5689, 5734, 5779, 5824, 5866, 5908,
+            5950, 5992, 6034, 6076, 6118, 6160, 6202, 6244, 6281, 6318, 6355, 6392, 6429, 6466, 6503, 6540, 6577,
+            6614, 6649, 6684, 6719, 6754, 6784, 6814, 6844, 6874, 6904, 6934, 6964, 6994, 7024, 7054, 7079, 7104,
+            7129, 7154, 7179, 7204, 7219, 7234, 7249, 7264, 7274, 7284, 7294, 7304, 7314};
+
+      }  // namespace endcap
+
+    }  // namespace internal
+
+    __device__ uint32_t hashedIndexEE(uint32_t id) {
+      using namespace internal::endcap;
+
+      const uint32_t jx(ix(id));
+      const uint32_t jd(2 * (iy(id) - 1) + (jx - 1) / 50);
+      return ((positiveZ(id) ? EEDetId::kEEhalf : 0) + kdi[jd] + jx - kxf[jd]);
+    }
+
+  }  // namespace multifit
+}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.cu
index 4c538a2e352ad..5089676ed0c9f 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.cu
@@ -17,50 +17,46 @@
 
 //#define ECAL_RECO_CUDA_DEBUG
 
-namespace ecal { namespace multifit {
-
-__device__
-__forceinline__
-bool use_sample(unsigned int sample_mask, unsigned int sample) {
-    return sample_mask & (0x1 << (EcalDataFrame::MAXSAMPLES - (sample + 1)));
-}
-
-__global__
-void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values,
-                                   SampleVector::Scalar const* sample_value_errors,
-                                   bool const* useless_sample_values,
-                                   SampleVector::Scalar* chi2s,
-                                   SampleVector::Scalar* sum0s,
-                                   SampleVector::Scalar* sumAAs,
-                                   int const nchannels) {
-    using ScalarType = SampleVector::Scalar;
-    constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
-
-    // indices
-    int tx = threadIdx.x + blockDim.x*blockIdx.x;
-    int ltx = threadIdx.x;
-    int ch = tx / nsamples;
-    int nchannels_per_block = blockDim.x / nsamples;
-
-    // TODO: make sure that this branch plays nicely with __syncthreads inside
-    // can there be a deadlock even if the thread is inactive
-    if (ch < nchannels) {
-        // 
+namespace ecal {
+  namespace multifit {
+
+    __device__ __forceinline__ bool use_sample(unsigned int sample_mask, unsigned int sample) {
+      return sample_mask & (0x1 << (EcalDataFrame::MAXSAMPLES - (sample + 1)));
+    }
+
+    __global__ void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values,
+                                                  SampleVector::Scalar const* sample_value_errors,
+                                                  bool const* useless_sample_values,
+                                                  SampleVector::Scalar* chi2s,
+                                                  SampleVector::Scalar* sum0s,
+                                                  SampleVector::Scalar* sumAAs,
+                                                  const int nchannels) {
+      using ScalarType = SampleVector::Scalar;
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      int tx = threadIdx.x + blockDim.x * blockIdx.x;
+      int ltx = threadIdx.x;
+      int ch = tx / nsamples;
+      int nchannels_per_block = blockDim.x / nsamples;
+
+      // TODO: make sure that this branch plays nicely with __syncthreads inside
+      // can there be a deadlock even if the thread is inactive
+      if (ch < nchannels) {
+        //
         int sample = tx % nsamples;
 
         // shared mem inits
         extern __shared__ char sdata[];
         char* s_sum0 = sdata;
-        SampleVector::Scalar* s_sum1 = reinterpret_cast<SampleVector::Scalar*>(
-            s_sum0 + nchannels_per_block*nsamples);
-        SampleVector::Scalar* s_sumA = s_sum1 + nchannels_per_block*nsamples;
-        SampleVector::Scalar* s_sumAA = s_sumA + nchannels_per_block*nsamples;
+        SampleVector::Scalar* s_sum1 = reinterpret_cast<SampleVector::Scalar*>(s_sum0 + nchannels_per_block * nsamples);
+        SampleVector::Scalar* s_sumA = s_sum1 + nchannels_per_block * nsamples;
+        SampleVector::Scalar* s_sumAA = s_sumA + nchannels_per_block * nsamples;
 
         // TODO make sure no div by 0
-        auto const inv_error = useless_sample_values[tx] 
-            ? 0.0 
-            : 1.0 / (sample_value_errors[tx] * sample_value_errors[tx]);
-        auto const sample_value = sample_values[tx];
+        const auto inv_error =
+            useless_sample_values[tx] ? 0.0 : 1.0 / (sample_value_errors[tx] * sample_value_errors[tx]);
+        const auto sample_value = sample_values[tx];
         s_sum0[ltx] = useless_sample_values[tx] ? 0 : 1;
         s_sum1[ltx] = inv_error;
         s_sumA[ltx] = sample_value * inv_error;
@@ -68,380 +64,349 @@ void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values,
         __syncthreads();
 
         // 5 threads for [0, 4] samples
-        if (sample<5) {
-            s_sum0[ltx] += s_sum0[ltx+5];
-            s_sum1[ltx] += s_sum1[ltx+5];
-            s_sumA[ltx] += s_sumA[ltx+5];
-            s_sumAA[ltx] += s_sumAA[ltx+5];
+        if (sample < 5) {
+          s_sum0[ltx] += s_sum0[ltx + 5];
+          s_sum1[ltx] += s_sum1[ltx + 5];
+          s_sumA[ltx] += s_sumA[ltx + 5];
+          s_sumAA[ltx] += s_sumAA[ltx + 5];
         }
         __syncthreads();
 
-        if (sample<2) {
-            // note double counting of sample 3
-            s_sum0[ltx] += s_sum0[ltx+2] + s_sum0[ltx+3];
-            s_sum1[ltx] += s_sum1[ltx+2] + s_sum1[ltx+3];
-            s_sumA[ltx] += s_sumA[ltx+2] + s_sumA[ltx+3];
-            s_sumAA[ltx] += s_sumAA[ltx+2] + s_sumAA[ltx+3];
+        if (sample < 2) {
+          // note double counting of sample 3
+          s_sum0[ltx] += s_sum0[ltx + 2] + s_sum0[ltx + 3];
+          s_sum1[ltx] += s_sum1[ltx + 2] + s_sum1[ltx + 3];
+          s_sumA[ltx] += s_sumA[ltx + 2] + s_sumA[ltx + 3];
+          s_sumAA[ltx] += s_sumAA[ltx + 2] + s_sumAA[ltx + 3];
         }
         __syncthreads();
 
         if (sample == 0) {
-            // note, subtract to remove the double counting of sample == 3
-            //s_sum0[ltx] += s_sum0[ltx+1] - s_sum0[ltx+3];
-            //s_sum1[ltx] += s_sum1[ltx+1] - s_sum1[ltx+3];
-            //s_sumA[ltx] += s_sumA[ltx+1] - s_sumA[ltx+3];
-            //s_sumAA[ltx] += s_sumAA[ltx+1] - s_sumAA[ltx+3];
-            auto const sum0 = s_sum0[ltx] + s_sum0[ltx+1] - s_sum0[ltx+3];
-            auto const sum1 = s_sum1[ltx] + s_sum1[ltx+1] - s_sum1[ltx+3];
-            auto const sumA = s_sumA[ltx] + s_sumA[ltx+1] - s_sumA[ltx+3];
-            auto const sumAA = s_sumAA[ltx] + s_sumAA[ltx+1] - s_sumAA[ltx+3];
-            auto const chi2 = sum0>0 
-                ? (sumAA - sumA * sumA / sum1) / sum0
-                : static_cast<ScalarType>(0);
-            chi2s[ch] = chi2;
-            sum0s[ch] = sum0;
-            sumAAs[ch] = sumAA;
+          // note, subtract to remove the double counting of sample == 3
+          //s_sum0[ltx] += s_sum0[ltx+1] - s_sum0[ltx+3];
+          //s_sum1[ltx] += s_sum1[ltx+1] - s_sum1[ltx+3];
+          //s_sumA[ltx] += s_sumA[ltx+1] - s_sumA[ltx+3];
+          //s_sumAA[ltx] += s_sumAA[ltx+1] - s_sumAA[ltx+3];
+          const auto sum0 = s_sum0[ltx] + s_sum0[ltx + 1] - s_sum0[ltx + 3];
+          const auto sum1 = s_sum1[ltx] + s_sum1[ltx + 1] - s_sum1[ltx + 3];
+          const auto sumA = s_sumA[ltx] + s_sumA[ltx + 1] - s_sumA[ltx + 3];
+          const auto sumAA = s_sumAA[ltx] + s_sumAA[ltx + 1] - s_sumAA[ltx + 3];
+          const auto chi2 = sum0 > 0 ? (sumAA - sumA * sumA / sum1) / sum0 : static_cast<ScalarType>(0);
+          chi2s[ch] = chi2;
+          sum0s[ch] = sum0;
+          sumAAs[ch] = sumAA;
 
 #ifdef DEBUG_TC_NULLHYPOT
-            if (ch == 0) {
-                printf("chi2 = %f sum0 = %d sumAA = %f\n",
-                    chi2, static_cast<int>(sum0), sumAA);
-            }
+          if (ch == 0) {
+            printf("chi2 = %f sum0 = %d sumAA = %f\n", chi2, static_cast<int>(sum0), sumAA);
+          }
 #endif
         }
+      }
     }
-}
-
-constexpr float fast_expf(float x) { return unsafe_expf<6>(x); }
-constexpr float fast_logf(float x) { return unsafe_logf<7>(x); }
-
-//#define DEBUG_TC_MAKERATIO
-//
-// launch ctx parameters are 
-// 45 threads per channel, X channels per block, Y blocks
-// 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9
-// TODO: it might be much beter to use 32 threads per channel instead of 45
-// to simplify the synchronization
-//
-__global__
-void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values,
-                                   SampleVector::Scalar const* sample_value_errors,
-                                   uint32_t const* dids_eb,
-                                   uint32_t const* dids_ee,
-                                   bool const* useless_sample_values,
-                                   char const* pedestal_nums,
-                                   ConfigurationParameters::type const* amplitudeFitParametersEB,
-                                   ConfigurationParameters::type const* amplitudeFitParametersEE,
-                                   ConfigurationParameters::type const* timeFitParametersEB,
-                                   ConfigurationParameters::type const* timeFitParametersEE,
-                                   SampleVector::Scalar const* sumAAsNullHypot,
-                                   SampleVector::Scalar const* sum0sNullHypot,
-                                   SampleVector::Scalar* tMaxAlphaBetas,
-                                   SampleVector::Scalar* tMaxErrorAlphaBetas,
-                                   SampleVector::Scalar* g_accTimeMax,
-                                   SampleVector::Scalar* g_accTimeWgt,
-                                   TimeComputationState* g_state,
-                                   unsigned int const timeFitParameters_sizeEB,
-                                   unsigned int const timeFitParameters_sizeEE,
-                                   ConfigurationParameters::type const timeFitLimits_firstEB,
-                                   ConfigurationParameters::type const timeFitLimits_firstEE,
-                                   ConfigurationParameters::type const timeFitLimits_secondEB,
-                                   ConfigurationParameters::type const timeFitLimits_secondEE,
-                                   int const nchannels,
-                                   uint32_t const offsetForInputs) {
-    using ScalarType = SampleVector::Scalar;
-
-    // constants
-    constexpr int nthreads_per_channel = 45; // n=10, n(n-1)/2
-    constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
-
-    // indices
-    int const gtx = threadIdx.x + blockDim.x*blockIdx.x;
-    int const ch = gtx / nthreads_per_channel;
-    int const lch = threadIdx.x / nthreads_per_channel;
-    int const ltx = threadIdx.x % nthreads_per_channel;
-    int const ch_start = ch*nsamples;
-    int const lch_start = lch*nthreads_per_channel;
-    int const nchannels_per_block = blockDim.x / nthreads_per_channel;
-    auto const* dids = ch >= offsetForInputs
-        ? dids_ee
-        : dids_eb;
-    int const inputCh = ch >= offsetForInputs
-        ? ch - offsetForInputs
-        : ch;
-    
-    // rmeove inactive threads
-    // TODO: need to understand if this is 100% safe in presence of syncthreads
-    if (ch >= nchannels) return;
-
-    auto const did = DetId{dids[inputCh]};
-    auto const isBarrel = did.subdetId() == EcalBarrel;
-    auto const* amplitudeFitParameters = isBarrel
-        ? amplitudeFitParametersEB
-        : amplitudeFitParametersEE;
-    auto const* timeFitParameters = isBarrel
-        ? timeFitParametersEB
-        : timeFitParametersEE;
-    auto const timeFitParameters_size = isBarrel
-        ? timeFitParameters_sizeEB
-        : timeFitParameters_sizeEE;
-    auto const timeFitLimits_first = isBarrel
-        ? timeFitLimits_firstEB
-        : timeFitLimits_firstEE;
-    auto const timeFitLimits_second = isBarrel
-        ? timeFitLimits_secondEB
-        : timeFitLimits_secondEE;
-
-    extern __shared__ char smem[];
-    ScalarType* shr_chi2s = reinterpret_cast<ScalarType*>(smem);
-    ScalarType* shr_time_wgt = shr_chi2s + blockDim.x;
-    ScalarType* shr_time_max = shr_time_wgt + blockDim.x;
-    ScalarType* shrTimeMax = shr_time_max + blockDim.x;
-    ScalarType* shrTimeWgt = shrTimeMax + blockDim.x;
-
-    // map tx -> (sample_i, sample_j)
-    int sample_i, sample_j = 0;
-    if (ltx>=0 && ltx<=8) {
+
+    constexpr float fast_expf(float x) { return unsafe_expf<6>(x); }
+    constexpr float fast_logf(float x) { return unsafe_logf<7>(x); }
+
+    //#define DEBUG_TC_MAKERATIO
+    //
+    // launch ctx parameters are
+    // 45 threads per channel, X channels per block, Y blocks
+    // 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9
+    // TODO: it might be much beter to use 32 threads per channel instead of 45
+    // to simplify the synchronization
+    //
+    __global__ void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values,
+                                                  SampleVector::Scalar const* sample_value_errors,
+                                                  uint32_t const* dids_eb,
+                                                  uint32_t const* dids_ee,
+                                                  bool const* useless_sample_values,
+                                                  char const* pedestal_nums,
+                                                  ConfigurationParameters::type const* amplitudeFitParametersEB,
+                                                  ConfigurationParameters::type const* amplitudeFitParametersEE,
+                                                  ConfigurationParameters::type const* timeFitParametersEB,
+                                                  ConfigurationParameters::type const* timeFitParametersEE,
+                                                  SampleVector::Scalar const* sumAAsNullHypot,
+                                                  SampleVector::Scalar const* sum0sNullHypot,
+                                                  SampleVector::Scalar* tMaxAlphaBetas,
+                                                  SampleVector::Scalar* tMaxErrorAlphaBetas,
+                                                  SampleVector::Scalar* g_accTimeMax,
+                                                  SampleVector::Scalar* g_accTimeWgt,
+                                                  TimeComputationState* g_state,
+                                                  unsigned const int timeFitParameters_sizeEB,
+                                                  unsigned const int timeFitParameters_sizeEE,
+                                                  ConfigurationParameters::type const timeFitLimits_firstEB,
+                                                  ConfigurationParameters::type const timeFitLimits_firstEE,
+                                                  ConfigurationParameters::type const timeFitLimits_secondEB,
+                                                  ConfigurationParameters::type const timeFitLimits_secondEE,
+                                                  const int nchannels,
+                                                  uint32_t const offsetForInputs) {
+      using ScalarType = SampleVector::Scalar;
+
+      // constants
+      constexpr int nthreads_per_channel = 45;  // n=10, n(n-1)/2
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      const int gtx = threadIdx.x + blockDim.x * blockIdx.x;
+      const int ch = gtx / nthreads_per_channel;
+      const int ltx = threadIdx.x % nthreads_per_channel;
+      const int ch_start = ch * nsamples;
+      const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+      const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+
+      // rmeove inactive threads
+      // TODO: need to understand if this is 100% safe in presence of syncthreads
+      if (ch >= nchannels)
+        return;
+
+      const auto did = DetId{dids[inputCh]};
+      const auto isBarrel = did.subdetId() == EcalBarrel;
+      const auto* amplitudeFitParameters = isBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE;
+      const auto* timeFitParameters = isBarrel ? timeFitParametersEB : timeFitParametersEE;
+      const auto timeFitParameters_size = isBarrel ? timeFitParameters_sizeEB : timeFitParameters_sizeEE;
+      const auto timeFitLimits_first = isBarrel ? timeFitLimits_firstEB : timeFitLimits_firstEE;
+      const auto timeFitLimits_second = isBarrel ? timeFitLimits_secondEB : timeFitLimits_secondEE;
+
+      extern __shared__ char smem[];
+      ScalarType* shr_chi2s = reinterpret_cast<ScalarType*>(smem);
+      ScalarType* shr_time_wgt = shr_chi2s + blockDim.x;
+      ScalarType* shr_time_max = shr_time_wgt + blockDim.x;
+      ScalarType* shrTimeMax = shr_time_max + blockDim.x;
+      ScalarType* shrTimeWgt = shrTimeMax + blockDim.x;
+
+      // map tx -> (sample_i, sample_j)
+      int sample_i, sample_j = 0;
+      if (ltx >= 0 && ltx <= 8) {
         sample_i = 0;
-        sample_j = 1+ltx;
-    } else if (ltx<=16) {
+        sample_j = 1 + ltx;
+      } else if (ltx <= 16) {
         sample_i = 1;
-        sample_j = 2+ltx-9;
-    } else if (ltx<=23) {
+        sample_j = 2 + ltx - 9;
+      } else if (ltx <= 23) {
         sample_i = 2;
         sample_j = 3 + ltx - 17;
-    } else if (ltx<=29) {
+      } else if (ltx <= 29) {
         sample_i = 3;
         sample_j = 4 + ltx - 24;
-    } else if (ltx<=34) {
+      } else if (ltx <= 34) {
         sample_i = 4;
         sample_j = 5 + ltx - 30;
-    } else if (ltx<=38) {
+      } else if (ltx <= 38) {
         sample_i = 5;
         sample_j = 6 + ltx - 35;
-    } else if (ltx<=41) {
+      } else if (ltx <= 41) {
         sample_i = 6;
         sample_j = 7 + ltx - 39;
-    } else if (ltx<=43) {
+      } else if (ltx <= 43) {
         sample_i = 7;
         sample_j = 8 + ltx - 42;
-    } else if (ltx <= 44) {
+      } else if (ltx <= 44) {
         sample_i = 8;
         sample_j = 9;
-    } else
+      } else
         assert(false);
 
-    auto const tx_i = ch_start + sample_i;
-    auto const tx_j = ch_start + sample_j;
+      const auto tx_i = ch_start + sample_i;
+      const auto tx_j = ch_start + sample_j;
 
-    //
-    // note, given the way we partition the block, with 45 threads per channel
-    // we will end up with inactive threads which need to be dragged along
-    // through the synching point
-    // 
-    /*
+      //
+      // note, given the way we partition the block, with 45 threads per channel
+      // we will end up with inactive threads which need to be dragged along
+      // through the synching point
+      //
+      /*
     bool const condToExit = ch >= nchannels
         ? true
         : useless_sample_values[tx_i] 
           || useless_sample_values[tx_j]
           || sample_values[tx_i]<=1 || sample_values[tx_j]<=1;
           */
-    bool const condForUselessSamples = useless_sample_values[tx_i] 
-        || useless_sample_values[tx_j]
-        || sample_values[tx_i]<=1 || sample_values[tx_j]<=1;
-
-    //
-    // see cpu implementation for explanation
-    // 
-    ScalarType chi2 = std::numeric_limits<ScalarType>::max();
-    ScalarType tmax = 0;
-    ScalarType tmaxerr = 0;
-    shrTimeMax[threadIdx.x] = 0;
-    shrTimeWgt[threadIdx.x] = 0;
-    bool internalCondForSkipping1 = true;
-    bool internalCondForSkipping2 = true;
-    if (!condForUselessSamples) {
-        auto const rtmp = sample_values[tx_i] / sample_values[tx_j];
-        auto const invampl_i = 1.0 / sample_values[tx_i];
-        auto const relErr2_i = sample_value_errors[tx_i]*sample_value_errors[tx_i]*
-            invampl_i*invampl_i;
-        auto const invampl_j = 1.0 / sample_values[tx_j];
-        auto const relErr2_j = sample_value_errors[tx_j]*sample_value_errors[tx_j]*
-            invampl_j*invampl_j;
-        auto const err1 = rtmp * rtmp * (relErr2_i + relErr2_j);
-        auto err2 = sample_value_errors[tx_j]*
-            (sample_values[tx_i] - sample_values[tx_j])*(invampl_j*invampl_j);
+      bool const condForUselessSamples = useless_sample_values[tx_i] || useless_sample_values[tx_j] ||
+                                         sample_values[tx_i] <= 1 || sample_values[tx_j] <= 1;
+
+      //
+      // see cpu implementation for explanation
+      //
+      ScalarType chi2 = std::numeric_limits<ScalarType>::max();
+      ScalarType tmax = 0;
+      ScalarType tmaxerr = 0;
+      shrTimeMax[threadIdx.x] = 0;
+      shrTimeWgt[threadIdx.x] = 0;
+      bool internalCondForSkipping1 = true;
+      bool internalCondForSkipping2 = true;
+      if (!condForUselessSamples) {
+        const auto rtmp = sample_values[tx_i] / sample_values[tx_j];
+        const auto invampl_i = 1.0 / sample_values[tx_i];
+        const auto relErr2_i = sample_value_errors[tx_i] * sample_value_errors[tx_i] * invampl_i * invampl_i;
+        const auto invampl_j = 1.0 / sample_values[tx_j];
+        const auto relErr2_j = sample_value_errors[tx_j] * sample_value_errors[tx_j] * invampl_j * invampl_j;
+        const auto err1 = rtmp * rtmp * (relErr2_i + relErr2_j);
+        auto err2 = sample_value_errors[tx_j] * (sample_values[tx_i] - sample_values[tx_j]) * (invampl_j * invampl_j);
         // TODO non-divergent branch for a block if each block has 1 channel
         // otherwise non-divergent for groups of 45 threads
         // at this point, pedestal_nums[ch] can be either 0, 1 or 2
-        if (pedestal_nums[ch]==2)
-            err2 *= err2 * 0.5;
-        auto const err3 = (0.289*0.289) * (invampl_j*invampl_j);
-        auto const total_error = std::sqrt(err1 + err2 + err3);
+        if (pedestal_nums[ch] == 2)
+          err2 *= err2 * 0.5;
+        const auto err3 = (0.289 * 0.289) * (invampl_j * invampl_j);
+        const auto total_error = std::sqrt(err1 + err2 + err3);
 
-        auto const alpha = amplitudeFitParameters[0];
-        auto const beta = amplitudeFitParameters[1];
-        auto const alphabeta = alpha * beta;
-        auto const invalphabeta = 1.0 / alphabeta;
+        const auto alpha = amplitudeFitParameters[0];
+        const auto beta = amplitudeFitParameters[1];
+        const auto alphabeta = alpha * beta;
+        const auto invalphabeta = 1.0 / alphabeta;
 
         // variables instead of a struct
-        auto const ratio_index = sample_i;
-        auto const ratio_step = sample_j - sample_i;
-        auto const ratio_value = rtmp;
-        auto const ratio_error = total_error;
-
-        auto const rlim_i_j = fast_expf(
-            static_cast<ScalarType>(sample_j - sample_i) / beta) - 0.001;
-        internalCondForSkipping1 = !(total_error<1.0 && rtmp>0.001 && rtmp<rlim_i_j);
+        const auto ratio_index = sample_i;
+        const auto ratio_step = sample_j - sample_i;
+        const auto ratio_value = rtmp;
+        const auto ratio_error = total_error;
+
+        const auto rlim_i_j = fast_expf(static_cast<ScalarType>(sample_j - sample_i) / beta) - 0.001;
+        internalCondForSkipping1 = !(total_error < 1.0 && rtmp > 0.001 && rtmp < rlim_i_j);
         if (!internalCondForSkipping1) {
-            //
-            // precompute.
-            // in cpu version this was done conditionally
-            // however easier to do it here (precompute) and then just filter out
-            // if not needed
-            // 
-            auto const l_timeFitLimits_first = timeFitLimits_first;
-            auto const l_timeFitLimits_second = timeFitLimits_second;
-            if (ratio_step == 1
-                && ratio_value >= l_timeFitLimits_first
-                && ratio_value <= l_timeFitLimits_second) {
-
-                auto const time_max_i = static_cast<ScalarType>(ratio_index);
-                auto u = timeFitParameters[timeFitParameters_size - 1];
+          //
+          // precompute.
+          // in cpu version this was done conditionally
+          // however easier to do it here (precompute) and then just filter out
+          // if not needed
+          //
+          const auto l_timeFitLimits_first = timeFitLimits_first;
+          const auto l_timeFitLimits_second = timeFitLimits_second;
+          if (ratio_step == 1 && ratio_value >= l_timeFitLimits_first && ratio_value <= l_timeFitLimits_second) {
+            const auto time_max_i = static_cast<ScalarType>(ratio_index);
+            auto u = timeFitParameters[timeFitParameters_size - 1];
 #pragma unroll
-                for (int k=timeFitParameters_size-2; k>=0; k--)
-                    u = u*ratio_value + timeFitParameters[k];
-
-                auto du = (timeFitParameters_size - 1) *
-                    (timeFitParameters[timeFitParameters_size - 1]);
-                for (int k=timeFitParameters_size - 2; k>=1; k--)
-                    du = du*ratio_value + k*timeFitParameters[k];
-
-                auto const error2 = ratio_error * ratio_error * du * du;
-                auto const time_max = error2 > 0
-                    ? (time_max_i - u) / error2
-                    : static_cast<ScalarType>(0);
-                auto const time_wgt = error2 > 0
-                    ? 1.0 / error2
-                    : static_cast<ScalarType>(0);
-
-                // store into shared mem
-                // note, this name is essentially identical to the one used 
-                // below. 
-                shrTimeMax[threadIdx.x] = error2 > 0 ? time_max : 0;
-                shrTimeWgt[threadIdx.x] = error2 > 0 ? time_wgt : 0;
-            } else {
-                shrTimeMax[threadIdx.x] = 0;
-                shrTimeWgt[threadIdx.x] = 0;
-            }
-
-            // continue with ratios
-            auto const stepOverBeta = static_cast<SampleVector::Scalar>(ratio_step) / beta;
-            auto const offset = static_cast<SampleVector::Scalar>(ratio_index) + alphabeta;
-            auto const rmin = std::max(ratio_value - ratio_error, 0.001);
-            auto const rmax = std::min(ratio_value + ratio_error, 
-                fast_expf(static_cast<SampleVector::Scalar>(ratio_step) / beta)
-                - 0.001);
-            auto const time1 = 
-                offset - 
-                ratio_step / 
-                    (fast_expf((stepOverBeta - fast_logf(rmin)) / 
-                                       alpha) - 1.0);
-            auto const time2 = 
-                offset - 
-                ratio_step /
-                    (fast_expf((stepOverBeta - fast_logf(rmax)) / 
-                                       alpha) - 1.0);
-
-            // set these guys
-            tmax = 0.5 * (time1 + time2);
-            tmaxerr = 0.5 * std::sqrt((time1 - time2) * (time1 - time2));
+            for (int k = timeFitParameters_size - 2; k >= 0; k--)
+              u = u * ratio_value + timeFitParameters[k];
+
+            auto du = (timeFitParameters_size - 1) * (timeFitParameters[timeFitParameters_size - 1]);
+            for (int k = timeFitParameters_size - 2; k >= 1; k--)
+              du = du * ratio_value + k * timeFitParameters[k];
+
+            const auto error2 = ratio_error * ratio_error * du * du;
+            const auto time_max = error2 > 0 ? (time_max_i - u) / error2 : static_cast<ScalarType>(0);
+            const auto time_wgt = error2 > 0 ? 1.0 / error2 : static_cast<ScalarType>(0);
+
+            // store into shared mem
+            // note, this name is essentially identical to the one used
+            // below.
+            shrTimeMax[threadIdx.x] = error2 > 0 ? time_max : 0;
+            shrTimeWgt[threadIdx.x] = error2 > 0 ? time_wgt : 0;
+          } else {
+            shrTimeMax[threadIdx.x] = 0;
+            shrTimeWgt[threadIdx.x] = 0;
+          }
+
+          // continue with ratios
+          const auto stepOverBeta = static_cast<SampleVector::Scalar>(ratio_step) / beta;
+          const auto offset = static_cast<SampleVector::Scalar>(ratio_index) + alphabeta;
+          const auto rmin = std::max(ratio_value - ratio_error, 0.001);
+          const auto rmax = std::min(ratio_value + ratio_error,
+                                     fast_expf(static_cast<SampleVector::Scalar>(ratio_step) / beta) - 0.001);
+          const auto time1 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmin)) / alpha) - 1.0);
+          const auto time2 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmax)) / alpha) - 1.0);
+
+          // set these guys
+          tmax = 0.5 * (time1 + time2);
+          tmaxerr = 0.5 * std::sqrt((time1 - time2) * (time1 - time2));
 #ifdef DEBUG_TC_MAKERATIO
-            if (ch == 1 || ch == 0)
-                printf("ch = %d ltx = %d tmax = %f tmaxerr = %f time1 = %f time2 = %f offset = %f rmin = %f rmax = %f\n",
-                    ch, ltx, tmax, tmaxerr, time1, time2, offset, rmin, rmax);
+          if (ch == 1 || ch == 0)
+            printf("ch = %d ltx = %d tmax = %f tmaxerr = %f time1 = %f time2 = %f offset = %f rmin = %f rmax = %f\n",
+                   ch,
+                   ltx,
+                   tmax,
+                   tmaxerr,
+                   time1,
+                   time2,
+                   offset,
+                   rmin,
+                   rmax);
 #endif
 
-            SampleVector::Scalar sumAf = 0;
-            SampleVector::Scalar sumff = 0;
-            int const itmin = std::max(-1, static_cast<int>(std::floor(tmax - alphabeta)));
-            auto loffset = (static_cast<ScalarType>(itmin) - tmax) * invalphabeta;
-            // TODO: data dependence 
-            for (int it = itmin+1; it<nsamples; it++) {
-                loffset += invalphabeta;
-                if (useless_sample_values[ch_start + it])
-                    continue;
-                auto const inverr2 = 1.0 / 
-                    (sample_value_errors[ch_start + it]*sample_value_errors[ch_start + it]);
-                auto const term1 = 1.0 + loffset;
-                auto const f = (term1 > 1e-6)
-                    ? fast_expf(alpha * (fast_logf(term1) - loffset))
-                    : 0;
-                sumAf += sample_values[ch_start+it] * (f * inverr2);
-                sumff += f*(f*inverr2);
-            }
-
-            auto const sumAA = sumAAsNullHypot[ch];
-            auto const sum0 = sum0sNullHypot[ch];
-            chi2 = sumAA;
-            ScalarType amp = 0;
-            // TODO: sum0 can not be 0 below, need to introduce the check upfront
-            if (sumff > 0) {
-                chi2 = sumAA - sumAf * (sumAf / sumff);
-                amp = sumAf / sumff;
-            }
-            chi2 /= sum0;
+          SampleVector::Scalar sumAf = 0;
+          SampleVector::Scalar sumff = 0;
+          const int itmin = std::max(-1, static_cast<int>(std::floor(tmax - alphabeta)));
+          auto loffset = (static_cast<ScalarType>(itmin) - tmax) * invalphabeta;
+          // TODO: data dependence
+          for (int it = itmin + 1; it < nsamples; it++) {
+            loffset += invalphabeta;
+            if (useless_sample_values[ch_start + it])
+              continue;
+            const auto inverr2 = 1.0 / (sample_value_errors[ch_start + it] * sample_value_errors[ch_start + it]);
+            const auto term1 = 1.0 + loffset;
+            const auto f = (term1 > 1e-6) ? fast_expf(alpha * (fast_logf(term1) - loffset)) : 0;
+            sumAf += sample_values[ch_start + it] * (f * inverr2);
+            sumff += f * (f * inverr2);
+          }
+
+          const auto sumAA = sumAAsNullHypot[ch];
+          const auto sum0 = sum0sNullHypot[ch];
+          chi2 = sumAA;
+          // TODO: sum0 can not be 0 below, need to introduce the check upfront
+          if (sumff > 0) {
+            chi2 = sumAA - sumAf * (sumAf / sumff);
+          }
+          chi2 /= sum0;
 
 #ifdef DEBUG_TC_MAKERATIO
-            if (ch == 1 || ch == 0)
-                printf("ch = %d ltx = %d sumAf = %f sumff = %f sumAA = %f sum0 = %d tmax = %f tmaxerr = %f chi2 = %f\n",
-                    ch, ltx, sumAf, sumff, sumAA, static_cast<int>(sum0), tmax, tmaxerr, chi2);
+          if (ch == 1 || ch == 0)
+            printf("ch = %d ltx = %d sumAf = %f sumff = %f sumAA = %f sum0 = %d tmax = %f tmaxerr = %f chi2 = %f\n",
+                   ch,
+                   ltx,
+                   sumAf,
+                   sumff,
+                   sumAA,
+                   static_cast<int>(sum0),
+                   tmax,
+                   tmaxerr,
+                   chi2);
 #endif
 
-            if (chi2>0 && tmax>0 && tmaxerr>0)
-                internalCondForSkipping2 = false;
-            else
-                chi2 = std::numeric_limits<ScalarType>::max();
+          if (chi2 > 0 && tmax > 0 && tmaxerr > 0)
+            internalCondForSkipping2 = false;
+          else
+            chi2 = std::numeric_limits<ScalarType>::max();
         }
-    }
+      }
 
-    // store into smem
-    shr_chi2s[threadIdx.x] = chi2;
-    __syncthreads();
+      // store into smem
+      shr_chi2s[threadIdx.x] = chi2;
+      __syncthreads();
 
-    // find min chi2 - quite crude for now
-    // TODO validate/check
-    char iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
-    bool oddElements = nthreads_per_channel % 2;
+      // find min chi2 - quite crude for now
+      // TODO validate/check
+      char iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
+      bool oddElements = nthreads_per_channel % 2;
 #pragma unroll
-    while (iter>=1) {
+      while (iter >= 1) {
         if (ltx < iter)
-            // for odd ns, the last guy will just store itself
-            // exception is for ltx == 0 and iter==1
-            shr_chi2s[threadIdx.x] = oddElements && (ltx==iter-1 && ltx>0)
-                ? shr_chi2s[threadIdx.x] 
-                : std::min(shr_chi2s[threadIdx.x], shr_chi2s[threadIdx.x+iter]);
+          // for odd ns, the last guy will just store itself
+          // exception is for ltx == 0 and iter==1
+          shr_chi2s[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
+                                       ? shr_chi2s[threadIdx.x]
+                                       : std::min(shr_chi2s[threadIdx.x], shr_chi2s[threadIdx.x + iter]);
         __syncthreads();
         oddElements = iter % 2;
-        iter = iter==1 ? iter/2 : iter/2 + iter%2;
-    }
+        iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
+      }
 
-    // filter out inactive or useless samples threads
-    if (!condForUselessSamples && !internalCondForSkipping1 
-            && !internalCondForSkipping2) {
+      // filter out inactive or useless samples threads
+      if (!condForUselessSamples && !internalCondForSkipping1 && !internalCondForSkipping2) {
         // min chi2, now compute weighted average of tmax measurements
         // see cpu version for more explanation
-        auto const chi2min = shr_chi2s[threadIdx.x - ltx];
-        auto const chi2Limit = chi2min + 1.0;
-        auto const inverseSigmaSquared = 
-            chi2 < chi2Limit
-                ? 1.0 / (tmaxerr * tmaxerr)
-                : 0.0;
+        const auto chi2min = shr_chi2s[threadIdx.x - ltx];
+        const auto chi2Limit = chi2min + 1.0;
+        const auto inverseSigmaSquared = chi2 < chi2Limit ? 1.0 / (tmaxerr * tmaxerr) : 0.0;
 
 #ifdef DEBUG_TC_MAKERATIO
         if (ch == 1 || ch == 0)
-            printf("ch = %d ltx = %d chi2min = %f chi2Limit = %f inverseSigmaSquared = %f\n",
-                ch, ltx, chi2min, chi2Limit, inverseSigmaSquared);
+          printf("ch = %d ltx = %d chi2min = %f chi2Limit = %f inverseSigmaSquared = %f\n",
+                 ch,
+                 ltx,
+                 chi2min,
+                 chi2Limit,
+                 inverseSigmaSquared);
 #endif
 
         // store into shared mem and run reduction
@@ -449,53 +414,53 @@ void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values,
         // TODO: check if shuffling intrinsics are better
         shr_time_wgt[threadIdx.x] = inverseSigmaSquared;
         shr_time_max[threadIdx.x] = tmax * inverseSigmaSquared;
-    } else {
+      } else {
         shr_time_wgt[threadIdx.x] = 0;
         shr_time_max[threadIdx.x] = 0;
-    }
-    __syncthreads();
+      }
+      __syncthreads();
 
-    // reduce to compute time_max and time_wgt
-    iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
-    oddElements = nthreads_per_channel % 2;
+      // reduce to compute time_max and time_wgt
+      iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
+      oddElements = nthreads_per_channel % 2;
 #pragma unroll
-    while (iter>=1) {
+      while (iter >= 1) {
         if (ltx < iter) {
-            shr_time_wgt[threadIdx.x] = oddElements && (ltx==iter-1 && ltx>0)
-                ? shr_time_wgt[threadIdx.x]
-                : shr_time_wgt[threadIdx.x] + shr_time_wgt[threadIdx.x+iter];
-            shr_time_max[threadIdx.x] = oddElements && (ltx==iter-1 && ltx>0)
-                ? shr_time_max[threadIdx.x]
-                : shr_time_max[threadIdx.x] + shr_time_max[threadIdx.x+iter];
-            shrTimeMax[threadIdx.x] = oddElements && (ltx==iter-1 && ltx>0)
-                ? shrTimeMax[threadIdx.x]
-                : shrTimeMax[threadIdx.x] + shrTimeMax[threadIdx.x+iter];
-            shrTimeWgt[threadIdx.x] = oddElements && (ltx==iter-1 && ltx>0)
-                ? shrTimeWgt[threadIdx.x]
-                : shrTimeWgt[threadIdx.x] + shrTimeWgt[threadIdx.x+iter];
+          shr_time_wgt[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
+                                          ? shr_time_wgt[threadIdx.x]
+                                          : shr_time_wgt[threadIdx.x] + shr_time_wgt[threadIdx.x + iter];
+          shr_time_max[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
+                                          ? shr_time_max[threadIdx.x]
+                                          : shr_time_max[threadIdx.x] + shr_time_max[threadIdx.x + iter];
+          shrTimeMax[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
+                                        ? shrTimeMax[threadIdx.x]
+                                        : shrTimeMax[threadIdx.x] + shrTimeMax[threadIdx.x + iter];
+          shrTimeWgt[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
+                                        ? shrTimeWgt[threadIdx.x]
+                                        : shrTimeWgt[threadIdx.x] + shrTimeWgt[threadIdx.x + iter];
         }
-        
+
         __syncthreads();
         oddElements = iter % 2;
-        iter = iter==1 ? iter/2 : iter/2 + iter%2;
-    }
+        iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
+      }
 
-    // load from shared memory the 0th guy (will contain accumulated values)
-    // compute 
-    // store into global mem
-    if (ltx == 0) {
-        auto const tmp_time_max = shr_time_max[threadIdx.x];
-        auto const tmp_time_wgt = shr_time_wgt[threadIdx.x];
+      // load from shared memory the 0th guy (will contain accumulated values)
+      // compute
+      // store into global mem
+      if (ltx == 0) {
+        const auto tmp_time_max = shr_time_max[threadIdx.x];
+        const auto tmp_time_wgt = shr_time_wgt[threadIdx.x];
 
         // we are done if there number of time ratios is 0
-        if (tmp_time_wgt==0 && tmp_time_max==0) {
-            g_state[ch] = TimeComputationState::Finished;
-            return ;
+        if (tmp_time_wgt == 0 && tmp_time_max == 0) {
+          g_state[ch] = TimeComputationState::Finished;
+          return;
         }
 
         // no div by 0
-        auto const tMaxAlphaBeta = tmp_time_max / tmp_time_wgt;
-        auto const tMaxErrorAlphaBeta = 1.0 / std::sqrt(tmp_time_wgt);
+        const auto tMaxAlphaBeta = tmp_time_max / tmp_time_wgt;
+        const auto tMaxErrorAlphaBeta = 1.0 / std::sqrt(tmp_time_wgt);
 
         tMaxAlphaBetas[ch] = tMaxAlphaBeta;
         tMaxErrorAlphaBetas[ch] = tMaxErrorAlphaBeta;
@@ -504,22 +469,22 @@ void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values,
         g_state[ch] = TimeComputationState::NotFinished;
 
 #ifdef DEBUG_TC_MAKERATIO
-            printf("ch = %d time_max = %f time_wgt = %f\n",
-                ch, tmp_time_max, tmp_time_wgt);
-            printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f timeMax = %f timeWgt = %f\n",
-                ch, tMaxAlphaBeta, tMaxErrorAlphaBeta, 
-                shrTimeMax[threadIdx.x],
-                shrTimeWgt[threadIdx.x]);
+        printf("ch = %d time_max = %f time_wgt = %f\n", ch, tmp_time_max, tmp_time_wgt);
+        printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f timeMax = %f timeWgt = %f\n",
+               ch,
+               tMaxAlphaBeta,
+               tMaxErrorAlphaBeta,
+               shrTimeMax[threadIdx.x],
+               shrTimeWgt[threadIdx.x]);
 #endif
+      }
     }
-}
-
-/// launch ctx parameters are 
-/// 10 threads per channel, N channels per block, Y blocks
-/// TODO: do we need to keep the state around or can be removed?!
-//#define DEBUG_FINDAMPLCHI2_AND_FINISH
-__global__
-void kernel_time_compute_findamplchi2_and_finish(
+
+    /// launch ctx parameters are
+    /// 10 threads per channel, N channels per block, Y blocks
+    /// TODO: do we need to keep the state around or can be removed?!
+    //#define DEBUG_FINDAMPLCHI2_AND_FINISH
+    __global__ void kernel_time_compute_findamplchi2_and_finish(
         SampleVector::Scalar const* sample_values,
         SampleVector::Scalar const* sample_value_errors,
         uint32_t const* dids_eb,
@@ -539,451 +504,387 @@ void kernel_time_compute_findamplchi2_and_finish(
         SampleVector::Scalar* g_ampMaxError,
         SampleVector::Scalar* g_timeMax,
         SampleVector::Scalar* g_timeError,
-        int const nchannels,
+        const int nchannels,
         uint32_t const offsetForInputs) {
-    using ScalarType = SampleVector::Scalar;
-
-    // constants 
-    constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
-
-    // indices
-    int const gtx = threadIdx.x + blockIdx.x*blockDim.x;
-    int const ch = gtx / nsamples;
-    int const sample = threadIdx.x % nsamples;
-    int const ch_start = ch * nsamples;
-    auto const* dids = ch >= offsetForInputs
-        ? dids_ee
-        : dids_eb;
-    int const inputCh = ch >= offsetForInputs
-        ? ch - offsetForInputs
-        : ch;
-
-    // configure shared mem
-    // per block, we need #threads per block * 2 * sizeof(ScalarType)
-    // we run with N channels per block
-    extern __shared__ char smem[];
-    ScalarType* shr_sumAf = reinterpret_cast<ScalarType*>(smem);
-    ScalarType* shr_sumff = shr_sumAf + blockDim.x;
-
-    if (ch >= nchannels) return;
-
-    auto state = g_state[ch];
-    auto const did = DetId{dids[inputCh]};
-    auto const* amplitudeFitParameters = did.subdetId() == EcalBarrel
-        ? amplitudeFitParametersEB
-        : amplitudeFitParametersEE;
-
-
-    // TODO is that better than storing into global and launching another kernel
-    // for the first 10 threads
-    if (state == TimeComputationState::NotFinished) {
-        auto const alpha = amplitudeFitParameters[0];
-        auto const beta = amplitudeFitParameters[1];
-        auto const alphabeta = alpha * beta;
-        auto const invalphabeta = 1.0 / alphabeta;
-        auto const tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
-        auto const sample_value = sample_values[gtx];
-        auto const sample_value_error = sample_value_errors[gtx];
-        auto const inverr2 = useless_samples[gtx]
-            ? static_cast<ScalarType>(0)
-            : 1.0 / (sample_value_error * sample_value_error);
-        auto const offset = (static_cast<ScalarType>(sample) - tMaxAlphaBeta) 
-            * invalphabeta;
-        auto const term1 = 1.0 + offset;
-        auto const f = term1 > 1e-6 
-            ? fast_expf(alpha * (fast_logf(term1) - offset))
-            : static_cast<ScalarType>(0.0);
-        auto const sumAf = sample_value * (f * inverr2);
-        auto const sumff = f * (f * inverr2);
+      using ScalarType = SampleVector::Scalar;
+
+      // constants
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
+      const int ch = gtx / nsamples;
+      const int sample = threadIdx.x % nsamples;
+      const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+      const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+
+      // configure shared mem
+      // per block, we need #threads per block * 2 * sizeof(ScalarType)
+      // we run with N channels per block
+      extern __shared__ char smem[];
+      ScalarType* shr_sumAf = reinterpret_cast<ScalarType*>(smem);
+      ScalarType* shr_sumff = shr_sumAf + blockDim.x;
+
+      if (ch >= nchannels)
+        return;
+
+      auto state = g_state[ch];
+      const auto did = DetId{dids[inputCh]};
+      const auto* amplitudeFitParameters =
+          did.subdetId() == EcalBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE;
+
+      // TODO is that better than storing into global and launching another kernel
+      // for the first 10 threads
+      if (state == TimeComputationState::NotFinished) {
+        const auto alpha = amplitudeFitParameters[0];
+        const auto beta = amplitudeFitParameters[1];
+        const auto alphabeta = alpha * beta;
+        const auto invalphabeta = 1.0 / alphabeta;
+        const auto tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
+        const auto sample_value = sample_values[gtx];
+        const auto sample_value_error = sample_value_errors[gtx];
+        const auto inverr2 =
+            useless_samples[gtx] ? static_cast<ScalarType>(0) : 1.0 / (sample_value_error * sample_value_error);
+        const auto offset = (static_cast<ScalarType>(sample) - tMaxAlphaBeta) * invalphabeta;
+        const auto term1 = 1.0 + offset;
+        const auto f = term1 > 1e-6 ? fast_expf(alpha * (fast_logf(term1) - offset)) : static_cast<ScalarType>(0.0);
+        const auto sumAf = sample_value * (f * inverr2);
+        const auto sumff = f * (f * inverr2);
 
         // store into shared mem
         shr_sumAf[threadIdx.x] = sumAf;
         shr_sumff[threadIdx.x] = sumff;
-    } else {
+      } else {
         shr_sumAf[threadIdx.x] = 0;
         shr_sumff[threadIdx.x] = 0;
-    }
-    __syncthreads();
-
-    // reduce
-    // unroll completely here (but hardcoded)
-    if (sample<5) {
-        shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x+5];
-        shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x+5];
-    }
-    __syncthreads();
-
-    if (sample<2) {
+      }
+      __syncthreads();
+
+      // reduce
+      // unroll completely here (but hardcoded)
+      if (sample < 5) {
+        shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x + 5];
+        shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x + 5];
+      }
+      __syncthreads();
+
+      if (sample < 2) {
         // will need to subtract for ltx = 3, we double count here
-        shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x+2] 
-            + shr_sumAf[threadIdx.x+3];
-        shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x+2] 
-            + shr_sumff[threadIdx.x+3];
-    }
-    __syncthreads();
+        shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x + 2] + shr_sumAf[threadIdx.x + 3];
+        shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x + 2] + shr_sumff[threadIdx.x + 3];
+      }
+      __syncthreads();
 
-    if (sample==0) {
+      if (sample == 0) {
         // exit if the state is done
         // note, we do not exit before all __synchtreads are finished
         if (state == TimeComputationState::Finished) {
-            g_timeMax[ch] = 5;
-            g_timeError[ch] = -999;
-            return;
+          g_timeMax[ch] = 5;
+          g_timeError[ch] = -999;
+          return;
         }
 
         // subtract to avoid double counting
-        auto const sumff = shr_sumff[threadIdx.x] 
-            + shr_sumff[threadIdx.x+1] 
-            - shr_sumff[threadIdx.x+3];
-        auto const sumAf = shr_sumAf[threadIdx.x]
-            + shr_sumAf[threadIdx.x+1]
-            - shr_sumAf[threadIdx.x+3];
-
-        auto const ampMaxAlphaBeta = sumff>0 ? sumAf / sumff : 0;
-        auto const sumAA = sumAAsNullHypot[ch];
-        auto const sum0 = sum0sNullHypot[ch];
-        auto const nullChi2 = chi2sNullHypot[ch];
+        const auto sumff = shr_sumff[threadIdx.x] + shr_sumff[threadIdx.x + 1] - shr_sumff[threadIdx.x + 3];
+        const auto sumAf = shr_sumAf[threadIdx.x] + shr_sumAf[threadIdx.x + 1] - shr_sumAf[threadIdx.x + 3];
+
+        const auto ampMaxAlphaBeta = sumff > 0 ? sumAf / sumff : 0;
+        const auto sumAA = sumAAsNullHypot[ch];
+        const auto sum0 = sum0sNullHypot[ch];
+        const auto nullChi2 = chi2sNullHypot[ch];
         if (sumff > 0) {
-            auto const chi2AlphaBeta = (sumAA - sumAf * sumAf / sumff) / sum0;
-            if (chi2AlphaBeta > nullChi2) {
-                // null hypothesis is better
-                state = TimeComputationState::Finished;
+          const auto chi2AlphaBeta = (sumAA - sumAf * sumAf / sumff) / sum0;
+          if (chi2AlphaBeta > nullChi2) {
+            // null hypothesis is better
+            state = TimeComputationState::Finished;
 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
-                printf("ch = %d chi2AlphaBeta = %f nullChi2 = %f sumAA = %f sumAf = %f sumff = %f sum0 = %f\n",
-                    ch, chi2AlphaBeta, nullChi2, sumAA, sumAf, sumff, sum0);
+            printf("ch = %d chi2AlphaBeta = %f nullChi2 = %f sumAA = %f sumAf = %f sumff = %f sum0 = %f\n",
+                   ch,
+                   chi2AlphaBeta,
+                   nullChi2,
+                   sumAA,
+                   sumAf,
+                   sumff,
+                   sum0);
 #endif
-            }
+          }
 
-            // store to global
-            g_ampMaxAlphaBeta[ch] = ampMaxAlphaBeta;
+          // store to global
+          g_ampMaxAlphaBeta[ch] = ampMaxAlphaBeta;
         } else {
 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
-            printf("ch = %d sum0 = %f sumAA = %f sumff = %f sumAf = %f\n",
-                ch, sum0, sumAA, sumff, sumAf);
+          printf("ch = %d sum0 = %f sumAA = %f sumff = %f sumAf = %f\n", ch, sum0, sumAA, sumff, sumAf);
 #endif
-            state = TimeComputationState::Finished;
+          state = TimeComputationState::Finished;
         }
 
         // store the state to global and finish calcs
         g_state[ch] = state;
         if (state == TimeComputationState::Finished) {
-            // store default values into global
-            g_timeMax[ch] = 5;
-            g_timeError[ch] = -999;
+          // store default values into global
+          g_timeMax[ch] = 5;
+          g_timeError[ch] = -999;
 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
-            printf("ch = %d finished state\n", ch);
+          printf("ch = %d finished state\n", ch);
 #endif
-            return;
+          return;
         }
 
-        auto const ampMaxError = g_ampMaxError[ch];
-        auto const test_ratio = ampMaxAlphaBeta / ampMaxError;
-        auto const accTimeMax = g_accTimeMax[ch];
-        auto const accTimeWgt = g_accTimeWgt[ch];
-        auto const tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
-        auto const tMaxErrorAlphaBeta = g_tMaxErrorAlphaBeta[ch];
+        const auto ampMaxError = g_ampMaxError[ch];
+        const auto test_ratio = ampMaxAlphaBeta / ampMaxError;
+        const auto accTimeMax = g_accTimeMax[ch];
+        const auto accTimeWgt = g_accTimeWgt[ch];
+        const auto tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
+        const auto tMaxErrorAlphaBeta = g_tMaxErrorAlphaBeta[ch];
         // branch to separate large vs small pulses
         // see cpu version for more info
-        if (test_ratio > 5.0 && accTimeWgt>0) {
-            auto const tMaxRatio = accTimeWgt>0 
-                ? accTimeMax / accTimeWgt 
-                : static_cast<ScalarType>(0);
-            auto const tMaxErrorRatio = accTimeWgt>0 
-                ? 1.0 / std::sqrt(accTimeWgt) 
-                : static_cast<ScalarType>(0);
-
-            if (test_ratio > 10.0) {
-                g_timeMax[ch] = tMaxRatio;
-                g_timeError[ch] = tMaxErrorRatio;
-                
-#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
-                    printf("ch = %d tMaxRatio = %f tMaxErrorRatio = %f\n",
-                        ch, tMaxRatio, tMaxErrorRatio);
-#endif
-            } else {
-                auto const timeMax = 
-                    (tMaxAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) + 
-                     tMaxRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) / 5.0;
-                auto const timeError = 
-                    (tMaxErrorAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) + 
-                     tMaxErrorRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) / 5.0;
-                state = TimeComputationState::Finished;
-                g_state[ch] = state;
-                g_timeMax[ch] = timeMax;
-                g_timeError[ch] = timeError;
+        if (test_ratio > 5.0 && accTimeWgt > 0) {
+          const auto tMaxRatio = accTimeWgt > 0 ? accTimeMax / accTimeWgt : static_cast<ScalarType>(0);
+          const auto tMaxErrorRatio = accTimeWgt > 0 ? 1.0 / std::sqrt(accTimeWgt) : static_cast<ScalarType>(0);
+
+          if (test_ratio > 10.0) {
+            g_timeMax[ch] = tMaxRatio;
+            g_timeError[ch] = tMaxErrorRatio;
 
 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
-                    printf("ch = %d timeMax = %f timeError = %f\n",
-                        ch, timeMax, timeError);
+            printf("ch = %d tMaxRatio = %f tMaxErrorRatio = %f\n", ch, tMaxRatio, tMaxErrorRatio);
 #endif
-            }
-        }
-        else {
+          } else {
+            const auto timeMax = (tMaxAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) +
+                                  tMaxRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) /
+                                 5.0;
+            const auto timeError = (tMaxErrorAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) +
+                                    tMaxErrorRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) /
+                                   5.0;
             state = TimeComputationState::Finished;
             g_state[ch] = state;
-            g_timeMax[ch] = tMaxAlphaBeta;
-            g_timeError[ch] = tMaxErrorAlphaBeta;
+            g_timeMax[ch] = timeMax;
+            g_timeError[ch] = timeError;
+
+#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
+            printf("ch = %d timeMax = %f timeError = %f\n", ch, timeMax, timeError);
+#endif
+          }
+        } else {
+          state = TimeComputationState::Finished;
+          g_state[ch] = state;
+          g_timeMax[ch] = tMaxAlphaBeta;
+          g_timeError[ch] = tMaxErrorAlphaBeta;
 
 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
-                printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f\n",
-                    ch, tMaxAlphaBeta, tMaxErrorAlphaBeta);
+          printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f\n", ch, tMaxAlphaBeta, tMaxErrorAlphaBeta);
 #endif
         }
+      }
     }
-}
-
-__global__
-void kernel_time_compute_fixMGPAslew(uint16_t const* digis_eb,
-                                     uint16_t const* digis_ee,
-                                     SampleVector::Scalar* sample_values,
-                                     SampleVector::Scalar* sample_value_errors,
-                                     bool* useless_sample_values,
-                                     unsigned int const sample_mask,
-                                     int const nchannels,
-                                     uint32_t const offsetForInputs) {
-    using ScalarType = SampleVector::Scalar;
-
-    // constants
-    constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
-
-    // indices
-    int const gtx = threadIdx.x + blockIdx.x * blockDim.x;
-    int const ch = gtx / nsamples;
-    int const sample = threadIdx.x % nsamples;
-    int const inputCh = ch >= offsetForInputs
-        ? ch - offsetForInputs
-        : ch;
-    int const inputGtx = ch >= offsetForInputs
-        ? gtx - offsetForInputs*nsamples
-        : gtx;
-    auto const* digis = ch >= offsetForInputs
-        ? digis_ee
-        : digis_eb;
-
-    // remove thread for sample 0, oversubscribing is easier than ....
-    if (ch >= nchannels || sample==0) return;
-
-    if (!use_sample(sample_mask, sample)) return;
-
-    auto const gainIdPrev = ecal::mgpa::gainId(digis[inputGtx-1]);
-    auto const gainIdNext = ecal::mgpa::gainId(digis[inputGtx]);
-    if (gainIdPrev>=1 && gainIdPrev<=3 &&
-        gainIdNext>=1 && gainIdNext<=3 && gainIdPrev < gainIdNext) {
-        sample_values[gtx-1] = 0;
-        sample_value_errors[gtx-1] = 1e+9;
-        useless_sample_values[gtx-1] = true;
-    }
-}
-
-__global__
-void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values,
-                              SampleVector::Scalar const* sample_value_errors,
-                              uint32_t const* dids,
-                              bool const* useless_samples,
-                              SampleVector::Scalar const* g_timeMax,
-                              SampleVector::Scalar const* amplitudeFitParametersEB,
-                              SampleVector::Scalar const* amplitudeFitParametersEE,
-                              SampleVector::Scalar *g_amplitudeMax,
-                              int const nchannels) {
-    using ScalarType = SampleVector::Scalar;
-
-    // constants
-    constexpr ScalarType corr4 = 1.;
-    constexpr ScalarType corr6 = 1.;
-    constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
-
-    // indices
-    int const gtx = threadIdx.x + blockIdx.x * blockDim.x;
-    int const ch = gtx / nsamples;
-    int const sample = threadIdx.x % nsamples;
-
-    if (ch >= nchannels) return;
-
-    auto const did = DetId{dids[ch]};
-    auto const* amplitudeFitParameters = did.subdetId() == EcalBarrel
-        ? amplitudeFitParametersEB
-        : amplitudeFitParametersEE;
-
-    // configure shared mem
-    extern __shared__ char smem[];
-    ScalarType* shr_sum1 = reinterpret_cast<ScalarType*>(smem);
-    auto *shr_sumA = shr_sum1 + blockDim.x;
-    auto *shr_sumF = shr_sumA + blockDim.x;
-    auto *shr_sumAF = shr_sumF + blockDim.x;
-    auto *shr_sumFF = shr_sumAF + blockDim.x;
-
-    auto const alpha = amplitudeFitParameters[0];
-    auto const beta = amplitudeFitParameters[1];
-    auto const timeMax = g_timeMax[ch];
-    auto const pedestalLimit = timeMax - (alpha * beta) - 1.0;
-    auto const sample_value = sample_values[gtx];
-    auto const sample_value_error = sample_value_errors[gtx];
-    auto const inverr2 = sample_value_error > 0
-        ? 1. / (sample_value_error * sample_value_error)
-        : static_cast<ScalarType>(0);
-    auto const termOne = 1 + (sample - timeMax) / (alpha * beta);
-    auto const f = termOne > 1.e-5
-        ? fast_expf(alpha * fast_logf(termOne) - 
-            (sample - timeMax) / beta)
-        : static_cast<ScalarType>(0.); 
-
-    bool const cond = ((sample < pedestalLimit) ||
-        (f>0.6*corr6 && sample<=timeMax) ||
-        (f>0.4*corr4 && sample>=timeMax)) && !useless_samples[gtx];
-
-    // store into shared mem
-    shr_sum1[threadIdx.x] = cond ? inverr2 : static_cast<ScalarType>(0);
-    shr_sumA[threadIdx.x] = cond
-        ? sample_value * inverr2
-        : static_cast<ScalarType>(0);
-    shr_sumF[threadIdx.x] = cond 
-        ? f * inverr2
-        : static_cast<ScalarType>(0);
-    shr_sumAF[threadIdx.x] = cond 
-        ? (f*inverr2)*sample_value
-        : static_cast<ScalarType>(0);
-    shr_sumFF[threadIdx.x] = cond 
-        ? f*(f*inverr2)
-        : static_cast<ScalarType>(0);
-
-    // reduction
-    if (sample <= 4) {
-        shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x+5];
-        shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x+5];
-        shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x+5];
-        shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x+5];
-        shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x+5];
+
+    __global__ void kernel_time_compute_fixMGPAslew(uint16_t const* digis_eb,
+                                                    uint16_t const* digis_ee,
+                                                    SampleVector::Scalar* sample_values,
+                                                    SampleVector::Scalar* sample_value_errors,
+                                                    bool* useless_sample_values,
+                                                    unsigned const int sample_mask,
+                                                    const int nchannels,
+                                                    uint32_t const offsetForInputs) {
+      using ScalarType = SampleVector::Scalar;
+
+      // constants
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
+      const int ch = gtx / nsamples;
+      const int sample = threadIdx.x % nsamples;
+      const int inputGtx = ch >= offsetForInputs ? gtx - offsetForInputs * nsamples : gtx;
+      const auto* digis = ch >= offsetForInputs ? digis_ee : digis_eb;
+
+      // remove thread for sample 0, oversubscribing is easier than ....
+      if (ch >= nchannels || sample == 0)
+        return;
+
+      if (!use_sample(sample_mask, sample))
+        return;
+
+      const auto gainIdPrev = ecal::mgpa::gainId(digis[inputGtx - 1]);
+      const auto gainIdNext = ecal::mgpa::gainId(digis[inputGtx]);
+      if (gainIdPrev >= 1 && gainIdPrev <= 3 && gainIdNext >= 1 && gainIdNext <= 3 && gainIdPrev < gainIdNext) {
+        sample_values[gtx - 1] = 0;
+        sample_value_errors[gtx - 1] = 1e+9;
+        useless_sample_values[gtx - 1] = true;
+      }
     }
-    __syncthreads();
 
-    if (sample < 2) {
+    __global__ void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values,
+                                             SampleVector::Scalar const* sample_value_errors,
+                                             uint32_t const* dids,
+                                             bool const* useless_samples,
+                                             SampleVector::Scalar const* g_timeMax,
+                                             SampleVector::Scalar const* amplitudeFitParametersEB,
+                                             SampleVector::Scalar const* amplitudeFitParametersEE,
+                                             SampleVector::Scalar* g_amplitudeMax,
+                                             const int nchannels) {
+      using ScalarType = SampleVector::Scalar;
+
+      // constants
+      constexpr ScalarType corr4 = 1.;
+      constexpr ScalarType corr6 = 1.;
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
+      const int ch = gtx / nsamples;
+      const int sample = threadIdx.x % nsamples;
+
+      if (ch >= nchannels)
+        return;
+
+      const auto did = DetId{dids[ch]};
+      const auto* amplitudeFitParameters =
+          did.subdetId() == EcalBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE;
+
+      // configure shared mem
+      extern __shared__ char smem[];
+      ScalarType* shr_sum1 = reinterpret_cast<ScalarType*>(smem);
+      auto* shr_sumA = shr_sum1 + blockDim.x;
+      auto* shr_sumF = shr_sumA + blockDim.x;
+      auto* shr_sumAF = shr_sumF + blockDim.x;
+      auto* shr_sumFF = shr_sumAF + blockDim.x;
+
+      const auto alpha = amplitudeFitParameters[0];
+      const auto beta = amplitudeFitParameters[1];
+      const auto timeMax = g_timeMax[ch];
+      const auto pedestalLimit = timeMax - (alpha * beta) - 1.0;
+      const auto sample_value = sample_values[gtx];
+      const auto sample_value_error = sample_value_errors[gtx];
+      const auto inverr2 =
+          sample_value_error > 0 ? 1. / (sample_value_error * sample_value_error) : static_cast<ScalarType>(0);
+      const auto termOne = 1 + (sample - timeMax) / (alpha * beta);
+      const auto f = termOne > 1.e-5 ? fast_expf(alpha * fast_logf(termOne) - (sample - timeMax) / beta)
+                                     : static_cast<ScalarType>(0.);
+
+      bool const cond = ((sample < pedestalLimit) || (f > 0.6 * corr6 && sample <= timeMax) ||
+                         (f > 0.4 * corr4 && sample >= timeMax)) &&
+                        !useless_samples[gtx];
+
+      // store into shared mem
+      shr_sum1[threadIdx.x] = cond ? inverr2 : static_cast<ScalarType>(0);
+      shr_sumA[threadIdx.x] = cond ? sample_value * inverr2 : static_cast<ScalarType>(0);
+      shr_sumF[threadIdx.x] = cond ? f * inverr2 : static_cast<ScalarType>(0);
+      shr_sumAF[threadIdx.x] = cond ? (f * inverr2) * sample_value : static_cast<ScalarType>(0);
+      shr_sumFF[threadIdx.x] = cond ? f * (f * inverr2) : static_cast<ScalarType>(0);
+
+      // reduction
+      if (sample <= 4) {
+        shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x + 5];
+        shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x + 5];
+        shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x + 5];
+        shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x + 5];
+        shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x + 5];
+      }
+      __syncthreads();
+
+      if (sample < 2) {
         // note: we double count sample 3
-        shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x+2] + shr_sum1[threadIdx.x+3];
-        shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x+2] + shr_sumA[threadIdx.x+3];
-        shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x+2] + shr_sumF[threadIdx.x+3];
-        shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x+2] 
-            + shr_sumAF[threadIdx.x+3];
-        shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x+2] 
-            + shr_sumFF[threadIdx.x+3];
-    }
-    __syncthreads();
-
-    if (sample == 0) {
-        auto const sum1 = shr_sum1[threadIdx.x] 
-            + shr_sum1[threadIdx.x+1] - shr_sum1[threadIdx.x+3];
-        auto const sumA = shr_sumA[threadIdx.x] 
-            + shr_sumA[threadIdx.x+1] - shr_sumA[threadIdx.x+3];
-        auto const sumF = shr_sumF[threadIdx.x] 
-            + shr_sumF[threadIdx.x+1] - shr_sumF[threadIdx.x+3];
-        auto const sumAF = shr_sumAF[threadIdx.x] 
-            + shr_sumAF[threadIdx.x+1] - shr_sumAF[threadIdx.x+3];
-        auto const sumFF = shr_sumFF[threadIdx.x] 
-            + shr_sumFF[threadIdx.x+1] - shr_sumFF[threadIdx.x+3];
-
-        auto const denom = sumFF * sum1 - sumF*sumF;
-        auto const condForDenom = sum1 > 0 && ecal::abs(denom)>1.e-20;
-        auto const amplitudeMax = condForDenom
-            ? (sumAF * sum1 - sumA * sumF) / denom
-            : static_cast<ScalarType>(0.);
+        shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x + 2] + shr_sum1[threadIdx.x + 3];
+        shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x + 2] + shr_sumA[threadIdx.x + 3];
+        shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x + 2] + shr_sumF[threadIdx.x + 3];
+        shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x + 2] + shr_sumAF[threadIdx.x + 3];
+        shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x + 2] + shr_sumFF[threadIdx.x + 3];
+      }
+      __syncthreads();
+
+      if (sample == 0) {
+        const auto sum1 = shr_sum1[threadIdx.x] + shr_sum1[threadIdx.x + 1] - shr_sum1[threadIdx.x + 3];
+        const auto sumA = shr_sumA[threadIdx.x] + shr_sumA[threadIdx.x + 1] - shr_sumA[threadIdx.x + 3];
+        const auto sumF = shr_sumF[threadIdx.x] + shr_sumF[threadIdx.x + 1] - shr_sumF[threadIdx.x + 3];
+        const auto sumAF = shr_sumAF[threadIdx.x] + shr_sumAF[threadIdx.x + 1] - shr_sumAF[threadIdx.x + 3];
+        const auto sumFF = shr_sumFF[threadIdx.x] + shr_sumFF[threadIdx.x + 1] - shr_sumFF[threadIdx.x + 3];
+
+        const auto denom = sumFF * sum1 - sumF * sumF;
+        const auto condForDenom = sum1 > 0 && ecal::abs(denom) > 1.e-20;
+        const auto amplitudeMax = condForDenom ? (sumAF * sum1 - sumA * sumF) / denom : static_cast<ScalarType>(0.);
 
         // store into global mem
         g_amplitudeMax[ch] = amplitudeMax;
+      }
     }
-}
-
-//#define ECAL_RECO_CUDA_TC_INIT_DEBUG
-__global__
-void kernel_time_computation_init(uint16_t const* digis_eb,
-                                  uint32_t const* dids_eb,
-                                  uint16_t const* digis_ee,
-                                  uint32_t const* dids_ee,
-                                  float const* rms_x12,
-                                  float const* rms_x6,
-                                  float const* rms_x1,
-                                  float const* mean_x12,
-                                  float const* mean_x6,
-                                  float const* mean_x1,
-                                  float const* gain12Over6,
-                                  float const* gain6Over1,
-                                  SampleVector::Scalar* sample_values,
-                                  SampleVector::Scalar* sample_value_errors,
-                                  SampleVector::Scalar* ampMaxError,
-                                  bool* useless_sample_values,
-                                  char* pedestal_nums,
-                                  uint32_t const offsetForHashes,
-                                  uint32_t const offsetForInputs,
-                                  unsigned int const sample_maskEB,
-                                  unsigned int const sample_maskEE,
-                                  int nchannels) {
-    using ScalarType = SampleVector::Scalar;
-
-    // constants
-    constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
-
-    // indices
-    int const tx = threadIdx.x + blockDim.x*blockIdx.x;
-    int const ch = tx/nsamples;
-    int const inputTx = ch >= offsetForInputs
-        ? tx - offsetForInputs*nsamples
-        : tx;
-    int const inputCh = ch >= offsetForInputs
-        ? ch - offsetForInputs
-        : ch;
-    auto const* digis = ch >= offsetForInputs
-        ? digis_ee
-        : digis_eb;
-    auto const* dids = ch >= offsetForInputs
-        ? dids_ee
-        : dids_eb;
-
-    if (ch < nchannels) {
+
+    //#define ECAL_RECO_CUDA_TC_INIT_DEBUG
+    __global__ void kernel_time_computation_init(uint16_t const* digis_eb,
+                                                 uint32_t const* dids_eb,
+                                                 uint16_t const* digis_ee,
+                                                 uint32_t const* dids_ee,
+                                                 float const* rms_x12,
+                                                 float const* rms_x6,
+                                                 float const* rms_x1,
+                                                 float const* mean_x12,
+                                                 float const* mean_x6,
+                                                 float const* mean_x1,
+                                                 float const* gain12Over6,
+                                                 float const* gain6Over1,
+                                                 SampleVector::Scalar* sample_values,
+                                                 SampleVector::Scalar* sample_value_errors,
+                                                 SampleVector::Scalar* ampMaxError,
+                                                 bool* useless_sample_values,
+                                                 char* pedestal_nums,
+                                                 uint32_t const offsetForHashes,
+                                                 uint32_t const offsetForInputs,
+                                                 unsigned const int sample_maskEB,
+                                                 unsigned const int sample_maskEE,
+                                                 int nchannels) {
+      using ScalarType = SampleVector::Scalar;
+
+      // constants
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      const int tx = threadIdx.x + blockDim.x * blockIdx.x;
+      const int ch = tx / nsamples;
+      const int inputTx = ch >= offsetForInputs ? tx - offsetForInputs * nsamples : tx;
+      const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+      const auto* digis = ch >= offsetForInputs ? digis_ee : digis_eb;
+      const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+
+      if (ch < nchannels) {
         // indices/inits
-        int const sample = tx % nsamples;
-        int const ch_start = ch*nsamples;
-        int const input_ch_start = inputCh*nsamples;
+        const int sample = tx % nsamples;
+        const int input_ch_start = inputCh * nsamples;
         SampleVector::Scalar pedestal = 0.;
         int num = 0;
 
         // configure shared mem
         extern __shared__ char smem[];
-        ScalarType* shrSampleValues = 
-            reinterpret_cast<SampleVector::Scalar*>(smem);
+        ScalarType* shrSampleValues = reinterpret_cast<SampleVector::Scalar*>(smem);
         ScalarType* shrSampleValueErrors = shrSampleValues + blockDim.x;
 
         // 0 and 1 sample values
-        auto const adc0 = ecal::mgpa::adc(digis[input_ch_start]);
-        auto const gainId0 = ecal::mgpa::gainId(digis[input_ch_start]);
-        auto const adc1 = ecal::mgpa::adc(digis[input_ch_start+1]);
-        auto const gainId1 = ecal::mgpa::gainId(digis[input_ch_start+1]);
-        auto const did = DetId{dids[inputCh]};
-        auto const isBarrel = did.subdetId() == EcalBarrel;
-        auto const sample_mask = did.subdetId() == EcalBarrel
-            ? sample_maskEB
-            : sample_maskEE;
-        auto const hashedId = isBarrel
-            ? hashedIndexEB(did.rawId())
-            : offsetForHashes + hashedIndexEE(did.rawId());
+        const auto adc0 = ecal::mgpa::adc(digis[input_ch_start]);
+        const auto gainId0 = ecal::mgpa::gainId(digis[input_ch_start]);
+        const auto adc1 = ecal::mgpa::adc(digis[input_ch_start + 1]);
+        const auto gainId1 = ecal::mgpa::gainId(digis[input_ch_start + 1]);
+        const auto did = DetId{dids[inputCh]};
+        const auto isBarrel = did.subdetId() == EcalBarrel;
+        const auto sample_mask = did.subdetId() == EcalBarrel ? sample_maskEB : sample_maskEE;
+        const auto hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId());
 
         // set pedestal
         // TODO this branch is non-divergent for a group of 10 threads
         if (gainId0 == 1 && use_sample(sample_mask, 0)) {
-            pedestal = static_cast<SampleVector::Scalar>(adc0);
-            num=1;
-
-            auto const diff = adc1 - adc0;
-            if (gainId1 == 1 && use_sample(sample_mask, 1)
-                && std::abs(diff) < 3*rms_x12[hashedId]) {
-                pedestal = 
-                    (pedestal + static_cast<SampleVector::Scalar>(adc1)) / 2.0;
-                num=2;
-            }
+          pedestal = static_cast<SampleVector::Scalar>(adc0);
+          num = 1;
+
+          const auto diff = adc1 - adc0;
+          if (gainId1 == 1 && use_sample(sample_mask, 1) && std::abs(diff) < 3 * rms_x12[hashedId]) {
+            pedestal = (pedestal + static_cast<SampleVector::Scalar>(adc1)) / 2.0;
+            num = 2;
+          }
         } else {
-            pedestal = mean_x12[ch];
+          pedestal = mean_x12[ch];
         }
 
         // ped subtracted and gain-renormalized samples.
-        auto const gainId = ecal::mgpa::gainId(digis[inputTx]);
-        auto const adc = ecal::mgpa::adc(digis[inputTx]);
+        const auto gainId = ecal::mgpa::gainId(digis[inputTx]);
+        const auto adc = ecal::mgpa::adc(digis[inputTx]);
 
         bool bad = false;
         SampleVector::Scalar sample_value, sample_value_error;
@@ -991,29 +892,27 @@ void kernel_time_computation_init(uint16_t const* digis_eb,
         // TODO: piece below is general both for amplitudes and timing
         // potentially there is a way to reduce the amount of code...
         if (!use_sample(sample_mask, sample)) {
-            bad = true;
-            sample_value = 0;
-            sample_value_error = 0;
+          bad = true;
+          sample_value = 0;
+          sample_value_error = 0;
         } else if (gainId == 1) {
-            sample_value = static_cast<SampleVector::Scalar>(adc) - pedestal;
-            sample_value_error = rms_x12[hashedId];
+          sample_value = static_cast<SampleVector::Scalar>(adc) - pedestal;
+          sample_value_error = rms_x12[hashedId];
         } else if (gainId == 2) {
-            sample_value =  (static_cast<SampleVector::Scalar>(adc) 
-                - mean_x6[hashedId]) * gain12Over6[hashedId]; 
-            sample_value_error = rms_x6[hashedId] * gain12Over6[hashedId];
+          sample_value = (static_cast<SampleVector::Scalar>(adc) - mean_x6[hashedId]) * gain12Over6[hashedId];
+          sample_value_error = rms_x6[hashedId] * gain12Over6[hashedId];
         } else if (gainId == 3) {
-            sample_value = (static_cast<SampleVector::Scalar>(adc) 
-                - mean_x1[hashedId]) * gain6Over1[hashedId] * gain12Over6[hashedId];
-            sample_value_error = rms_x1[hashedId] 
-                * gain6Over1[hashedId] * gain12Over6[hashedId];
+          sample_value = (static_cast<SampleVector::Scalar>(adc) - mean_x1[hashedId]) * gain6Over1[hashedId] *
+                         gain12Over6[hashedId];
+          sample_value_error = rms_x1[hashedId] * gain6Over1[hashedId] * gain12Over6[hashedId];
         } else {
-            sample_value = 0;
-            sample_value_error = 0;
-            bad = true;
+          sample_value = 0;
+          sample_value_error = 0;
+          bad = true;
         }
 
         // TODO: make sure we save things correctly when sample is useless
-        auto const useless_sample = (sample_value_error <= 0) | bad;
+        const auto useless_sample = (sample_value_error <= 0) | bad;
         useless_sample_values[tx] = useless_sample;
         sample_values[tx] = sample_value;
         sample_value_errors[tx] = useless_sample ? 1e+9 : sample_value_error;
@@ -1021,85 +920,73 @@ void kernel_time_computation_init(uint16_t const* digis_eb,
         // DEBUG
 #ifdef ECAL_RECO_CUDA_TC_INIT_DEBUG
         if (ch == 0) {
-            printf("sample = %d sample_value = %f sample_value_error = %f useless = %c\n",
-                sample, sample_value, sample_value_error, 
-                useless_sample ? '1' : '0');           
+          printf("sample = %d sample_value = %f sample_value_error = %f useless = %c\n",
+                 sample,
+                 sample_value,
+                 sample_value_error,
+                 useless_sample ? '1' : '0');
         }
 #endif
 
         // store into the shared mem
-        shrSampleValues[threadIdx.x] = sample_value_error > 0
-            ? sample_value
-            : std::numeric_limits<ScalarType>::min();
+        shrSampleValues[threadIdx.x] = sample_value_error > 0 ? sample_value : std::numeric_limits<ScalarType>::min();
         shrSampleValueErrors[threadIdx.x] = sample_value_error;
         __syncthreads();
 
         // perform the reduction with min
         if (sample < 5) {
-            // note, if equal -> we keep the value with lower sample as for cpu
-            shrSampleValueErrors[threadIdx.x] = 
-                shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x+5] 
-                ? shrSampleValueErrors[threadIdx.x+5]
-                : shrSampleValueErrors[threadIdx.x];
-            shrSampleValues[threadIdx.x] = 
-                std::max(shrSampleValues[threadIdx.x], 
-                         shrSampleValues[threadIdx.x+5]);
+          // note, if equal -> we keep the value with lower sample as for cpu
+          shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 5]
+                                                  ? shrSampleValueErrors[threadIdx.x + 5]
+                                                  : shrSampleValueErrors[threadIdx.x];
+          shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 5]);
         }
         __syncthreads();
 
         // a bit of an overkill, but easier than to compare across 3 values
-        if (sample<3) {
-            shrSampleValueErrors[threadIdx.x] = 
-                shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x+3]
-                ? shrSampleValueErrors[threadIdx.x+3]
-                : shrSampleValueErrors[threadIdx.x];
-            shrSampleValues[threadIdx.x] = 
-                std::max(shrSampleValues[threadIdx.x], 
-                         shrSampleValues[threadIdx.x+3]);
+        if (sample < 3) {
+          shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 3]
+                                                  ? shrSampleValueErrors[threadIdx.x + 3]
+                                                  : shrSampleValueErrors[threadIdx.x];
+          shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 3]);
         }
         __syncthreads();
 
         if (sample < 2) {
-            shrSampleValueErrors[threadIdx.x] = 
-                shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x+2]
-                ? shrSampleValueErrors[threadIdx.x+2]
-                : shrSampleValueErrors[threadIdx.x];
-            shrSampleValues[threadIdx.x] = 
-                std::max(shrSampleValues[threadIdx.x], 
-                         shrSampleValues[threadIdx.x+2]);
+          shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 2]
+                                                  ? shrSampleValueErrors[threadIdx.x + 2]
+                                                  : shrSampleValueErrors[threadIdx.x];
+          shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 2]);
         }
         __syncthreads();
- 
+
         if (sample == 0) {
-            // we only needd the max error
-            auto const maxSampleValueError = 
-                shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x+1]
-                ? shrSampleValueErrors[threadIdx.x+1]
-                : shrSampleValueErrors[threadIdx.x];
-
-            // # pedestal samples used
-            pedestal_nums[ch] = num;
-            // this is used downstream
-            ampMaxError[ch] = maxSampleValueError;
-
-            // DEBUG
+          // we only needd the max error
+          const auto maxSampleValueError = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 1]
+                                               ? shrSampleValueErrors[threadIdx.x + 1]
+                                               : shrSampleValueErrors[threadIdx.x];
+
+          // # pedestal samples used
+          pedestal_nums[ch] = num;
+          // this is used downstream
+          ampMaxError[ch] = maxSampleValueError;
+
+          // DEBUG
 #ifdef ECAL_RECO_CUDA_TC_INIT_DEBUG
-            if (ch == 0) {
-                printf("pedestal_nums = %d ampMaxError = %f\n",
-                    num, maxSampleValueError);
-            }
+          if (ch == 0) {
+            printf("pedestal_nums = %d ampMaxError = %f\n", num, maxSampleValueError);
+          }
 #endif
         }
+      }
     }
-}
-
-///
-/// launch context parameters: 1 thread per channel
-///
-//#define DEBUG_TIME_CORRECTION
-__global__
-void kernel_time_correction_and_finalize(
-//        SampleVector::Scalar const* g_amplitude,
+
+    ///
+    /// launch context parameters: 1 thread per channel
+    ///
+    //#define DEBUG_TIME_CORRECTION
+    __global__ void kernel_time_correction_and_finalize(
+        //        SampleVector::Scalar const* g_amplitude,
         ::ecal::reco::StorageScalarType const* g_amplitude,
         uint16_t const* digis_eb,
         uint32_t const* dids_eb,
@@ -1113,11 +1000,11 @@ void kernel_time_correction_and_finalize(
         SampleVector::Scalar const* g_timeError,
         float const* g_rms_x12,
         float const* timeCalibConstant,
-        float *g_jitter,
-        float *g_jitterError,
-        uint32_t *flags,
-        int const amplitudeBinsSizeEB,
-        int const amplitudeBinsSizeEE,
+        float* g_jitter,
+        float* g_jitterError,
+        uint32_t* flags,
+        const int amplitudeBinsSizeEB,
+        const int amplitudeBinsSizeEE,
         ConfigurationParameters::type const timeConstantTermEB,
         ConfigurationParameters::type const timeConstantTermEE,
         float const offsetTimeValueEB,
@@ -1136,136 +1023,108 @@ void kernel_time_correction_and_finalize(
         ConfigurationParameters::type const outOfTimeThreshG61mEE,
         uint32_t const offsetForHashes,
         uint32_t const offsetForInputs,
-        int const nchannels) {
-    using ScalarType = SampleVector::Scalar;
-
-    // constants
-    constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
-
-    // indices
-    int const gtx = threadIdx.x + blockIdx.x * blockDim.x;
-    int const inputGtx = gtx >= offsetForInputs
-        ? gtx - offsetForInputs
-        : gtx;
-    auto const* dids = gtx >= offsetForInputs
-        ? dids_ee
-        : dids_eb;
-    auto const& digis = gtx >= offsetForInputs
-        ? digis_ee
-        : digis_eb;
-
-    // filter out outside of range threads
-    if (gtx >= nchannels) return;
-
-    auto const did = DetId{dids[inputGtx]};
-    auto const isBarrel = did.subdetId() == EcalBarrel;
-    auto const hashedId = isBarrel
-        ? hashedIndexEB(did.rawId())
-        : offsetForHashes + hashedIndexEE(did.rawId());
-    auto const* amplitudeBins = isBarrel
-        ? amplitudeBinsEB
-        : amplitudeBinsEE;
-    auto const* shiftBins = isBarrel
-        ? shiftBinsEB
-        : shiftBinsEE;
-    auto const amplitudeBinsSize = isBarrel
-        ? amplitudeBinsSizeEB
-        : amplitudeBinsSizeEE;
-    auto const timeConstantTerm = isBarrel 
-        ? timeConstantTermEB
-        : timeConstantTermEE;
-    auto const timeNconst = isBarrel 
-        ? timeNconstEB
-        : timeNconstEE;
-    auto const offsetTimeValue = isBarrel
-        ? offsetTimeValueEB
-        : offsetTimeValueEE;
-    auto const amplitudeThreshold = isBarrel
-        ? amplitudeThresholdEB
-        : amplitudeThresholdEE;
-    auto const outOfTimeThreshG12p = isBarrel
-        ? outOfTimeThreshG12pEB
-        : outOfTimeThreshG12pEE;
-    auto const outOfTimeThreshG12m = isBarrel
-        ? outOfTimeThreshG12mEB
-        : outOfTimeThreshG12mEE;
-    auto const outOfTimeThreshG61p = isBarrel
-        ? outOfTimeThreshG61pEB
-        : outOfTimeThreshG61pEE;
-    auto const outOfTimeThreshG61m = isBarrel
-        ? outOfTimeThreshG61mEB
-        : outOfTimeThreshG61mEE;
-    
-    // load some
-    auto const amplitude = g_amplitude[gtx];
-    auto const rms_x12 = g_rms_x12[hashedId];
-    auto const timeCalibConst = timeCalibConstant[hashedId];
-
-    int myBin = -1;
-    for (int bin=0; bin<amplitudeBinsSize; bin++) {
-        if (amplitude > amplitudeBins[bin]) 
-            myBin = bin;
-        else 
-            break;
-    }
-
-    ScalarType correction = 0;
-    if (myBin == -1) {
+        const int nchannels) {
+      using ScalarType = SampleVector::Scalar;
+
+      // constants
+      constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+      // indices
+      const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
+      const int inputGtx = gtx >= offsetForInputs ? gtx - offsetForInputs : gtx;
+      const auto* dids = gtx >= offsetForInputs ? dids_ee : dids_eb;
+      const auto& digis = gtx >= offsetForInputs ? digis_ee : digis_eb;
+
+      // filter out outside of range threads
+      if (gtx >= nchannels)
+        return;
+
+      const auto did = DetId{dids[inputGtx]};
+      const auto isBarrel = did.subdetId() == EcalBarrel;
+      const auto hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId());
+      const auto* amplitudeBins = isBarrel ? amplitudeBinsEB : amplitudeBinsEE;
+      const auto* shiftBins = isBarrel ? shiftBinsEB : shiftBinsEE;
+      const auto amplitudeBinsSize = isBarrel ? amplitudeBinsSizeEB : amplitudeBinsSizeEE;
+      const auto timeConstantTerm = isBarrel ? timeConstantTermEB : timeConstantTermEE;
+      const auto timeNconst = isBarrel ? timeNconstEB : timeNconstEE;
+      const auto offsetTimeValue = isBarrel ? offsetTimeValueEB : offsetTimeValueEE;
+      const auto amplitudeThreshold = isBarrel ? amplitudeThresholdEB : amplitudeThresholdEE;
+      const auto outOfTimeThreshG12p = isBarrel ? outOfTimeThreshG12pEB : outOfTimeThreshG12pEE;
+      const auto outOfTimeThreshG12m = isBarrel ? outOfTimeThreshG12mEB : outOfTimeThreshG12mEE;
+      const auto outOfTimeThreshG61p = isBarrel ? outOfTimeThreshG61pEB : outOfTimeThreshG61pEE;
+      const auto outOfTimeThreshG61m = isBarrel ? outOfTimeThreshG61mEB : outOfTimeThreshG61mEE;
+
+      // load some
+      const auto amplitude = g_amplitude[gtx];
+      const auto rms_x12 = g_rms_x12[hashedId];
+      const auto timeCalibConst = timeCalibConstant[hashedId];
+
+      int myBin = -1;
+      for (int bin = 0; bin < amplitudeBinsSize; bin++) {
+        if (amplitude > amplitudeBins[bin])
+          myBin = bin;
+        else
+          break;
+      }
+
+      ScalarType correction = 0;
+      if (myBin == -1) {
         correction = shiftBins[0];
-    } else if (myBin == amplitudeBinsSize-1) {
+      } else if (myBin == amplitudeBinsSize - 1) {
         correction = shiftBins[myBin];
-    } else {
-        correction = shiftBins[myBin+1] - shiftBins[myBin];
-        correction *= (amplitude - amplitudeBins[myBin]) / 
-            (amplitudeBins[myBin+1] - amplitudeBins[myBin]);
+      } else {
+        correction = shiftBins[myBin + 1] - shiftBins[myBin];
+        correction *= (amplitude - amplitudeBins[myBin]) / (amplitudeBins[myBin + 1] - amplitudeBins[myBin]);
         correction += shiftBins[myBin];
-    }
+      }
 
-    // correction * 1./25.
-    correction = correction * 0.04;
-    auto const timeMax = g_timeMax[gtx];
-    auto const timeError = g_timeError[gtx];
-    auto const jitter = timeMax - 5 + correction;
-    auto const jitterError = std::sqrt(timeError*timeError + 
-        timeConstantTerm*timeConstantTerm * 0.04 * 0.04); // 0.04 = 1./25.
+      // correction * 1./25.
+      correction = correction * 0.04;
+      const auto timeMax = g_timeMax[gtx];
+      const auto timeError = g_timeError[gtx];
+      const auto jitter = timeMax - 5 + correction;
+      const auto jitterError =
+          std::sqrt(timeError * timeError + timeConstantTerm * timeConstantTerm * 0.04 * 0.04);  // 0.04 = 1./25.
 
 #ifdef DEBUG_TIME_CORRECTION
-//    if (gtx == 0) {
-        printf("ch = %d timeMax = %f timeError = %f jitter = %f correction = %f\n",
-            gtx, timeMax, timeError, jitter, correction);
+      //    if (gtx == 0) {
+      printf("ch = %d timeMax = %f timeError = %f jitter = %f correction = %f\n",
+             gtx,
+             timeMax,
+             timeError,
+             jitter,
+             correction);
 //    }
 #endif
 
-    // store back to  global
-    g_jitter[gtx] = jitter;
-    g_jitterError[gtx] = jitterError;
+      // store back to  global
+      g_jitter[gtx] = jitter;
+      g_jitterError[gtx] = jitterError;
 
-    // set the flag
-    // TODO: replace with something more efficient (if required), 
-    // for now just to make it work
-    if (amplitude > amplitudeThreshold * rms_x12) {
+      // set the flag
+      // TODO: replace with something more efficient (if required),
+      // for now just to make it work
+      if (amplitude > amplitudeThreshold * rms_x12) {
         auto threshP = outOfTimeThreshG12p;
         auto threshM = outOfTimeThreshG12m;
         if (amplitude > 3000.) {
-            for (int isample=0; isample<nsamples; isample++) {
-                int gainid = ecal::mgpa::gainId(digis[nsamples*inputGtx + isample]);
-                if (gainid != 1) {
-                    threshP = outOfTimeThreshG61p;
-                    threshM = outOfTimeThreshG61m;
-                    break;
-                }
+          for (int isample = 0; isample < nsamples; isample++) {
+            int gainid = ecal::mgpa::gainId(digis[nsamples * inputGtx + isample]);
+            if (gainid != 1) {
+              threshP = outOfTimeThreshG61p;
+              threshM = outOfTimeThreshG61m;
+              break;
             }
+          }
         }
 
-        auto const correctedTime = (timeMax - 5) * 25 + 
-            timeCalibConst + offsetTimeValue;
-        auto const nterm = timeNconst * rms_x12 / amplitude;
-        auto const sigmat = std::sqrt(nterm * nterm + 
-            timeConstantTerm*timeConstantTerm);
-        if (correctedTime > sigmat*threshP || 
-            correctedTime < -sigmat*threshM)
-            flags[gtx] |= 0x1 << EcalUncalibratedRecHit::kOutOfTime;
+        const auto correctedTime = (timeMax - 5) * 25 + timeCalibConst + offsetTimeValue;
+        const auto nterm = timeNconst * rms_x12 / amplitude;
+        const auto sigmat = std::sqrt(nterm * nterm + timeConstantTerm * timeConstantTerm);
+        if (correctedTime > sigmat * threshP || correctedTime < -sigmat * threshM)
+          flags[gtx] |= 0x1 << EcalUncalibratedRecHit::kOutOfTime;
+      }
     }
-}
 
-}}
+  }  // namespace multifit
+}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.cu b/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.cu
index f657981b95fa0..327d9b20445fa 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.cu
@@ -1,120 +1,120 @@
 #include "inplace_fnnls.h"
 
-namespace ecal { namespace multifit {
-
-using matrix_t = SampleMatrix;
-using vector_t = SampleVector;
-
-__device__
-bool inplace_fnnls(matrix_t& AtA,
-                   vector_t& Atb,
-                   vector_t& x,
-                   int& npassive,
-                   BXVectorType& activeBXs,
-                   PulseMatrixType& pulse_matrix,
-                   const double eps,
-                   const unsigned int max_iterations) {
-  vector_t s;
-  vector_t w;
-
-// main loop
-  Eigen::Index w_max_idx_prev = 0;
-  matrix_t::Scalar w_max_prev = 0;
-  double eps_to_use = eps;
-
-  int iter = 0;
-  while (true) {
-    if (iter>0 || npassive==0) {
-        const auto nActive = vector_t::RowsAtCompileTime - npassive;
-        if(!nActive)
-          break;
-
-        w.tail(nActive) = Atb.tail(nActive) - (AtA * x).tail(nActive);
-
-        // get the index of w that gives the maximum gain
-        Eigen::Index w_max_idx;
-        const auto max_w = w.tail(nActive).maxCoeff(&w_max_idx);
-
-        // check for convergence
-        if (max_w < eps_to_use || (w_max_idx==w_max_idx_prev && max_w==w_max_prev))
-          break;
-    
-        // worst case
-        if (iter >= 500)
+namespace ecal {
+  namespace multifit {
+
+    using matrix_t = SampleMatrix;
+    using vector_t = SampleVector;
+
+    __device__ bool inplace_fnnls(matrix_t& AtA,
+                                  vector_t& Atb,
+                                  vector_t& x,
+                                  int& npassive,
+                                  BXVectorType& activeBXs,
+                                  PulseMatrixType& pulse_matrix,
+                                  const double eps,
+                                  const unsigned int max_iterations) {
+      vector_t s;
+      vector_t w;
+
+      // main loop
+      Eigen::Index w_max_idx_prev = 0;
+      matrix_t::Scalar w_max_prev = 0;
+      double eps_to_use = eps;
+
+      int iter = 0;
+      while (true) {
+        if (iter > 0 || npassive == 0) {
+          const auto nActive = vector_t::RowsAtCompileTime - npassive;
+          if (!nActive)
             break;
 
-        w_max_prev = max_w;
-        w_max_idx_prev = w_max_idx;
+          w.tail(nActive) = Atb.tail(nActive) - (AtA * x).tail(nActive);
 
-        // need to translate the index into the right part of the vector
-        w_max_idx += npassive;
+          // get the index of w that gives the maximum gain
+          Eigen::Index w_max_idx;
+          const auto max_w = w.tail(nActive).maxCoeff(&w_max_idx);
 
-        // swap AtA to avoid copy
-        AtA.col(npassive).swap(AtA.col(w_max_idx));
-        AtA.row(npassive).swap(AtA.row(w_max_idx));
-        // swap Atb to match with AtA
-        Eigen::numext::swap(Atb.coeffRef(npassive), Atb.coeffRef(w_max_idx));
-        Eigen::numext::swap(x.coeffRef(npassive), x.coeffRef(w_max_idx));
-        Eigen::numext::swap(activeBXs.coeffRef(npassive), activeBXs.coeffRef(w_max_idx));
-        pulse_matrix.col(npassive).swap(pulse_matrix.col(w_max_idx));
+          // check for convergence
+          if (max_w < eps_to_use || (w_max_idx == w_max_idx_prev && max_w == w_max_prev))
+            break;
 
-        ++npassive;
-    }
+          // worst case
+          if (iter >= 500)
+            break;
 
-// inner loop
-    while (true) {
-      if (npassive == 0) break;
+          w_max_prev = max_w;
+          w_max_idx_prev = w_max_idx;
 
-      s.head(npassive) =
-          AtA.topLeftCorner(npassive, npassive).llt().solve(Atb.head(npassive));
+          // need to translate the index into the right part of the vector
+          w_max_idx += npassive;
 
-      // if all coefficients are positive, done for this iteration
-      if (s.head(npassive).minCoeff() > 0.) {
-        x.head(npassive) = s.head(npassive);
-        break;
-      }
+          // swap AtA to avoid copy
+          AtA.col(npassive).swap(AtA.col(w_max_idx));
+          AtA.row(npassive).swap(AtA.row(w_max_idx));
+          // swap Atb to match with AtA
+          Eigen::numext::swap(Atb.coeffRef(npassive), Atb.coeffRef(w_max_idx));
+          Eigen::numext::swap(x.coeffRef(npassive), x.coeffRef(w_max_idx));
+          Eigen::numext::swap(activeBXs.coeffRef(npassive), activeBXs.coeffRef(w_max_idx));
+          pulse_matrix.col(npassive).swap(pulse_matrix.col(w_max_idx));
 
-      auto alpha = std::numeric_limits<double>::max();
-      Eigen::Index alpha_idx = 0;
+          ++npassive;
+        }
+
+        // inner loop
+        while (true) {
+          if (npassive == 0)
+            break;
+
+          s.head(npassive) = AtA.topLeftCorner(npassive, npassive).llt().solve(Atb.head(npassive));
+
+          // if all coefficients are positive, done for this iteration
+          if (s.head(npassive).minCoeff() > 0.) {
+            x.head(npassive) = s.head(npassive);
+            break;
+          }
+
+          auto alpha = std::numeric_limits<double>::max();
+          Eigen::Index alpha_idx = 0;
 
 #pragma unroll
-      for (auto i = 0; i < npassive; ++i) {
-        if (s[i] <= 0.) {
-          auto const ratio = x[i] / (x[i] - s[i]);
-          if (ratio < alpha) {
-            alpha = ratio;
-            alpha_idx = i;
+          for (auto i = 0; i < npassive; ++i) {
+            if (s[i] <= 0.) {
+              auto const ratio = x[i] / (x[i] - s[i]);
+              if (ratio < alpha) {
+                alpha = ratio;
+                alpha_idx = i;
+              }
+            }
           }
-        }
-      }
 
-      /*
+          /*
       if (std::numeric_limits<double>::max() == alpha) {
         x.head(npassive) = s.head(npassive);
         break;
       }*/
 
-      x.head(npassive) += alpha * (s.head(npassive) - x.head(npassive));
-      x[alpha_idx] = 0;
-      --npassive;
-
-      AtA.col(npassive).swap(AtA.col(alpha_idx));
-      AtA.row(npassive).swap(AtA.row(alpha_idx));
-      // swap Atb to match with AtA
-      Eigen::numext::swap(Atb.coeffRef(npassive), Atb.coeffRef(alpha_idx));
-      Eigen::numext::swap(x.coeffRef(npassive), x.coeffRef(alpha_idx));
-      Eigen::numext::swap(activeBXs.coeffRef(npassive), 
-                          activeBXs.coeffRef(alpha_idx));
-      pulse_matrix.col(npassive).swap(pulse_matrix.col(alpha_idx));
-    }
+          x.head(npassive) += alpha * (s.head(npassive) - x.head(npassive));
+          x[alpha_idx] = 0;
+          --npassive;
+
+          AtA.col(npassive).swap(AtA.col(alpha_idx));
+          AtA.row(npassive).swap(AtA.row(alpha_idx));
+          // swap Atb to match with AtA
+          Eigen::numext::swap(Atb.coeffRef(npassive), Atb.coeffRef(alpha_idx));
+          Eigen::numext::swap(x.coeffRef(npassive), x.coeffRef(alpha_idx));
+          Eigen::numext::swap(activeBXs.coeffRef(npassive), activeBXs.coeffRef(alpha_idx));
+          pulse_matrix.col(npassive).swap(pulse_matrix.col(alpha_idx));
+        }
 
-    // TODO as in cpu NNLS version
-    iter++;
-    if (iter % 16 == 0)
-        eps_to_use *= 2;
-  }
-  
-  return true;
-}
+        // TODO as in cpu NNLS version
+        iter++;
+        if (iter % 16 == 0)
+          eps_to_use *= 2;
+      }
+
+      return true;
+    }
 
-}}
+  }  // namespace multifit
+}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc
index 9661f98139f7b..95ccee87c726a 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc
@@ -10,7 +10,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h" 
+#include "FWCore/Framework/interface/MakerMacros.h"
 
 // algorithm specific
 
@@ -18,129 +18,106 @@
 
 #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
 
-class EcalCPUUncalibRecHitProducer
-    : public edm::stream::EDProducer<edm::ExternalWork>
-{
+class EcalCPUUncalibRecHitProducer : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
-    explicit EcalCPUUncalibRecHitProducer(edm::ParameterSet const& ps);
-    ~EcalCPUUncalibRecHitProducer() override;
-    static void fillDescriptions(edm::ConfigurationDescriptions&);
+  explicit EcalCPUUncalibRecHitProducer(edm::ParameterSet const& ps);
+  ~EcalCPUUncalibRecHitProducer() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
 
 private:
-    void acquire(edm::Event const&, 
-                 edm::EventSetup const&,
-                 edm::WaitingTaskWithArenaHolder) override;
-    void produce(edm::Event&, edm::EventSetup const&) override;
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
 
 private:
-    edm::EDGetTokenT<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>> 
-        recHitsInEBToken_, recHitsInEEToken_;
-    edm::EDPutTokenT<ecal::UncalibratedRecHit<ecal::Tag::soa>>
-        recHitsOutEBToken_, recHitsOutEEToken_;
-
-    ecal::UncalibratedRecHit<ecal::Tag::soa>
-        recHitsEB_, recHitsEE_;
-    bool containsTimingInformation_;
+  edm::EDGetTokenT<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>> recHitsInEBToken_, recHitsInEEToken_;
+  edm::EDPutTokenT<ecal::UncalibratedRecHit<ecal::Tag::soa>> recHitsOutEBToken_, recHitsOutEEToken_;
+
+  ecal::UncalibratedRecHit<ecal::Tag::soa> recHitsEB_, recHitsEE_;
+  bool containsTimingInformation_;
 };
 
-void EcalCPUUncalibRecHitProducer::fillDescriptions(
-        edm::ConfigurationDescriptions& confDesc) {
-    edm::ParameterSetDescription desc;
+void EcalCPUUncalibRecHitProducer::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
 
-    desc.add<edm::InputTag>("recHitsInLabelEB", 
-        edm::InputTag{"ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEB"});
-    desc.add<edm::InputTag>("recHitsInLabelEE", 
-        edm::InputTag{"ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEE"});
-    desc.add<std::string>("recHitsOutLabelEB", "EcalUncalibRecHitsEB");
-    desc.add<std::string>("recHitsOutLabelEE", "EcalUncalibRecHitsEE");
-    desc.add<bool>("containsTimingInformation", false);
+  desc.add<edm::InputTag>("recHitsInLabelEB", edm::InputTag{"ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEB"});
+  desc.add<edm::InputTag>("recHitsInLabelEE", edm::InputTag{"ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEE"});
+  desc.add<std::string>("recHitsOutLabelEB", "EcalUncalibRecHitsEB");
+  desc.add<std::string>("recHitsOutLabelEE", "EcalUncalibRecHitsEE");
+  desc.add<bool>("containsTimingInformation", false);
 
-    std::string label = "ecalCPUUncalibRecHitProducer";
-    confDesc.add(label, desc);
+  std::string label = "ecalCPUUncalibRecHitProducer";
+  confDesc.add(label, desc);
 }
 
-EcalCPUUncalibRecHitProducer::EcalCPUUncalibRecHitProducer(
-        const edm::ParameterSet& ps) 
+EcalCPUUncalibRecHitProducer::EcalCPUUncalibRecHitProducer(const edm::ParameterSet& ps)
     : recHitsInEBToken_{consumes<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>>(
-        ps.getParameter<edm::InputTag>("recHitsInLabelEB"))}
-    , recHitsInEEToken_{consumes<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>>(
-        ps.getParameter<edm::InputTag>("recHitsInLabelEE"))}
-    , recHitsOutEBToken_{produces<ecal::UncalibratedRecHit<ecal::Tag::soa>>(
-        ps.getParameter<std::string>("recHitsOutLabelEB"))}
-    , recHitsOutEEToken_{produces<ecal::UncalibratedRecHit<ecal::Tag::soa>>(
-        ps.getParameter<std::string>("recHitsOutLabelEE"))}
-    , containsTimingInformation_{ps.getParameter<bool>("containsTimingInformation")}
-{}
+          ps.getParameter<edm::InputTag>("recHitsInLabelEB"))},
+      recHitsInEEToken_{consumes<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>>(
+          ps.getParameter<edm::InputTag>("recHitsInLabelEE"))},
+      recHitsOutEBToken_{
+          produces<ecal::UncalibratedRecHit<ecal::Tag::soa>>(ps.getParameter<std::string>("recHitsOutLabelEB"))},
+      recHitsOutEEToken_{
+          produces<ecal::UncalibratedRecHit<ecal::Tag::soa>>(ps.getParameter<std::string>("recHitsOutLabelEE"))},
+      containsTimingInformation_{ps.getParameter<bool>("containsTimingInformation")} {}
 
 EcalCPUUncalibRecHitProducer::~EcalCPUUncalibRecHitProducer() {}
 
-void EcalCPUUncalibRecHitProducer::acquire(
-        edm::Event const& event,
-        edm::EventSetup const& setup,
-        edm::WaitingTaskWithArenaHolder taskHolder) 
-{
-    // retrieve data/ctx
-    auto const& ebRecHitsProduct = event.get(recHitsInEBToken_);
-    auto const& eeRecHitsProduct = event.get(recHitsInEEToken_);
-    cms::cuda::ScopedContextAcquire ctx{ebRecHitsProduct, std::move(taskHolder)};
-    auto const& ebRecHits = ctx.get(ebRecHitsProduct);
-    auto const& eeRecHits = ctx.get(eeRecHitsProduct);
-
-    // resize the output buffers
-    recHitsEB_.resize(ebRecHits.size);
-    recHitsEE_.resize(eeRecHits.size);
-
-    auto lambdaToTransfer = [&ctx](auto& dest, auto* src) {
-        using vector_type = typename std::remove_reference<decltype(dest)>::type;
-        using type = typename vector_type::value_type;
-        cudaCheck(cudaMemcpyAsync(dest.data(),
-                                  src,
-                                  dest.size() * sizeof(type),
-                                  cudaMemcpyDeviceToHost,
-                                  ctx.stream()));
-    };
-
-    // enqeue transfers
-    lambdaToTransfer(recHitsEB_.did, ebRecHits.did);
-    lambdaToTransfer(recHitsEE_.did, eeRecHits.did);
-    
-    lambdaToTransfer(recHitsEB_.amplitudesAll, ebRecHits.amplitudesAll);
-    lambdaToTransfer(recHitsEE_.amplitudesAll, eeRecHits.amplitudesAll);
-    
-    lambdaToTransfer(recHitsEB_.amplitude, ebRecHits.amplitude);
-    lambdaToTransfer(recHitsEE_.amplitude, eeRecHits.amplitude);
-
-    lambdaToTransfer(recHitsEB_.chi2, ebRecHits.chi2);
-    lambdaToTransfer(recHitsEE_.chi2, eeRecHits.chi2);
-    
-    lambdaToTransfer(recHitsEB_.pedestal, ebRecHits.pedestal);
-    lambdaToTransfer(recHitsEE_.pedestal, eeRecHits.pedestal);
-
-    lambdaToTransfer(recHitsEB_.flags, ebRecHits.flags);
-    lambdaToTransfer(recHitsEE_.flags, eeRecHits.flags);
-
-    if (containsTimingInformation_) {
-        lambdaToTransfer(recHitsEB_.jitter, ebRecHits.jitter);
-        lambdaToTransfer(recHitsEE_.jitter, eeRecHits.jitter);
-    
-        lambdaToTransfer(recHitsEB_.jitterError, ebRecHits.jitterError);
-        lambdaToTransfer(recHitsEE_.jitterError, eeRecHits.jitterError);
-    }
+void EcalCPUUncalibRecHitProducer::acquire(edm::Event const& event,
+                                           edm::EventSetup const& setup,
+                                           edm::WaitingTaskWithArenaHolder taskHolder) {
+  // retrieve data/ctx
+  auto const& ebRecHitsProduct = event.get(recHitsInEBToken_);
+  auto const& eeRecHitsProduct = event.get(recHitsInEEToken_);
+  cms::cuda::ScopedContextAcquire ctx{ebRecHitsProduct, std::move(taskHolder)};
+  auto const& ebRecHits = ctx.get(ebRecHitsProduct);
+  auto const& eeRecHits = ctx.get(eeRecHitsProduct);
+
+  // resize the output buffers
+  recHitsEB_.resize(ebRecHits.size);
+  recHitsEE_.resize(eeRecHits.size);
+
+  auto lambdaToTransfer = [&ctx](auto& dest, auto* src) {
+    using vector_type = typename std::remove_reference<decltype(dest)>::type;
+    using type = typename vector_type::value_type;
+    cudaCheck(cudaMemcpyAsync(dest.data(), src, dest.size() * sizeof(type), cudaMemcpyDeviceToHost, ctx.stream()));
+  };
+
+  // enqeue transfers
+  lambdaToTransfer(recHitsEB_.did, ebRecHits.did);
+  lambdaToTransfer(recHitsEE_.did, eeRecHits.did);
+
+  lambdaToTransfer(recHitsEB_.amplitudesAll, ebRecHits.amplitudesAll);
+  lambdaToTransfer(recHitsEE_.amplitudesAll, eeRecHits.amplitudesAll);
+
+  lambdaToTransfer(recHitsEB_.amplitude, ebRecHits.amplitude);
+  lambdaToTransfer(recHitsEE_.amplitude, eeRecHits.amplitude);
+
+  lambdaToTransfer(recHitsEB_.chi2, ebRecHits.chi2);
+  lambdaToTransfer(recHitsEE_.chi2, eeRecHits.chi2);
+
+  lambdaToTransfer(recHitsEB_.pedestal, ebRecHits.pedestal);
+  lambdaToTransfer(recHitsEE_.pedestal, eeRecHits.pedestal);
+
+  lambdaToTransfer(recHitsEB_.flags, ebRecHits.flags);
+  lambdaToTransfer(recHitsEE_.flags, eeRecHits.flags);
+
+  if (containsTimingInformation_) {
+    lambdaToTransfer(recHitsEB_.jitter, ebRecHits.jitter);
+    lambdaToTransfer(recHitsEE_.jitter, eeRecHits.jitter);
+
+    lambdaToTransfer(recHitsEB_.jitterError, ebRecHits.jitterError);
+    lambdaToTransfer(recHitsEE_.jitterError, eeRecHits.jitterError);
+  }
 }
 
-void EcalCPUUncalibRecHitProducer::produce(
-        edm::Event& event, 
-        edm::EventSetup const& setup) 
-{
-    // tmp vectors
-    auto recHitsOutEB = std::make_unique<ecal::UncalibratedRecHit<ecal::Tag::soa>>(
-        std::move(recHitsEB_));
-    auto recHitsOutEE = std::make_unique<ecal::UncalibratedRecHit<ecal::Tag::soa>>(
-        std::move(recHitsEE_));
-
-    // put into event
-    event.put(recHitsOutEBToken_, std::move(recHitsOutEB));
-    event.put(recHitsOutEEToken_, std::move(recHitsOutEE));
+void EcalCPUUncalibRecHitProducer::produce(edm::Event& event, edm::EventSetup const& setup) {
+  // tmp vectors
+  auto recHitsOutEB = std::make_unique<ecal::UncalibratedRecHit<ecal::Tag::soa>>(std::move(recHitsEB_));
+  auto recHitsOutEE = std::make_unique<ecal::UncalibratedRecHit<ecal::Tag::soa>>(std::move(recHitsEE_));
+
+  // put into event
+  event.put(recHitsOutEBToken_, std::move(recHitsOutEB));
+  event.put(recHitsOutEEToken_, std::move(recHitsOutEE));
 }
 
 DEFINE_FWK_MODULE(EcalCPUUncalibRecHitProducer);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc
index 24b782b7b434d..c851bf24c0e40 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc
@@ -18,32 +18,19 @@
 
 #include <iostream>
 
-using EcalPedestalsGPUESProducer = EcalESProducerGPU<EcalPedestalsGPU,
-                                                     EcalPedestals,
-                                                     EcalPedestalsRcd>;
-using EcalGainRatiosGPUESProducer = EcalESProducerGPU<EcalGainRatiosGPU,
-                                                      EcalGainRatios,
-                                                      EcalGainRatiosRcd>;
-using EcalPulseShapesGPUESProducer = EcalESProducerGPU<EcalPulseShapesGPU,
-                                                       EcalPulseShapes,
-                                                       EcalPulseShapesRcd>;
-using EcalPulseCovariancesGPUESProducer = EcalESProducerGPU<EcalPulseCovariancesGPU,
-                                                            EcalPulseCovariances,
-                                                            EcalPulseCovariancesRcd>;
-using EcalSamplesCorrelationGPUESProducer = EcalESProducerGPU<
-    EcalSamplesCorrelationGPU,
-    EcalSamplesCorrelation,
-    EcalSamplesCorrelationRcd>;
+using EcalPedestalsGPUESProducer = EcalESProducerGPU<EcalPedestalsGPU, EcalPedestals, EcalPedestalsRcd>;
+using EcalGainRatiosGPUESProducer = EcalESProducerGPU<EcalGainRatiosGPU, EcalGainRatios, EcalGainRatiosRcd>;
+using EcalPulseShapesGPUESProducer = EcalESProducerGPU<EcalPulseShapesGPU, EcalPulseShapes, EcalPulseShapesRcd>;
+using EcalPulseCovariancesGPUESProducer =
+    EcalESProducerGPU<EcalPulseCovariancesGPU, EcalPulseCovariances, EcalPulseCovariancesRcd>;
+using EcalSamplesCorrelationGPUESProducer =
+    EcalESProducerGPU<EcalSamplesCorrelationGPU, EcalSamplesCorrelation, EcalSamplesCorrelationRcd>;
 
-using EcalTimeBiasCorrectionsGPUESProducer = EcalESProducerGPU<
-    EcalTimeBiasCorrectionsGPU,
-    EcalTimeBiasCorrections,
-    EcalTimeBiasCorrectionsRcd>;
+using EcalTimeBiasCorrectionsGPUESProducer =
+    EcalESProducerGPU<EcalTimeBiasCorrectionsGPU, EcalTimeBiasCorrections, EcalTimeBiasCorrectionsRcd>;
 
-using EcalTimeCalibConstantsGPUESProducer = EcalESProducerGPU<
-    EcalTimeCalibConstantsGPU,
-    EcalTimeCalibConstants,
-    EcalTimeCalibConstantsRcd>;
+using EcalTimeCalibConstantsGPUESProducer =
+    EcalESProducerGPU<EcalTimeCalibConstantsGPU, EcalTimeCalibConstants, EcalTimeCalibConstantsRcd>;
 
 DEFINE_FWK_EVENTSETUP_MODULE(EcalPedestalsGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(EcalGainRatiosGPUESProducer);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc
index 916230516f070..20f51ea5245df 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc
@@ -3,7 +3,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h" 
+#include "FWCore/Framework/interface/MakerMacros.h"
 
 // algorithm specific
 #include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
@@ -14,103 +14,87 @@
 
 #include <iostream>
 
-class EcalUncalibRecHitConvertGPU2CPUFormat
-    : public edm::stream::EDProducer<>
-{
+class EcalUncalibRecHitConvertGPU2CPUFormat : public edm::stream::EDProducer<> {
 public:
-    explicit EcalUncalibRecHitConvertGPU2CPUFormat(edm::ParameterSet const& ps);
-    ~EcalUncalibRecHitConvertGPU2CPUFormat() override;
-    static void fillDescriptions(edm::ConfigurationDescriptions&);
+  explicit EcalUncalibRecHitConvertGPU2CPUFormat(edm::ParameterSet const& ps);
+  ~EcalUncalibRecHitConvertGPU2CPUFormat() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
 
 private:
-    using GPURecHitType = ecal::UncalibratedRecHit<ecal::Tag::soa>;
-    void produce(edm::Event&, edm::EventSetup const&) override;
+  using GPURecHitType = ecal::UncalibratedRecHit<ecal::Tag::soa>;
+  void produce(edm::Event&, edm::EventSetup const&) override;
 
 private:
-    const edm::EDGetTokenT<ecal::SoAUncalibratedRecHitCollection> recHitsGPUEB_;
-    const edm::EDGetTokenT<ecal::SoAUncalibratedRecHitCollection> recHitsGPUEE_;
+  const edm::EDGetTokenT<ecal::SoAUncalibratedRecHitCollection> recHitsGPUEB_;
+  const edm::EDGetTokenT<ecal::SoAUncalibratedRecHitCollection> recHitsGPUEE_;
 
-    const std::string recHitsLabelCPUEB_, recHitsLabelCPUEE_;
+  const std::string recHitsLabelCPUEB_, recHitsLabelCPUEE_;
 };
 
-void EcalUncalibRecHitConvertGPU2CPUFormat::fillDescriptions(
-        edm::ConfigurationDescriptions& confDesc) {
-    edm::ParameterSetDescription desc;
+void EcalUncalibRecHitConvertGPU2CPUFormat::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
 
-    desc.add<edm::InputTag>("recHitsLabelGPUEB", 
-        edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEB"));
-    desc.add<edm::InputTag>("recHitsLabelGPUEE", 
-        edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEE"));
+  desc.add<edm::InputTag>("recHitsLabelGPUEB", edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEB"));
+  desc.add<edm::InputTag>("recHitsLabelGPUEE", edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEE"));
 
-    desc.add<std::string>("recHitsLabelCPUEB", "EcalUncalibRecHitsEB");
-    desc.add<std::string>("recHitsLabelCPUEE", "EcalUncalibRecHitsEE");
+  desc.add<std::string>("recHitsLabelCPUEB", "EcalUncalibRecHitsEB");
+  desc.add<std::string>("recHitsLabelCPUEE", "EcalUncalibRecHitsEE");
 
-    std::string label = "ecalUncalibRecHitConvertGPU2CPUFormat";
-    confDesc.add(label, desc);
+  std::string label = "ecalUncalibRecHitConvertGPU2CPUFormat";
+  confDesc.add(label, desc);
 }
 
-EcalUncalibRecHitConvertGPU2CPUFormat::EcalUncalibRecHitConvertGPU2CPUFormat(
-        const edm::ParameterSet& ps) 
+EcalUncalibRecHitConvertGPU2CPUFormat::EcalUncalibRecHitConvertGPU2CPUFormat(const edm::ParameterSet& ps)
     : recHitsGPUEB_{consumes<ecal::SoAUncalibratedRecHitCollection>(
-        ps.getParameter<edm::InputTag>("recHitsLabelGPUEB"))}
-    , recHitsGPUEE_{consumes<ecal::SoAUncalibratedRecHitCollection>(
-        ps.getParameter<edm::InputTag>("recHitsLabelGPUEE"))}
-    , recHitsLabelCPUEB_{ps.getParameter<std::string>("recHitsLabelCPUEB")}
-    , recHitsLabelCPUEE_{ps.getParameter<std::string>("recHitsLabelCPUEE")}
-{
-    produces<EBUncalibratedRecHitCollection>(recHitsLabelCPUEB_);
-    produces<EEUncalibratedRecHitCollection>(recHitsLabelCPUEE_);
+          ps.getParameter<edm::InputTag>("recHitsLabelGPUEB"))},
+      recHitsGPUEE_{
+          consumes<ecal::SoAUncalibratedRecHitCollection>(ps.getParameter<edm::InputTag>("recHitsLabelGPUEE"))},
+      recHitsLabelCPUEB_{ps.getParameter<std::string>("recHitsLabelCPUEB")},
+      recHitsLabelCPUEE_{ps.getParameter<std::string>("recHitsLabelCPUEE")} {
+  produces<EBUncalibratedRecHitCollection>(recHitsLabelCPUEB_);
+  produces<EEUncalibratedRecHitCollection>(recHitsLabelCPUEE_);
 }
 
 EcalUncalibRecHitConvertGPU2CPUFormat::~EcalUncalibRecHitConvertGPU2CPUFormat() {}
 
-void EcalUncalibRecHitConvertGPU2CPUFormat::produce(
-        edm::Event& event, 
-        edm::EventSetup const& setup) 
-{
-    edm::Handle<ecal::SoAUncalibratedRecHitCollection> hRecHitsGPUEB, hRecHitsGPUEE;
-    event.getByToken(recHitsGPUEB_, hRecHitsGPUEB);
-    event.getByToken(recHitsGPUEE_, hRecHitsGPUEE);
-
-    auto recHitsCPUEB = std::make_unique<EBUncalibratedRecHitCollection>();
-    auto recHitsCPUEE = std::make_unique<EEUncalibratedRecHitCollection>();
-    recHitsCPUEB->reserve(hRecHitsGPUEB->amplitude.size());
-    recHitsCPUEE->reserve(hRecHitsGPUEE->amplitude.size());
-
-    for (uint32_t i=0; i<hRecHitsGPUEB->amplitude.size(); ++i) {
-        recHitsCPUEB->emplace_back(
-            DetId{hRecHitsGPUEB->did[i]},
-            hRecHitsGPUEB->amplitude[i],
-            hRecHitsGPUEB->pedestal[i],
-            hRecHitsGPUEB->jitter[i],
-            hRecHitsGPUEB->chi2[i],
-            hRecHitsGPUEB->flags[i]
-        );
-        (*recHitsCPUEB)[i].setJitterError(hRecHitsGPUEB->jitterError[i]);
-        auto const offset = i * EcalDataFrame::MAXSAMPLES;
-        for (uint32_t sample=0; sample<EcalDataFrame::MAXSAMPLES; ++sample) 
-            (*recHitsCPUEB)[i].setOutOfTimeAmplitude(
-                sample, hRecHitsGPUEB->amplitudesAll[offset + sample]);
-    }
-
-    for (uint32_t i=0; i<hRecHitsGPUEE->amplitude.size(); ++i) {
-        recHitsCPUEE->emplace_back(
-            DetId{hRecHitsGPUEE->did[i]},
-            hRecHitsGPUEE->amplitude[i],
-            hRecHitsGPUEE->pedestal[i],
-            hRecHitsGPUEE->jitter[i],
-            hRecHitsGPUEE->chi2[i],
-            hRecHitsGPUEE->flags[i]
-        );
-        (*recHitsCPUEE)[i].setJitterError(hRecHitsGPUEE->jitterError[i]);
-        auto const offset = i * EcalDataFrame::MAXSAMPLES;
-        for (uint32_t sample=0; sample<EcalDataFrame::MAXSAMPLES; ++sample) 
-            (*recHitsCPUEE)[i].setOutOfTimeAmplitude(
-                sample, hRecHitsGPUEE->amplitudesAll[offset + sample]);
-    }
-
-    event.put(std::move(recHitsCPUEB), recHitsLabelCPUEB_);
-    event.put(std::move(recHitsCPUEE), recHitsLabelCPUEE_);
+void EcalUncalibRecHitConvertGPU2CPUFormat::produce(edm::Event& event, edm::EventSetup const& setup) {
+  edm::Handle<ecal::SoAUncalibratedRecHitCollection> hRecHitsGPUEB, hRecHitsGPUEE;
+  event.getByToken(recHitsGPUEB_, hRecHitsGPUEB);
+  event.getByToken(recHitsGPUEE_, hRecHitsGPUEE);
+
+  auto recHitsCPUEB = std::make_unique<EBUncalibratedRecHitCollection>();
+  auto recHitsCPUEE = std::make_unique<EEUncalibratedRecHitCollection>();
+  recHitsCPUEB->reserve(hRecHitsGPUEB->amplitude.size());
+  recHitsCPUEE->reserve(hRecHitsGPUEE->amplitude.size());
+
+  for (uint32_t i = 0; i < hRecHitsGPUEB->amplitude.size(); ++i) {
+    recHitsCPUEB->emplace_back(DetId{hRecHitsGPUEB->did[i]},
+                               hRecHitsGPUEB->amplitude[i],
+                               hRecHitsGPUEB->pedestal[i],
+                               hRecHitsGPUEB->jitter[i],
+                               hRecHitsGPUEB->chi2[i],
+                               hRecHitsGPUEB->flags[i]);
+    (*recHitsCPUEB)[i].setJitterError(hRecHitsGPUEB->jitterError[i]);
+    auto const offset = i * EcalDataFrame::MAXSAMPLES;
+    for (uint32_t sample = 0; sample < EcalDataFrame::MAXSAMPLES; ++sample)
+      (*recHitsCPUEB)[i].setOutOfTimeAmplitude(sample, hRecHitsGPUEB->amplitudesAll[offset + sample]);
+  }
+
+  for (uint32_t i = 0; i < hRecHitsGPUEE->amplitude.size(); ++i) {
+    recHitsCPUEE->emplace_back(DetId{hRecHitsGPUEE->did[i]},
+                               hRecHitsGPUEE->amplitude[i],
+                               hRecHitsGPUEE->pedestal[i],
+                               hRecHitsGPUEE->jitter[i],
+                               hRecHitsGPUEE->chi2[i],
+                               hRecHitsGPUEE->flags[i]);
+    (*recHitsCPUEE)[i].setJitterError(hRecHitsGPUEE->jitterError[i]);
+    auto const offset = i * EcalDataFrame::MAXSAMPLES;
+    for (uint32_t sample = 0; sample < EcalDataFrame::MAXSAMPLES; ++sample)
+      (*recHitsCPUEE)[i].setOutOfTimeAmplitude(sample, hRecHitsGPUEE->amplitudesAll[offset + sample]);
+  }
+
+  event.put(std::move(recHitsCPUEB), recHitsLabelCPUEB_);
+  event.put(std::move(recHitsCPUEE), recHitsLabelCPUEE_);
 }
 
 DEFINE_FWK_MODULE(EcalUncalibRecHitConvertGPU2CPUFormat);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc
index a96b729223d01..d043d0f8e6e50 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc
@@ -8,7 +8,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h" 
+#include "FWCore/Framework/interface/MakerMacros.h"
 
 // algorithm specific
 #include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
@@ -40,355 +40,323 @@
 #include "RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalUncalibRecHitMultiFitAlgo_gpu_new.h"
 
-class EcalUncalibRecHitProducerGPU
-    : public edm::stream::EDProducer<edm::ExternalWork>
-{
+class EcalUncalibRecHitProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
-    explicit EcalUncalibRecHitProducerGPU(edm::ParameterSet const& ps);
-    ~EcalUncalibRecHitProducerGPU() override;
-    static void fillDescriptions(edm::ConfigurationDescriptions&);
+  explicit EcalUncalibRecHitProducerGPU(edm::ParameterSet const& ps);
+  ~EcalUncalibRecHitProducerGPU() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
 
 private:
-    using RecHitType = ecal::UncalibratedRecHit<ecal::Tag::soa>;
-    void acquire(edm::Event const&, 
-                 edm::EventSetup const&,
-                 edm::WaitingTaskWithArenaHolder) override;
-    void produce(edm::Event&, edm::EventSetup const&) override;
+  using RecHitType = ecal::UncalibratedRecHit<ecal::Tag::soa>;
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
 
 private:
-    edm::EDGetTokenT<cms::cuda::Product<ecal::DigisCollection>> digisTokenEB_, digisTokenEE_;
-    edm::EDPutTokenT<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>>
-        recHitsTokenEB_, recHitsTokenEE_;
-    
-    // conditions handles
-    edm::ESHandle<EcalPedestalsGPU> pedestalsHandle_;
-    edm::ESHandle<EcalGainRatiosGPU> gainRatiosHandle_;
-    edm::ESHandle<EcalPulseShapesGPU> pulseShapesHandle_;
-    edm::ESHandle<EcalPulseCovariancesGPU> pulseCovariancesHandle_;
-    edm::ESHandle<EcalSamplesCorrelationGPU> samplesCorrelationHandle_;
-    edm::ESHandle<EcalTimeBiasCorrectionsGPU> timeBiasCorrectionsHandle_;
-    edm::ESHandle<EcalTimeCalibConstantsGPU> timeCalibConstantsHandle_;
-    edm::ESHandle<EcalSampleMask> sampleMaskHandle_;
-    edm::ESHandle<EcalTimeOffsetConstant> timeOffsetConstantHandle_;
-
-    // configuration parameters
-    ecal::multifit::ConfigurationParameters configParameters_;
-
-    // event data
-    ecal::multifit::EventOutputDataGPU eventOutputDataGPU_;
-    ecal::multifit::EventDataForScratchGPU eventDataForScratchGPU_;
-    bool shouldTransferToHost_{true};
-
-    cms::cuda::ContextState cudaState_;
-
-    uint32_t maxNumberHits_;
-    uint32_t neb_, nee_;
+  edm::EDGetTokenT<cms::cuda::Product<ecal::DigisCollection>> digisTokenEB_, digisTokenEE_;
+  edm::EDPutTokenT<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>> recHitsTokenEB_, recHitsTokenEE_;
+
+  // conditions handles
+  edm::ESHandle<EcalPedestalsGPU> pedestalsHandle_;
+  edm::ESHandle<EcalGainRatiosGPU> gainRatiosHandle_;
+  edm::ESHandle<EcalPulseShapesGPU> pulseShapesHandle_;
+  edm::ESHandle<EcalPulseCovariancesGPU> pulseCovariancesHandle_;
+  edm::ESHandle<EcalSamplesCorrelationGPU> samplesCorrelationHandle_;
+  edm::ESHandle<EcalTimeBiasCorrectionsGPU> timeBiasCorrectionsHandle_;
+  edm::ESHandle<EcalTimeCalibConstantsGPU> timeCalibConstantsHandle_;
+  edm::ESHandle<EcalSampleMask> sampleMaskHandle_;
+  edm::ESHandle<EcalTimeOffsetConstant> timeOffsetConstantHandle_;
+
+  // configuration parameters
+  ecal::multifit::ConfigurationParameters configParameters_;
+
+  // event data
+  ecal::multifit::EventOutputDataGPU eventOutputDataGPU_;
+  ecal::multifit::EventDataForScratchGPU eventDataForScratchGPU_;
+  bool shouldTransferToHost_{true};
+
+  cms::cuda::ContextState cudaState_;
+
+  uint32_t maxNumberHits_;
+  uint32_t neb_, nee_;
 };
 
-void EcalUncalibRecHitProducerGPU::fillDescriptions(
-        edm::ConfigurationDescriptions& confDesc) {
-    edm::ParameterSetDescription desc;
-
-    desc.add<edm::InputTag>("digisLabelEB", 
-        edm::InputTag("ecalRawToDigiGPU", "ebDigisGPU"));
-    desc.add<edm::InputTag>("digisLabelEE", 
-        edm::InputTag("ecalRawToDigiGPU", "eeDigisGPU"));
-
-    desc.add<std::string>("recHitsLabelEB", "EcalUncalibRecHitsEB");
-    desc.add<std::string>("recHitsLabelEE", "EcalUncalibRecHitsEE");
-
-    desc.add<std::vector<double>>("EBtimeFitParameters", 
-        {-2.015452e+00, 3.130702e+00, -1.234730e+01, 4.188921e+01, -8.283944e+01, 
-         9.101147e+01, -5.035761e+01, 1.105621e+01});
-    desc.add<std::vector<double>>("EEtimeFitParameters", 
-        {-2.390548e+00, 3.553628e+00, -1.762341e+01, 6.767538e+01, -1.332130e+02, 
-         1.407432e+02, -7.541106e+01, 1.620277e+01});
-    desc.add<std::vector<double>>("EBamplitudeFitParameters", {1.138,1.652});
-    desc.add<std::vector<double>>("EEamplitudeFitParameters", {1.890,1.400});
-    desc.add<double>("EBtimeFitLimits_Lower", 0.2);
-    desc.add<double>("EBtimeFitLimits_Upper", 1.4);
-    desc.add<double>("EEtimeFitLimits_Lower", 0.2);
-    desc.add<double>("EEtimeFitLimits_Upper", 1.4);
-    desc.add<double>("EBtimeConstantTerm", .6);
-    desc.add<double>("EEtimeConstantTerm", 1.0);
-    desc.add<double>("EBtimeNconst", 28.5);
-    desc.add<double>("EEtimeNconst", 31.8);
-    desc.add<double>("outOfTimeThresholdGain12pEB", 5);
-    desc.add<double>("outOfTimeThresholdGain12mEB", 5);
-    desc.add<double>("outOfTimeThresholdGain61pEB", 5);
-    desc.add<double>("outOfTimeThresholdGain61mEB", 5);
-    desc.add<double>("outOfTimeThresholdGain12pEE", 1000);
-    desc.add<double>("outOfTimeThresholdGain12mEE", 1000);
-    desc.add<double>("outOfTimeThresholdGain61pEE", 1000);
-    desc.add<double>("outOfTimeThresholdGain61mEE", 1000);
-    desc.add<double>("amplitudeThresholdEB", 10);
-    desc.add<double>("amplitudeThresholdEE", 10);
-    desc.add<uint32_t>("maxNumberHits", 20000);   //---- AM TEST
-    desc.add<bool>("shouldTransferToHost", true);
-    desc.add<std::vector<uint32_t>>("kernelMinimizeThreads", {32, 1, 1});
-    // ---- default false or true? It was set to true, but at HLT it is false
-    desc.add<bool>("shouldRunTimingComputation", false);
-    std::string label = "ecalUncalibRecHitProducerGPU";
-    confDesc.add(label, desc);
+void EcalUncalibRecHitProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("digisLabelEB", edm::InputTag("ecalRawToDigiGPU", "ebDigisGPU"));
+  desc.add<edm::InputTag>("digisLabelEE", edm::InputTag("ecalRawToDigiGPU", "eeDigisGPU"));
+
+  desc.add<std::string>("recHitsLabelEB", "EcalUncalibRecHitsEB");
+  desc.add<std::string>("recHitsLabelEE", "EcalUncalibRecHitsEE");
+
+  desc.add<std::vector<double>>("EBtimeFitParameters",
+                                {-2.015452e+00,
+                                 3.130702e+00,
+                                 -1.234730e+01,
+                                 4.188921e+01,
+                                 -8.283944e+01,
+                                 9.101147e+01,
+                                 -5.035761e+01,
+                                 1.105621e+01});
+  desc.add<std::vector<double>>("EEtimeFitParameters",
+                                {-2.390548e+00,
+                                 3.553628e+00,
+                                 -1.762341e+01,
+                                 6.767538e+01,
+                                 -1.332130e+02,
+                                 1.407432e+02,
+                                 -7.541106e+01,
+                                 1.620277e+01});
+  desc.add<std::vector<double>>("EBamplitudeFitParameters", {1.138, 1.652});
+  desc.add<std::vector<double>>("EEamplitudeFitParameters", {1.890, 1.400});
+  desc.add<double>("EBtimeFitLimits_Lower", 0.2);
+  desc.add<double>("EBtimeFitLimits_Upper", 1.4);
+  desc.add<double>("EEtimeFitLimits_Lower", 0.2);
+  desc.add<double>("EEtimeFitLimits_Upper", 1.4);
+  desc.add<double>("EBtimeConstantTerm", .6);
+  desc.add<double>("EEtimeConstantTerm", 1.0);
+  desc.add<double>("EBtimeNconst", 28.5);
+  desc.add<double>("EEtimeNconst", 31.8);
+  desc.add<double>("outOfTimeThresholdGain12pEB", 5);
+  desc.add<double>("outOfTimeThresholdGain12mEB", 5);
+  desc.add<double>("outOfTimeThresholdGain61pEB", 5);
+  desc.add<double>("outOfTimeThresholdGain61mEB", 5);
+  desc.add<double>("outOfTimeThresholdGain12pEE", 1000);
+  desc.add<double>("outOfTimeThresholdGain12mEE", 1000);
+  desc.add<double>("outOfTimeThresholdGain61pEE", 1000);
+  desc.add<double>("outOfTimeThresholdGain61mEE", 1000);
+  desc.add<double>("amplitudeThresholdEB", 10);
+  desc.add<double>("amplitudeThresholdEE", 10);
+  desc.add<uint32_t>("maxNumberHits", 20000);  //---- AM TEST
+  desc.add<bool>("shouldTransferToHost", true);
+  desc.add<std::vector<uint32_t>>("kernelMinimizeThreads", {32, 1, 1});
+  // ---- default false or true? It was set to true, but at HLT it is false
+  desc.add<bool>("shouldRunTimingComputation", false);
+  std::string label = "ecalUncalibRecHitProducerGPU";
+  confDesc.add(label, desc);
 }
 
-EcalUncalibRecHitProducerGPU::EcalUncalibRecHitProducerGPU(
-        const edm::ParameterSet& ps) 
+EcalUncalibRecHitProducerGPU::EcalUncalibRecHitProducerGPU(const edm::ParameterSet& ps)
     : digisTokenEB_{consumes<cms::cuda::Product<ecal::DigisCollection>>(
-        ps.getParameter<edm::InputTag>("digisLabelEB"))}
-    , digisTokenEE_{consumes<cms::cuda::Product<ecal::DigisCollection>>(
-        ps.getParameter<edm::InputTag>("digisLabelEE"))}
-    , recHitsTokenEB_{produces<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>>(
-        ps.getParameter<std::string>("recHitsLabelEB"))}
-    , recHitsTokenEE_{produces<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>>(
-        ps.getParameter<std::string>("recHitsLabelEE"))}
-{
-    auto EBamplitudeFitParameters = ps.getParameter<std::vector<double>>(
-        "EBamplitudeFitParameters");
-    auto EEamplitudeFitParameters = ps.getParameter<std::vector<double>>(
-        "EEamplitudeFitParameters");
-    auto EBtimeFitParameters = ps.getParameter<std::vector<double>>(
-        "EBtimeFitParameters");
-    auto EEtimeFitParameters = ps.getParameter<std::vector<double>>(
-        "EEtimeFitParameters");
-    std::pair<double, double> EBtimeFitLimits, EEtimeFitLimits;
-    EBtimeFitLimits.first  = ps.getParameter<double>("EBtimeFitLimits_Lower");
-    EBtimeFitLimits.second = ps.getParameter<double>("EBtimeFitLimits_Upper");
-    EEtimeFitLimits.first  = ps.getParameter<double>("EEtimeFitLimits_Lower");
-    EEtimeFitLimits.second = ps.getParameter<double>("EEtimeFitLimits_Upper");
-
-    auto EBtimeConstantTerm = ps.getParameter<double>("EBtimeConstantTerm");
-    auto EEtimeConstantTerm = ps.getParameter<double>("EEtimeConstantTerm");
-    auto EBtimeNconst = ps.getParameter<double>("EBtimeNconst");
-    auto EEtimeNconst = ps.getParameter<double>("EEtimeNconst");
-
-    auto outOfTimeThreshG12pEB = ps.getParameter<double>(
-        "outOfTimeThresholdGain12pEB");
-    auto outOfTimeThreshG12mEB = ps.getParameter<double>(
-        "outOfTimeThresholdGain12mEB");
-    auto outOfTimeThreshG61pEB = ps.getParameter<double>(
-        "outOfTimeThresholdGain61pEB");
-    auto outOfTimeThreshG61mEB = ps.getParameter<double>(
-        "outOfTimeThresholdGain61mEB");
-    auto outOfTimeThreshG12pEE = ps.getParameter<double>(
-        "outOfTimeThresholdGain12pEE");
-    auto outOfTimeThreshG12mEE = ps.getParameter<double>(
-        "outOfTimeThresholdGain12mEE");
-    auto outOfTimeThreshG61pEE = ps.getParameter<double>(
-        "outOfTimeThresholdGain61pEE");
-    auto outOfTimeThreshG61mEE = ps.getParameter<double>(
-        "outOfTimeThresholdGain61mEE");
-    auto amplitudeThreshEB = ps.getParameter<double>("amplitudeThresholdEB");
-    auto amplitudeThreshEE = ps.getParameter<double>("amplitudeThresholdEE");
-
-    // max number of digis to allocate for
-    maxNumberHits_ = ps.getParameter<uint32_t>("maxNumberHits");
-
-    // transfer to host switch
-    shouldTransferToHost_ = ps.getParameter<bool>("shouldTransferToHost");
-
-    // switch to run timing computation kernels
-    configParameters_.shouldRunTimingComputation = 
-        ps.getParameter<bool>("shouldRunTimingComputation");
-
-    // minimize kernel launch conf
-    auto threadsMinimize = ps.getParameter<std::vector<uint32_t>>("kernelMinimizeThreads");
-    configParameters_.kernelMinimizeThreads[0] = threadsMinimize[0];
-    configParameters_.kernelMinimizeThreads[1] = threadsMinimize[1];
-    configParameters_.kernelMinimizeThreads[2] = threadsMinimize[2];
-
-    //
-    // configuration and physics parameters: done once
-    // assume there is a single device
-    // use sync copying
-    //
-
-    // amplitude fit parameters copying
-    cudaCheck( cudaMalloc((void**)&configParameters_.amplitudeFitParametersEB,
-        sizeof(ecal::multifit::ConfigurationParameters::type) 
-        * EBamplitudeFitParameters.size()) );
-    cudaCheck( cudaMemcpy(configParameters_.amplitudeFitParametersEB,
-        EBamplitudeFitParameters.data(),
-        EBamplitudeFitParameters.size() * 
-        sizeof(ecal::multifit::ConfigurationParameters::type),
-        cudaMemcpyHostToDevice) );
-    cudaCheck( cudaMalloc((void**)&configParameters_.amplitudeFitParametersEE,
-        sizeof(ecal::multifit::ConfigurationParameters::type) * 
-        EEamplitudeFitParameters.size()) );
-    cudaCheck( cudaMemcpy(configParameters_.amplitudeFitParametersEE,
-        EEamplitudeFitParameters.data(),
-        EEamplitudeFitParameters.size() * 
-        sizeof(ecal::multifit::ConfigurationParameters::type),
-        cudaMemcpyHostToDevice) );
-
-    // time fit parameters and limits
-    configParameters_.timeFitParametersSizeEB = EBtimeFitParameters.size();
-    configParameters_.timeFitParametersSizeEE = EEtimeFitParameters.size();
-    configParameters_.timeFitLimitsFirstEB = EBtimeFitLimits.first;
-    configParameters_.timeFitLimitsSecondEB = EBtimeFitLimits.second;
-    configParameters_.timeFitLimitsFirstEE = EEtimeFitLimits.first;
-    configParameters_.timeFitLimitsSecondEE = EEtimeFitLimits.second;
-    cudaCheck( cudaMalloc((void**)&configParameters_.timeFitParametersEB,
-        sizeof(ecal::multifit::ConfigurationParameters::type) 
-        * EBtimeFitParameters.size()) );
-    cudaCheck( cudaMemcpy(configParameters_.timeFitParametersEB,
-        EBtimeFitParameters.data(),
-        EBtimeFitParameters.size() * 
-        sizeof(ecal::multifit::ConfigurationParameters::type),
-        cudaMemcpyHostToDevice) );
-    cudaCheck( cudaMalloc((void**)&configParameters_.timeFitParametersEE,
-        sizeof(ecal::multifit::ConfigurationParameters::type) 
-        * EEtimeFitParameters.size()) );
-    cudaCheck( cudaMemcpy(configParameters_.timeFitParametersEE,
-        EEtimeFitParameters.data(),
-        EEtimeFitParameters.size() 
-        * sizeof(ecal::multifit::ConfigurationParameters::type),
-        cudaMemcpyHostToDevice) );
-
-    // time constant terms
-    configParameters_.timeConstantTermEB = EBtimeConstantTerm;
-    configParameters_.timeConstantTermEE = EEtimeConstantTerm;
-
-    // time N const 
-    configParameters_.timeNconstEB = EBtimeNconst;
-    configParameters_.timeNconstEE = EEtimeNconst;
-
-    // amplitude threshold for time flags
-    configParameters_.amplitudeThreshEB = amplitudeThreshEB;
-    configParameters_.amplitudeThreshEE = amplitudeThreshEE;
-
-    // out of time thresholds gain-dependent
-    configParameters_.outOfTimeThreshG12pEB = outOfTimeThreshG12pEB;
-    configParameters_.outOfTimeThreshG12pEE = outOfTimeThreshG12pEE;
-    configParameters_.outOfTimeThreshG61pEB = outOfTimeThreshG61pEB;
-    configParameters_.outOfTimeThreshG61pEE = outOfTimeThreshG61pEE;
-    configParameters_.outOfTimeThreshG12mEB = outOfTimeThreshG12mEB;
-    configParameters_.outOfTimeThreshG12mEE = outOfTimeThreshG12mEE;
-    configParameters_.outOfTimeThreshG61mEB = outOfTimeThreshG61mEB;
-    configParameters_.outOfTimeThreshG61mEE = outOfTimeThreshG61mEE;
-
-    // allocate event output data
-    eventOutputDataGPU_.allocate(configParameters_, maxNumberHits_);
-
-    // allocate scratch data for gpu
-    eventDataForScratchGPU_.allocate(configParameters_, maxNumberHits_);
+          ps.getParameter<edm::InputTag>("digisLabelEB"))},
+      digisTokenEE_{
+          consumes<cms::cuda::Product<ecal::DigisCollection>>(ps.getParameter<edm::InputTag>("digisLabelEE"))},
+      recHitsTokenEB_{produces<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>>(
+          ps.getParameter<std::string>("recHitsLabelEB"))},
+      recHitsTokenEE_{produces<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>>(
+          ps.getParameter<std::string>("recHitsLabelEE"))} {
+  auto EBamplitudeFitParameters = ps.getParameter<std::vector<double>>("EBamplitudeFitParameters");
+  auto EEamplitudeFitParameters = ps.getParameter<std::vector<double>>("EEamplitudeFitParameters");
+  auto EBtimeFitParameters = ps.getParameter<std::vector<double>>("EBtimeFitParameters");
+  auto EEtimeFitParameters = ps.getParameter<std::vector<double>>("EEtimeFitParameters");
+  std::pair<double, double> EBtimeFitLimits, EEtimeFitLimits;
+  EBtimeFitLimits.first = ps.getParameter<double>("EBtimeFitLimits_Lower");
+  EBtimeFitLimits.second = ps.getParameter<double>("EBtimeFitLimits_Upper");
+  EEtimeFitLimits.first = ps.getParameter<double>("EEtimeFitLimits_Lower");
+  EEtimeFitLimits.second = ps.getParameter<double>("EEtimeFitLimits_Upper");
+
+  auto EBtimeConstantTerm = ps.getParameter<double>("EBtimeConstantTerm");
+  auto EEtimeConstantTerm = ps.getParameter<double>("EEtimeConstantTerm");
+  auto EBtimeNconst = ps.getParameter<double>("EBtimeNconst");
+  auto EEtimeNconst = ps.getParameter<double>("EEtimeNconst");
+
+  auto outOfTimeThreshG12pEB = ps.getParameter<double>("outOfTimeThresholdGain12pEB");
+  auto outOfTimeThreshG12mEB = ps.getParameter<double>("outOfTimeThresholdGain12mEB");
+  auto outOfTimeThreshG61pEB = ps.getParameter<double>("outOfTimeThresholdGain61pEB");
+  auto outOfTimeThreshG61mEB = ps.getParameter<double>("outOfTimeThresholdGain61mEB");
+  auto outOfTimeThreshG12pEE = ps.getParameter<double>("outOfTimeThresholdGain12pEE");
+  auto outOfTimeThreshG12mEE = ps.getParameter<double>("outOfTimeThresholdGain12mEE");
+  auto outOfTimeThreshG61pEE = ps.getParameter<double>("outOfTimeThresholdGain61pEE");
+  auto outOfTimeThreshG61mEE = ps.getParameter<double>("outOfTimeThresholdGain61mEE");
+  auto amplitudeThreshEB = ps.getParameter<double>("amplitudeThresholdEB");
+  auto amplitudeThreshEE = ps.getParameter<double>("amplitudeThresholdEE");
+
+  // max number of digis to allocate for
+  maxNumberHits_ = ps.getParameter<uint32_t>("maxNumberHits");
+
+  // transfer to host switch
+  shouldTransferToHost_ = ps.getParameter<bool>("shouldTransferToHost");
+
+  // switch to run timing computation kernels
+  configParameters_.shouldRunTimingComputation = ps.getParameter<bool>("shouldRunTimingComputation");
+
+  // minimize kernel launch conf
+  auto threadsMinimize = ps.getParameter<std::vector<uint32_t>>("kernelMinimizeThreads");
+  configParameters_.kernelMinimizeThreads[0] = threadsMinimize[0];
+  configParameters_.kernelMinimizeThreads[1] = threadsMinimize[1];
+  configParameters_.kernelMinimizeThreads[2] = threadsMinimize[2];
+
+  //
+  // configuration and physics parameters: done once
+  // assume there is a single device
+  // use sync copying
+  //
+
+  // amplitude fit parameters copying
+  cudaCheck(cudaMalloc((void**)&configParameters_.amplitudeFitParametersEB,
+                       sizeof(ecal::multifit::ConfigurationParameters::type) * EBamplitudeFitParameters.size()));
+  cudaCheck(cudaMemcpy(configParameters_.amplitudeFitParametersEB,
+                       EBamplitudeFitParameters.data(),
+                       EBamplitudeFitParameters.size() * sizeof(ecal::multifit::ConfigurationParameters::type),
+                       cudaMemcpyHostToDevice));
+  cudaCheck(cudaMalloc((void**)&configParameters_.amplitudeFitParametersEE,
+                       sizeof(ecal::multifit::ConfigurationParameters::type) * EEamplitudeFitParameters.size()));
+  cudaCheck(cudaMemcpy(configParameters_.amplitudeFitParametersEE,
+                       EEamplitudeFitParameters.data(),
+                       EEamplitudeFitParameters.size() * sizeof(ecal::multifit::ConfigurationParameters::type),
+                       cudaMemcpyHostToDevice));
+
+  // time fit parameters and limits
+  configParameters_.timeFitParametersSizeEB = EBtimeFitParameters.size();
+  configParameters_.timeFitParametersSizeEE = EEtimeFitParameters.size();
+  configParameters_.timeFitLimitsFirstEB = EBtimeFitLimits.first;
+  configParameters_.timeFitLimitsSecondEB = EBtimeFitLimits.second;
+  configParameters_.timeFitLimitsFirstEE = EEtimeFitLimits.first;
+  configParameters_.timeFitLimitsSecondEE = EEtimeFitLimits.second;
+  cudaCheck(cudaMalloc((void**)&configParameters_.timeFitParametersEB,
+                       sizeof(ecal::multifit::ConfigurationParameters::type) * EBtimeFitParameters.size()));
+  cudaCheck(cudaMemcpy(configParameters_.timeFitParametersEB,
+                       EBtimeFitParameters.data(),
+                       EBtimeFitParameters.size() * sizeof(ecal::multifit::ConfigurationParameters::type),
+                       cudaMemcpyHostToDevice));
+  cudaCheck(cudaMalloc((void**)&configParameters_.timeFitParametersEE,
+                       sizeof(ecal::multifit::ConfigurationParameters::type) * EEtimeFitParameters.size()));
+  cudaCheck(cudaMemcpy(configParameters_.timeFitParametersEE,
+                       EEtimeFitParameters.data(),
+                       EEtimeFitParameters.size() * sizeof(ecal::multifit::ConfigurationParameters::type),
+                       cudaMemcpyHostToDevice));
+
+  // time constant terms
+  configParameters_.timeConstantTermEB = EBtimeConstantTerm;
+  configParameters_.timeConstantTermEE = EEtimeConstantTerm;
+
+  // time N const
+  configParameters_.timeNconstEB = EBtimeNconst;
+  configParameters_.timeNconstEE = EEtimeNconst;
+
+  // amplitude threshold for time flags
+  configParameters_.amplitudeThreshEB = amplitudeThreshEB;
+  configParameters_.amplitudeThreshEE = amplitudeThreshEE;
+
+  // out of time thresholds gain-dependent
+  configParameters_.outOfTimeThreshG12pEB = outOfTimeThreshG12pEB;
+  configParameters_.outOfTimeThreshG12pEE = outOfTimeThreshG12pEE;
+  configParameters_.outOfTimeThreshG61pEB = outOfTimeThreshG61pEB;
+  configParameters_.outOfTimeThreshG61pEE = outOfTimeThreshG61pEE;
+  configParameters_.outOfTimeThreshG12mEB = outOfTimeThreshG12mEB;
+  configParameters_.outOfTimeThreshG12mEE = outOfTimeThreshG12mEE;
+  configParameters_.outOfTimeThreshG61mEB = outOfTimeThreshG61mEB;
+  configParameters_.outOfTimeThreshG61mEE = outOfTimeThreshG61mEE;
+
+  // allocate event output data
+  eventOutputDataGPU_.allocate(configParameters_, maxNumberHits_);
+
+  // allocate scratch data for gpu
+  eventDataForScratchGPU_.allocate(configParameters_, maxNumberHits_);
 }
 
 EcalUncalibRecHitProducerGPU::~EcalUncalibRecHitProducerGPU() {
-    //
-    // assume single device for now
-    //
-
-    if (configParameters_.amplitudeFitParametersEB) {
-        // configuration parameters
-        cudaCheck( cudaFree(configParameters_.amplitudeFitParametersEB) );
-        cudaCheck( cudaFree(configParameters_.amplitudeFitParametersEE) );
-        cudaCheck( cudaFree(configParameters_.timeFitParametersEB) );
-        cudaCheck( cudaFree(configParameters_.timeFitParametersEE) );
-
-        // free event ouput data 
-        eventOutputDataGPU_.deallocate(configParameters_);
-
-        // free event scratch data
-        eventDataForScratchGPU_.deallocate(configParameters_);
-    }
+  //
+  // assume single device for now
+  //
+
+  if (configParameters_.amplitudeFitParametersEB) {
+    // configuration parameters
+    cudaCheck(cudaFree(configParameters_.amplitudeFitParametersEB));
+    cudaCheck(cudaFree(configParameters_.amplitudeFitParametersEE));
+    cudaCheck(cudaFree(configParameters_.timeFitParametersEB));
+    cudaCheck(cudaFree(configParameters_.timeFitParametersEE));
+
+    // free event ouput data
+    eventOutputDataGPU_.deallocate(configParameters_);
+
+    // free event scratch data
+    eventDataForScratchGPU_.deallocate(configParameters_);
+  }
 }
 
-void EcalUncalibRecHitProducerGPU::acquire(
-        edm::Event const& event,
-        edm::EventSetup const& setup,
-        edm::WaitingTaskWithArenaHolder holder) 
-{
-    // cuda products
-    auto const& ebDigisProduct = event.get(digisTokenEB_);
-    auto const& eeDigisProduct = event.get(digisTokenEE_);
-    
-    // raii
-    cms::cuda::ScopedContextAcquire ctx{ebDigisProduct, std::move(holder), cudaState_};
-
-    // get actual obj
-    auto const& ebDigis = ctx.get(ebDigisProduct);
-    auto const& eeDigis = ctx.get(eeDigisProduct);
-    ecal::multifit::EventInputDataGPU inputDataGPU{ebDigis, eeDigis};
-    neb_ = ebDigis.ndigis;
-    nee_ = eeDigis.ndigis;
-
-    // conditions
-    setup.get<EcalPedestalsRcd>().get(pedestalsHandle_);
-    setup.get<EcalGainRatiosRcd>().get(gainRatiosHandle_);
-    setup.get<EcalPulseShapesRcd>().get(pulseShapesHandle_);
-    setup.get<EcalPulseCovariancesRcd>().get(pulseCovariancesHandle_);
-    setup.get<EcalSamplesCorrelationRcd>().get(samplesCorrelationHandle_);
-    setup.get<EcalTimeBiasCorrectionsRcd>().get(timeBiasCorrectionsHandle_);
-    setup.get<EcalTimeCalibConstantsRcd>().get(timeCalibConstantsHandle_);
-    setup.get<EcalSampleMaskRcd>().get(sampleMaskHandle_);
-    setup.get<EcalTimeOffsetConstantRcd>().get(timeOffsetConstantHandle_);
-
-    auto const& pedProduct = pedestalsHandle_->getProduct(ctx.stream());
-    auto const& gainsProduct = gainRatiosHandle_->getProduct(ctx.stream());
-    auto const& pulseShapesProduct = pulseShapesHandle_->getProduct(ctx.stream());
-    auto const& pulseCovariancesProduct = pulseCovariancesHandle_->getProduct(ctx.stream());
-    auto const& samplesCorrelationProduct = samplesCorrelationHandle_->getProduct(ctx.stream());
-    auto const& timeBiasCorrectionsProduct = timeBiasCorrectionsHandle_->getProduct(ctx.stream());
-    auto const& timeCalibConstantsProduct = timeCalibConstantsHandle_->getProduct(ctx.stream());
-
-    // bundle up conditions
-    ecal::multifit::ConditionsProducts conditions {
-        pedProduct, gainsProduct, pulseShapesProduct,
-        pulseCovariancesProduct, 
-        samplesCorrelationProduct,
-        timeBiasCorrectionsProduct,
-        timeCalibConstantsProduct,
-        *sampleMaskHandle_,
-        *timeOffsetConstantHandle_,
-        timeCalibConstantsHandle_->getOffset()
-    };
-    
-    //
-    // schedule algorithms
-    //
-    ecal::multifit::entryPoint(
-        inputDataGPU,
-        eventOutputDataGPU_,
-        eventDataForScratchGPU_,
-        conditions,
-        configParameters_,
-        ctx.stream()
-    );
+void EcalUncalibRecHitProducerGPU::acquire(edm::Event const& event,
+                                           edm::EventSetup const& setup,
+                                           edm::WaitingTaskWithArenaHolder holder) {
+  // cuda products
+  auto const& ebDigisProduct = event.get(digisTokenEB_);
+  auto const& eeDigisProduct = event.get(digisTokenEE_);
+
+  // raii
+  cms::cuda::ScopedContextAcquire ctx{ebDigisProduct, std::move(holder), cudaState_};
+
+  // get actual obj
+  auto const& ebDigis = ctx.get(ebDigisProduct);
+  auto const& eeDigis = ctx.get(eeDigisProduct);
+  ecal::multifit::EventInputDataGPU inputDataGPU{ebDigis, eeDigis};
+  neb_ = ebDigis.ndigis;
+  nee_ = eeDigis.ndigis;
+
+  // conditions
+  setup.get<EcalPedestalsRcd>().get(pedestalsHandle_);
+  setup.get<EcalGainRatiosRcd>().get(gainRatiosHandle_);
+  setup.get<EcalPulseShapesRcd>().get(pulseShapesHandle_);
+  setup.get<EcalPulseCovariancesRcd>().get(pulseCovariancesHandle_);
+  setup.get<EcalSamplesCorrelationRcd>().get(samplesCorrelationHandle_);
+  setup.get<EcalTimeBiasCorrectionsRcd>().get(timeBiasCorrectionsHandle_);
+  setup.get<EcalTimeCalibConstantsRcd>().get(timeCalibConstantsHandle_);
+  setup.get<EcalSampleMaskRcd>().get(sampleMaskHandle_);
+  setup.get<EcalTimeOffsetConstantRcd>().get(timeOffsetConstantHandle_);
+
+  auto const& pedProduct = pedestalsHandle_->getProduct(ctx.stream());
+  auto const& gainsProduct = gainRatiosHandle_->getProduct(ctx.stream());
+  auto const& pulseShapesProduct = pulseShapesHandle_->getProduct(ctx.stream());
+  auto const& pulseCovariancesProduct = pulseCovariancesHandle_->getProduct(ctx.stream());
+  auto const& samplesCorrelationProduct = samplesCorrelationHandle_->getProduct(ctx.stream());
+  auto const& timeBiasCorrectionsProduct = timeBiasCorrectionsHandle_->getProduct(ctx.stream());
+  auto const& timeCalibConstantsProduct = timeCalibConstantsHandle_->getProduct(ctx.stream());
+
+  // bundle up conditions
+  ecal::multifit::ConditionsProducts conditions{pedProduct,
+                                                gainsProduct,
+                                                pulseShapesProduct,
+                                                pulseCovariancesProduct,
+                                                samplesCorrelationProduct,
+                                                timeBiasCorrectionsProduct,
+                                                timeCalibConstantsProduct,
+                                                *sampleMaskHandle_,
+                                                *timeOffsetConstantHandle_,
+                                                timeCalibConstantsHandle_->getOffset()};
+
+  //
+  // schedule algorithms
+  //
+  ecal::multifit::entryPoint(
+      inputDataGPU, eventOutputDataGPU_, eventDataForScratchGPU_, conditions, configParameters_, ctx.stream());
 }
 
-void EcalUncalibRecHitProducerGPU::produce(
-        edm::Event& event, 
-        edm::EventSetup const& setup) 
-{
-    //DurationMeasurer<std::chrono::milliseconds> timer{std::string{"produce duration"}};
-    cms::cuda::ScopedContextProduce ctx{cudaState_};
-
-    // copy construct output collections
-    // note, output collections do not own device memory!
-    ecal::UncalibratedRecHit<ecal::Tag::ptr> 
-        ebRecHits{eventOutputDataGPU_},
-        eeRecHits{eventOutputDataGPU_};
-
-    // set the size of eb and ee
-    ebRecHits.size = neb_;
-    eeRecHits.size = nee_;
-
-    // shift ptrs for ee
-    eeRecHits.amplitudesAll += neb_ * EcalDataFrame::MAXSAMPLES;
-    eeRecHits.amplitude += neb_;
-    eeRecHits.chi2 += neb_;
-    eeRecHits.pedestal += neb_;
-    eeRecHits.did += neb_;
-    eeRecHits.flags += neb_;
-    if (configParameters_.shouldRunTimingComputation) {
-        eeRecHits.jitter += neb_;
-        eeRecHits.jitterError += neb_;
-    }
-
-    // put into the event
-    ctx.emplace(event, recHitsTokenEB_, std::move(ebRecHits));
-    ctx.emplace(event, recHitsTokenEE_, std::move(eeRecHits));
+void EcalUncalibRecHitProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) {
+  //DurationMeasurer<std::chrono::milliseconds> timer{std::string{"produce duration"}};
+  cms::cuda::ScopedContextProduce ctx{cudaState_};
+
+  // copy construct output collections
+  // note, output collections do not own device memory!
+  ecal::UncalibratedRecHit<ecal::Tag::ptr> ebRecHits{eventOutputDataGPU_}, eeRecHits{eventOutputDataGPU_};
+
+  // set the size of eb and ee
+  ebRecHits.size = neb_;
+  eeRecHits.size = nee_;
+
+  // shift ptrs for ee
+  eeRecHits.amplitudesAll += neb_ * EcalDataFrame::MAXSAMPLES;
+  eeRecHits.amplitude += neb_;
+  eeRecHits.chi2 += neb_;
+  eeRecHits.pedestal += neb_;
+  eeRecHits.did += neb_;
+  eeRecHits.flags += neb_;
+  if (configParameters_.shouldRunTimingComputation) {
+    eeRecHits.jitter += neb_;
+    eeRecHits.jitterError += neb_;
+  }
+
+  // put into the event
+  ctx.emplace(event, recHitsTokenEB_, std::move(ebRecHits));
+  ctx.emplace(event, recHitsTokenEE_, std::move(eeRecHits));
 }
 
 DEFINE_FWK_MODULE(EcalUncalibRecHitProducerGPU);

From d5f111724fab3e72bc8be43b205690ac1a85d84b Mon Sep 17 00:00:00 2001
From: amassiro <massironi.andrea@gmail.com>
Date: Mon, 6 Apr 2020 08:56:45 +0200
Subject: [PATCH 09/30] first commit

---
 .../EcalRecHitSoA/interface/EcalRecHit_soa.h  |  52 ++
 .../interface/EcalUncalibratedRecHit_soa.h    |  16 +
 CUDADataFormats/EcalRecHitSoA/src/classes.h   |   1 +
 .../EcalRecHitSoA/src/classes_def.xml         |   2 +
 .../EcalRecAlgos/interface/DeclsForKernels.h  | 133 ++++
 .../interface/EcalADCToGeVConstantGPU.h       |  43 +
 .../interface/EcalChannelStatusGPU.h          |  43 +
 .../interface/EcalIntercalibConstantsGPU.h    |  44 ++
 .../interface/EcalLaserAPDPNRatiosGPU.h       |  58 ++
 .../interface/EcalLaserAPDPNRatiosRefGPU.h    |  44 ++
 .../interface/EcalLaserAlphasGPU.h            |  44 ++
 .../interface/EcalLinearCorrectionsGPU.h      |  57 ++
 .../src/AmplitudeComputationCommonKernels.cu  |   4 +-
 .../src/EcalADCToGeVConstantGPU.cc            |  37 +
 .../EcalRecAlgos/src/EcalChannelStatusGPU.cc  |  47 ++
 .../src/EcalIntercalibConstantsGPU.cc         |  44 ++
 .../src/EcalLaserAPDPNRatiosGPU.cc            | 109 +++
 .../src/EcalLaserAPDPNRatiosRefGPU.cc         |  44 ++
 .../EcalRecAlgos/src/EcalLaserAlphasGPU.cc    |  44 ++
 .../src/EcalLinearCorrectionsGPU.cc           | 102 +++
 .../src/EcalRecHitBuilderKernels.cu           | 734 ++++++++++++++++++
 .../src/EcalRecHitBuilderKernels.h            |  97 +++
 .../EcalRecAlgos/src/KernelHelpers.cu         | 308 +++++++-
 .../EcalRecAlgos/src/KernelHelpers.h          |  21 +-
 .../src/TimeComputationKernels.cu             |   4 +-
 .../plugins/EcalCPURecHitProducer.cc          | 190 +++++
 .../plugins/EcalRecHitConvertGPU2CPUFormat.cc | 137 ++++
 .../python/ecalRecHitGPU_cfi.py               | 132 ++++
 .../test/sourceFromRawCmggpu_cff.py           | 151 ++++
 .../test/testEcalUncalibRechitProducer_cfg.py | 231 ++++++
 30 files changed, 2960 insertions(+), 13 deletions(-)
 create mode 100644 CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h
 create mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h
 create mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h
 create mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h
 create mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h
 create mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h
 create mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h
 create mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h
 create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalADCToGeVConstantGPU.cc
 create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc
 create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalIntercalibConstantsGPU.cc
 create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosGPU.cc
 create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosRefGPU.cc
 create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalLaserAlphasGPU.cc
 create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc
 create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
 create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h
 create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc
 create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc
 create mode 100644 RecoLocalCalo/EcalRecProducers/python/ecalRecHitGPU_cfi.py
 create mode 100644 RecoLocalCalo/EcalRecProducers/test/sourceFromRawCmggpu_cff.py
 create mode 100644 RecoLocalCalo/EcalRecProducers/test/testEcalUncalibRechitProducer_cfg.py

diff --git a/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h b/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h
new file mode 100644
index 0000000000000..20d342d1b7073
--- /dev/null
+++ b/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h
@@ -0,0 +1,52 @@
+#ifndef CUDADataFormats_EcalRecHitSoA_interface_EcalRecHit_soa_h
+#define CUDADataFormats_EcalRecHitSoA_interface_EcalRecHit_soa_h
+
+#include <vector>
+#include <array>
+
+#include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+
+// needed for "soa" definition
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
+
+namespace ecal {
+  
+  template<typename L = Tag::soa>
+  struct RecHit  : public Detail::Base<L> {
+    
+    RecHit() = default;
+    RecHit(const RecHit&) = default;
+    RecHit& operator=(const RecHit&) = default;
+    
+    RecHit(RecHit&&) = default;
+    RecHit& operator=(RecHit&&) = default;
+    
+    typename type_wrapper<reco::StorageScalarType, L>::type energy;
+    typename type_wrapper<reco::StorageScalarType, L>::type time;
+    typename type_wrapper<reco::StorageScalarType, L>::type chi2; // should we remove this, since already included in "extra" ?
+    typename type_wrapper<uint32_t, L>::type extra;    // packed uint32_t for timeError, chi2, energyError
+    typename type_wrapper<uint32_t, L>::type flagBits; // store rechit condition (see Flags enum) in a bit-wise way
+    
+    typename type_wrapper<uint32_t, L>::type did;
+    
+    
+    template<typename U = L>
+    typename std::enable_if<std::is_same<U, Tag::soa>::value, void>::type 
+    resize(size_t size) {
+      energy.resize(size);
+      time.resize(size);
+      chi2.resize(size);
+      extra.resize(size);
+      flagBits.resize(size);
+      did.resize(size);
+    }
+  };
+  
+  using SoARecHitCollection = RecHit<Tag::soa>;
+  
+}
+
+#endif 
+// RecoLocalCalo_EcalRecAlgos_interface_EcalRecHit_soa_h
+
diff --git a/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h b/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h
index e11c13ebdf4c2..fe11fc64dae8f 100644
--- a/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h
+++ b/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h
@@ -18,6 +18,22 @@ namespace ecal {
 
   }  // namespace Tag
 
+  
+  namespace Detail {
+    
+    // empty base 
+    template<typename T>
+    struct Base {};
+    
+    // add number of values for ptr case
+    template<>
+    struct Base<::ecal::Tag::ptr> {
+      uint32_t size;
+    };
+    
+  }
+  
+  
   template <typename T, typename L = Tag::soa>
   struct type_wrapper {
     //#ifndef ECAL_MULTIFIT_DONOT_USE_PINNED_MEM
diff --git a/CUDADataFormats/EcalRecHitSoA/src/classes.h b/CUDADataFormats/EcalRecHitSoA/src/classes.h
index 8ad6b8d684b9a..5c47ccc6c10e9 100644
--- a/CUDADataFormats/EcalRecHitSoA/src/classes.h
+++ b/CUDADataFormats/EcalRecHitSoA/src/classes.h
@@ -1,2 +1,3 @@
 #include "DataFormats/Common/interface/Wrapper.h"
 #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
\ No newline at end of file
diff --git a/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml b/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml
index 461460835a723..7217782abac05 100644
--- a/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml
+++ b/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml
@@ -2,4 +2,6 @@
     <class name="ecal::Tag::soa"/>
     <class name="ecal::UncalibratedRecHit<ecal::Tag::soa>"/>
     <class name="edm::Wrapper<ecal::UncalibratedRecHit<ecal::Tag::soa>>"/>
+    <class name="ecal::RecHit<ecal::Tag::soa>"/>
+    <class name="edm::Wrapper<ecal::RecHit<ecal::Tag::soa> >"/>          
 </lcgdict>
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h
index b997906006a22..1a117a63288ef 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h
@@ -252,4 +252,137 @@ namespace ecal {
   }  // namespace multifit
 }  // namespace ecal
 
+
+
+
+
+
+
+
+
+// 
+// ECAL Rechit producer
+// 
+
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
+
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h"
+
+
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h"
+
+
+
+
+namespace ecal { 
+  namespace rechit {
+    
+    // parameters that are read in the configuration file for rechit producer
+    struct ConfigurationParameters {
+      // device ptrs
+      int *ChannelStatusToBeExcluded=nullptr; 
+      uint32_t ChannelStatusToBeExcludedSize;
+      
+      bool killDeadChannels;
+      
+      bool recoverEBIsolatedChannels ;
+      bool recoverEEIsolatedChannels ;
+      bool recoverEBVFE              ;
+      bool recoverEEVFE              ;
+      bool recoverEBFE               ;
+      bool recoverEEFE               ;
+      
+      float EBLaserMIN;
+      float EELaserMIN;
+      float EBLaserMAX;
+      float EELaserMAX;
+      
+      //       std::vector<std::vector<uint32_t> > v_DB_reco_flags;
+      int* expanded_v_DB_reco_flags;
+      uint32_t* expanded_Sizes_v_DB_reco_flags;
+      uint32_t* expanded_flagbit_v_DB_reco_flags;
+      uint32_t expanded_v_DB_reco_flagsSize;
+      
+      uint32_t flagmask;
+      
+      
+      //       
+      //       bool shouldRunTimingComputation;
+    };
+    
+    
+    
+    
+    
+    
+    struct EventOutputDataGPU final : public ::ecal::RecHit<::ecal::Tag::ptr> {
+      
+      void allocate(ConfigurationParameters const& configParameters, uint32_t size) {
+        //      void allocate(uint32_t size) {
+        //---- configParameters -> needed only to decide if to save the timing information or not
+        
+        cudaCheck( cudaMalloc((void**)&energy,
+                              size * sizeof(::ecal::reco::StorageScalarType)) );
+        cudaCheck( cudaMalloc((void**)&time,
+                              size * sizeof(::ecal::reco::StorageScalarType)) );
+        cudaCheck( cudaMalloc((void**)&chi2,
+                              size * sizeof(::ecal::reco::StorageScalarType)) );
+        cudaCheck( cudaMalloc((void**)&flagBits,
+                              size * sizeof(uint32_t)) );
+        cudaCheck( cudaMalloc((void**)&extra,
+                              size * sizeof(uint32_t)) );      
+        cudaCheck( cudaMalloc((void**)&did,
+                              size * sizeof(uint32_t)) );
+      }
+      
+      
+      void deallocate(ConfigurationParameters const& configParameters) {
+        //     void deallocate() {
+        //---- configParameters -> needed only to decide if to save the timing information or not
+        
+        cudaCheck( cudaFree(energy) );
+        cudaCheck( cudaFree(time) );
+        cudaCheck( cudaFree(chi2) );
+        cudaCheck( cudaFree(flagBits) );
+        cudaCheck( cudaFree(extra) );
+        cudaCheck( cudaFree(did) );
+      }
+    };
+    
+    
+    
+    struct EventInputDataGPU {
+      ecal::UncalibratedRecHit<ecal::Tag::ptr> const& ebUncalibRecHits;
+      ecal::UncalibratedRecHit<ecal::Tag::ptr> const& eeUncalibRecHits;
+    };
+    
+    // const refs products to conditions
+    struct ConditionsProducts {
+      EcalADCToGeVConstantGPU::Product    const& ADCToGeV;
+      EcalIntercalibConstantsGPU::Product const& Intercalib;
+      EcalChannelStatusGPU::Product       const& ChannelStatus;
+      //     
+      EcalLaserAPDPNRatiosGPU::Product     const& LaserAPDPNRatios   ;
+      EcalLaserAPDPNRatiosRefGPU::Product  const& LaserAPDPNRatiosRef;
+      EcalLaserAlphasGPU::Product          const& LaserAlphas        ;
+      EcalLinearCorrectionsGPU::Product    const& LinearCorrections  ;
+      //     
+      //     
+      uint32_t offsetForHashes;    
+    };
+    
+    
+    
+  }
+}
+
+
+
+
+
 #endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h
new file mode 100644
index 0000000000000..4f6cb43eddee0
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h
@@ -0,0 +1,43 @@
+#ifndef RecoLocalCalo_EcalRecProducers_src_EcalADCToGeVConstantGPU_h
+#define RecoLocalCalo_EcalRecProducers_src_EcalADCToGeVConstantGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalADCToGeVConstant.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif
+
+class EcalADCToGeVConstantGPU {
+public:
+  struct Product {
+    ~Product();
+    float *adc2gev = nullptr;
+  };
+  
+  #ifndef __CUDACC__
+  
+  // 
+  EcalADCToGeVConstantGPU(EcalADCToGeVConstant const&);
+  
+  // will call dealloation for Product thru ~Product
+  ~EcalADCToGeVConstantGPU() = default;
+  
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+  
+  // 
+  static std::string name() { return std::string{"ecalADCToGeVConstantGPU"}; }
+  
+private:
+  // in the future, we need to arrange so to avoid this copy on the host
+  // store eb first then ee
+  std::vector<float, CUDAHostAllocator<float>> adc2gev_;
+  
+  cms::cuda::ESProduct<Product> product_;
+  
+  #endif
+};
+
+
+#endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h
new file mode 100644
index 0000000000000..0932e7f0641d9
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h
@@ -0,0 +1,43 @@
+#ifndef RecoLocalCalo_EcalRecProducers_src_EcalChannelStatusGPU_h
+#define RecoLocalCalo_EcalRecProducers_src_EcalChannelStatusGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalChannelStatus.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif
+
+class EcalChannelStatusGPU {
+public:
+  struct Product {
+    ~Product();
+    uint16_t *status = nullptr;
+  };
+  
+  #ifndef __CUDACC__
+  
+  // 
+  EcalChannelStatusGPU(EcalChannelStatus const&);
+  
+  // will call dealloation for Product thru ~Product
+  ~EcalChannelStatusGPU() = default;
+  
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+  
+  // 
+  static std::string name() { return std::string{"ecalChannelStatusGPU"}; }
+  
+private:
+  // in the future, we need to arrange so to avoid this copy on the host
+  // store eb first then ee
+  std::vector<uint16_t, CUDAHostAllocator<uint16_t>> status_;
+  
+  cms::cuda::ESProduct<Product> product_;
+  
+  #endif
+};
+
+
+#endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h
new file mode 100644
index 0000000000000..ae36aa78c9e45
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h
@@ -0,0 +1,44 @@
+#ifndef RecoLocalCalo_EcalRecProducers_src_EcalIntercalibConstantsGPU_h
+#define RecoLocalCalo_EcalRecProducers_src_EcalIntercalibConstantsGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalIntercalibConstants.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif
+
+class EcalIntercalibConstantsGPU {
+public:
+  struct Product {
+    ~Product();
+    float *values = nullptr;
+  };
+  
+  #ifndef __CUDACC__
+  // 
+  EcalIntercalibConstantsGPU(EcalIntercalibConstants const&);
+  
+  // will call dealloation for Product thru ~Product
+  ~EcalIntercalibConstantsGPU() = default;
+  
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+  
+  // TODO: do this centrally
+  // get offset for hashes. equals number of barrel items
+  uint32_t getOffset() const { return valuesEB_.size(); }
+  
+  // 
+  static std::string name() { return std::string{"ecalIntercalibConstantsGPU"}; }
+  
+private:
+  std::vector<float> const& valuesEB_;
+  std::vector<float> const& valuesEE_;
+  
+  cms::cuda::ESProduct<Product> product_;
+  #endif
+};
+
+
+#endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h
new file mode 100644
index 0000000000000..53c8ea6ba67b7
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h
@@ -0,0 +1,58 @@
+#ifndef RecoLocalCalo_EcalRecProducers_src_EcalLaserAPDPNRatiosGPU_h
+#define RecoLocalCalo_EcalRecProducers_src_EcalLaserAPDPNRatiosGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalLaserAPDPNRatios.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif
+
+class EcalLaserAPDPNRatiosGPU {
+public:
+  struct Product {
+    ~Product();
+    float *p1=nullptr;
+    float *p2=nullptr;
+    float *p3=nullptr;
+    edm::TimeValue_t *t1=nullptr;
+    edm::TimeValue_t *t2=nullptr;
+    edm::TimeValue_t *t3=nullptr;
+  };
+  
+  #ifndef __CUDACC__
+  
+  // 
+  EcalLaserAPDPNRatiosGPU(EcalLaserAPDPNRatios const&);
+  
+  // will call dealloation for Product thru ~Product
+  ~EcalLaserAPDPNRatiosGPU() = default;
+  
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+  
+  // 
+  static std::string name() { return std::string{"ecalLaserAPDPNRatiosGPU"}; }
+  
+private:
+  // in the future, we need to arrange so to avoid this copy on the host
+  // store eb first then ee
+  std::vector<float, CUDAHostAllocator<float> > p1_;
+  std::vector<float, CUDAHostAllocator<float> > p2_;
+  std::vector<float, CUDAHostAllocator<float> > p3_;
+  
+  std::vector<edm::TimeValue_t, CUDAHostAllocator<edm::TimeValue_t> > t1_;
+  std::vector<edm::TimeValue_t, CUDAHostAllocator<edm::TimeValue_t> > t2_;
+  std::vector<edm::TimeValue_t, CUDAHostAllocator<edm::TimeValue_t> > t3_;
+  
+  cms::cuda::ESProduct<Product> product_;
+  
+  #endif
+};
+
+
+#endif
+
+
+
+
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h
new file mode 100644
index 0000000000000..191c78a7c4617
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h
@@ -0,0 +1,44 @@
+#ifndef RecoLocalCalo_EcalRecProducers_src_EcalLaserAPDPNRatiosRefGPU_h
+#define RecoLocalCalo_EcalRecProducers_src_EcalLaserAPDPNRatiosRefGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalLaserAPDPNRatiosRef.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif
+
+class EcalLaserAPDPNRatiosRefGPU {
+public:
+  struct Product {
+    ~Product();
+    float *values = nullptr;
+  };
+  
+  #ifndef __CUDACC__
+  // 
+  EcalLaserAPDPNRatiosRefGPU(EcalLaserAPDPNRatiosRef const&);
+  
+  // will call dealloation for Product thru ~Product
+  ~EcalLaserAPDPNRatiosRefGPU() = default;
+  
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+  
+  // TODO: do this centrally
+  // get offset for hashes. equals number of barrel items
+  uint32_t getOffset() const { return valuesEB_.size(); }
+  
+  // 
+  static std::string name() { return std::string{"ecalLaserAPDPNRatiosRefGPU"}; }
+  
+private:
+  std::vector<float> const& valuesEB_;
+  std::vector<float> const& valuesEE_;
+  
+  cms::cuda::ESProduct<Product> product_;
+  #endif
+};
+
+
+#endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h
new file mode 100644
index 0000000000000..ac97e6c514bac
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h
@@ -0,0 +1,44 @@
+#ifndef RecoLocalCalo_EcalRecProducers_src_EcalLaserAlphasGPU_h
+#define RecoLocalCalo_EcalRecProducers_src_EcalLaserAlphasGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalLaserAlphas.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif
+
+class EcalLaserAlphasGPU {
+public:
+  struct Product {
+    ~Product();
+    float *values = nullptr;
+  };
+  
+  #ifndef __CUDACC__
+  // 
+  EcalLaserAlphasGPU(EcalLaserAlphas const&);
+  
+  // will call dealloation for Product thru ~Product
+  ~EcalLaserAlphasGPU() = default;
+  
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+  
+  // TODO: do this centrally
+  // get offset for hashes. equals number of barrel items
+  uint32_t getOffset() const { return valuesEB_.size(); }
+  
+  // 
+  static std::string name() { return std::string{"ecalLaserAlphasGPU"}; }
+  
+private:
+  std::vector<float> const& valuesEB_;
+  std::vector<float> const& valuesEE_;
+  
+  cms::cuda::ESProduct<Product> product_;
+  #endif
+};
+
+
+#endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h
new file mode 100644
index 0000000000000..41469bcf16c82
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h
@@ -0,0 +1,57 @@
+#ifndef RecoLocalCalo_EcalRecProducers_src_EcalLinearCorrectionsGPU_h
+#define RecoLocalCalo_EcalRecProducers_src_EcalLinearCorrectionsGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalLinearCorrections.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif
+
+class EcalLinearCorrectionsGPU {
+public:
+  struct Product {
+    ~Product();
+    float *p1=nullptr;
+    float *p2=nullptr;
+    float *p3=nullptr;
+    edm::TimeValue_t *t1=nullptr;
+    edm::TimeValue_t *t2=nullptr;
+    edm::TimeValue_t *t3=nullptr;
+  };
+  
+  #ifndef __CUDACC__
+  
+  // 
+  EcalLinearCorrectionsGPU(EcalLinearCorrections const&);
+  
+  // will call dealloation for Product thru ~Product
+  ~EcalLinearCorrectionsGPU() = default;
+  
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+  
+  // 
+  static std::string name() { return std::string{"ecalLinearCorrectionsGPU"}; }
+  
+private:
+  // in the future, we need to arrange so to avoid this copy on the host
+  // store eb first then ee
+  std::vector<float, CUDAHostAllocator<float>> p1_;
+  std::vector<float, CUDAHostAllocator<float>> p2_;
+  std::vector<float, CUDAHostAllocator<float>> p3_;
+  
+  std::vector<edm::TimeValue_t, CUDAHostAllocator<edm::TimeValue_t>> t1_;
+  std::vector<edm::TimeValue_t, CUDAHostAllocator<edm::TimeValue_t>> t2_;
+  std::vector<edm::TimeValue_t, CUDAHostAllocator<edm::TimeValue_t>> t3_;
+  
+  cms::cuda::ESProduct<Product> product_;
+  
+  #endif
+};
+
+
+#endif
+
+
+
diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu
index bc2b1300123dd..cf59775811486 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu
@@ -77,7 +77,7 @@ namespace ecal {
         auto const did = DetId{dids[ch]};
         auto const isBarrel = did.subdetId() == EcalBarrel;
         // TODO offset for ee, 0 for eb
-        auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId());
+        auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
 
         //
         // pulse shape template
@@ -334,7 +334,7 @@ namespace ecal {
       bool tmp1 = hasSwitchToGain1[ch];
       auto const did = DetId{dids[ch]};
       auto const isBarrel = did.subdetId() == EcalBarrel;
-      auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId());
+      auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
       auto const G12SamplesCorrelation = isBarrel ? G12SamplesCorrelationEB : G12SamplesCorrelationEE;
       auto const* G6SamplesCorrelation = isBarrel ? G6SamplesCorrelationEB : G6SamplesCorrelationEE;
       auto const* G1SamplesCorrelation = isBarrel ? G1SamplesCorrelationEB : G1SamplesCorrelationEE;
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalADCToGeVConstantGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalADCToGeVConstantGPU.cc
new file mode 100644
index 0000000000000..25ec93faad1e7
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalADCToGeVConstantGPU.cc
@@ -0,0 +1,37 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalADCToGeVConstantGPU::EcalADCToGeVConstantGPU(EcalADCToGeVConstant const& values) 
+: adc2gev_(2)  // size is 2, one form EB and one for EE
+{
+  adc2gev_[0] = values.getEBValue();
+  adc2gev_[1] = values.getEEValue(); 
+}
+
+EcalADCToGeVConstantGPU::Product::~Product() {
+  // deallocation
+  cudaCheck( cudaFree(adc2gev) );
+}
+
+EcalADCToGeVConstantGPU::Product const& EcalADCToGeVConstantGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+    cudaStream,
+    [this](EcalADCToGeVConstantGPU::Product& product, cudaStream_t cudaStream) {
+      // malloc
+      cudaCheck( cudaMalloc((void**)&product.adc2gev,
+                            this->adc2gev_.size() * sizeof(float)) );
+      // transfer 
+      cudaCheck( cudaMemcpyAsync(product.adc2gev,
+                                 this->adc2gev_.data(),
+                                 this->adc2gev_.size() * sizeof(float),
+                                 cudaMemcpyHostToDevice,
+                                 cudaStream) );
+    }
+  );
+  
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalADCToGeVConstantGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc
new file mode 100644
index 0000000000000..c1cdc6631878b
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc
@@ -0,0 +1,47 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalChannelStatusGPU::EcalChannelStatusGPU(EcalChannelStatus const& values) 
+: status_(values.size())
+{
+  // fill in eb
+  auto const& barrelValues = values.barrelItems();
+  for (unsigned int i=0; i<barrelValues.size(); i++) {
+    status_[i] = barrelValues[i].getEncodedStatusCode();
+  }
+  
+  // fill in ee
+  auto const& endcapValues = values.endcapItems();
+  auto const offset = barrelValues.size();
+  for (unsigned int i=0; i<endcapValues.size(); i++) {
+    status_[offset + i] = endcapValues[i].getEncodedStatusCode();
+  }
+}
+
+EcalChannelStatusGPU::Product::~Product() {
+  // deallocation
+  cudaCheck( cudaFree(status) );
+}
+
+EcalChannelStatusGPU::Product const& EcalChannelStatusGPU::getProduct(cudaStream_t cudaStream) const { 
+  auto const& product = product_.dataForCurrentDeviceAsync(
+    cudaStream,
+    [this](EcalChannelStatusGPU::Product& product, cudaStream_t cudaStream) {
+      // malloc
+      cudaCheck( cudaMalloc((void**)&product.status,
+                            this->status_.size() * sizeof(uint16_t)) );
+      // transfer 
+      cudaCheck( cudaMemcpyAsync(product.status,
+                                 this->status_.data(),
+                                 this->status_.size() * sizeof(uint16_t),
+                                 cudaMemcpyHostToDevice,
+                                 cudaStream) );
+    }
+  );
+  
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalChannelStatusGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalIntercalibConstantsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalIntercalibConstantsGPU.cc
new file mode 100644
index 0000000000000..844a28d27fd8e
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalIntercalibConstantsGPU.cc
@@ -0,0 +1,44 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalIntercalibConstantsGPU::EcalIntercalibConstantsGPU(EcalIntercalibConstants const& values) 
+: valuesEB_{values.barrelItems()}
+, valuesEE_{values.endcapItems()}
+{}
+
+EcalIntercalibConstantsGPU::Product::~Product() {
+  // deallocation
+  cudaCheck( cudaFree(values) );
+}
+
+EcalIntercalibConstantsGPU::Product const& EcalIntercalibConstantsGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
+                                                           [this](EcalIntercalibConstantsGPU::Product& product, cudaStream_t cudaStream) {
+                                                             // malloc
+                                                             cudaCheck( cudaMalloc((void**)&product.values,
+                                                                                   (this->valuesEB_.size() + this->valuesEE_.size()) * 
+                                                                                   sizeof(float)) );
+                                                             
+                                                             // offset in floats, not bytes
+                                                             auto const offset = this->valuesEB_.size();
+                                                             
+                                                             // transfer 
+                                                             cudaCheck( cudaMemcpyAsync(product.values,
+                                                                                        this->valuesEB_.data(),
+                                                                                        this->valuesEB_.size() * sizeof(float),
+                                                                                        cudaMemcpyHostToDevice,
+                                                                                        cudaStream) );
+                                                             cudaCheck( cudaMemcpyAsync(product.values + offset,
+                                                                                        this->valuesEE_.data(),
+                                                                                        this->valuesEE_.size() * sizeof(float),
+                                                                                        cudaMemcpyHostToDevice,
+                                                                                        cudaStream) );
+                                                           }
+  );
+  
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalIntercalibConstantsGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosGPU.cc
new file mode 100644
index 0000000000000..f54f7bd47c022
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosGPU.cc
@@ -0,0 +1,109 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalLaserAPDPNRatiosGPU::EcalLaserAPDPNRatiosGPU(EcalLaserAPDPNRatios const& values) 
+: p1_(values.getLaserMap().size())
+, p2_(values.getLaserMap().size())
+, p3_(values.getLaserMap().size())
+, t1_(values.getTimeMap().size())
+, t2_(values.getTimeMap().size())
+, t3_(values.getTimeMap().size())
+{
+  
+  // fill in eb
+  //     auto const& barrelValues = values.barrelItems();
+  for (unsigned int i=0; i<values.getLaserMap().barrelItems().size(); i++) {
+    p1_[i] = values.getLaserMap().barrelItems()[i].p1;
+    p2_[i] = values.getLaserMap().barrelItems()[i].p2;
+    p3_[i] = values.getLaserMap().barrelItems()[i].p3;
+  }
+  
+  // fill in ee
+  //     auto const& endcapValues = values.endcapItems();
+  auto const offset_laser = values.getLaserMap().barrelItems().size();
+  for (unsigned int i=0; i<values.getLaserMap().endcapItems().size(); i++) {
+    p1_[offset_laser + i] = values.getLaserMap().endcapItems()[i].p1;
+    p2_[offset_laser + i] = values.getLaserMap().endcapItems()[i].p2;
+    p3_[offset_laser + i] = values.getLaserMap().endcapItems()[i].p3;
+  }
+  
+  //   Time is a simple std::vector
+  //       typedef std::vector<EcalLaserTimeStamp> EcalLaserTimeStampMap;
+  for (unsigned int i=0; i<values.getTimeMap().size(); i++) {
+    t1_[i] = values.getTimeMap()[i].t1.value();
+    t2_[i] = values.getTimeMap()[i].t2.value();
+    t3_[i] = values.getTimeMap()[i].t3.value();
+  }
+}
+
+
+
+EcalLaserAPDPNRatiosGPU::Product::~Product() {
+  // deallocation
+  cudaCheck( cudaFree(p1) );
+  cudaCheck( cudaFree(p2) );
+  cudaCheck( cudaFree(p3) );
+  cudaCheck( cudaFree(t1) );
+  cudaCheck( cudaFree(t2) );
+  cudaCheck( cudaFree(t3) );
+}
+
+EcalLaserAPDPNRatiosGPU::Product const& EcalLaserAPDPNRatiosGPU::getProduct(
+  cudaStream_t cudaStream) const
+  {
+    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
+                                                             [this](EcalLaserAPDPNRatiosGPU::Product& product, cudaStream_t cudaStream) {
+                                                               // malloc
+                                                               cudaCheck( cudaMalloc((void**)&product.p1,
+                                                                                     this->p1_.size() * sizeof(float)) );
+                                                               cudaCheck( cudaMalloc((void**)&product.p2,
+                                                                                     this->p2_.size() * sizeof(float)) );
+                                                               cudaCheck( cudaMalloc((void**)&product.p3,
+                                                                                     this->p3_.size() * sizeof(float)) );
+                                                               cudaCheck( cudaMalloc((void**)&product.t1,
+                                                                                     this->t1_.size() * sizeof(edm::TimeValue_t)) );
+                                                               cudaCheck( cudaMalloc((void**)&product.t2,
+                                                                                     this->t2_.size() * sizeof(edm::TimeValue_t)) );
+                                                               cudaCheck( cudaMalloc((void**)&product.t3,
+                                                                                     this->t3_.size() * sizeof(edm::TimeValue_t)) );
+                                                               // transfer 
+                                                               cudaCheck( cudaMemcpyAsync(product.p1,
+                                                                                          this->p1_.data(),
+                                                                                          this->p1_.size() * sizeof(float),
+                                                                                          cudaMemcpyHostToDevice,
+                                                                                          cudaStream) );
+                                                               cudaCheck( cudaMemcpyAsync(product.p2,
+                                                                                          this->p2_.data(),
+                                                                                          this->p2_.size() * sizeof(float),
+                                                                                          cudaMemcpyHostToDevice,
+                                                                                          cudaStream) );
+                                                               cudaCheck( cudaMemcpyAsync(product.p3,
+                                                                                          this->p3_.data(),
+                                                                                          this->p3_.size() * sizeof(float),
+                                                                                          cudaMemcpyHostToDevice,
+                                                                                          cudaStream) );
+                                                               cudaCheck( cudaMemcpyAsync(product.t1,
+                                                                                          this->t1_.data(),
+                                                                                          this->t1_.size() * sizeof(edm::TimeValue_t),
+                                                                                          cudaMemcpyHostToDevice,
+                                                                                          cudaStream) );
+                                                               cudaCheck( cudaMemcpyAsync(product.t2,
+                                                                                          this->t2_.data(),
+                                                                                          this->t2_.size() * sizeof(edm::TimeValue_t),
+                                                                                          cudaMemcpyHostToDevice,
+                                                                                          cudaStream) );
+                                                               cudaCheck( cudaMemcpyAsync(product.t3,
+                                                                                          this->t3_.data(),
+                                                                                          this->t3_.size() * sizeof(edm::TimeValue_t),
+                                                                                          cudaMemcpyHostToDevice,
+                                                                                          cudaStream) );
+                                                             }
+    );
+    
+    return product;
+  }
+  
+  TYPELOOKUP_DATA_REG(EcalLaserAPDPNRatiosGPU);
+  
\ No newline at end of file
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosRefGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosRefGPU.cc
new file mode 100644
index 0000000000000..c4c07361a8535
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosRefGPU.cc
@@ -0,0 +1,44 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalLaserAPDPNRatiosRefGPU::EcalLaserAPDPNRatiosRefGPU(EcalLaserAPDPNRatiosRef const& values) 
+: valuesEB_{values.barrelItems()}
+, valuesEE_{values.endcapItems()}
+{}
+
+EcalLaserAPDPNRatiosRefGPU::Product::~Product() {
+  // deallocation
+  cudaCheck( cudaFree(values) );
+}
+
+EcalLaserAPDPNRatiosRefGPU::Product const& EcalLaserAPDPNRatiosRefGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
+                                                           [this](EcalLaserAPDPNRatiosRefGPU::Product& product, cudaStream_t cudaStream) {
+                                                             // malloc
+                                                             cudaCheck( cudaMalloc((void**)&product.values,
+                                                                                   (this->valuesEB_.size() + this->valuesEE_.size()) * 
+                                                                                   sizeof(float)) );
+                                                             
+                                                             // offset in floats, not bytes
+                                                             auto const offset = this->valuesEB_.size();
+                                                             
+                                                             // transfer 
+                                                             cudaCheck( cudaMemcpyAsync(product.values,
+                                                                                        this->valuesEB_.data(),
+                                                                                        this->valuesEB_.size() * sizeof(float),
+                                                                                        cudaMemcpyHostToDevice,
+                                                                                        cudaStream) );
+                                                             cudaCheck( cudaMemcpyAsync(product.values + offset,
+                                                                                        this->valuesEE_.data(),
+                                                                                        this->valuesEE_.size() * sizeof(float),
+                                                                                        cudaMemcpyHostToDevice,
+                                                                                        cudaStream) );
+                                                           }
+  );
+  
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalLaserAPDPNRatiosRefGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAlphasGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAlphasGPU.cc
new file mode 100644
index 0000000000000..24257fd8b547a
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAlphasGPU.cc
@@ -0,0 +1,44 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalLaserAlphasGPU::EcalLaserAlphasGPU(EcalLaserAlphas const& values) 
+: valuesEB_{values.barrelItems()}
+, valuesEE_{values.endcapItems()}
+{}
+
+EcalLaserAlphasGPU::Product::~Product() {
+  // deallocation
+  cudaCheck( cudaFree(values) );
+}
+
+EcalLaserAlphasGPU::Product const& EcalLaserAlphasGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
+                                                           [this](EcalLaserAlphasGPU::Product& product, cudaStream_t cudaStream) {
+                                                             // malloc
+                                                             cudaCheck( cudaMalloc((void**)&product.values,
+                                                                                   (this->valuesEB_.size() + this->valuesEE_.size()) * 
+                                                                                   sizeof(float)) );
+                                                             
+                                                             // offset in floats, not bytes
+                                                             auto const offset = this->valuesEB_.size();
+                                                             
+                                                             // transfer 
+                                                             cudaCheck( cudaMemcpyAsync(product.values,
+                                                                                        this->valuesEB_.data(),
+                                                                                        this->valuesEB_.size() * sizeof(float),
+                                                                                        cudaMemcpyHostToDevice,
+                                                                                        cudaStream) );
+                                                             cudaCheck( cudaMemcpyAsync(product.values + offset,
+                                                                                        this->valuesEE_.data(),
+                                                                                        this->valuesEE_.size() * sizeof(float),
+                                                                                        cudaMemcpyHostToDevice,
+                                                                                        cudaStream) );
+                                                           }
+  );
+  
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalLaserAlphasGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc
new file mode 100644
index 0000000000000..2dedb1074bee7
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc
@@ -0,0 +1,102 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalLinearCorrectionsGPU::EcalLinearCorrectionsGPU(EcalLinearCorrections const& values) 
+: p1_(values.getValueMap().size())
+, p2_(values.getValueMap().size())
+, p3_(values.getValueMap().size())
+, t1_(values.getTimeMap().size())
+, t2_(values.getTimeMap().size())
+, t3_(values.getTimeMap().size())
+{
+  
+  // fill in eb
+  for (unsigned int i=0; i<values.getValueMap().barrelItems().size(); i++) {
+    p1_[i] = values.getValueMap().barrelItems()[i].p1;
+    p2_[i] = values.getValueMap().barrelItems()[i].p2;
+    p3_[i] = values.getValueMap().barrelItems()[i].p3;
+  }
+  
+  // fill in ee
+  auto const offset_laser = values.getValueMap().barrelItems().size();
+  for (unsigned int i=0; i<values.getValueMap().endcapItems().size(); i++) {
+    p1_[offset_laser + i] = values.getValueMap().endcapItems()[i].p1;
+    p2_[offset_laser + i] = values.getValueMap().endcapItems()[i].p2;
+    p3_[offset_laser + i] = values.getValueMap().endcapItems()[i].p3;
+  }
+  
+  //   Time is a simple std::vector
+  //       typedef std::vector<EcalLaserTimeStamp> EcalLaserTimeStampMap;
+  for (unsigned int i=0; i<values.getTimeMap().size(); i++) {
+    t1_[i] = values.getTimeMap()[i].t1.value();
+    t2_[i] = values.getTimeMap()[i].t2.value();
+    t3_[i] = values.getTimeMap()[i].t3.value();
+  }
+  
+}
+
+EcalLinearCorrectionsGPU::Product::~Product() {
+  // deallocation
+  cudaCheck( cudaFree(p1) );
+  cudaCheck( cudaFree(p2) );
+}
+
+EcalLinearCorrectionsGPU::Product const& EcalLinearCorrectionsGPU::getProduct(
+  cudaStream_t cudaStream) const
+  {
+    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
+                                                             [this](EcalLinearCorrectionsGPU::Product& product, cudaStream_t cudaStream) {
+                                                               // malloc
+                                                               cudaCheck( cudaMalloc((void**)&product.p1,
+                                                                                     this->p1_.size() * sizeof(float)) );
+                                                               cudaCheck( cudaMalloc((void**)&product.p2,
+                                                                                     this->p2_.size() * sizeof(float)) );
+                                                               cudaCheck( cudaMalloc((void**)&product.p3,
+                                                                                     this->p3_.size() * sizeof(float)) );
+                                                               cudaCheck( cudaMalloc((void**)&product.t1,
+                                                                                     this->t1_.size() * sizeof(edm::TimeValue_t)) );
+                                                               cudaCheck( cudaMalloc((void**)&product.t2,
+                                                                                     this->t2_.size() * sizeof(edm::TimeValue_t)) );
+                                                               cudaCheck( cudaMalloc((void**)&product.t3,
+                                                                                     this->t3_.size() * sizeof(edm::TimeValue_t)) );
+                                                               // transfer 
+                                                               cudaCheck( cudaMemcpyAsync(product.p1,
+                                                                                          this->p1_.data(),
+                                                                                          this->p1_.size() * sizeof(float),
+                                                                                          cudaMemcpyHostToDevice,
+                                                                                          cudaStream) );
+                                                               cudaCheck( cudaMemcpyAsync(product.p2,
+                                                                                          this->p2_.data(),
+                                                                                          this->p2_.size() * sizeof(float),
+                                                                                          cudaMemcpyHostToDevice,
+                                                                                          cudaStream) );
+                                                               cudaCheck( cudaMemcpyAsync(product.p3,
+                                                                                          this->p3_.data(),
+                                                                                          this->p3_.size() * sizeof(float),
+                                                                                          cudaMemcpyHostToDevice,
+                                                                                          cudaStream) );
+                                                               cudaCheck( cudaMemcpyAsync(product.t1,
+                                                                                          this->t1_.data(),
+                                                                                          this->t1_.size() * sizeof(edm::TimeValue_t),
+                                                                                          cudaMemcpyHostToDevice,
+                                                                                          cudaStream) );
+                                                               cudaCheck( cudaMemcpyAsync(product.t2,
+                                                                                          this->t2_.data(),
+                                                                                          this->t2_.size() * sizeof(edm::TimeValue_t),
+                                                                                          cudaMemcpyHostToDevice,
+                                                                                          cudaStream) );
+                                                               cudaCheck( cudaMemcpyAsync(product.t3,
+                                                                                          this->t3_.data(),
+                                                                                          this->t3_.size() * sizeof(edm::TimeValue_t),
+                                                                                          cudaMemcpyHostToDevice,
+                                                                                          cudaStream) );
+                                                             }
+    );
+    
+    return product;
+  }
+  
+  TYPELOOKUP_DATA_REG(EcalLinearCorrectionsGPU);
+  
\ No newline at end of file
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
new file mode 100644
index 0000000000000..ab67ceb46fc0f
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
@@ -0,0 +1,734 @@
+#include "cuda.h"
+
+#include "KernelHelpers.h"
+
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
+
+//
+//
+#include "EcalRecHitBuilderKernels.h"
+
+
+#include "KernelHelpers.h"
+
+
+
+
+namespace ecal {
+  namespace rechit {
+    
+    
+    // uncalibrecHit flags
+    enum UncalibRecHitFlags {
+      kGood=-1,                 // channel is good (mutually exclusive with other states)  setFlagBit(kGood) reset flags_ to zero 
+      kPoorReco,                // channel has been badly reconstructed (e.g. bad shape, bad chi2 etc.)
+      kSaturated,               // saturated channel
+      kOutOfTime,               // channel out of time
+      kLeadingEdgeRecovered,    // saturated channel: energy estimated from the leading edge before saturation
+      kHasSwitchToGain6,        // at least one data frame is in G6
+      kHasSwitchToGain1         // at least one data frame is in G1
+    };
+    
+    
+    // recHit flags
+    enum RecHitFlags { 
+      RecHitFlags_kGood=0,                   // channel ok, the energy and time measurement are reliable
+      RecHitFlags_kPoorReco,                 // the energy is available from the UncalibRecHit, but approximate (bad shape, large chi2)
+      RecHitFlags_kOutOfTime,                // the energy is available from the UncalibRecHit (sync reco), but the event is out of time
+      RecHitFlags_kFaultyHardware,           // The energy is available from the UncalibRecHit, channel is faulty at some hardware level (e.g. noisy)
+      RecHitFlags_kNoisy,                    // the channel is very noisy
+      RecHitFlags_kPoorCalib,                // the energy is available from the UncalibRecHit, but the calibration of the channel is poor
+      RecHitFlags_kSaturated,                // saturated channel (recovery not tried)
+      RecHitFlags_kLeadingEdgeRecovered,     // saturated channel: energy estimated from the leading edge before saturation
+      RecHitFlags_kNeighboursRecovered,      // saturated/isolated dead: energy estimated from neighbours
+      RecHitFlags_kTowerRecovered,           // channel in TT with no data link, info retrieved from Trigger Primitive
+      RecHitFlags_kDead,                     // channel is dead and any recovery fails
+      RecHitFlags_kKilled,                   // MC only flag: the channel is killed in the real detector
+      RecHitFlags_kTPSaturated,              // the channel is in a region with saturated TP
+      RecHitFlags_kL1SpikeFlag,              // the channel is in a region with TP with sFGVB = 0
+      RecHitFlags_kWeird,                    // the signal is believed to originate from an anomalous deposit (spike) 
+      RecHitFlags_kDiWeird,                  // the signal is anomalous, and neighbors another anomalous signal  
+      RecHitFlags_kHasSwitchToGain6,         // at least one data frame is in G6
+      RecHitFlags_kHasSwitchToGain1,         // at least one data frame is in G1
+      //
+      RecHitFlags_kUnknown                   // to ease the interface with functions returning flags. 
+    };
+    
+    
+    // status code
+    enum EcalChannelStatusCode_Code {
+      kOk=0,
+      kDAC,
+      kNoLaser,
+      kNoisy,
+      kNNoisy,
+      kNNNoisy,
+      kNNNNoisy,
+      kNNNNNoisy,
+      kFixedG6,
+      kFixedG1,
+      kFixedG0,
+      kNonRespondingIsolated,
+      kDeadVFE,
+      kDeadFE,
+      kNoDataNoTP      
+    };
+    
+    
+    
+    
+    
+    __global__
+    void kernel_create_ecal_rehit(
+      // configuration 
+      int const* ChannelStatusToBeExcluded,
+      uint32_t ChannelStatusToBeExcludedSize,   
+      bool const killDeadChannels,
+      bool const recoverEBIsolatedChannels,
+      bool const recoverEEIsolatedChannels,
+      bool const recoverEBVFE,             
+      bool const recoverEEVFE,             
+      bool const recoverEBFE,             
+      bool const recoverEEFE,              
+      float const EBLaserMIN,
+      float const EELaserMIN,
+      float const EBLaserMAX,
+      float const EELaserMAX,
+      // for flags setting
+      int const* expanded_v_DB_reco_flags,    // FIXME AM: to be checked
+      uint32_t const* expanded_Sizes_v_DB_reco_flags,
+      uint32_t const* expanded_flagbit_v_DB_reco_flags,
+      uint32_t expanded_v_DB_reco_flagsSize,
+      uint32_t flagmask,
+      // conditions
+      float const* adc2gev,
+      float const* intercalib,
+      uint16_t const* status,
+      float const* apdpnrefs,
+      float const* alphas,
+      // input for transparency corrections
+      float const* p1,
+      float const* p2,
+      float const* p3,
+      edm::TimeValue_t const* t1,
+      edm::TimeValue_t const* t2,
+      edm::TimeValue_t const* t3,  
+      // input for linear corrections
+      float const* lp1,
+      float const* lp2,
+      float const* lp3,
+      edm::TimeValue_t const* lt1,
+      edm::TimeValue_t const* lt2,
+      edm::TimeValue_t const* lt3,                    
+      // time, used for time dependent corrections
+      edm::TimeValue_t const event_time,
+      // input
+      uint32_t const* did_eb,
+      uint32_t const* did_ee,
+      ::ecal::reco::StorageScalarType const* amplitude_eb,   // in adc counts  
+      ::ecal::reco::StorageScalarType const* amplitude_ee,   // in adc counts  
+      ::ecal::reco::StorageScalarType const* time_eb,   
+      ::ecal::reco::StorageScalarType const* time_ee,   
+      ::ecal::reco::StorageScalarType const* chi2_eb,   
+      ::ecal::reco::StorageScalarType const* chi2_ee,   
+      uint32_t const* flags_eb,
+      uint32_t const* flags_ee,
+      // output
+      uint32_t *did,
+      ::ecal::reco::StorageScalarType* energy,   // in energy [GeV]  
+      ::ecal::reco::StorageScalarType* time,  
+      ::ecal::reco::StorageScalarType* chi2,  
+      uint32_t* flagBits,
+      uint32_t* extra,
+      // other
+      int const nchannels,
+      uint32_t const offsetForInput,
+      uint32_t const offsetForHashes                     
+    ) {
+      
+      
+      //       
+      //    NB: energy   "type_wrapper<reco::StorageScalarType, L>::type" most likely std::vector<float>
+      //       
+      
+      int ch = threadIdx.x + blockDim.x*blockIdx.x;
+      
+      if (ch < nchannels) {
+        
+        int const inputCh = ch >= offsetForInput
+        ? ch - offsetForInput
+        : ch;
+        
+        uint32_t const * didCh = ch >= offsetForInput
+        ? did_ee
+        : did_eb;
+        
+        // only two values, EB or EE
+        // AM : FIXME : why not using "isBarrel" ?    isBarrel ? adc2gev[0] : adc2gev[1]
+        float adc2gev_to_use = ch >= offsetForInput
+        ? adc2gev[1]  // ee
+        : adc2gev[0]; // eb
+        
+        
+        // first EB and then EE
+        
+        ::ecal::reco::StorageScalarType const* amplitude = ch >= offsetForInput
+        ? amplitude_ee
+        : amplitude_eb;
+        
+        ::ecal::reco::StorageScalarType const* time_in = ch >= offsetForInput
+        ? time_ee
+        : time_eb;
+        
+        ::ecal::reco::StorageScalarType const* chi2_in = ch >= offsetForInput
+        ? chi2_ee
+        : chi2_eb;
+        
+        uint32_t const* flags_in = ch >= offsetForInput
+        ? flags_ee
+        : flags_eb;
+        
+        // simple copy
+        did[ch] = didCh[inputCh];
+        
+        auto const did_to_use = DetId{didCh[inputCh]};
+        
+        auto const isBarrel = did_to_use.subdetId() == EcalBarrel;
+        auto const hashedId = isBarrel
+        ? ecal::reconstruction::hashedIndexEB(did_to_use.rawId())
+        : offsetForHashes + ecal::reconstruction::hashedIndexEE(did_to_use.rawId());
+        
+        float const intercalib_to_use = intercalib[hashedId];
+        
+        
+        // get laser coefficient
+        float lasercalib = 1.;
+        
+        //
+        // AM: ideas
+        //
+        //    One possibility is to create the map of laser corrections once on CPU
+        //    for all crystals and push them on GPU.
+        //    Then only if the LS is different, update the laser correction
+        //    The variation within a LS is not worth pursuing (<< 0.1% !!)
+        //    and below the precision we can claim on the laser corrections (right?).
+        //    This will save quite some time (also for the CPU version?)    
+        //
+                
+        int iLM = 1;
+        
+        if (isBarrel) {
+          iLM = ecal::reconstruction::laser_monitoring_region_EB (did_to_use.rawId());
+        }
+        else {
+          iLM = ecal::reconstruction::laser_monitoring_region_EE (did_to_use.rawId());
+        }
+        
+        
+        long long t_i = 0, t_f = 0;
+        float p_i = 0, p_f = 0;
+        long long lt_i = 0, lt_f = 0;
+        float lp_i = 0, lp_f = 0;
+        
+        // laser
+        if (event_time >= t1[iLM - 1] && event_time < t2[iLM - 1]) {
+          t_i = t1[iLM - 1];
+          t_f = t2[iLM - 1];
+          p_i = p1[hashedId];
+          p_f = p2[hashedId];
+        } else if (event_time >= t2[iLM - 1] && event_time <= t3[iLM - 1]) {
+          t_i = t2[iLM - 1];
+          t_f = t3[iLM - 1];
+          p_i = p2[hashedId];
+          p_f = p3[hashedId];
+        } else if (event_time < t1[iLM - 1]) {
+          t_i = t1[iLM - 1];
+          t_f = t2[iLM - 1];
+          p_i = p1[hashedId];
+          p_f = p2[hashedId];
+          
+        } else if (event_time > t3[iLM - 1]) {
+          t_i = t2[iLM - 1];
+          t_f = t3[iLM - 1];
+          p_i = p2[hashedId];
+          p_f = p3[hashedId];
+        }
+        
+        
+        // linear corrections
+        if (event_time >= lt1[iLM - 1] && event_time < lt2[iLM - 1]) {
+          lt_i = lt1[iLM - 1];
+          lt_f = lt2[iLM - 1];
+          lp_i = lp1[hashedId];
+          lp_f = lp2[hashedId];
+        } else if (event_time >= lt2[iLM - 1] && event_time <= lt3[iLM - 1]) {
+          lt_i = lt2[iLM - 1];
+          lt_f = lt3[iLM - 1];
+          lp_i = lp2[hashedId];
+          lp_f = lp3[hashedId];
+        } else if (event_time < lt1[iLM - 1]) {
+          lt_i = lt1[iLM - 1];
+          lt_f = lt2[iLM - 1];
+          lp_i = lp1[hashedId];
+          lp_f = lp2[hashedId];
+          
+        } else if (event_time > lt3[iLM - 1]) {
+          lt_i = lt2[iLM - 1];
+          lt_f = lt3[iLM - 1];
+          lp_i = lp2[hashedId];
+          lp_f = lp3[hashedId];
+        }
+        
+        
+        // apdpnref and alpha 
+        float apdpnref = apdpnrefs[hashedId];
+        float alpha = alphas[hashedId];
+        
+        // now calculate transparency correction
+        if (apdpnref != 0 && (t_i - t_f) != 0 && (lt_i - lt_f) != 0) {
+          long long tt = event_time;  // never subtract two unsigned!
+          float interpolatedLaserResponse =   p_i / apdpnref + float(tt - t_i)  * (p_f - p_i)   / (apdpnref * float(t_f - t_i));
+          
+          float interpolatedLinearResponse = lp_i / apdpnref + float(tt - lt_i) * (lp_f - lp_i) / (apdpnref * float(lt_f - lt_i));  // FIXED BY FC
+          
+          if (interpolatedLinearResponse > 2.f || interpolatedLinearResponse < 0.1f) {
+            interpolatedLinearResponse = 1.f;
+          }
+          if (interpolatedLaserResponse <= 0.) {
+            // AM :  how the heck is it possible?
+            //             interpolatedLaserResponse = 0.0001;
+            lasercalib = 1.;
+            
+          }
+          else {
+            
+            float interpolatedTransparencyResponse = interpolatedLaserResponse / interpolatedLinearResponse;
+            
+            // ... and now this:
+            lasercalib = 1.f / ( std::pow(interpolatedTransparencyResponse, alpha) * interpolatedLinearResponse);
+            
+          }
+        }
+        
+        //
+        // Check for channels to be excluded from reconstruction
+        //        
+        //
+        // Default energy? Not to be updated if "ChannelStatusToBeExcluded"
+        // Exploited later by the module "EcalRecHitConvertGPU2CPUFormat"
+        //
+        energy[ch] = -1; //---- AM: default, un-physical, ok
+        
+        //
+        static const int chStatusMask      = 0x1F;
+        // ChannelStatusToBeExcluded is a "int" then I put "dbstatus" to be the same
+        int dbstatus = EcalChannelStatusCode_Code( (status[hashedId]) & chStatusMask );
+        if (ChannelStatusToBeExcludedSize != 0) {
+          for (int ich_to_check = 0; ich_to_check<ChannelStatusToBeExcludedSize; ich_to_check++) {
+            if ( ChannelStatusToBeExcluded[ich_to_check] == dbstatus ) {
+              return; 
+            }
+          }
+        }
+        
+        // Take our association map of dbstatuses-> recHit flagbits and return the apporpriate flagbit word
+        
+        //
+        // AM: get the smaller "flagbit_counter" with match
+        //
+        
+        uint32_t temporary_flagBits = 0;
+        
+        int iterator_flags = 0;
+        bool need_to_exit = false;
+        int flagbit_counter = 0;
+        while (!need_to_exit) {
+          iterator_flags = 0;
+          for (unsigned int i = 0; i != expanded_v_DB_reco_flagsSize; ++i) { 
+            // check the correct "flagbit"
+            if (expanded_flagbit_v_DB_reco_flags[i] == flagbit_counter) {
+              
+              for (unsigned int j = 0; j < expanded_Sizes_v_DB_reco_flags[i]; j++) {
+                
+                if ( expanded_v_DB_reco_flags[iterator_flags] == dbstatus ) {
+                  temporary_flagBits =  0x1 << expanded_flagbit_v_DB_reco_flags[i];      
+                  need_to_exit = true;
+                  break; // also from the big loop!!!
+                  
+                }
+                iterator_flags++;
+              }
+            }
+            else {
+              // if not, got to the next bunch directly
+              iterator_flags += expanded_Sizes_v_DB_reco_flags[i];
+            }
+            
+            if (need_to_exit) {
+              break;
+            }
+            
+          }
+          flagbit_counter+=1;
+        }
+        
+        
+        if ( (flagmask & temporary_flagBits) && killDeadChannels ) {
+          return;
+        }
+        
+        
+        //
+        flagBits[ch] = temporary_flagBits;
+        
+        //
+        // multiply the adc counts with factors to get GeV
+        //
+        
+        //         energy[ch] = amplitude[inputCh] * adc2gev_to_use * intercalib_to_use ;
+        energy[ch] = amplitude[inputCh] * adc2gev_to_use * intercalib_to_use * lasercalib;
+        
+        // Time is not saved so far, FIXME
+        //         time[ch] = time_in[inputCh];
+        
+        
+        if (chi2_in[inputCh] > 64) chi2[ch] = 64;
+        else chi2[ch] = chi2_in[inputCh];
+        
+        
+        // FIXME: calculate the "flagBits extra"  --> not really "flags", but actually an encoded version of energy uncertainty, time unc., ...
+        extra[ch] = 0;
+        
+        //
+        // extra packing ...
+        //
+        
+        uint32_t offset;
+        uint32_t width;
+        uint32_t value;
+        
+        float chi2_temp = chi2[ch];
+        if (chi2_temp > 64) chi2_temp = 64;
+        // use 7 bits
+        uint32_t rawChi2 = lround(chi2_temp / 64. * ((1<<7)-1));
+        
+        offset = 0;
+        width = 7;
+        value = 0; 
+        
+        uint32_t mask = ((1 << width) - 1) << offset;
+        value &= ~mask;
+        value |= (rawChi2 & ((1U << width) - 1)) << offset;
+        
+        //         extra[ch] = value;
+        //         
+        
+        // rawEnergy is actually "error" !!!
+        uint32_t rawEnergy = 0;
+        
+        
+        // AM: FIXME: this is not propagated currently to the uncalibrecHit collection SOA 
+        //            if you want to store this in "extra", we need first to add it to the uncalibrecHit results
+        //            then it will be something like the following
+        //         amplitudeError[inputCh] * adc2gev_to_use * intercalib_to_use * lasercalib
+        //         
+        //         
+        
+        float amplitudeError_ch = 0. ; // amplitudeError[ch];
+        
+        if (amplitudeError_ch > 0.001) {
+          //           uint16_t exponent = getPower10(amplitudeError_ch);
+          
+          static constexpr float p10[] = {1.e-2f,1.e-1f,1.f,1.e1f,1.e2f,1.e3f,1.e4f,1.e5f,1.e6f};
+          int b = amplitudeError_ch<p10[4] ? 0 : 5;
+          for (;b<9;++b) if (amplitudeError_ch<p10[b]) break;
+          
+          uint16_t exponent = b;
+          
+          static constexpr float ip10[] = {1.e5f,1.e4f,1.e3f,1.e2f,1.e1f,1.e0f,1.e-1f,1.e-2f,1.e-3f,1.e-4};
+          uint16_t significand = lround( amplitudeError_ch * ip10[exponent]);
+          // use 13 bits (3 exponent, 10 significand)
+          rawEnergy = exponent << 10 | significand;
+        }
+        
+        
+        offset = 8;
+        width = 13;
+        // value from last change, ok
+        
+        mask = ((1 << width) - 1) << offset;
+        value &= ~mask;
+        value |= (rawEnergy & ((1U << width) - 1)) << offset;
+        
+        uint32_t jitterErrorBits = 0;
+        jitterErrorBits = jitterErrorBits & 0xFF;
+        
+        
+        offset = 24;
+        width = 8;
+        // value from last change, ok
+        
+        mask = ((1 << width) - 1) << offset;
+        value &= ~mask;
+        value |= (jitterErrorBits & ((1U << width) - 1)) << offset;
+        
+        //
+        // now finally set "extra[ch]"
+        //
+        extra[ch] = value;
+        
+        
+        //
+        // additional flags setting
+        //
+        // using correctly the flags as calculated at the UncalibRecHit stage
+        //
+        // Now fill flags
+        
+        bool good = true;
+        
+        if ( flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kLeadingEdgeRecovered) ) ) {
+          flagBits[ch]  |=  (0x1 <<  (RecHitFlags::RecHitFlags_kLeadingEdgeRecovered));  
+          good = false;          
+        }
+        
+        if (flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kSaturated) ) ) {
+          // leading edge recovery failed - still keep the information
+          // about the saturation and do not flag as dead
+          flagBits[ch]  |=  (0x1 <<  (RecHitFlags::RecHitFlags_kSaturated));  
+          good = false;
+        }
+        
+        //
+        // AM: why do we have two tests one after the other checking almost the same thing??? 
+        // Please clean up the code, ... also the original one!
+        //        
+        // uncalibRH.isSaturated() ---> 
+        //         
+        //                                   bool EcalUncalibratedRecHit::isSaturated() const {
+        //                                     return EcalUncalibratedRecHit::checkFlag(kSaturated);
+        //                                   }
+        //
+        //
+        
+        if ( flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kSaturated) ) ) {
+          flagBits[ch]  |= (0x1 <<  (RecHitFlags::RecHitFlags_kSaturated));  
+          good = false;
+        }
+        
+        if (flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kOutOfTime) ) ) {
+          flagBits[ch]  |= (0x1 <<  (RecHitFlags::RecHitFlags_kOutOfTime));
+          good = false;
+        }
+        if (flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kPoorReco) ) ) {
+          flagBits[ch]  |= (0x1 <<  (RecHitFlags::RecHitFlags_kPoorReco));
+          good = false;
+        }
+        if (flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kHasSwitchToGain6) ) ) {
+          flagBits[ch]  |= (0x1 <<  (RecHitFlags::RecHitFlags_kHasSwitchToGain6));
+        }
+        if (flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kHasSwitchToGain1) ) ) {
+          flagBits[ch]  |= (0x1 <<  (RecHitFlags::RecHitFlags_kHasSwitchToGain1));
+        }
+        
+        
+        if (good) {
+          flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kGood));
+        }
+        
+        if (isBarrel  && (lasercalib < EBLaserMIN || lasercalib > EBLaserMAX)) {
+          flagBits[ch]  |= (0x1 <<  (RecHitFlags::RecHitFlags_kPoorCalib));
+          
+        }
+        if (!isBarrel && (lasercalib < EELaserMIN || lasercalib > EELaserMAX)) {
+          flagBits[ch]  |= (0x1 <<  (RecHitFlags::RecHitFlags_kPoorCalib));
+        }
+        
+        
+  
+        // recover, killing, and other stuff
+    
+    //
+    // Structure:
+    //  EB
+    //  EE
+    //
+    //
+    //  - single MVA
+    //  - democratic sharing
+    //  - kill all the other cases
+    //
+    
+        bool is_Single = false;
+        bool is_FE     = false;
+        bool is_VFE    = false;
+        
+        bool is_recoverable = false; // DetIdToBeRecovered
+        
+        if ( dbstatus == 10 ||  dbstatus == 11 ||  dbstatus == 12 ) {
+          is_recoverable = true;
+        }
+        
+        
+        if (is_recoverable) {
+          if (dbstatus == EcalChannelStatusCode_Code::kDeadVFE) {
+            is_VFE = true;
+          }
+          else if (dbstatus == EcalChannelStatusCode_Code::kDeadVFE) {
+            is_FE = true;
+          }
+          else {
+            is_Single = true;
+          }
+          
+          
+          // EB
+          if (isBarrel) {
+            if (is_Single || is_FE || is_VFE) {           
+              // single MVA
+              if (is_Single && (recoverEBIsolatedChannels || !killDeadChannels) ) {
+               
+                  
+              }
+              // decmocratic sharing
+              else if (is_FE && (recoverEBFE || !killDeadChannels) ) {
+               
+                
+              }
+              // kill all the other cases
+              else {
+                energy[ch] = 0.;  // Need to set also the flags ...
+              }
+            }
+          }
+          // EE
+          else { 
+            if (is_Single || is_FE || is_VFE) {           
+              // single MVA
+              if (is_Single && (recoverEBIsolatedChannels || !killDeadChannels) ) {
+                
+                
+              }
+              // decmocratic sharing
+              else if (is_FE && (recoverEBFE || !killDeadChannels) ) {
+                  
+                //                
+                //  Code is definitely too long ...              
+                //                
+                
+              }
+              // kill all the other cases
+              else {
+                energy[ch] = 0.;  // Need to set also the flags ...
+              }
+            }
+          }
+          
+        }   
+    
+  
+      } // end channel
+      
+    }
+    
+    
+    
+    // host version, to be called by the plugin
+    void create_ecal_rehit(
+      EventInputDataGPU const& eventInputGPU,
+      EventOutputDataGPU&      eventOutputGPU,
+      //     eventDataForScratchGPU_,
+      ConditionsProducts const& conditions, 
+      ConfigurationParameters const& configParameters,
+      uint32_t const  offsetForInput,
+      edm::TimeValue_t const event_time,
+      cudaStream_t cudaStream
+    ){
+      
+//       int nchannels = eventInputGPU.ebUncalibRecHits.size + eventInputGPU.eeUncalibRecHits.size ;   //---- AM FIXME Once the PR by Viktor is integrated. The following is bad!
+      int nchannels = 100 ;
+      
+      unsigned int nchannels_per_block = 32;
+      unsigned int threads_1d = nchannels_per_block;
+      unsigned int blocks_1d = (nchannels + threads_1d) / threads_1d; // TEST : to be optimized (AM)
+      
+      
+      // 
+      // kernel create rechit
+      //
+      kernel_create_ecal_rehit <<< blocks_1d, threads_1d >>> (
+        // configuration 
+        configParameters.ChannelStatusToBeExcluded,
+        configParameters.ChannelStatusToBeExcludedSize,
+        configParameters.killDeadChannels,
+        configParameters.recoverEBIsolatedChannels,
+        configParameters.recoverEEIsolatedChannels,
+        configParameters.recoverEBVFE,             
+        configParameters.recoverEEVFE,             
+        configParameters.recoverEBFE,             
+        configParameters.recoverEEFE,              
+        configParameters.EBLaserMIN,
+        configParameters.EELaserMIN,
+        configParameters.EBLaserMAX,
+        configParameters.EELaserMAX,
+        // for flags setting
+        configParameters.expanded_v_DB_reco_flags,
+        configParameters.expanded_Sizes_v_DB_reco_flags,
+        configParameters.expanded_flagbit_v_DB_reco_flags,
+        configParameters.expanded_v_DB_reco_flagsSize,
+        configParameters.flagmask,
+        // conditions
+        conditions.ADCToGeV.adc2gev,
+        conditions.Intercalib.values,  
+        conditions.ChannelStatus.status,  
+        conditions.LaserAPDPNRatiosRef.values,  
+        conditions.LaserAlphas.values,  
+        // input for transparency corrections
+        conditions.LaserAPDPNRatios.p1,
+        conditions.LaserAPDPNRatios.p2,
+        conditions.LaserAPDPNRatios.p3,
+        conditions.LaserAPDPNRatios.t1,
+        conditions.LaserAPDPNRatios.t2,
+        conditions.LaserAPDPNRatios.t3,
+        // input for linear corrections
+        conditions.LinearCorrections.p1,
+        conditions.LinearCorrections.p2,
+        conditions.LinearCorrections.p3,
+        conditions.LinearCorrections.t1,
+        conditions.LinearCorrections.t2,
+        conditions.LinearCorrections.t3,
+        // time, used for time dependent corrections
+        event_time,
+        // input
+        eventInputGPU.ebUncalibRecHits.did,
+        eventInputGPU.eeUncalibRecHits.did,
+        eventInputGPU.ebUncalibRecHits.amplitude, 
+        eventInputGPU.eeUncalibRecHits.amplitude, 
+        eventInputGPU.ebUncalibRecHits.jitter, 
+        eventInputGPU.eeUncalibRecHits.jitter, 
+        eventInputGPU.ebUncalibRecHits.chi2, 
+        eventInputGPU.eeUncalibRecHits.chi2, 
+        eventInputGPU.ebUncalibRecHits.flags, 
+        eventInputGPU.eeUncalibRecHits.flags, 
+        // output
+        eventOutputGPU.did,
+        eventOutputGPU.energy,
+        eventOutputGPU.time,
+        eventOutputGPU.chi2,
+        eventOutputGPU.flagBits,
+        eventOutputGPU.extra,
+        // other
+        nchannels,
+        offsetForInput,
+        conditions.offsetForHashes
+      );
+      
+      
+      
+    }  
+    
+    
+  }
+  
+}
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h
new file mode 100644
index 0000000000000..587abe0575883
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h
@@ -0,0 +1,97 @@
+//
+// Builder of ECAL RecHits on GPU
+//
+
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
+
+#include "RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/Common.h"
+
+#include "DataFormats/Provenance/interface/Timestamp.h"
+
+
+namespace ecal { 
+  namespace rechit {
+    
+    
+    __global__
+    void kernel_create_ecal_rehit(
+      // configuration 
+      int const* ChannelStatusToBeExcluded,
+      uint32_t ChannelStatusToBeExcludedSize, 
+      bool killDeadChannels,
+      bool const recoverEBIsolatedChannels,
+      bool const recoverEEIsolatedChannels,
+      bool const recoverEBVFE,             
+      bool const recoverEEVFE,             
+      bool const recoverEBFE,             
+      bool const recoverEEFE,
+      // for flags setting
+      int const* expanded_v_DB_reco_flags,
+      uint32_t const* expanded_Sizes_v_DB_reco_flags,
+      uint32_t const* expanded_flagbit_v_DB_reco_flags,
+      uint32_t expanded_v_DB_reco_flagsSize,
+      uint32_t flagmask,
+      // conditions
+      float const* adc2gev,
+      float const* intercalib,
+      uint16_t const* status,
+      float const* apdpnrefs,
+      float const* alphas,
+      // input for transparency corrections
+      float const* p1,
+      float const* p2,
+      float const* p3,
+      edm::TimeValue_t const* t1,
+      edm::TimeValue_t const* t2,
+      edm::TimeValue_t const* t3,  
+      // input for linear corrections
+      float const* lp1,
+      float const* lp2,
+      float const* lp3,
+      edm::TimeValue_t const* lt1,
+      edm::TimeValue_t const* lt2,
+      edm::TimeValue_t const* lt3,                    
+      // time, used for time dependent corrections
+      edm::TimeValue_t const event_time,
+      // input
+      uint32_t const* did_eb,
+      uint32_t const* did_ee,
+      ::ecal::reco::StorageScalarType const* amplitude_eb,   // in adc counts  
+      ::ecal::reco::StorageScalarType const* amplitude_ee,   // in adc counts  
+      ::ecal::reco::StorageScalarType const* time_eb,   
+      ::ecal::reco::StorageScalarType const* time_ee,   
+      ::ecal::reco::StorageScalarType const* chi2_eb,   
+      ::ecal::reco::StorageScalarType const* chi2_ee,   
+      uint32_t const* flags_eb,   
+      uint32_t const* flags_ee,   
+      // output
+      uint32_t *did,
+      ::ecal::reco::StorageScalarType* energy,   // in energy [GeV]  
+      ::ecal::reco::StorageScalarType* time,  
+      ::ecal::reco::StorageScalarType* chi2,  
+      uint32_t* flagBits,
+      uint32_t* extra,
+      int const nchannels,
+      uint32_t const offsetForInput,
+      uint32_t const offsetForHashes  
+    );
+    
+    
+    // host version, to be called by the plugin
+    
+    void create_ecal_rehit(
+      EventInputDataGPU const& eventInputGPU,
+      EventOutputDataGPU&      eventOutputGPU,
+      //     eventDataForScratchGPU_,
+      ConditionsProducts const& conditions, 
+      ConfigurationParameters const& configParameters,
+      uint32_t const offsetForInput, 
+      edm::TimeValue_t const event_time,
+      cudaStream_t cudaStream
+    );
+    
+  }
+  
+}
+
diff --git a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu
index b85f002464f65..b6aee22e7da6f 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu
@@ -4,7 +4,7 @@
 #include "DataFormats/EcalDetId/interface/EEDetId.h"
 
 namespace ecal {
-  namespace multifit {
+  namespace reconstruction {
 
     namespace internal {
 
@@ -16,6 +16,137 @@ namespace ecal {
 
         __device__ __forceinline__ uint32_t iphi(uint32_t id) { return id & 0x1FF; }
 
+        
+        
+        __device__ int dccFromSm(int ism) {
+          int iz = 1;
+          if (ism > 18)
+            iz = -1;
+          if (iz == -1)
+            ism -= 18;
+          int idcc = 9 + ism;
+          if (iz == +1)
+            idcc += 18;
+          return idcc;  
+        }
+        
+        __device__ int sm(int ieta, int iphi) {
+          int iz = 1;
+          if (ieta < 0)
+            iz = -1;
+          ieta *= iz;
+          int iphi_ = iphi;
+          if (iphi_ > 360)
+            iphi_ -= 360;
+          int ism = (iphi_ - 1) / 20 + 1;
+          if (iz == -1)
+            ism += 18;
+          return ism;
+        }
+        
+        
+        __device__ int dcc(int ieta, int iphi) {
+          int ism = sm(ieta, iphi);
+          return dccFromSm(ism);
+        }
+        
+        
+        
+        
+        //        
+        // ---- why on hell things are so complex and not simple ???
+        //        
+        
+        
+        __device__ int lm_channel (int iX, int iY) {
+          
+          static const int idx_[] = {
+            // 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16
+            1, 2, 2, 2, 2, 4, 4, 4, 4,
+            6, 6, 6, 6, 8, 8, 8, 8,  // 3
+            1, 2, 2, 2, 2, 4, 4, 4, 4,
+            6, 6, 6, 6, 8, 8, 8, 8,  // 2
+            1, 3, 3, 3, 3, 5, 5, 5, 5,
+            7, 7, 7, 7, 9, 9, 9, 9,  // 1
+            1, 3, 3, 3, 3, 5, 5, 5, 5,
+            7, 7, 7, 7, 9, 9, 9, 9  // 0
+            // 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16
+          };
+          
+          int iym, ixm, il, ic, ii;
+          iym = 4;
+          ixm = 17;
+          int iX_ = iX + 1;
+          int iY_ = iY + 1;
+          il = iym - iY_;
+          ic = iX_ - 1;
+          ii = il * ixm + ic;
+          if (ii < 0 || ii > (int)(sizeof(idx_) / sizeof(int))) {
+            return -1;
+          };
+          return idx_[ii];
+          
+        }
+        
+        
+        
+        __device__ int localCoord_x (int ieta, int iphi) {
+          int iz = 1;
+          if (ieta < 0) {
+            iz = -1; 
+          }
+          ieta *= iz;
+          //   int iphi_ = iphi;
+          //   if (iphi_ > 360) {
+          //     iphi_ -= 360;
+          //   }
+          int ix = ieta - 1;
+          //   int iy = (iphi_ - 1) % 20;
+          //   if (iz == -1) {
+          //     iy = 19 - iy;
+          //   }
+          
+          return ix;
+        }
+        
+        
+        __device__ int localCoord_y (int ieta, int iphi) {
+          int iz = 1;
+          if (ieta < 0) {
+            iz = -1; 
+          }
+          //   ieta *= iz;
+          int iphi_ = iphi;
+          if (iphi_ > 360) {
+            iphi_ -= 360;
+          }
+          //   int ix = ieta - 1;
+          int iy = (iphi_ - 1) % 20;
+          if (iz == -1) {
+            iy = 19 - iy;
+          }
+          
+          return iy;
+        }
+        
+        
+        __device__ int lmmod (int ieta, int iphi) {
+          
+          int ix = localCoord_x(ieta, iphi);
+          int iy = localCoord_y(ieta, iphi);
+          
+          return lm_channel(ix / 5, iy / 5);
+        }
+        
+        
+        
+        __device__ int side (int ieta, int iphi) {
+          int ilmmod = lmmod(ieta, iphi);
+          return (ilmmod % 2 == 0) ? 1 : 0;
+        }
+        
+        
+        
       }  // namespace barrel
 
     }  // namespace internal
@@ -25,6 +156,41 @@ namespace ecal {
       return (EBDetId::MAX_IETA + (positiveZ(id) ? ietaAbs(id) - 1 : -ietaAbs(id))) * EBDetId::MAX_IPHI + iphi(id) - 1;
     }
 
+    
+    
+    // 
+    // https://cmssdt.cern.ch/lxr/source/CalibCalorimetry/EcalLaserAnalyzer/src/MEEBGeom.cc
+    //  function: "lmr"
+    
+    __device__ 
+    int laser_monitoring_region_EB(uint32_t id) {
+      using namespace internal::barrel;
+      
+      int ieta;
+      if (positiveZ(id)) {
+        ieta = ietaAbs(id);
+      }
+      else {
+        ieta = - ietaAbs(id);            
+      }
+      
+      int idcc = dcc(ieta, (int) (iphi(id)) );
+      int ism = idcc - 9;
+      
+      int iside = side(ieta, (int) (iphi(id)) );
+      //   int iside = positiveZ(id) ? 1 : 0;
+      
+      return ( 1 + 2 * (ism - 1) + iside );
+      //   return ieta;
+      //   return (int) (iphi(id));
+      //   return idcc;
+      //   return iside;
+      
+    }
+    
+    
+    
+    
     namespace internal {
 
       namespace endcap {
@@ -60,6 +226,96 @@ namespace ecal {
             6614, 6649, 6684, 6719, 6754, 6784, 6814, 6844, 6874, 6904, 6934, 6964, 6994, 7024, 7054, 7079, 7104,
             7129, 7154, 7179, 7204, 7219, 7234, 7249, 7264, 7274, 7284, 7294, 7304, 7314};
 
+            
+            __device__ int quadrant(int iX, int iY) {
+              bool near = iX >= 11;
+              bool far = !near;
+              bool top = iY >= 11;
+              bool bot = !top;
+              
+              int iquad = 0;
+              if (near && top)
+                iquad = 1;
+              if (far && top)
+                iquad = 2;
+              if (far && bot)
+                iquad = 3;
+              if (near && bot)
+                iquad = 4;
+              
+              return iquad;
+            }
+            
+            __device__ int sector(int iX, int iY) {
+              //  Y (towards the surface)
+              //  T
+              //  |
+              //  |
+              //  |
+              //  o---------| X  (towards center of LHC)
+              //
+              static const int idx_[] = {
+                // 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
+                0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 9,
+                9, 9, 0, 0, 0, 0, 0, 0, 0,  // 20
+                0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 9,
+                9, 9, 9, 9, 9, 0, 0, 0, 0,  // 19
+                0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 9,
+                9, 9, 9, 9, 9, 8, 0, 0, 0,  // 18
+                0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 9,
+                9, 9, 9, 9, 8, 8, 8, 0, 0,  // 17
+                0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 9,
+                9, 9, 9, 9, 8, 8, 8, 8, 0,  // 16
+                0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 9,
+                9, 9, 9, 8, 8, 8, 8, 8, 0,  // 15
+                0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 9,
+                9, 9, 8, 8, 8, 8, 8, 8, 0,  // 14
+                2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 9,
+                9, 8, 8, 8, 8, 8, 8, 8, 8,  // 13
+                3, 3, 2, 2, 2, 2, 2, 2, 2, 0, 0,
+                8, 8, 8, 8, 8, 8, 8, 7, 7,  // 12
+                3, 3, 3, 3, 3, 3, 3, 2, 0, 0, 0,
+                0, 8, 7, 7, 7, 7, 7, 7, 7,  // 11
+                3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0,
+                0, 7, 7, 7, 7, 7, 7, 7, 7,  // 10
+                3, 3, 3, 3, 3, 3, 3, 4, 4, 0, 0,
+                6, 6, 7, 7, 7, 7, 7, 7, 7,  // 9
+                3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5,
+                6, 6, 6, 7, 7, 7, 7, 7, 7,  // 8
+                0, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5,
+                6, 6, 6, 6, 6, 7, 7, 7, 0,  // 7
+                0, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5,
+                5, 6, 6, 6, 6, 6, 6, 7, 0,  // 6
+                0, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5,
+                5, 6, 6, 6, 6, 6, 6, 6, 0,  // 5
+                0, 0, 4, 4, 4, 4, 4, 4, 5, 5, 5,
+                5, 6, 6, 6, 6, 6, 6, 0, 0,  // 4
+                0, 0, 0, 4, 4, 4, 4, 5, 5, 5, 5,
+                5, 5, 6, 6, 6, 6, 0, 0, 0,  // 3
+                0, 0, 0, 0, 4, 4, 4, 5, 5, 5, 5,
+                5, 5, 6, 6, 6, 0, 0, 0, 0,  // 2
+                0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5,
+                5, 5, 0, 0, 0, 0, 0, 0, 0  // 1
+                // 1  2  3  4  5  6  7  8  9 10   11 12 13 14 15 16 17 18 19 20
+              };
+              
+              int iym, ixm, il, ic, ii;
+              iym = 20;
+              ixm = 20;
+              int iX_ = iX;
+              int iY_ = iY;
+              il = iym - iY_;
+              ic = iX_ - 1;
+              ii = il * ixm + ic;
+              
+              if (ii < 0 || ii > (int)(sizeof(idx_) / sizeof(int)) || idx_[ii] == 0) {
+                return -1;
+              };
+              return idx_[ii];
+            }
+            
+            
+            
       }  // namespace endcap
 
     }  // namespace internal
@@ -72,5 +328,53 @@ namespace ecal {
       return ((positiveZ(id) ? EEDetId::kEEhalf : 0) + kdi[jd] + jx - kxf[jd]);
     }
 
-  }  // namespace multifit
+    
+    
+    
+    // 
+    // https://cmssdt.cern.ch/lxr/source/CalibCalorimetry/EcalLaserAnalyzer/src/MEEEGeom.cc
+    // https://github.com/cms-sw/cmssw/blob/master/CalibCalorimetry/EcalLaserCorrection/src/EcalLaserDbService.cc
+    // 
+    
+    __device__ 
+    int laser_monitoring_region_EE(uint32_t id) {
+      using namespace internal::endcap;
+      
+      // SuperCrysCoord
+      uint32_t iX = (ix(id) - 1) / 5 + 1;
+      uint32_t iY = (iy(id) - 1) / 5 + 1;
+      
+      // Correct convention 
+      //   * @param iz iz/zside index: -1 for EE-, +1 for EE+
+      //   https://github.com/cms-sw/cmssw/blob/master/DataFormats/EcalDetId/interface/EEDetId.h#L68-L71
+      //   zside in https://github.com/cms-sw/cmssw/blob/master/CalibCalorimetry/EcalLaserCorrection/src/EcalLaserDbService.cc#L63
+      //   
+      int iz = positiveZ(id) ? 1 : -1;
+      
+      int iquad = quadrant(iX, iY);
+      int isect = sector(iX, iY);
+      if (isect < 0)
+        return -1;
+      
+      int ilmr = 0;
+      ilmr = isect - 6;
+      if (ilmr <= 0)
+        ilmr += 9;
+      if (ilmr == 9)
+        ilmr++;
+      if (ilmr == 8 && iquad == 4)
+        ilmr++;
+      if (iz == +1)
+        ilmr += 72;
+      else
+        ilmr += 82;
+      
+      return ilmr;
+      
+    }
+    
+    
+    
+    
+  }  // namespace reconstruction
 }  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h
index b148ab91915d1..3a8125bbe8fb1 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h
+++ b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h
@@ -2,13 +2,20 @@
 #define RecoLocalCalo_EcalRecAlgos_src_KernelHelpers_h
 
 namespace ecal {
-  namespace multifit {
-
-    __device__ uint32_t hashedIndexEB(uint32_t id);
+  namespace reconstruction {
+  
+  __device__ uint32_t hashedIndexEB(uint32_t id);
+  
+  __device__ uint32_t hashedIndexEE(uint32_t id);
+  
+  
+  __device__ int laser_monitoring_region_EB(uint32_t id);
+  
+  __device__ int laser_monitoring_region_EE(uint32_t id);
+  
+  }  // namespace reconstruction
+}  // namespace ecal
 
-    __device__ uint32_t hashedIndexEE(uint32_t id);
+#endif // RecoLocalCalo_EcalRecAlgos_src_KernelHelpers_h
 
-  }  // namespace multifit
-}  // namespace ecal
 
-#endif  // RecoLocalCalo_EcalRecAlgos_src_KernelHelpers_h
diff --git a/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.cu
index 3726ea43d95db..ce4426df03227 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.cu
@@ -852,7 +852,7 @@ namespace ecal {
         auto const did = DetId{dids[ch]};
         auto const isBarrel = did.subdetId() == EcalBarrel;
         auto const sample_mask = did.subdetId() == EcalBarrel ? sample_maskEB : sample_maskEE;
-        auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId());
+        auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
 
         // set pedestal
         // TODO this branch is non-divergent for a group of 10 threads
@@ -1022,7 +1022,7 @@ namespace ecal {
 
       auto const did = DetId{dids[gtx]};
       auto const isBarrel = did.subdetId() == EcalBarrel;
-      auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId());
+      auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
       auto const* amplitudeBins = isBarrel ? amplitudeBinsEB : amplitudeBinsEE;
       auto const* shiftBins = isBarrel ? shiftBinsEB : shiftBinsEE;
       auto const amplitudeBinsSize = isBarrel ? amplitudeBinsSizeEB : amplitudeBinsSizeEE;
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc
new file mode 100644
index 0000000000000..7216c6edb7e73
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc
@@ -0,0 +1,190 @@
+#include <iostream>
+
+// framework
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
+//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h" 
+
+
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+
+// algorithm specific
+
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
+
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
+
+class EcalCPURecHitProducer
+: public edm::stream::EDProducer<edm::ExternalWork>
+{
+public:
+  explicit EcalCPURecHitProducer(edm::ParameterSet const& ps);
+  ~EcalCPURecHitProducer() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+  
+private:
+  void acquire(edm::Event const&, 
+               edm::EventSetup const&,
+               edm::WaitingTaskWithArenaHolder) override;
+               void produce(edm::Event&, edm::EventSetup const&) override;
+               
+private:
+  edm::EDGetTokenT<cms::cuda::Product<ecal::RecHit<ecal::Tag::ptr>>> recHitsInEBToken_, recHitsInEEToken_;
+  edm::EDPutTokenT<ecal::RecHit<ecal::Tag::soa>> recHitsOutEBToken_, recHitsOutEEToken_;
+  
+  ecal::RecHit<ecal::Tag::soa> recHitsEB_, recHitsEE_;
+  bool containsTimingInformation_;
+};
+
+void EcalCPURecHitProducer::fillDescriptions(
+  edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
+  
+  desc.add<edm::InputTag>("recHitsInLabelEB", edm::InputTag{"ecalRecHitProducerGPU", "EcalRecHitsEB"});
+  desc.add<edm::InputTag>("recHitsInLabelEE", edm::InputTag{"ecalRecHitProducerGPU", "EcalRecHitsEE"});
+  desc.add<std::string>("recHitsOutLabelEB", "EcalRecHitsEB");
+  desc.add<std::string>("recHitsOutLabelEE", "EcalRecHitsEE");
+  desc.add<bool>("containsTimingInformation", false);
+  
+  std::string label = "ecalCPURecHitProducer";
+  confDesc.add(label, desc);
+  }
+  
+  EcalCPURecHitProducer::EcalCPURecHitProducer(
+    const edm::ParameterSet& ps) 
+  : recHitsInEBToken_{consumes<cms::cuda::Product<ecal::RecHit<ecal::Tag::ptr>>>(ps.getParameter<edm::InputTag>("recHitsInLabelEB"))}
+  , recHitsInEEToken_{consumes<cms::cuda::Product<ecal::RecHit<ecal::Tag::ptr>>>(ps.getParameter<edm::InputTag>("recHitsInLabelEE"))}
+  , recHitsOutEBToken_{produces<ecal::RecHit<ecal::Tag::soa>>(ps.getParameter<std::string>("recHitsOutLabelEB"))}
+  , recHitsOutEEToken_{produces<ecal::RecHit<ecal::Tag::soa>>(ps.getParameter<std::string>("recHitsOutLabelEE"))}
+  , containsTimingInformation_{ps.getParameter<bool>("containsTimingInformation")}
+  {}
+  
+  EcalCPURecHitProducer::~EcalCPURecHitProducer() {}
+  
+  void EcalCPURecHitProducer::acquire(
+    edm::Event const& event,
+    edm::EventSetup const& setup,
+    edm::WaitingTaskWithArenaHolder taskHolder) 
+  {
+    // retrieve data/ctx
+    auto const& ebRecHitsProduct = event.get(recHitsInEBToken_);
+    auto const& eeRecHitsProduct = event.get(recHitsInEEToken_);
+    cms::cuda::ScopedContextAcquire ctx{ebRecHitsProduct, std::move(taskHolder)};
+    auto const& ebRecHits = ctx.get(ebRecHitsProduct);
+    auto const& eeRecHits = ctx.get(eeRecHitsProduct);
+    
+    // resize the output buffers
+    recHitsEB_.resize(ebRecHits.size);
+    recHitsEE_.resize(eeRecHits.size);
+    
+    //     std::cout << " [EcalCPURecHitProducer::acquire] ebRecHits.size = " << ebRecHits.size << std::endl;
+    //     std::cout << " [EcalCPURecHitProducer::acquire] eeRecHits.size = " << eeRecHits.size << std::endl;
+    
+    
+    // AM: FIXME : why all "uint32_t" and not "float" where needed?
+    
+    
+    // enqeue transfers
+    cudaCheck( cudaMemcpyAsync(recHitsEB_.did.data(),
+                               ebRecHits.did,
+                               recHitsEB_.did.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    cudaCheck( cudaMemcpyAsync(recHitsEE_.did.data(),
+                               eeRecHits.did,
+                               recHitsEE_.did.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    // 
+    //     ./CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h:using StorageScalarType = float;
+    // 
+    
+    cudaCheck( cudaMemcpyAsync(recHitsEB_.energy.data(),
+                               ebRecHits.energy,
+                               recHitsEB_.energy.size() * sizeof(::ecal::reco::StorageScalarType),   // AM: FIX
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    cudaCheck( cudaMemcpyAsync(recHitsEE_.energy.data(),
+                               eeRecHits.energy,
+                               recHitsEE_.energy.size() * sizeof(::ecal::reco::StorageScalarType),   // AM: FIX
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    
+    cudaCheck( cudaMemcpyAsync(recHitsEB_.chi2.data(),
+                               ebRecHits.chi2,
+                               recHitsEB_.chi2.size() * sizeof(::ecal::reco::StorageScalarType),   // AM: FIX
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    cudaCheck( cudaMemcpyAsync(recHitsEE_.chi2.data(),
+                               eeRecHits.chi2,
+                               recHitsEE_.chi2.size() * sizeof(::ecal::reco::StorageScalarType),   // AM: FIX
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    
+    cudaCheck( cudaMemcpyAsync(recHitsEB_.extra.data(),
+                               ebRecHits.extra,
+                               recHitsEB_.extra.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    cudaCheck( cudaMemcpyAsync(recHitsEE_.extra.data(),
+                               eeRecHits.extra,
+                               recHitsEE_.extra.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    
+    cudaCheck( cudaMemcpyAsync(recHitsEB_.flagBits.data(),
+                               ebRecHits.flagBits,
+                               recHitsEB_.flagBits.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    cudaCheck( cudaMemcpyAsync(recHitsEE_.flagBits.data(),
+                               eeRecHits.flagBits,
+                               recHitsEE_.flagBits.size() * sizeof(uint32_t),
+                               cudaMemcpyDeviceToHost,
+                               ctx.stream()) );
+    
+    
+    
+    
+    //     for (unsigned int ieb = 0; ieb <  ebRecHits.size ; ieb++) {
+    //       if (recHitsEB_.extra[ieb] != 0 ) std::cout << " [ " << ieb << " :: " << ebRecHits.size << " ] [ " << recHitsEB_.did[ieb] << " ] eb extra = " << recHitsEB_.extra[ieb] << std::endl;
+    //     }
+    
+    //     
+    //     for (unsigned int ieb = 0; ieb <  ebRecHits.size ; ieb++) {
+    //       if (recHitsEB_.energy[ieb] != 0 ) std::cout << " [ " << ieb << " :: " << ebRecHits.size << " ] [ " << recHitsEB_.did[ieb] << " ] eb energy = " << recHitsEB_.energy[ieb] << std::endl;
+    //     }
+    //     
+    //     for (unsigned int iee = 0; iee <  eeRecHits.size ; iee++) {
+    //       if (recHitsEE_.energy[iee] != 0 ) std::cout << " [ " << iee << " :: " << eeRecHits.size << " ] [ " << recHitsEE_.did[iee] << " ] ee energy = " << recHitsEE_.energy[iee] << std::endl;
+    //     }
+    //     
+    
+    
+    
+    
+  }
+  
+  void EcalCPURecHitProducer::produce(
+    edm::Event& event, 
+    edm::EventSetup const& setup) 
+  {
+    // tmp vectors
+    auto recHitsOutEB = std::make_unique<ecal::RecHit<ecal::Tag::soa>>(std::move(recHitsEB_));
+    auto recHitsOutEE = std::make_unique<ecal::RecHit<ecal::Tag::soa>>(std::move(recHitsEE_));
+    
+    // put into event
+    event.put(recHitsOutEBToken_, std::move(recHitsOutEB));
+    event.put(recHitsOutEEToken_, std::move(recHitsOutEE));
+  }
+  
+  DEFINE_FWK_MODULE(EcalCPURecHitProducer);
+  
+  
+  
\ No newline at end of file
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc
new file mode 100644
index 0000000000000..54d772efa806b
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc
@@ -0,0 +1,137 @@
+// framework
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h" 
+
+// algorithm specific
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
+#include "DataFormats/EcalRecHit/interface/EcalRecHit.h"
+#include "DataFormats/EcalRecHit/interface/EcalRecHitCollections.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/Common.h"
+
+#include <iostream>
+
+class EcalRecHitConvertGPU2CPUFormat
+: public edm::stream::EDProducer<>
+{
+public:
+  explicit EcalRecHitConvertGPU2CPUFormat(edm::ParameterSet const& ps);
+  ~EcalRecHitConvertGPU2CPUFormat() override;
+  static void fillDescriptions(edm::ConfigurationDescriptions&);
+  
+private:
+  using GPURecHitType = ecal::RecHit<ecal::Tag::soa>;
+  void produce(edm::Event&, edm::EventSetup const&) override;
+  
+private:
+  const edm::EDGetTokenT<ecal::SoARecHitCollection> recHitsGPUEB_;
+  const edm::EDGetTokenT<ecal::SoARecHitCollection> recHitsGPUEE_;
+  
+  const std::string recHitsLabelCPUEB_, recHitsLabelCPUEE_;
+};
+
+void EcalRecHitConvertGPU2CPUFormat::fillDescriptions(
+  edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
+  
+  desc.add<edm::InputTag>("recHitsLabelGPUEB", edm::InputTag("ecalRecHitProducerGPU", "EcalRecHitsGPUEB"));
+  desc.add<edm::InputTag>("recHitsLabelGPUEE", edm::InputTag("ecalRecHitProducerGPU", "EcalRecHitsGPUEE"));
+  
+  desc.add<std::string>("recHitsLabelCPUEB", "EcalRecHitsEB");
+  desc.add<std::string>("recHitsLabelCPUEE", "EcalRecHitsEE");
+  
+  std::string label = "ecalRecHitConvertGPU2CPUFormat";
+  confDesc.add(label, desc);
+  }
+  
+  EcalRecHitConvertGPU2CPUFormat::EcalRecHitConvertGPU2CPUFormat(const edm::ParameterSet& ps) 
+  : recHitsGPUEB_{consumes<ecal::SoARecHitCollection>(ps.getParameter<edm::InputTag>("recHitsLabelGPUEB"))}
+  , recHitsGPUEE_{consumes<ecal::SoARecHitCollection>(ps.getParameter<edm::InputTag>("recHitsLabelGPUEE"))}
+  , recHitsLabelCPUEB_{ps.getParameter<std::string>("recHitsLabelCPUEB")}
+  , recHitsLabelCPUEE_{ps.getParameter<std::string>("recHitsLabelCPUEE")}
+  {
+    produces<EBRecHitCollection>(recHitsLabelCPUEB_);
+    produces<EERecHitCollection>(recHitsLabelCPUEE_);
+  }
+  
+  EcalRecHitConvertGPU2CPUFormat::~EcalRecHitConvertGPU2CPUFormat() {}
+  
+  void EcalRecHitConvertGPU2CPUFormat::produce(
+    edm::Event& event, 
+    edm::EventSetup const& setup) 
+  {
+    edm::Handle<ecal::SoARecHitCollection> hRecHitsGPUEB, hRecHitsGPUEE;
+    event.getByToken(recHitsGPUEB_, hRecHitsGPUEB);
+    event.getByToken(recHitsGPUEE_, hRecHitsGPUEE);
+    
+    auto recHitsCPUEB = std::make_unique<EBRecHitCollection>();
+    auto recHitsCPUEE = std::make_unique<EERecHitCollection>();
+    recHitsCPUEB->reserve(hRecHitsGPUEB->energy.size());
+    recHitsCPUEE->reserve(hRecHitsGPUEE->energy.size());
+    
+    //     
+    //     explicit EcalRecHit(const DetId& id, float energy, float time, uint32_t extra = 0, uint32_t flagBits = 0):
+    //     
+    
+    for (uint32_t i=0; i<hRecHitsGPUEB->energy.size(); ++i) {
+      
+      //
+      // Save only if energy is >= 0 !
+      // This is extremely important because the channels that were supposed 
+      // to be excluded get "-1" as energy
+      //
+      
+      if (hRecHitsGPUEB->energy[i] >=0) {
+        recHitsCPUEB->emplace_back(
+          DetId{hRecHitsGPUEB->did[i]},
+          hRecHitsGPUEB->energy[i],
+          hRecHitsGPUEB->time[i],
+          hRecHitsGPUEB->extra[i],
+          hRecHitsGPUEB->flagBits[i]
+        );
+      }
+      
+      //       std::cout << " EB :: extra [" << i << "::" << hRecHitsGPUEB->energy.size() << "] = " << hRecHitsGPUEB->extra[i] << std::endl;        
+      
+      //         (*recHitsCPUEB)[i].setJitterError(hRecHitsGPUEB->timeError[i]);
+      //         auto const offset = i * EcalDataFrame::MAXSAMPLES;
+      //         for (uint32_t sample=0; sample<EcalDataFrame::MAXSAMPLES; ++sample) 
+      //             (*recHitsCPUEB)[i].setOutOfTimeAmplitude(
+      //                 sample, hRecHitsGPUEB->energysAll[offset + sample]);
+    }
+    
+    for (uint32_t i=0; i<hRecHitsGPUEE->energy.size(); ++i) {
+      //
+      // Save only if energy is >= 0 !
+      // This is extremely important because the channels that were supposed 
+      // to be excluded get "-1" as energy
+      //
+      
+      if (hRecHitsGPUEE->energy[i] >=0) {
+        recHitsCPUEE->emplace_back(
+          DetId{hRecHitsGPUEE->did[i]},
+          hRecHitsGPUEE->energy[i],
+          hRecHitsGPUEE->time[i],
+          hRecHitsGPUEE->extra[i],
+          hRecHitsGPUEE->flagBits[i]
+        );
+      }
+      
+      //       std::cout << " EE :: extra [" << i << "::" << hRecHitsGPUEE->energy.size() << "] = " << hRecHitsGPUEE->extra[i] << std::endl;        
+      
+      //         (*recHitsCPUEE)[i].setJitterError(hRecHitsGPUEE->timeError[i]);
+      //         auto const offset = i * EcalDataFrame::MAXSAMPLES;
+      //         for (uint32_t sample=0; sample<EcalDataFrame::MAXSAMPLES; ++sample) 
+      //             (*recHitsCPUEE)[i].setOutOfTimeAmplitude(
+      //                 sample, hRecHitsGPUEE->energysAll[offset + sample]);
+    }
+    
+    event.put(std::move(recHitsCPUEB), recHitsLabelCPUEB_);
+    event.put(std::move(recHitsCPUEE), recHitsLabelCPUEE_);
+  }
+  
+  DEFINE_FWK_MODULE(EcalRecHitConvertGPU2CPUFormat);
+  
\ No newline at end of file
diff --git a/RecoLocalCalo/EcalRecProducers/python/ecalRecHitGPU_cfi.py b/RecoLocalCalo/EcalRecProducers/python/ecalRecHitGPU_cfi.py
new file mode 100644
index 0000000000000..76299519b51dc
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/python/ecalRecHitGPU_cfi.py
@@ -0,0 +1,132 @@
+import FWCore.ParameterSet.Config as cms
+
+from RecoLocalCalo.EcalRecAlgos.ecalCleaningAlgo import cleaningAlgoConfig 
+
+# rechit producer
+ecalRecHitGPU = cms.EDProducer("EcalRecHitProducerGPU",
+                               
+    uncalibrecHitsInLabelEB = cms.InputTag("ecalUncalibRecHitProducerGPU","EcalUncalibRecHitsEB"),
+    uncalibrecHitsInLabelEE = cms.InputTag("ecalUncalibRecHitProducerGPU","EcalUncalibRecHitsEE"),
+          
+    #recHitsLabelEB = cms.string("EcalRecHitsGPUEB"),
+    #recHitsLabelEE = cms.string("EcalRecHitsGPUEE"),
+    recHitsLabelEB = cms.string("EcalRecHitsEB"),
+    recHitsLabelEE = cms.string("EcalRecHitsEE"),
+ 
+    maxNumberHits = cms.uint32(20000),  # FIXME AM
+  
+  
+    #EErechitCollection = cms.string('EcalRecHitsEE'),
+    #EEuncalibRecHitCollection = cms.InputTag("ecalMultiFitUncalibRecHit","EcalUncalibRecHitsEE"),
+    #EBuncalibRecHitCollection = cms.InputTag("ecalMultiFitUncalibRecHit","EcalUncalibRecHitsEB"),
+    #EBrechitCollection = cms.string('EcalRecHitsEB'),
+   
+    ## db statuses to be exluded from reconstruction (some will be recovered)
+    ChannelStatusToBeExcluded = cms.vstring(   'kDAC',
+                                               'kNoisy',
+                                               'kNNoisy',
+                                               'kFixedG6',
+                                               'kFixedG1',
+                                               'kFixedG0',
+                                               'kNonRespondingIsolated',
+                                               'kDeadVFE',
+                                               'kDeadFE',
+                                               'kNoDataNoTP',
+                                               #
+                                               # AM should I add them here?????
+                                               # next ones from "flagsMapDBReco"
+                                               # but not defined in "EcalChannelStatusCode.h"
+                                               # but they are defined in "EcalRecHit.h"
+                                               #
+                                               #'kKilled',
+                                               #'kTPSaturated',
+                                               #'kL1SpikeFlag',
+                                               ),
+    
+    ## avoid propagation of dead channels other than after recovery
+    killDeadChannels = cms.bool(True),
+    #algo = cms.string("EcalRecHitWorkerSimple"),
+    
+    ## define maximal and minimal values for the laser corrections
+    
+    EBLaserMIN = cms.double(0.01),                    #    EBLaserMIN = cms.double(0.5),
+    EELaserMIN = cms.double(0.01),                    #    EELaserMIN = cms.double(0.5),
+                                                     
+    EBLaserMAX = cms.double(30.0),                    #    EBLaserMAX = cms.double(3.0),
+    EELaserMAX = cms.double(30.0),                    #    EELaserMAX = cms.double(8.0),
+
+
+    ## useful if time is not calculated, as at HLT                        
+    #skipTimeCalib = cms.bool(False),                         
+
+    ## apply laser corrections
+    #laserCorrection = cms.bool(True),
+                            
+    ## reco flags association to DB flag
+    flagsMapDBReco = cms.PSet(
+        kGood  = cms.vstring('kOk','kDAC','kNoLaser','kNoisy'),
+        kNoisy = cms.vstring('kNNoisy','kFixedG6','kFixedG1'),
+        kNeighboursRecovered = cms.vstring('kFixedG0',
+                                           'kNonRespondingIsolated',
+                                           'kDeadVFE'),
+        kTowerRecovered = cms.vstring('kDeadFE'),
+        kDead           = cms.vstring('kNoDataNoTP')
+        ), 
+        
+#//         flagmask_ |= 0x1 << EcalRecHit::kNeighboursRecovered;
+#//         flagmask_ |= 0x1 << EcalRecHit::kTowerRecovered;
+#//         flagmask_ |= 0x1 << EcalRecHit::kDead;
+#//         flagmask_ |= 0x1 << EcalRecHit::kKilled;
+#//         flagmask_ |= 0x1 << EcalRecHit::kTPSaturated;
+#//         flagmask_ |= 0x1 << EcalRecHit::kL1SpikeFlag;
+
+
+                            
+    ## for channel recovery
+    #algoRecover = cms.string("EcalRecHitWorkerRecover"),
+    recoverEBIsolatedChannels = cms.bool(False),
+    recoverEEIsolatedChannels = cms.bool(False),
+    recoverEBVFE  = cms.bool(False),
+    recoverEEVFE  = cms.bool(False),
+    recoverEBFE = cms.bool(True),
+    recoverEEFE = cms.bool(True),
+
+    ##db statuses for which recovery in EE/EB should not be attempted           
+    #dbStatusToBeExcludedEE = cms.vint32(
+                                        #14,  # dead, no TP
+                                        #78,  # dead, HV off
+                                        #142, # dead,LV off
+                                        #), 
+    #dbStatusToBeExcludedEB = cms.vint32(
+                                        #14,  # dead, no TP
+                                        #78,  # dead, HV off
+                                        #142, # dead,LV off
+                                        #), 
+    
+    ## --- logWarnings for saturated DeadFEs
+    ## if the logWarningThreshold is negative the Algo will not try recovery (in EE is not tested we may need negative threshold e.g. -1.e+9)
+    ## if you want to enable recovery but you don't wish to throw logWarnings put the logWarningThresholds very high e.g +1.e+9
+    ##  ~64 GeV is the TP saturation level
+    #logWarningEtThreshold_EB_FE = cms.double(50),# in EB logWarningThreshold is actually in E (GeV)
+    #logWarningEtThreshold_EE_FE = cms.double(50),# in EE the energy should correspond to Et (GeV) but the recovered values of energies are not tested if make sense
+    #ebDetIdToBeRecovered = cms.InputTag("ecalDetIdToBeRecovered:ebDetId"),
+    #eeDetIdToBeRecovered = cms.InputTag("ecalDetIdToBeRecovered:eeDetId"),
+    #ebFEToBeRecovered = cms.InputTag("ecalDetIdToBeRecovered:ebFE"),
+    #eeFEToBeRecovered = cms.InputTag("ecalDetIdToBeRecovered:eeFE"),
+    #singleChannelRecoveryMethod = cms.string("NeuralNetworks"),
+    #singleChannelRecoveryThreshold = cms.double(8),
+    #triggerPrimitiveDigiCollection = cms.InputTag("ecalDigis:EcalTriggerPrimitives"),
+    #cleaningConfig=cleaningAlgoConfig,
+
+    )
+
+
+
+#from Configuration.Eras.Modifier_fastSim_cff import fastSim
+## no flags for bad channels in FastSim
+#fastSim.toModify(ecalRecHit, 
+                 #killDeadChannels = False,
+                 #recoverEBFE = False,
+                 #recoverEEFE = False)
+
+
diff --git a/RecoLocalCalo/EcalRecProducers/test/sourceFromRawCmggpu_cff.py b/RecoLocalCalo/EcalRecProducers/test/sourceFromRawCmggpu_cff.py
new file mode 100644
index 0000000000000..e993a7573b689
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/test/sourceFromRawCmggpu_cff.py
@@ -0,0 +1,151 @@
+import FWCore.ParameterSet.Config as cms
+
+# input
+FastMonitoringService = cms.Service( "FastMonitoringService",
+    filePerFwkStream = cms.untracked.bool( False ),
+    fastMonIntervals = cms.untracked.uint32( 2 ),
+    sleepTime = cms.untracked.int32( 1 )
+)
+
+EvFDaqDirector = cms.Service( "EvFDaqDirector",
+    runNumber = cms.untracked.uint32( 321177 ),
+
+    baseDir = cms.untracked.string( "tmp" ),
+    buBaseDir = cms.untracked.string( "tmp" ),
+
+    useFileBroker = cms.untracked.bool( False ),
+    fileBrokerKeepAlive = cms.untracked.bool( True ),
+    fileBrokerPort = cms.untracked.string( "8080" ),
+    fileBrokerUseLocalLock = cms.untracked.bool( True ),
+    fuLockPollInterval = cms.untracked.uint32( 2000 ),
+
+    requireTransfersPSet = cms.untracked.bool( False ),
+    selectedTransferMode = cms.untracked.string( "" ),
+    mergingPset = cms.untracked.string( "" ),
+
+    outputAdler32Recheck = cms.untracked.bool( False ),
+)
+
+source = cms.Source( "FedRawDataInputSource",
+    runNumber = cms.untracked.uint32( 321177 ),
+    getLSFromFilename = cms.untracked.bool(True),
+    testModeNoBuilderUnit = cms.untracked.bool(False),
+    verifyAdler32 = cms.untracked.bool( True ),
+    verifyChecksum = cms.untracked.bool( True ),
+    useL1EventID = cms.untracked.bool( False ),         # True
+    alwaysStartFromfirstLS = cms.untracked.uint32( 0 ),
+
+    eventChunkBlock = cms.untracked.uint32( 240 ),      # 32
+    eventChunkSize = cms.untracked.uint32( 240),        # 32
+    maxBufferedFiles = cms.untracked.uint32( 8 ),       #  2
+    numBuffers = cms.untracked.uint32( 8 ),             #  2
+
+    fileListMode = cms.untracked.bool( True ),          # False
+    fileNames = cms.untracked.vstring(
+        #'/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0142_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0142_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0142_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0142_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0142_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0143_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0143_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0143_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0143_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0143_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0144_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0144_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0144_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0144_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0144_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0145_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0145_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0145_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0145_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0145_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0146_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0146_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0146_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0146_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0146_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0147_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0147_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0147_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0147_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0147_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0148_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0148_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0148_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0148_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0148_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0149_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0149_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0149_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0149_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0149_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0150_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0150_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0150_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0150_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0150_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0151_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0151_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0151_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0151_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0151_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0152_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0152_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0152_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0152_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0152_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0153_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0153_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0153_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0153_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0153_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0154_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0154_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0154_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0154_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0154_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0155_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0155_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0155_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0155_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0155_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0156_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0156_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0156_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0156_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0156_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0157_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0157_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0157_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0157_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0157_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0158_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0158_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0158_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0158_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0158_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0159_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0159_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0159_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0159_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0159_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0160_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0160_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0160_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0160_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0160_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0161_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0161_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0161_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0161_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0161_index000004.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0162_index000000.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0162_index000001.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0162_index000002.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0162_index000003.raw',
+        '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0162_index000004.raw',
+    ),
+)
\ No newline at end of file
diff --git a/RecoLocalCalo/EcalRecProducers/test/testEcalUncalibRechitProducer_cfg.py b/RecoLocalCalo/EcalRecProducers/test/testEcalUncalibRechitProducer_cfg.py
new file mode 100644
index 0000000000000..7fdf723b67bdd
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/test/testEcalUncalibRechitProducer_cfg.py
@@ -0,0 +1,231 @@
+
+import FWCore.ParameterSet.Config as cms
+
+from Configuration.StandardSequences.Eras import eras
+#from Configuration.ProcessModifiers.gpu_cff import gpu
+
+process = cms.Process('RECO', eras.Run2_2018)
+
+# import of standard configurations
+process.load('Configuration.StandardSequences.Services_cff')
+#process.load('SimGeneral.HepPDTESSource.pythiapdt_cfi')
+process.load('FWCore.MessageService.MessageLogger_cfi')
+process.load('HeterogeneousCore.CUDAServices.CUDAService_cfi')
+#process.load('Configuration.EventContent.EventContent_cff')
+process.load('Configuration.StandardSequences.GeometryRecoDB_cff')
+process.load('Configuration.StandardSequences.MagneticField_AutoFromDBCurrent_cff')
+#process.load('Configuration.StandardSequences.RawToDigi_Data_cff')
+#process.load('Configuration.StandardSequences.Reconstruction_Data_cff')
+#process.load('DQMOffline.Configuration.DQMOffline_cff')
+process.load('Configuration.StandardSequences.FrontierConditions_GlobalTag_cff')
+
+
+
+
+
+
+# Other statements
+from Configuration.AlCa.GlobalTag import GlobalTag
+process.GlobalTag = GlobalTag(process.GlobalTag, '102X_dataRun2_HLT_v2', '')
+
+
+process.maxEvents = cms.untracked.PSet(
+    #input = cms.untracked.int32(100)
+    input = cms.untracked.int32(1000)
+)
+
+# load data using the DAQ source
+import sys, os, inspect
+sys.path.append(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))))
+process.load('sourceFromRawCmggpu_cff')
+
+#-----------------------------------------
+# CMSSW/Hcal non-DQM Related Module import
+#-----------------------------------------
+process.load('Configuration.StandardSequences.GeometryRecoDB_cff')
+process.load("RecoLocalCalo.Configuration.hcalLocalReco_cff")
+#process.load("RecoLocalCalo.Configuration.ecalLocalRecoSequence_cff")
+process.load("EventFilter.HcalRawToDigi.HcalRawToDigi_cfi")
+process.load("EventFilter.EcalRawToDigi.EcalUnpackerData_cfi")
+process.load("RecoLuminosity.LumiProducer.bunchSpacingProducer_cfi")
+
+# load both cpu and gpu plugins
+#
+# ../cfipython/slc7_amd64_gcc700/RecoLocalCalo/EcalRecProducers/ecalUncalibRecHitProducerGPU_cfi.py
+#
+process.load("RecoLocalCalo.EcalRecProducers.ecalUncalibRecHitProducerGPU_cfi")
+#
+process.load("RecoLocalCalo.EcalRecProducers.ecalMultiFitUncalibRecHit_cfi")
+
+# for validation of gpu multifit products
+process.load("RecoLocalCalo.EcalRecProducers.ecalCPUUncalibRecHitProducer_cfi")
+#
+# ../cfipython/slc7_amd64_gcc700/RecoLocalCalo/EcalRecProducers/ecalCPUUncalibRecHitProducer_cfi.py
+#
+
+process.load("EventFilter.EcalRawToDigi.ecalRawToDigiGPU_cfi")
+process.load("EventFilter.EcalRawToDigi.ecalElectronicsMappingGPUESProducer_cfi")
+
+#process.ecalUncalibRecHitProducerGPU.kernelsVersion = 0
+#process.ecalUncalibRecHitProducerGPU.kernelMinimizeThreads = cms.vuint32(16, 1, 1)
+#
+# process.ecalUncalibRecHitProducerGPU.shouldRunTimingComputation = cms.bool(False)
+#
+
+
+process.load("RecoLocalCalo.EcalRecProducers.ecalPedestalsGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalGainRatiosGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalPulseShapesGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalPulseCovariancesGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalSamplesCorrelationGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalTimeBiasCorrectionsGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalTimeCalibConstantsGPUESProducer_cfi")
+
+#process.ecalMultiFitUncalibRecHitgpu.algoPSet.threads = cms.vint32(256, 1, 1)
+
+
+##
+## force HLT configuration for ecalMultiFitUncalibRecHit
+##
+
+process.ecalMultiFitUncalibRecHit.algoPSet = cms.PSet( 
+      ebSpikeThreshold = cms.double( 1.042 ),
+      EBtimeFitLimits_Upper = cms.double( 1.4 ),
+      EEtimeFitLimits_Lower = cms.double( 0.2 ),
+      timealgo = cms.string( "None" ),   # ----> no timing computation for CPU version
+      EBtimeNconst = cms.double( 28.5 ),
+      prefitMaxChiSqEE = cms.double( 10.0 ),
+      outOfTimeThresholdGain12mEB = cms.double( 5.0 ),
+      outOfTimeThresholdGain12mEE = cms.double( 1000.0 ),
+      EEtimeFitParameters = cms.vdouble( -2.390548, 3.553628, -17.62341, 67.67538, -133.213, 140.7432, -75.41106, 16.20277 ),
+      prefitMaxChiSqEB = cms.double( 25.0 ),
+      simplifiedNoiseModelForGainSwitch = cms.bool( True ),
+      EBtimeFitParameters = cms.vdouble( -2.015452, 3.130702, -12.3473, 41.88921, -82.83944, 91.01147, -50.35761, 11.05621 ),
+      selectiveBadSampleCriteriaEB = cms.bool( False ),
+      dynamicPedestalsEB = cms.bool( False ),
+      useLumiInfoRunHeader = cms.bool( False ),
+      EBamplitudeFitParameters = cms.vdouble( 1.138, 1.652 ),
+      doPrefitEE = cms.bool( False ),
+      dynamicPedestalsEE = cms.bool( False ),
+      selectiveBadSampleCriteriaEE = cms.bool( False ),
+      outOfTimeThresholdGain61pEE = cms.double( 1000.0 ),
+      outOfTimeThresholdGain61pEB = cms.double( 5.0 ),
+      activeBXs = cms.vint32( -5, -4, -3, -2, -1, 0, 1, 2, 3, 4 ),
+      EcalPulseShapeParameters = cms.PSet( 
+        EEPulseShapeTemplate = cms.vdouble( 0.116442, 0.756246, 1.0, 0.897182, 0.686831, 0.491506, 0.344111, 0.245731, 0.174115, 0.123361, 0.0874288, 0.061957 ),
+        EEdigiCollection = cms.string( "" ),
+        EcalPreMixStage2 = cms.bool( False ),
+        EcalPreMixStage1 = cms.bool( False ),
+        EBPulseShapeCovariance = cms.vdouble( 3.001E-6, 1.233E-5, 0.0, -4.416E-6, -4.571E-6, -3.614E-6, -2.636E-6, -1.286E-6, -8.41E-7, -5.296E-7, 0.0, 0.0, 1.233E-5, 6.154E-5, 0.0, -2.2E-5, -2.309E-5, -1.838E-5, -1.373E-5, -7.334E-6, -5.088E-6, -3.745E-6, -2.428E-6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -4.416E-6, -2.2E-5, 0.0, 8.319E-6, 8.545E-6, 6.792E-6, 5.059E-6, 2.678E-6, 1.816E-6, 1.223E-6, 8.245E-7, 5.589E-7, -4.571E-6, -2.309E-5, 0.0, 8.545E-6, 9.182E-6, 7.219E-6, 5.388E-6, 2.853E-6, 1.944E-6, 1.324E-6, 9.083E-7, 6.335E-7, -3.614E-6, -1.838E-5, 0.0, 6.792E-6, 7.219E-6, 6.016E-6, 4.437E-6, 2.385E-6, 1.636E-6, 1.118E-6, 7.754E-7, 5.556E-7, -2.636E-6, -1.373E-5, 0.0, 5.059E-6, 5.388E-6, 4.437E-6, 3.602E-6, 1.917E-6, 1.322E-6, 9.079E-7, 6.529E-7, 4.752E-7, -1.286E-6, -7.334E-6, 0.0, 2.678E-6, 2.853E-6, 2.385E-6, 1.917E-6, 1.375E-6, 9.1E-7, 6.455E-7, 4.693E-7, 3.657E-7, -8.41E-7, -5.088E-6, 0.0, 1.816E-6, 1.944E-6, 1.636E-6, 1.322E-6, 9.1E-7, 9.115E-7, 6.062E-7, 4.436E-7, 3.422E-7, -5.296E-7, -3.745E-6, 0.0, 1.223E-6, 1.324E-6, 1.118E-6, 9.079E-7, 6.455E-7, 6.062E-7, 7.217E-7, 4.862E-7, 3.768E-7, 0.0, -2.428E-6, 0.0, 8.245E-7, 9.083E-7, 7.754E-7, 6.529E-7, 4.693E-7, 4.436E-7, 4.862E-7, 6.509E-7, 4.418E-7, 0.0, 0.0, 0.0, 5.589E-7, 6.335E-7, 5.556E-7, 4.752E-7, 3.657E-7, 3.422E-7, 3.768E-7, 4.418E-7, 6.142E-7 ),
+        ESdigiCollection = cms.string( "" ),
+        EBdigiCollection = cms.string( "" ),
+        EBCorrNoiseMatrixG01 = cms.vdouble( 1.0, 0.73354, 0.64442, 0.58851, 0.55425, 0.53082, 0.51916, 0.51097, 0.50732, 0.50409 ),
+        EBCorrNoiseMatrixG12 = cms.vdouble( 1.0, 0.71073, 0.55721, 0.46089, 0.40449, 0.35931, 0.33924, 0.32439, 0.31581, 0.30481 ),
+        EBCorrNoiseMatrixG06 = cms.vdouble( 1.0, 0.70946, 0.58021, 0.49846, 0.45006, 0.41366, 0.39699, 0.38478, 0.37847, 0.37055 ),
+        EEPulseShapeCovariance = cms.vdouble( 3.941E-5, 3.333E-5, 0.0, -1.449E-5, -1.661E-5, -1.424E-5, -1.183E-5, -6.842E-6, -4.915E-6, -3.411E-6, 0.0, 0.0, 3.333E-5, 2.862E-5, 0.0, -1.244E-5, -1.431E-5, -1.233E-5, -1.032E-5, -5.883E-6, -4.154E-6, -2.902E-6, -2.128E-6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.449E-5, -1.244E-5, 0.0, 5.84E-6, 6.649E-6, 5.72E-6, 4.812E-6, 2.708E-6, 1.869E-6, 1.33E-6, 9.186E-7, 6.446E-7, -1.661E-5, -1.431E-5, 0.0, 6.649E-6, 7.966E-6, 6.898E-6, 5.794E-6, 3.157E-6, 2.184E-6, 1.567E-6, 1.084E-6, 7.575E-7, -1.424E-5, -1.233E-5, 0.0, 5.72E-6, 6.898E-6, 6.341E-6, 5.347E-6, 2.859E-6, 1.991E-6, 1.431E-6, 9.839E-7, 6.886E-7, -1.183E-5, -1.032E-5, 0.0, 4.812E-6, 5.794E-6, 5.347E-6, 4.854E-6, 2.628E-6, 1.809E-6, 1.289E-6, 9.02E-7, 6.146E-7, -6.842E-6, -5.883E-6, 0.0, 2.708E-6, 3.157E-6, 2.859E-6, 2.628E-6, 1.863E-6, 1.296E-6, 8.882E-7, 6.108E-7, 4.283E-7, -4.915E-6, -4.154E-6, 0.0, 1.869E-6, 2.184E-6, 1.991E-6, 1.809E-6, 1.296E-6, 1.217E-6, 8.669E-7, 5.751E-7, 3.882E-7, -3.411E-6, -2.902E-6, 0.0, 1.33E-6, 1.567E-6, 1.431E-6, 1.289E-6, 8.882E-7, 8.669E-7, 9.522E-7, 6.717E-7, 4.293E-7, 0.0, -2.128E-6, 0.0, 9.186E-7, 1.084E-6, 9.839E-7, 9.02E-7, 6.108E-7, 5.751E-7, 6.717E-7, 7.911E-7, 5.493E-7, 0.0, 0.0, 0.0, 6.446E-7, 7.575E-7, 6.886E-7, 6.146E-7, 4.283E-7, 3.882E-7, 4.293E-7, 5.493E-7, 7.027E-7 ),
+        EBPulseShapeTemplate = cms.vdouble( 0.0113979, 0.758151, 1.0, 0.887744, 0.673548, 0.474332, 0.319561, 0.215144, 0.147464, 0.101087, 0.0693181, 0.0475044 ),
+        EECorrNoiseMatrixG01 = cms.vdouble( 1.0, 0.72698, 0.62048, 0.55691, 0.51848, 0.49147, 0.47813, 0.47007, 0.46621, 0.46265 ),
+        EECorrNoiseMatrixG12 = cms.vdouble( 1.0, 0.71373, 0.44825, 0.30152, 0.21609, 0.14786, 0.11772, 0.10165, 0.09465, 0.08098 ),
+        UseLCcorrection = cms.untracked.bool( True ),
+        EECorrNoiseMatrixG06 = cms.vdouble( 1.0, 0.71217, 0.47464, 0.34056, 0.26282, 0.20287, 0.17734, 0.16256, 0.15618, 0.14443 )
+      ),
+      doPrefitEB = cms.bool( False ),
+      addPedestalUncertaintyEE = cms.double( 0.0 ),
+      addPedestalUncertaintyEB = cms.double( 0.0 ),
+      gainSwitchUseMaxSampleEB = cms.bool( True ),
+      EEtimeNconst = cms.double( 31.8 ),
+      EEamplitudeFitParameters = cms.vdouble( 1.89, 1.4 ),
+      chi2ThreshEE_ = cms.double( 50.0 ),
+      eePulseShape = cms.vdouble( 5.2E-5, -5.26E-5, 6.66E-5, 0.1168, 0.7575, 1.0, 0.8876, 0.6732, 0.4741, 0.3194 ),
+      outOfTimeThresholdGain12pEB = cms.double( 5.0 ),
+      gainSwitchUseMaxSampleEE = cms.bool( False ),
+      mitigateBadSamplesEB = cms.bool( False ),
+      outOfTimeThresholdGain12pEE = cms.double( 1000.0 ),
+      ebPulseShape = cms.vdouble( 5.2E-5, -5.26E-5, 6.66E-5, 0.1168, 0.7575, 1.0, 0.8876, 0.6732, 0.4741, 0.3194 ),
+      ampErrorCalculation = cms.bool( False ),
+      mitigateBadSamplesEE = cms.bool( False ),
+      amplitudeThresholdEB = cms.double( 10.0 ),
+      kPoorRecoFlagEB = cms.bool( True ),
+      amplitudeThresholdEE = cms.double( 10.0 ),
+      EBtimeFitLimits_Lower = cms.double( 0.2 ),
+      kPoorRecoFlagEE = cms.bool( False ),
+      EEtimeFitLimits_Upper = cms.double( 1.4 ),
+      outOfTimeThresholdGain61mEE = cms.double( 1000.0 ),
+      EEtimeConstantTerm = cms.double( 1.0 ),
+      EBtimeConstantTerm = cms.double( 0.6 ),
+      chi2ThreshEB_ = cms.double( 65.0 ),
+      outOfTimeThresholdGain61mEB = cms.double( 5.0 )
+)     
+      
+##    
+    
+    
+    
+#process.load('Configuration.StandardSequences.Reconstruction_cff')
+#process.ecalRecHit
+
+    
+    
+#process.load("RecoLocalCalo.EcalRecProducers.ecalRecHitGPU_cfi")
+#process.ecalRecHitGPU
+    
+
+
+#process.hcalDigis.silent = cms.untracked.bool(False)
+#process.hcalDigis.InputLabel = rawTag
+process.ecalDigis = process.ecalEBunpacker.clone()
+process.ecalDigis.InputLabel = cms.InputTag('rawDataCollector')
+#process.hbheprerecogpu.processQIE11 = cms.bool(True)
+
+process.out = cms.OutputModule(
+    "PoolOutputModule",
+    fileName = cms.untracked.string("test_uncalib.root")
+)
+
+#process.out = cms.OutputModule("AsciiOutputModule",
+#    outputCommands = cms.untracked.vstring(
+#        'keep *_ecalMultiFitUncalibRecHit_*_*', 
+#    ),
+#    verbosity = cms.untracked.uint32(0)
+#)
+process.finalize = cms.EndPath(process.out)
+
+process.bunchSpacing = cms.Path(
+    process.bunchSpacingProducer
+)
+
+process.digiPath = cms.Path(
+    #process.hcalDigis
+    process.ecalDigis
+    *process.ecalRawToDigiGPU    
+)
+
+process.recoPath = cms.Path(
+    #(process.ecalMultiFitUncalibRecHit+process.ecalDetIdToBeRecovered)
+    process.ecalMultiFitUncalibRecHit
+    #*process.ecalRecHit
+#   gpu
+    *process.ecalUncalibRecHitProducerGPU
+    *process.ecalCPUUncalibRecHitProducer
+    #*process.ecalRecHitGPU
+)
+
+process.schedule = cms.Schedule(
+    process.bunchSpacing,
+    process.digiPath,
+    process.recoPath,
+#    process.ecalecalLocalRecoSequence
+    process.finalize
+)
+
+process.options = cms.untracked.PSet(
+    numberOfThreads = cms.untracked.uint32(8),
+    numberOfStreams = cms.untracked.uint32(8),
+    SkipEvent = cms.untracked.vstring('ProductNotFound'),
+    wantSummary = cms.untracked.bool(True)
+)
+
+# report CUDAService messages
+process.MessageLogger.categories.append("CUDAService")
+
+

From de950168d13a34f4fc5982ee033c10f29e7c8e85 Mon Sep 17 00:00:00 2001
From: amassiro <massironi.andrea@gmail.com>
Date: Fri, 10 Apr 2020 15:54:24 +0200
Subject: [PATCH 10/30] minor fix

---
 EventFilter/EcalRawToDigi/src/UnpackGPU.cu                      | 1 +
 .../EcalRecProducers/test/testEcalRechitProducer_cfg.py         | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/EventFilter/EcalRawToDigi/src/UnpackGPU.cu b/EventFilter/EcalRawToDigi/src/UnpackGPU.cu
index a2e5057bbbf6a..a4742f85ef6ca 100644
--- a/EventFilter/EcalRawToDigi/src/UnpackGPU.cu
+++ b/EventFilter/EcalRawToDigi/src/UnpackGPU.cu
@@ -317,6 +317,7 @@ namespace ecal {
                                                                    scratchGPU.pChannelsCounter,
                                                                    conditions.eMappingProduct.eid2did,
                                                                    nbytesTotal);
+      
       cudaCheck(cudaGetLastError());
 
       // transfer the counters for how many eb and ee channels we got
diff --git a/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py b/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py
index 77d562242985b..a18d2c0ea7e4c 100644
--- a/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py
+++ b/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py
@@ -154,7 +154,7 @@
     
 process.load("RecoLocalCalo.EcalRecProducers.ecalADCToGeVConstantGPUESProducer_cfi")
 process.load("RecoLocalCalo.EcalRecProducers.ecalIntercalibConstantsGPUESProducer_cfi")
-#process.load("RecoLocalCalo.EcalRecProducers.ecalChannelStatusGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalChannelStatusGPUESProducer_cfi")
     
 process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosGPUESProducer_cfi")
 process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosRefGPUESProducer_cfi")

From ee0c5ea653a03ebfd2d01a205f5719c8b315e892 Mon Sep 17 00:00:00 2001
From: amassiro <massironi.andrea@gmail.com>
Date: Tue, 21 Apr 2020 15:27:47 +0200
Subject: [PATCH 11/30] tests ongoing

---
 CUDADataFormats/EcalRecHitSoA/BuildFile.xml   |  4 +---
 .../EcalRecHitSoA/src/classes_def.xml         |  1 +
 EventFilter/EcalRawToDigi/src/UnpackGPU.cu    |  3 ++-
 .../EcalRecAlgos/interface/DeclsForKernels.h  | 20 +++++++++----------
 .../src/AmplitudeComputationKernels.cu        |  2 +-
 .../EcalRecAlgos/src/EcalChannelStatusGPU.cc  |  2 ++
 .../src/EcalRecHitBuilderKernels.cu           | 15 +++++++++-----
 .../src/EcalRecHitBuilderKernels.h            |  1 +
 .../EcalUncalibRecHitMultiFitAlgo_gpu_new.cu  | 16 +++++++--------
 RecoLocalCalo/EcalRecProducers/BuildFile.xml  |  2 ++
 .../EcalRecProducers/plugins/BuildFile.xml    |  1 +
 .../plugins/EcalRecHitProducerGPU.cc          |  3 ---
 .../test/testEcalRechitProducer_cfg.py        |  8 ++++----
 13 files changed, 43 insertions(+), 35 deletions(-)

diff --git a/CUDADataFormats/EcalRecHitSoA/BuildFile.xml b/CUDADataFormats/EcalRecHitSoA/BuildFile.xml
index 3b6d026d40d11..aaaaf306dd7c7 100644
--- a/CUDADataFormats/EcalRecHitSoA/BuildFile.xml
+++ b/CUDADataFormats/EcalRecHitSoA/BuildFile.xml
@@ -1,9 +1,7 @@
-<use name="cuda"/>
-<use name="CUDADataFormats/Common"/>
 <use name="DataFormats/Common"/>
-<use name="CUDADataFormats/Common"/>
 <use name="DataFormats/EcalDigi"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="CUDADataFormats/Common"/>
 <use name="cuda"/>
 
 <export>
diff --git a/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml b/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml
index 6721bfff3126c..266324f5fac31 100644
--- a/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml
+++ b/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml
@@ -2,6 +2,7 @@
     <class name="std::vector<float, CUDAHostAllocator<float, 0>>" />
     <class name="std::vector<double, CUDAHostAllocator<double, 0>>" />
     <class name="std::vector<uint32_t, CUDAHostAllocator<uint32_t, 0>>" />
+    <class name="std::vector<uint16_t, CUDAHostAllocator<uint16_t, 0>>" />
     <class name="ecal::Tag::soa"/>
     <class name="ecal::Detail::Base<ecal::Tag::soa>" />
 
diff --git a/EventFilter/EcalRawToDigi/src/UnpackGPU.cu b/EventFilter/EcalRawToDigi/src/UnpackGPU.cu
index a4742f85ef6ca..d8ffbec039b7c 100644
--- a/EventFilter/EcalRawToDigi/src/UnpackGPU.cu
+++ b/EventFilter/EcalRawToDigi/src/UnpackGPU.cu
@@ -307,7 +307,8 @@ namespace ecal {
       cudaCheck(cudaMemcpyAsync(
           inputGPU.feds, inputCPU.feds.data(), nfedsWithData * sizeof(int), cudaMemcpyHostToDevice, cudaStream));
 
-      kernel_unpack_test<32><<<nfedsWithData, 32, 0, cudaStream>>>(inputGPU.data,
+//       kernel_unpack_test<32><<<nfedsWithData, 32, 0, cudaStream>>>(inputGPU.data,
+      kernel_unpack_test<16><<<nfedsWithData, 16, 0, cudaStream>>>(inputGPU.data,
                                                                    inputGPU.offsets,
                                                                    inputGPU.feds,
                                                                    outputGPU.samplesEB,
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h
index 6bc816fca5295..419e50b3636c6 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h
@@ -15,7 +15,6 @@
 #include "CondFormats/EcalObjects/interface/EcalPedestals.h"
 #include "CondFormats/EcalObjects/interface/EcalGainRatios.h"
 #include "CondFormats/EcalObjects/interface/EcalTimeBiasCorrections.h"
-#include "CondFormats/EcalObjects/interface/EcalWeightSet.h"
 #include "CondFormats/EcalObjects/interface/EcalTimeOffsetConstant.h"
 
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h"
@@ -281,11 +280,13 @@ struct conf_data {
 
 #include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
 
+#include "CondFormats/EcalObjects/interface/EcalChannelStatus.h"
+#include "CondFormats/EcalObjects/interface/EcalChannelStatusCode.h"
+
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h"
 
-
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h"
@@ -293,7 +294,6 @@ struct conf_data {
 
 
 
-
 namespace ecal { 
   namespace rechit {
     
@@ -378,14 +378,14 @@ namespace ecal {
     
     // const refs products to conditions
     struct ConditionsProducts {
-      EcalADCToGeVConstantGPU::Product    const& ADCToGeV;
-      EcalIntercalibConstantsGPU::Product const& Intercalib;
-      EcalChannelStatusGPU::Product       const& ChannelStatus;
+      EcalADCToGeVConstantGPU::Product    const& ADCToGeV      ;
+      EcalIntercalibConstantsGPU::Product const& Intercalib    ;
+      EcalChannelStatusGPU::Product       const& ChannelStatus ;
       //     
-      EcalLaserAPDPNRatiosGPU::Product     const& LaserAPDPNRatios   ;
-      EcalLaserAPDPNRatiosRefGPU::Product  const& LaserAPDPNRatiosRef;
-      EcalLaserAlphasGPU::Product          const& LaserAlphas        ;
-      EcalLinearCorrectionsGPU::Product    const& LinearCorrections  ;
+      EcalLaserAPDPNRatiosGPU::Product     const& LaserAPDPNRatios    ;
+      EcalLaserAPDPNRatiosRefGPU::Product  const& LaserAPDPNRatiosRef ;
+      EcalLaserAlphasGPU::Product          const& LaserAlphas         ;
+      EcalLinearCorrectionsGPU::Product    const& LinearCorrections   ;
       //     
       //     
       uint32_t offsetForHashes;    
diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu
index 23d9c12aa0582..c67677055c189 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu
@@ -392,7 +392,7 @@ namespace ecal {
             50,
             offsetForHashes,
             offsetForInputs);
-        cudaCheck(cudaGetLastError());
+//         cudaCheck(cudaGetLastError());
       }
 
     }  // namespace v1
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc
index c1cdc6631878b..91293902bb667 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc
@@ -45,3 +45,5 @@ EcalChannelStatusGPU::Product const& EcalChannelStatusGPU::getProduct(cudaStream
 }
 
 TYPELOOKUP_DATA_REG(EcalChannelStatusGPU);
+
+
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
index 84aacc0cf5b33..5c50bdaa58f7f 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
@@ -648,15 +648,20 @@ namespace ecal {
       
       int nchannels = eventInputGPU.ebUncalibRecHits.size + eventInputGPU.eeUncalibRecHits.size ;
       
-      unsigned int nchannels_per_block = 32;
-      unsigned int threads_1d = nchannels_per_block;
-      unsigned int blocks_1d = (nchannels + threads_1d) / threads_1d; // TEST : to be optimized (AM)
-      
+//       unsigned int nchannels_per_block = 32;
+      unsigned int nchannels_per_block = 16;
+      unsigned int threads_min = nchannels_per_block;
+      unsigned int blocks_min = (nchannels + threads_min - 1) / threads_min; // TEST : to be optimized (AM)
       
       // 
       // kernel create rechit
       //
-      kernel_create_ecal_rehit <<< blocks_1d, threads_1d >>> (
+      
+//       auto const nbytesShared = 2 * threads_min * MapSymM<DataType, SampleVector::RowsAtCompileTime>::total * sizeof(DataType);
+      
+      kernel_create_ecal_rehit <<< blocks_min, threads_min, 0, cudaStream >>> (
+//       kernel_create_ecal_rehit <<< blocks_min, threads_min, nbytesShared, cudaStream >>> (
+//       kernel_create_ecal_rehit <<< blocks_min, threads_min >>> (
         // configuration 
         configParameters.ChannelStatusToBeExcluded,
         configParameters.ChannelStatusToBeExcludedSize,
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h
index 587abe0575883..a1809dbded6bd 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h
@@ -3,6 +3,7 @@
 //
 
 #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
 
 #include "RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/Common.h"
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu
index c8d2926b29afc..dbfe4833c7d3f 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu
@@ -83,7 +83,7 @@ namespace ecal {
           gainSwitchUseMaxSampleEB,
           gainSwitchUseMaxSampleEE,
           totalChannels);
-      cudaCheck(cudaGetLastError());
+//       cudaCheck(cudaGetLastError());
 
       //
       // 2d preparation kernel
@@ -112,7 +112,7 @@ namespace ecal {
                                                                scratch.isSaturated,
                                                                offsetForHashes,
                                                                offsetForInputs);
-      cudaCheck(cudaGetLastError());
+//       cudaCheck(cudaGetLastError());
 
       // run minimization kernels
       v1::minimization_procedure(eventInputGPU, eventOutputGPU, scratch, conditions, configParameters, cudaStream);
@@ -148,7 +148,7 @@ namespace ecal {
             conditions.sampleMask.getEcalSampleMaskRecordEB(),
             conditions.sampleMask.getEcalSampleMaskRecordEE(),
             totalChannels);
-        cudaCheck(cudaGetLastError());
+//         cudaCheck(cudaGetLastError());
 
         //
         // TODO: small kernel only for EB. It needs to be checked if
@@ -170,7 +170,7 @@ namespace ecal {
             conditions.sampleMask.getEcalSampleMaskRecordEB(),
             totalChannels,
             offsetForInputs);
-        cudaCheck(cudaGetLastError());
+//         cudaCheck(cudaGetLastError());
 
         //
         //
@@ -186,7 +186,7 @@ namespace ecal {
             scratch.sum0sNullHypot,
             scratch.sumAAsNullHypot,
             totalChannels);
-        cudaCheck(cudaGetLastError());
+//         cudaCheck(cudaGetLastError());
 
         unsigned int nchannels_per_block_makeratio = 10;
         unsigned int threads_makeratio = 45 * nchannels_per_block_makeratio;
@@ -220,7 +220,7 @@ namespace ecal {
             configParameters.timeFitLimitsSecondEE,
             totalChannels,
             offsetForInputs);
-        cudaCheck(cudaGetLastError());
+//         cudaCheck(cudaGetLastError());
 
         //
         //
@@ -252,7 +252,7 @@ namespace ecal {
                                                                     scratch.timeError,
                                                                     totalChannels,
                                                                     offsetForInputs);
-        cudaCheck(cudaGetLastError());
+//         cudaCheck(cudaGetLastError());
 
         //
         //
@@ -298,7 +298,7 @@ namespace ecal {
             offsetForHashes,
             offsetForInputs,
             totalChannels);
-        cudaCheck(cudaGetLastError());
+//         cudaCheck(cudaGetLastError());
       }
 
       /*
diff --git a/RecoLocalCalo/EcalRecProducers/BuildFile.xml b/RecoLocalCalo/EcalRecProducers/BuildFile.xml
index 836b1c5090955..59d0c5987d7fd 100644
--- a/RecoLocalCalo/EcalRecProducers/BuildFile.xml
+++ b/RecoLocalCalo/EcalRecProducers/BuildFile.xml
@@ -1,9 +1,11 @@
+<use   name="FWCore/MessageLogger"/>
 <use   name="FWCore/Framework"/>
 <use   name="clhep"/>
 <use   name="CUDADataFormats/EcalRecHitSoA"/>
 <use   name="CondFormats/EcalObjects"/>
 <use   name="HeterogeneousCore/CUDACore"/>
 <use   name="HeterogeneousCore/CUDAUtilities"/>
+<use   name="cuda-api-wrappers"/>
 <use   name="cuda"/>
 <export>
   <lib   name="1"/>
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml
index b4dfcc1cc3b0d..89e5e9d93c549 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml
+++ b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml
@@ -16,6 +16,7 @@
 <use   name="CUDADataFormats/EcalRecHitSoA"/>
 <use   name="HeterogeneousCore/CUDACore"/>
 <use   name="HeterogeneousCore/CUDAUtilities"/>
+<use   name="cuda-api-wrappers"/>
 <use   name="cuda"/>
 <library   file="*.cc" name="RecoLocalCaloEcalRecProducersPlugins">
   <flags   EDM_PLUGIN="1"/>
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
index 69c3a95244ed8..a9d4bb9e670f4 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
@@ -401,9 +401,6 @@ void EcalRecHitProducerGPU::acquire(
     ctx.stream()
   );
   
-  
-  
-  
   cudaCheck(cudaGetLastError());
   
   
diff --git a/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py b/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py
index a18d2c0ea7e4c..f1b68836b2101 100644
--- a/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py
+++ b/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py
@@ -152,7 +152,7 @@
 
     
     
-process.load("RecoLocalCalo.EcalRecProducers.ecalADCToGeVConstantGPUESProducer_cfi")
+#process.load("RecoLocalCalo.EcalRecProducers.ecalADCToGeVConstantGPUESProducer_cfi")
 process.load("RecoLocalCalo.EcalRecProducers.ecalIntercalibConstantsGPUESProducer_cfi")
 process.load("RecoLocalCalo.EcalRecProducers.ecalChannelStatusGPUESProducer_cfi")
     
@@ -161,8 +161,8 @@
 process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAlphasGPUESProducer_cfi")
 process.load("RecoLocalCalo.EcalRecProducers.ecalLinearCorrectionsGPUESProducer_cfi")
     
-process.load("RecoLocalCalo.EcalRecProducers.ecalRecHitGPU_cfi")
-process.ecalRecHitProducerGPU = process.ecalRecHitGPU.clone()
+#process.load("RecoLocalCalo.EcalRecProducers.ecalRecHitGPU_cfi")
+#process.ecalRecHitProducerGPU = process.ecalRecHitGPU.clone()
  
  
 #
@@ -273,7 +273,7 @@
 #   gpu
     *process.ecalUncalibRecHitProducerGPU
     *process.ecalCPUUncalibRecHitProducer
-    *process.ecalRecHitProducerGPU
+    #*process.ecalRecHitProducerGPU
     #*process.ecalCPURecHitProducer
 )
 

From ba982009f43a9036425ea6f1b2ef55f1552a64d2 Mon Sep 17 00:00:00 2001
From: amassiro <massironi.andrea@gmail.com>
Date: Tue, 21 Apr 2020 15:28:00 +0200
Subject: [PATCH 12/30] last file missing

---
 .../test/ecalRawDecodingAndMultifit.py        | 201 ++++++++++++++++++
 1 file changed, 201 insertions(+)
 create mode 100644 RecoLocalCalo/EcalRecProducers/test/ecalRawDecodingAndMultifit.py

diff --git a/RecoLocalCalo/EcalRecProducers/test/ecalRawDecodingAndMultifit.py b/RecoLocalCalo/EcalRecProducers/test/ecalRawDecodingAndMultifit.py
new file mode 100644
index 0000000000000..4886238cc620f
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/test/ecalRawDecodingAndMultifit.py
@@ -0,0 +1,201 @@
+import FWCore.ParameterSet.Config as cms
+
+from Configuration.StandardSequences.Eras import eras
+#from Configuration.ProcessModifiers.gpu_cff import gpu
+
+process = cms.Process('RECO', eras.Run2_2018)
+
+# import of standard configurations
+process.load('Configuration.StandardSequences.Services_cff')
+#process.load('SimGeneral.HepPDTESSource.pythiapdt_cfi')
+process.load('FWCore.MessageService.MessageLogger_cfi')
+process.load('HeterogeneousCore.CUDAServices.CUDAService_cfi')
+#process.load('Configuration.EventContent.EventContent_cff')
+process.load('Configuration.StandardSequences.GeometryRecoDB_cff')
+process.load('Configuration.StandardSequences.MagneticField_AutoFromDBCurrent_cff')
+#process.load('Configuration.StandardSequences.RawToDigi_Data_cff')
+#process.load('Configuration.StandardSequences.Reconstruction_Data_cff')
+#process.load('DQMOffline.Configuration.DQMOffline_cff')
+process.load('Configuration.StandardSequences.FrontierConditions_GlobalTag_cff')
+
+# Other statements
+from Configuration.AlCa.GlobalTag import GlobalTag
+process.GlobalTag = GlobalTag(process.GlobalTag, '102X_dataRun2_HLT_v2', '')
+
+
+process.maxEvents = cms.untracked.PSet(
+    input = cms.untracked.int32(100)
+)
+
+# load data using the DAQ source
+import sys, os, inspect
+sys.path.append(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))))
+process.load('sourceFromRawCmggpu_cff')
+
+#-----------------------------------------
+# CMSSW/Hcal non-DQM Related Module import
+#-----------------------------------------
+process.load('Configuration.StandardSequences.GeometryRecoDB_cff')
+process.load("RecoLocalCalo.Configuration.hcalLocalReco_cff")
+#process.load("RecoLocalCalo.Configuration.ecalLocalRecoSequence_cff")
+process.load("EventFilter.HcalRawToDigi.HcalRawToDigi_cfi")
+process.load("EventFilter.EcalRawToDigi.EcalUnpackerData_cfi")
+process.load("RecoLuminosity.LumiProducer.bunchSpacingProducer_cfi")
+
+# load both cpu and gpu plugins
+process.load("RecoLocalCalo.EcalRecProducers.ecalUncalibRecHitProducerGPU_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalMultiFitUncalibRecHit_cfi")
+
+# for validation of gpu multifit products
+process.load("RecoLocalCalo.EcalRecProducers.ecalCPUUncalibRecHitProducer_cfi")
+process.load("EventFilter.EcalRawToDigi.ecalCPUDigisProducer_cfi")
+
+process.load("EventFilter.EcalRawToDigi.ecalRawToDigiGPU_cfi")
+process.load("EventFilter.EcalRawToDigi.ecalElectronicsMappingGPUESProducer_cfi")
+
+#process.ecalUncalibRecHitProducerGPU.kernelsVersion = 0
+#process.ecalUncalibRecHitProducerGPU.kernelMinimizeThreads = cms.vuint32(16, 1, 1)
+
+process.load("RecoLocalCalo.EcalRecProducers.ecalPedestalsGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalGainRatiosGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalPulseShapesGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalPulseCovariancesGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalSamplesCorrelationGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalTimeBiasCorrectionsGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalTimeCalibConstantsGPUESProducer_cfi")
+
+#process.load("RecoLocalCalo.EcalRecProducers.ecalIntercalibConstantsGPUESProducer_cfi")
+#process.load("RecoLocalCalo.EcalRecProducers.ecalChannelStatusGPUESProducer_cfi")
+
+
+#process.ecalMultiFitUncalibRecHitgpu.algoPSet.threads = cms.vint32(256, 1, 1)
+
+
+##
+## force HLT configuration for ecalMultiFitUncalibRecHit
+##
+
+process.ecalMultiFitUncalibRecHit.algoPSet = cms.PSet( 
+              ebSpikeThreshold = cms.double( 1.042 ),
+                    EBtimeFitLimits_Upper = cms.double( 1.4 ),
+                          EEtimeFitLimits_Lower = cms.double( 0.2 ),
+                                timealgo = cms.string( "None" ),
+                                      EBtimeNconst = cms.double( 28.5 ),
+                                            prefitMaxChiSqEE = cms.double( 10.0 ),
+                                                  outOfTimeThresholdGain12mEB = cms.double( 5.0 ),
+                                                        outOfTimeThresholdGain12mEE = cms.double( 1000.0 ),
+                                                              EEtimeFitParameters = cms.vdouble( -2.390548, 3.553628, -17.62341, 67.67538, -133.213, 140.7432, -75.41106, 16.20277 ),
+                                                                    prefitMaxChiSqEB = cms.double( 25.0 ),
+                                                                          simplifiedNoiseModelForGainSwitch = cms.bool( True ),
+                                                                                EBtimeFitParameters = cms.vdouble( -2.015452, 3.130702, -12.3473, 41.88921, -82.83944, 91.01147, -50.35761, 11.05621 ),
+                                                                                      selectiveBadSampleCriteriaEB = cms.bool( False ),
+                                                                                            dynamicPedestalsEB = cms.bool( False ),
+                                                                                                  useLumiInfoRunHeader = cms.bool( False ),
+                                                                                                        EBamplitudeFitParameters = cms.vdouble( 1.138, 1.652 ),
+                                                                                                              doPrefitEE = cms.bool( False ),
+                                                                                                                    dynamicPedestalsEE = cms.bool( False ),
+                                                                                                                          selectiveBadSampleCriteriaEE = cms.bool( False ),
+                                                                                                                                outOfTimeThresholdGain61pEE = cms.double( 1000.0 ),
+                                                                                                                                      outOfTimeThresholdGain61pEB = cms.double( 5.0 ),
+                                                                                                                                            activeBXs = cms.vint32( -5, -4, -3, -2, -1, 0, 1, 2, 3, 4 ),
+                                                                                                                                                  EcalPulseShapeParameters = cms.PSet( 
+                                                                                                                                                              EEPulseShapeTemplate = cms.vdouble( 0.116442, 0.756246, 1.0, 0.897182, 0.686831, 0.491506, 0.344111, 0.245731, 0.174115, 0.123361, 0.0874288, 0.061957 ),
+                                                                                                                                                                      EEdigiCollection = cms.string( "" ),
+                                                                                                                                                                              EcalPreMixStage2 = cms.bool( False ),
+                                                                                                                                                                                      EcalPreMixStage1 = cms.bool( False ),
+                                                                                                                                                                                              EBPulseShapeCovariance = cms.vdouble( 3.001E-6, 1.233E-5, 0.0, -4.416E-6, -4.571E-6, -3.614E-6, -2.636E-6, -1.286E-6, -8.41E-7, -5.296E-7, 0.0, 0.0, 1.233E-5, 6.154E-5, 0.0, -2.2E-5, -2.309E-5, -1.838E-5, -1.373E-5, -7.334E-6, -5.088E-6, -3.745E-6, -2.428E-6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -4.416E-6, -2.2E-5, 0.0, 8.319E-6, 8.545E-6, 6.792E-6, 5.059E-6, 2.678E-6, 1.816E-6, 1.223E-6, 8.245E-7, 5.589E-7, -4.571E-6, -2.309E-5, 0.0, 8.545E-6, 9.182E-6, 7.219E-6, 5.388E-6, 2.853E-6, 1.944E-6, 1.324E-6, 9.083E-7, 6.335E-7, -3.614E-6, -1.838E-5, 0.0, 6.792E-6, 7.219E-6, 6.016E-6, 4.437E-6, 2.385E-6, 1.636E-6, 1.118E-6, 7.754E-7, 5.556E-7, -2.636E-6, -1.373E-5, 0.0, 5.059E-6, 5.388E-6, 4.437E-6, 3.602E-6, 1.917E-6, 1.322E-6, 9.079E-7, 6.529E-7, 4.752E-7, -1.286E-6, -7.334E-6, 0.0, 2.678E-6, 2.853E-6, 2.385E-6, 1.917E-6, 1.375E-6, 9.1E-7, 6.455E-7, 4.693E-7, 3.657E-7, -8.41E-7, -5.088E-6, 0.0, 1.816E-6, 1.944E-6, 1.636E-6, 1.322E-6, 9.1E-7, 9.115E-7, 6.062E-7, 4.436E-7, 3.422E-7, -5.296E-7, -3.745E-6, 0.0, 1.223E-6, 1.324E-6, 1.118E-6, 9.079E-7, 6.455E-7, 6.062E-7, 7.217E-7, 4.862E-7, 3.768E-7, 0.0, -2.428E-6, 0.0, 8.245E-7, 9.083E-7, 7.754E-7, 6.529E-7, 4.693E-7, 4.436E-7, 4.862E-7, 6.509E-7, 4.418E-7, 0.0, 0.0, 0.0, 5.589E-7, 6.335E-7, 5.556E-7, 4.752E-7, 3.657E-7, 3.422E-7, 3.768E-7, 4.418E-7, 6.142E-7 ),
+                                                                                                                                                                                                      ESdigiCollection = cms.string( "" ),
+                                                                                                                                                                                                              EBdigiCollection = cms.string( "" ),
+                                                                                                                                                                                                                      EBCorrNoiseMatrixG01 = cms.vdouble( 1.0, 0.73354, 0.64442, 0.58851, 0.55425, 0.53082, 0.51916, 0.51097, 0.50732, 0.50409 ),
+                                                                                                                                                                                                                              EBCorrNoiseMatrixG12 = cms.vdouble( 1.0, 0.71073, 0.55721, 0.46089, 0.40449, 0.35931, 0.33924, 0.32439, 0.31581, 0.30481 ),
+                                                                                                                                                                                                                                      EBCorrNoiseMatrixG06 = cms.vdouble( 1.0, 0.70946, 0.58021, 0.49846, 0.45006, 0.41366, 0.39699, 0.38478, 0.37847, 0.37055 ),
+                                                                                                                                                                                                                                              EEPulseShapeCovariance = cms.vdouble( 3.941E-5, 3.333E-5, 0.0, -1.449E-5, -1.661E-5, -1.424E-5, -1.183E-5, -6.842E-6, -4.915E-6, -3.411E-6, 0.0, 0.0, 3.333E-5, 2.862E-5, 0.0, -1.244E-5, -1.431E-5, -1.233E-5, -1.032E-5, -5.883E-6, -4.154E-6, -2.902E-6, -2.128E-6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.449E-5, -1.244E-5, 0.0, 5.84E-6, 6.649E-6, 5.72E-6, 4.812E-6, 2.708E-6, 1.869E-6, 1.33E-6, 9.186E-7, 6.446E-7, -1.661E-5, -1.431E-5, 0.0, 6.649E-6, 7.966E-6, 6.898E-6, 5.794E-6, 3.157E-6, 2.184E-6, 1.567E-6, 1.084E-6, 7.575E-7, -1.424E-5, -1.233E-5, 0.0, 5.72E-6, 6.898E-6, 6.341E-6, 5.347E-6, 2.859E-6, 1.991E-6, 1.431E-6, 9.839E-7, 6.886E-7, -1.183E-5, -1.032E-5, 0.0, 4.812E-6, 5.794E-6, 5.347E-6, 4.854E-6, 2.628E-6, 1.809E-6, 1.289E-6, 9.02E-7, 6.146E-7, -6.842E-6, -5.883E-6, 0.0, 2.708E-6, 3.157E-6, 2.859E-6, 2.628E-6, 1.863E-6, 1.296E-6, 8.882E-7, 6.108E-7, 4.283E-7, -4.915E-6, -4.154E-6, 0.0, 1.869E-6, 2.184E-6, 1.991E-6, 1.809E-6, 1.296E-6, 1.217E-6, 8.669E-7, 5.751E-7, 3.882E-7, -3.411E-6, -2.902E-6, 0.0, 1.33E-6, 1.567E-6, 1.431E-6, 1.289E-6, 8.882E-7, 8.669E-7, 9.522E-7, 6.717E-7, 4.293E-7, 0.0, -2.128E-6, 0.0, 9.186E-7, 1.084E-6, 9.839E-7, 9.02E-7, 6.108E-7, 5.751E-7, 6.717E-7, 7.911E-7, 5.493E-7, 0.0, 0.0, 0.0, 6.446E-7, 7.575E-7, 6.886E-7, 6.146E-7, 4.283E-7, 3.882E-7, 4.293E-7, 5.493E-7, 7.027E-7 ),
+                                                                                                                                                                                                                                                      EBPulseShapeTemplate = cms.vdouble( 0.0113979, 0.758151, 1.0, 0.887744, 0.673548, 0.474332, 0.319561, 0.215144, 0.147464, 0.101087, 0.0693181, 0.0475044 ),
+                                                                                                                                                                                                                                                              EECorrNoiseMatrixG01 = cms.vdouble( 1.0, 0.72698, 0.62048, 0.55691, 0.51848, 0.49147, 0.47813, 0.47007, 0.46621, 0.46265 ),
+                                                                                                                                                                                                                                                                      EECorrNoiseMatrixG12 = cms.vdouble( 1.0, 0.71373, 0.44825, 0.30152, 0.21609, 0.14786, 0.11772, 0.10165, 0.09465, 0.08098 ),
+                                                                                                                                                                                                                                                                              UseLCcorrection = cms.untracked.bool( True ),
+                                                                                                                                                                                                                                                                                      EECorrNoiseMatrixG06 = cms.vdouble( 1.0, 0.71217, 0.47464, 0.34056, 0.26282, 0.20287, 0.17734, 0.16256, 0.15618, 0.14443 )
+                                                                                                                                                                                                                                                                                            ),
+      doPrefitEB = cms.bool( False ),
+            addPedestalUncertaintyEE = cms.double( 0.0 ),
+                  addPedestalUncertaintyEB = cms.double( 0.0 ),
+                        gainSwitchUseMaxSampleEB = cms.bool( True ),
+                              EEtimeNconst = cms.double( 31.8 ),
+                                    EEamplitudeFitParameters = cms.vdouble( 1.89, 1.4 ),
+                                          chi2ThreshEE_ = cms.double( 50.0 ),
+                                                eePulseShape = cms.vdouble( 5.2E-5, -5.26E-5, 6.66E-5, 0.1168, 0.7575, 1.0, 0.8876, 0.6732, 0.4741, 0.3194 ),
+                                                      outOfTimeThresholdGain12pEB = cms.double( 5.0 ),
+                                                            gainSwitchUseMaxSampleEE = cms.bool( False ),
+      mitigateBadSamplesEB = cms.bool( False ),
+            outOfTimeThresholdGain12pEE = cms.double( 1000.0 ),
+                  ebPulseShape = cms.vdouble( 5.2E-5, -5.26E-5, 6.66E-5, 0.1168, 0.7575, 1.0, 0.8876, 0.6732, 0.4741, 0.3194 ),
+                        ampErrorCalculation = cms.bool( False ),
+                              mitigateBadSamplesEE = cms.bool( False ),
+                                    amplitudeThresholdEB = cms.double( 10.0 ),
+                                          kPoorRecoFlagEB = cms.bool( True ),
+                                                amplitudeThresholdEE = cms.double( 10.0 ),
+                                                      EBtimeFitLimits_Lower = cms.double( 0.2 ),
+                                                            kPoorRecoFlagEE = cms.bool( False ),
+                                                                  EEtimeFitLimits_Upper = cms.double( 1.4 ),
+                                                                        outOfTimeThresholdGain61mEE = cms.double( 1000.0 ),
+                                                                              EEtimeConstantTerm = cms.double( 1.0 ),
+                                                                                    EBtimeConstantTerm = cms.double( 0.6 ),
+                                                                                          chi2ThreshEB_ = cms.double( 65.0 ),
+                                                                                                outOfTimeThresholdGain61mEB = cms.double( 5.0 )
+                                                                                                )  
+
+
+#process.hcalDigis.silent = cms.untracked.bool(False)
+#process.hcalDigis.InputLabel = rawTag
+process.ecalDigis = process.ecalEBunpacker.clone()
+process.ecalDigis.InputLabel = cms.InputTag('rawDataCollector')
+#process.hbheprerecogpu.processQIE11 = cms.bool(True)
+
+process.out = cms.OutputModule(
+    "PoolOutputModule",
+    fileName = cms.untracked.string("test.root")
+)
+
+#process.out = cms.OutputModule("AsciiOutputModule",
+#    outputCommands = cms.untracked.vstring(
+#        'keep *_ecalMultiFitUncalibRecHit_*_*', 
+#    ),
+#    verbosity = cms.untracked.uint32(0)
+#)
+process.finalize = cms.EndPath(process.out)
+
+process.bunchSpacing = cms.Path(
+    process.bunchSpacingProducer
+)
+
+process.digiPath = cms.Path(
+    #process.hcalDigis
+    process.ecalDigis
+    *process.ecalRawToDigiGPU
+    *process.ecalCPUDigisProducer
+)
+
+process.recoPath = cms.Path(
+    process.ecalMultiFitUncalibRecHit
+#    process.ecalMultiFitUncalibRecHitgpu
+    *process.ecalUncalibRecHitProducerGPU
+    *process.ecalCPUUncalibRecHitProducer
+)
+
+process.schedule = cms.Schedule(
+    process.bunchSpacing,
+    process.digiPath,
+    process.recoPath,
+#    process.ecalecalLocalRecoSequence
+    process.finalize
+)
+
+process.options = cms.untracked.PSet(
+    numberOfThreads = cms.untracked.uint32(4),
+    numberOfStreams = cms.untracked.uint32(4),
+    SkipEvent = cms.untracked.vstring('ProductNotFound'),
+    wantSummary = cms.untracked.bool(True)
+)
+
+# report CUDAService messages
+process.MessageLogger.categories.append("CUDAService")

From e1268b1ce316e60565be45b3b35638c6cce05410 Mon Sep 17 00:00:00 2001
From: amassiro <massironi.andrea@gmail.com>
Date: Wed, 13 May 2020 16:35:01 +0200
Subject: [PATCH 13/30] update to make it work

---
 .../interface/EcalChannelStatusCode.h         |  19 +-
 EventFilter/EcalRawToDigi/src/UnpackGPU.cu    |   3 +-
 RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml  |  10 +
 ...eEcalMultifitResultsGpuValidationPlots.cpp | 508 +++++++++++++-----
 .../EcalRecAlgos/interface/DeclsForKernels.h  |  10 +-
 .../interface/EcalADCToGeVConstantGPU.h       |  43 --
 .../interface/EcalChannelStatusGPU.h          |  43 --
 .../src/AmplitudeComputationKernels.cu        |   2 +-
 .../src/EcalADCToGeVConstantGPU.cc            |  39 --
 .../EcalRecAlgos/src/EcalChannelStatusGPU.cc  |  49 --
 .../EcalUncalibRecHitMultiFitAlgo_gpu_new.cu  |  16 +-
 .../plugins/EcalESProducersGPUDefs.cc         |  16 +-
 .../plugins/EcalRecHitProducerGPU.cc          |  12 +-
 .../test/ecalRawDecodingAndMultifit.py        |  12 +
 .../test/testEcalRechitProducer_cfg.py        |  22 +-
 15 files changed, 461 insertions(+), 343 deletions(-)
 delete mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h
 delete mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h
 delete mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalADCToGeVConstantGPU.cc
 delete mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc

diff --git a/CondFormats/EcalObjects/interface/EcalChannelStatusCode.h b/CondFormats/EcalObjects/interface/EcalChannelStatusCode.h
index 09202950bfc68..a52868fe0d8df 100644
--- a/CondFormats/EcalObjects/interface/EcalChannelStatusCode.h
+++ b/CondFormats/EcalObjects/interface/EcalChannelStatusCode.h
@@ -5,6 +5,7 @@
  * Created: 14 Nov 2006
  **/
 
+
 #include "CondFormats/Serialization/interface/Serializable.h"
 
 #include <iostream>
@@ -16,7 +17,10 @@
  */
 
 class EcalChannelStatusCode {
+
+  
 public:
+  
   enum Code {
     kOk = 0,
     kDAC,
@@ -35,12 +39,22 @@ class EcalChannelStatusCode {
     kNoDataNoTP
   };
 
-  enum Bits { kHV = 0, kLV, kDAQ, kTP, kTrigger, kTemperature, kNextToDead };
-
+  enum Bits {
+    kHV=0,
+    kLV,
+    kDAQ,
+    kTP,
+    kTrigger,
+    kTemperature,
+    kNextToDead
+  };
+  
 public:
+  
   EcalChannelStatusCode() : status_(0) {}
   EcalChannelStatusCode(const uint16_t& encodedStatus) : status_(encodedStatus){};
 
+  
   void print(std::ostream& s) const { s << "status is: " << status_; }
 
   /// return decoded status
@@ -55,6 +69,7 @@ class EcalChannelStatusCode {
   static const int chStatusMask = 0x1F;
 
 private:
+  
   static const int kBitsOffset = 5;
   /* bits 1-5 store a status code:
        	0 	channel ok 
diff --git a/EventFilter/EcalRawToDigi/src/UnpackGPU.cu b/EventFilter/EcalRawToDigi/src/UnpackGPU.cu
index d8ffbec039b7c..a4742f85ef6ca 100644
--- a/EventFilter/EcalRawToDigi/src/UnpackGPU.cu
+++ b/EventFilter/EcalRawToDigi/src/UnpackGPU.cu
@@ -307,8 +307,7 @@ namespace ecal {
       cudaCheck(cudaMemcpyAsync(
           inputGPU.feds, inputCPU.feds.data(), nfedsWithData * sizeof(int), cudaMemcpyHostToDevice, cudaStream));
 
-//       kernel_unpack_test<32><<<nfedsWithData, 32, 0, cudaStream>>>(inputGPU.data,
-      kernel_unpack_test<16><<<nfedsWithData, 16, 0, cudaStream>>>(inputGPU.data,
+      kernel_unpack_test<32><<<nfedsWithData, 32, 0, cudaStream>>>(inputGPU.data,
                                                                    inputGPU.offsets,
                                                                    inputGPU.feds,
                                                                    outputGPU.samplesEB,
diff --git a/RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml b/RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml
index bf61d052856ad..4c98171091b84 100644
--- a/RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml
+++ b/RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml
@@ -5,3 +5,13 @@
     <use name="DataFormats/Common"/>
     <use name="DataFormats/EcalRecHit"/>
 </bin>
+
+<bin name="makeEcalRechitValidationPlots" file="makeEcalRechitValidationPlots.cpp">
+  <use name="root"/>
+  <use name="rootgraphics"/>
+  <use name="CUDADataFormats/EcalRecHitSoA"/>
+  <use name="DataFormats/Common"/>
+  <use name="DataFormats/EcalRecHit"/>
+</bin>
+
+
diff --git a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp
index 4d50b758d39f3..04ba175eebb1e 100644
--- a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp
+++ b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp
@@ -15,181 +15,290 @@
 #include "DataFormats/EcalRecHit/interface/EcalRecHitCollections.h"
 #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
 
+#include "TStyle.h"
+
+
+void setAxis(TH2D* histo) {
+  histo->GetXaxis()->SetTitle("cpu");
+  histo->GetYaxis()->SetTitle("gpu");
+}
+
+
+void setAxisDelta(TH2D* histo) {
+  histo->GetXaxis()->SetTitle("cpu");
+  histo->GetYaxis()->SetTitle("#Delta gpu-cpu");
+}
+
 int main(int argc, char *argv[]) {
-  if (argc < 3) {
+  if (argc<3) {
     std::cout << "run with: ./validateGPU <path to input file> <output file>\n";
     exit(0);
   }
-
-  edm::Wrapper<ecal::UncalibratedRecHit<ecal::Tag::soa>> *wgpuEB = nullptr;
-  edm::Wrapper<ecal::UncalibratedRecHit<ecal::Tag::soa>> *wgpuEE = nullptr;
+  
+  gStyle->SetOptStat("ourme");
+  
+  edm::Wrapper<ecal::UncalibratedRecHit<ecal::Tag::soa>> *wgpuEB=nullptr;
+  edm::Wrapper<ecal::UncalibratedRecHit<ecal::Tag::soa>> *wgpuEE=nullptr;
   edm::Wrapper<EBUncalibratedRecHitCollection> *wcpuEB = nullptr;
   edm::Wrapper<EEUncalibratedRecHitCollection> *wcpuEE = nullptr;
-
+  
   std::string fileName = argv[1];
   std::string outFileName = argv[2];
-
+  
   // output
   TFile rfout{outFileName.c_str(), "recreate"};
-
+  
+  int nbins_count = 200;
+  float last_count = 5000.;
+  int nbins_count_delta = 201;
+  
   int nbins = 300;
   float last = 3000.;
-
+  
+  //     int nbins_chi2 = 1000;
+  //     float last_chi2 = 1000.;
   int nbins_chi2 = 1000;
-  float last_chi2 = 1000.;
-
+  float last_chi2 = 200.;
+  
+  int nbins_flags = 100;
+  float last_flags = 100.;
+  float delta_flags = 20;
+  
   int nbins_delta = 201;  // use an odd number to center around 0
   float delta = 0.2;
-
+  
+  
+  // RecHits plots for EB and EE on both GPU and CPU
+  auto hRechitsEBGPU = new TH1D("RechitsEBGPU", "RechitsEBGPU; No. of Rechits", nbins_count, 0, last_count);
+  auto hRechitsEBCPU = new TH1D("RechitsEBCPU", "RechitsEBCPU; No. of Rechits", nbins_count, 0, last_count);
+  auto hRechitsEEGPU = new TH1D("RechitsEEGPU", "RechitsEEGPU; No. of Rechits", nbins_count, 0, last_count);
+  auto hRechitsEECPU = new TH1D("RechitsEECPU", "RechitsEECPU; No. of Rechits", nbins_count, 0, last_count);
+  auto hRechitsEBGPUCPUratio = new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU", 50, 0.9, 1.1);
+  auto hRechitsEEGPUCPUratio = new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU", 50, 0.9, 1.1);
+  
   auto hSOIAmplitudesEBGPU = new TH1D("hSOIAmplitudesEBGPU", "hSOIAmplitudesEBGPU", nbins, 0, last);
   auto hSOIAmplitudesEEGPU = new TH1D("hSOIAmplitudesEEGPU", "hSOIAmplitudesEEGPU", nbins, 0, last);
   auto hSOIAmplitudesEBCPU = new TH1D("hSOIAmplitudesEBCPU", "hSOIAmplitudesEBCPU", nbins, 0, last);
   auto hSOIAmplitudesEECPU = new TH1D("hSOIAmplitudesEECPU", "hSOIAmplitudesEECPU", nbins, 0, last);
-  auto hSOIAmplitudesEBGPUCPUratio =
-      new TH1D("SOIAmplitudesEBGPU/CPUratio", "SOIAmplitudesEBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
-  auto hSOIAmplitudesEEGPUCPUratio =
-      new TH1D("SOIAmplitudesEEGPU/CPUratio", "SOIAmplitudesEEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
-
+  auto hSOIAmplitudesEBGPUCPUratio = new TH1D("SOIAmplitudesEBGPU/CPUratio", "SOIAmplitudesEBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
+  auto hSOIAmplitudesEEGPUCPUratio = new TH1D("SOIAmplitudesEEGPU/CPUratio", "SOIAmplitudesEEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
+  
   auto hChi2EBGPU = new TH1D("hChi2EBGPU", "hChi2EBGPU", nbins_chi2, 0, last_chi2);
   auto hChi2EEGPU = new TH1D("hChi2EEGPU", "hChi2EEGPU", nbins_chi2, 0, last_chi2);
   auto hChi2EBCPU = new TH1D("hChi2EBCPU", "hChi2EBCPU", nbins_chi2, 0, last_chi2);
   auto hChi2EECPU = new TH1D("hChi2EECPU", "hChi2EECPU", nbins_chi2, 0, last_chi2);
-
-  auto hSOIAmplitudesEBGPUvsCPU =
-      new TH2D("hSOIAmplitudesEBGPUvsCPU", "hSOIAmplitudesEBGPUvsCPU", nbins, 0, last, nbins, 0, last);
-  auto hSOIAmplitudesEEGPUvsCPU =
-      new TH2D("hSOIAmplitudesEEGPUvsCPU", "hSOIAmplitudesEEGPUvsCPU", nbins, 0, last, nbins, 0, last);
-  auto hSOIAmplitudesEBdeltavsCPU =
-      new TH2D("hSOIAmplitudesEBdeltavsCPU", "hSOIAmplitudesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
-  auto hSOIAmplitudesEEdeltavsCPU =
-      new TH2D("hSOIAmplitudesEEdeltavsCPU", "hSOIAmplitudesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
-
-  auto hChi2EBGPUvsCPU =
-      new TH2D("hChi2EBGPUvsCPU", "hChi2EBGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2);
-  auto hChi2EEGPUvsCPU =
-      new TH2D("hChi2EEGPUvsCPU", "hChi2EEGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2);
-  auto hChi2EBdeltavsCPU =
-      new TH2D("hChi2EBdeltavsCPU", "hChi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
-  auto hChi2EEdeltavsCPU =
-      new TH2D("hChi2EEdeltavsCPU", "hChi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
-
+  auto hChi2EBGPUCPUratio = new TH1D("Chi2EBGPU/CPUratio", "Chi2EBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
+  auto hChi2EEGPUCPUratio = new TH1D("Chi2EEGPU/CPUratio", "Chi2EEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
+  
+  auto hFlagsEBGPU = new TH1D("hFlagsEBGPU", "hFlagsEBGPU", nbins_flags, 0, last_flags);
+  auto hFlagsEEGPU = new TH1D("hFlagsEEGPU", "hFlagsEEGPU", nbins_flags, 0, last_flags);
+  auto hFlagsEBCPU = new TH1D("hFlagsEBCPU", "hFlagsEBCPU", nbins_flags, 0, last_flags);
+  auto hFlagsEECPU = new TH1D("hFlagsEECPU", "hFlagsEECPU", nbins_flags, 0, last_flags);
+  auto hFlagsEBGPUCPUratio = new TH1D("FlagsEBGPU/CPUratio", "FlagsEBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
+  auto hFlagsEEGPUCPUratio = new TH1D("FlagsEEGPU/CPUratio", "FlagsEEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
+  
+  auto hSOIAmplitudesEBGPUvsCPU = new TH2D("hSOIAmplitudesEBGPUvsCPU", "hSOIAmplitudesEBGPUvsCPU", nbins, 0, last, nbins, 0, last);                       setAxis(hSOIAmplitudesEBGPUvsCPU  ) ;
+  auto hSOIAmplitudesEEGPUvsCPU = new TH2D("hSOIAmplitudesEEGPUvsCPU", "hSOIAmplitudesEEGPUvsCPU", nbins, 0, last, nbins, 0, last);                       setAxis(hSOIAmplitudesEEGPUvsCPU  ) ;
+  auto hSOIAmplitudesEBdeltavsCPU = new TH2D("hSOIAmplitudesEBdeltavsCPU", "hSOIAmplitudesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);     setAxisDelta(hSOIAmplitudesEBdeltavsCPU) ;
+  auto hSOIAmplitudesEEdeltavsCPU = new TH2D("hSOIAmplitudesEEdeltavsCPU", "hSOIAmplitudesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);     setAxisDelta(hSOIAmplitudesEEdeltavsCPU) ;
+  
+  auto hChi2EBGPUvsCPU = new TH2D("hChi2EBGPUvsCPU", "hChi2EBGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2);                      setAxis(hChi2EBGPUvsCPU  ) ;
+  auto hChi2EEGPUvsCPU = new TH2D("hChi2EEGPUvsCPU", "hChi2EEGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2);                      setAxis(hChi2EEGPUvsCPU  ) ;
+  auto hChi2EBdeltavsCPU = new TH2D("hChi2EBdeltavsCPU", "hChi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);              setAxisDelta(hChi2EBdeltavsCPU) ;
+  auto hChi2EEdeltavsCPU = new TH2D("hChi2EEdeltavsCPU", "hChi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);              setAxisDelta(hChi2EEdeltavsCPU) ;
+  
+  auto hFlagsEBGPUvsCPU = new TH2D("hFlagsEBGPUvsCPU", "hFlagsEBGPUvsCPU", nbins_flags, 0, last_flags, nbins_flags, 0, last_flags);                      setAxis(hFlagsEBGPUvsCPU  ) ;
+  auto hFlagsEEGPUvsCPU = new TH2D("hFlagsEEGPUvsCPU", "hFlagsEEGPUvsCPU", nbins_flags, 0, last_flags, nbins_flags, 0, last_flags);                      setAxis(hFlagsEEGPUvsCPU  ) ;
+  auto hFlagsEBdeltavsCPU = new TH2D("hFlagsEBdeltavsCPU", "hFlagsEBdeltavsCPU", nbins_flags, 0, last_flags, nbins_delta, -delta_flags, delta_flags);    setAxisDelta(hFlagsEBdeltavsCPU) ;
+  auto hFlagsEEdeltavsCPU = new TH2D("hFlagsEEdeltavsCPU", "hFlagsEEdeltavsCPU", nbins_flags, 0, last_flags, nbins_delta, -delta_flags, delta_flags);    setAxisDelta(hFlagsEEdeltavsCPU) ;
+  
+  auto hRechitsEBGPUvsCPU = new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU", last_count, 0, last_count, last_count, 0, last_count);       setAxis(hRechitsEBGPUvsCPU  ) ;
+  auto hRechitsEEGPUvsCPU = new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU", last_count, 0, last_count, last_count, 0, last_count);       setAxis(hRechitsEEGPUvsCPU  ) ;
+  auto hRechitsEBdeltavsCPU = new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU", nbins_count, 0, last_count, nbins_count_delta, -delta, delta);   setAxisDelta(hRechitsEBdeltavsCPU) ;
+  auto hRechitsEEdeltavsCPU = new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU", nbins_count, 0, last_count, nbins_count_delta, -delta, delta);   setAxisDelta(hRechitsEEdeltavsCPU) ;
+  
+  
   // input
   std::cout << "validating file " << fileName << std::endl;
   TFile rf{fileName.c_str()};
-  TTree *rt = (TTree *)rf.Get("Events");
-  rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEB_RECO.",
-                       &wgpuEB);
-  rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEE_RECO.",
-                       &wgpuEE);
+  TTree *rt = (TTree*)rf.Get("Events");
+  rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEB_RECO.", &wgpuEB);
+  rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEE_RECO.", &wgpuEE);
   rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEB_RECO.", &wcpuEB);
   rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEE_RECO.", &wcpuEE);
-
+  
   constexpr float eps_diff = 1e-3;
-
+  
   // accumulate
   auto const nentries = rt->GetEntries();
   std::cout << "#events to validate over: " << nentries << std::endl;
-  for (int ie = 0; ie < nentries; ++ie) {
+  for (int ie=0; ie<nentries; ++ie) {
     rt->GetEntry(ie);
-
-    const char *ordinal[] = {"th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th"};
+    
+    const char* ordinal[] = { "th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th" };
     auto cpu_eb_size = wcpuEB->bareProduct().size();
     auto cpu_ee_size = wcpuEE->bareProduct().size();
     auto gpu_eb_size = wgpuEB->bareProduct().amplitude.size();
     auto gpu_ee_size = wgpuEE->bareProduct().amplitude.size();
+    
+    float eb_ratio = (float) gpu_eb_size/cpu_eb_size;
+    float ee_ratio = (float) gpu_ee_size/cpu_ee_size;
+    
+    // Filling up the histograms on events sizes for EB and EE on both GPU and CPU
+    hRechitsEBGPU->Fill(gpu_eb_size);
+    hRechitsEBCPU->Fill(cpu_eb_size);
+    hRechitsEEGPU->Fill(gpu_ee_size);
+    hRechitsEECPU->Fill(cpu_ee_size);
+    hRechitsEBGPUvsCPU->Fill(cpu_eb_size, gpu_eb_size);
+    hRechitsEEGPUvsCPU->Fill(cpu_ee_size, gpu_ee_size);
+    hRechitsEBGPUCPUratio->Fill(eb_ratio);
+    hRechitsEEGPUCPUratio->Fill(ee_ratio);
+    hRechitsEBdeltavsCPU->Fill(cpu_eb_size, gpu_eb_size-cpu_eb_size);
+    hRechitsEEdeltavsCPU->Fill(cpu_ee_size, gpu_ee_size-cpu_ee_size);
+    
+    
     if (cpu_eb_size != gpu_eb_size or cpu_ee_size != gpu_ee_size) {
       std::cerr << ie << ordinal[ie % 10] << " entry:\n"
-                << "  EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << gpu_eb_size
-                << " (gpu)\n"
-                << "  EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << gpu_ee_size
-                << " (gpu)" << std::endl;
+      << "  EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << gpu_eb_size << " (gpu)\n"
+      << "  EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << gpu_ee_size << " (gpu)" << std::endl;
       continue;
     }
-
+    
     assert(wgpuEB->bareProduct().amplitude.size() == wcpuEB->bareProduct().size());
     assert(wgpuEE->bareProduct().amplitude.size() == wcpuEE->bareProduct().size());
     auto const neb = wcpuEB->bareProduct().size();
     auto const nee = wcpuEE->bareProduct().size();
-
-    for (uint32_t i = 0; i < neb; ++i) {
+    
+    
+    for (uint32_t i=0; i<neb; ++i) {
       auto const did_gpu = wgpuEB->bareProduct().did[i];
       auto const soi_amp_gpu = wgpuEB->bareProduct().amplitude[i];
       auto const cpu_iter = wcpuEB->bareProduct().find(DetId{did_gpu});
       if (cpu_iter == wcpuEB->bareProduct().end()) {
         std::cerr << ie << ordinal[ie % 10] << " entry\n"
-                  << "  Did not find a DetId " << did_gpu << " in a CPU collection\n";
+        << "  Did not find a DetId " << did_gpu
+        << " in a CPU collection\n";
         continue;
       }
       auto const soi_amp_cpu = cpu_iter->amplitude();
       auto const chi2_gpu = wgpuEB->bareProduct().chi2[i];
       auto const chi2_cpu = cpu_iter->chi2();
-
+      
+      auto const flags_gpu = wgpuEB->bareProduct().flags[i];
+      auto const flags_cpu = cpu_iter->flags();
+      
       hSOIAmplitudesEBGPU->Fill(soi_amp_gpu);
       hSOIAmplitudesEBCPU->Fill(soi_amp_cpu);
       hSOIAmplitudesEBGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu);
-      hSOIAmplitudesEBdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu - soi_amp_cpu);
-      hSOIAmplitudesEBGPUCPUratio->Fill((float)soi_amp_gpu / soi_amp_cpu);
+      hSOIAmplitudesEBdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu-soi_amp_cpu);
+      if (soi_amp_cpu>0) hSOIAmplitudesEBGPUCPUratio->Fill( (float) soi_amp_gpu/soi_amp_cpu);
+      
       hChi2EBGPU->Fill(chi2_gpu);
       hChi2EBCPU->Fill(chi2_cpu);
       hChi2EBGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
-      hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu);
-
-      if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or
-          std::isnan(chi2_gpu)) {
+      hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu);
+      if (chi2_cpu>0) hChi2EBGPUCPUratio->Fill( (float) chi2_gpu/chi2_cpu);
+      
+      if (fabs(chi2_gpu/chi2_cpu-1) > 0.05 || fabs(soi_amp_gpu/soi_amp_cpu-1) > 0.05) {
+        std::cout << " ---- EB  " << std::endl;
+        std::cout << " eventid = " << ie << " xtal = " << i << std::endl;
+        std::cout << " chi2_gpu    = " << chi2_gpu    << " chi2_cpu =    " << chi2_cpu << std::endl;
+        std::cout << " soi_amp_gpu = " << soi_amp_gpu << " soi_amp_cpu = " << soi_amp_cpu << std::endl;
+        std::cout << " flags_gpu   = " << flags_gpu   << " flags_cpu =   " << flags_cpu << std::endl;
+      } 
+      
+      hFlagsEBGPU->Fill(flags_gpu);
+      hFlagsEBCPU->Fill(flags_cpu);
+      hFlagsEBGPUvsCPU->Fill(flags_cpu, flags_gpu);
+      hFlagsEBdeltavsCPU->Fill(flags_cpu, flags_gpu-flags_cpu);
+      if (flags_cpu>0) hFlagsEBGPUCPUratio->Fill( (float) flags_gpu/flags_cpu);
+      
+      if (flags_cpu!=flags_gpu) {
+        std::cout << "    >>  No! Different flag cpu:gpu = " << flags_cpu << " : " << flags_gpu;
+        std::cout << std::endl;
+      }
+      
+      if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or
+        (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu)
+        or (flags_cpu!=flags_gpu) )
+      {
         printf("EB eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n",
-               ie,
-               i,
-               soi_amp_gpu,
-               soi_amp_cpu,
-               chi2_gpu,
-               chi2_cpu);
+               ie, i, soi_amp_gpu, soi_amp_cpu, chi2_gpu, chi2_cpu);
         if (std::isnan(chi2_gpu))
           printf("*** nan ***\n");
       }
     }
-
-    for (uint32_t i = 0; i < nee; ++i) {
+    
+    for (uint32_t i=0; i<nee; ++i) {
       auto const did_gpu = wgpuEE->bareProduct().did[i];
       auto const soi_amp_gpu = wgpuEE->bareProduct().amplitude[i];
       auto const cpu_iter = wcpuEE->bareProduct().find(DetId{did_gpu});
       if (cpu_iter == wcpuEE->bareProduct().end()) {
         std::cerr << ie << ordinal[ie % 10] << " entry\n"
-                  << "  did not find a DetId " << did_gpu << " in a CPU collection\n";
+        << "  did not find a DetId " << did_gpu
+        << " in a CPU collection\n";
         continue;
       }
       auto const soi_amp_cpu = cpu_iter->amplitude();
       auto const chi2_gpu = wgpuEE->bareProduct().chi2[i];
       auto const chi2_cpu = cpu_iter->chi2();
-
+      
+      auto const flags_gpu = wgpuEE->bareProduct().flags[i];
+      auto const flags_cpu = cpu_iter->flags();
+      
+      
       hSOIAmplitudesEEGPU->Fill(soi_amp_gpu);
       hSOIAmplitudesEECPU->Fill(soi_amp_cpu);
       hSOIAmplitudesEEGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu);
-      hSOIAmplitudesEEdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu - soi_amp_cpu);
-      hSOIAmplitudesEEGPUCPUratio->Fill((float)soi_amp_gpu / soi_amp_cpu);
+      hSOIAmplitudesEEdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu-soi_amp_cpu);
+      if (soi_amp_cpu>0) hSOIAmplitudesEEGPUCPUratio->Fill( (float) soi_amp_gpu/soi_amp_cpu);
+      
       hChi2EEGPU->Fill(chi2_gpu);
       hChi2EECPU->Fill(chi2_cpu);
       hChi2EEGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
-      hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu);
-
-      if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or
-          std::isnan(chi2_gpu)) {
+      hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu);
+      if (chi2_cpu>0) hChi2EEGPUCPUratio->Fill( (float) chi2_gpu/chi2_cpu);
+      
+      if (fabs(chi2_gpu/chi2_cpu-1) > 0.05 || fabs(soi_amp_gpu/soi_amp_cpu-1) > 0.05) {
+        std::cout << " ---- EE  " << std::endl;
+        std::cout << " eventid = " << ie << " xtal = " << i << std::endl;
+        std::cout << " chi2_gpu    = " << chi2_gpu    << " chi2_cpu =    " << chi2_cpu << std::endl;
+        std::cout << " soi_amp_gpu = " << soi_amp_gpu << " soi_amp_cpu = " << soi_amp_cpu << std::endl;
+        std::cout << " flags_gpu   = " << flags_gpu   << " flags_cpu =   " << flags_cpu << std::endl;
+      } 
+      
+      hFlagsEEGPU->Fill(flags_gpu);
+      hFlagsEECPU->Fill(flags_cpu);
+      hFlagsEEGPUvsCPU->Fill(flags_cpu, flags_gpu);
+      hFlagsEEdeltavsCPU->Fill(flags_cpu, flags_gpu-flags_cpu);
+      if (flags_cpu>0) hFlagsEEGPUCPUratio->Fill( (float) flags_gpu/flags_cpu);
+      
+      if (flags_cpu!=flags_gpu) {
+        std::cout << "    >>  No! Different flag cpu:gpu = " << flags_cpu << " : " << flags_gpu;
+        std::cout << std::endl;
+      }
+      
+      if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or
+        (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu)
+        or (flags_cpu!=flags_gpu) )
+      {
         printf("EE eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n",
-               ie,
-               static_cast<int>(neb + i),
-               soi_amp_gpu,
-               soi_amp_cpu,
-               chi2_gpu,
-               chi2_cpu);
+               ie, static_cast<int>(neb+i), soi_amp_gpu, soi_amp_cpu, chi2_gpu, chi2_cpu);
         if (std::isnan(chi2_gpu))
           printf("*** nan ***\n");
       }
     }
   }
-
+  
   {
-    TCanvas c("plots", "plots", 4200, 6200);
-    c.Divide(2, 4);
-
+    
+    
+    //       TCanvas c("plots", "plots", 4200, 6200);
+    TCanvas c("plots", "plots", 1750, 860);
+    //       c.Divide(2, 3);
+    c.Divide(3, 2);
+    
+    //       c.cd(1);
     c.cd(1);
     {
       gPad->SetLogy();
@@ -200,13 +309,14 @@ int main(int argc, char *argv[]) {
       hSOIAmplitudesEBGPU->SetLineWidth(1.);
       hSOIAmplitudesEBGPU->Draw("sames");
       gPad->Update();
-      auto stats = (TPaveStats *)hSOIAmplitudesEBGPU->FindObject("stats");
+      auto stats = (TPaveStats*)hSOIAmplitudesEBGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2 - y1));
+      stats->SetY1NDC(y1 - (y2-y1));
     }
-    c.cd(2);
+    //       c.cd(2);
+    c.cd(4);
     {
       gPad->SetLogy();
       hSOIAmplitudesEECPU->SetLineColor(kBlack);
@@ -216,41 +326,35 @@ int main(int argc, char *argv[]) {
       hSOIAmplitudesEEGPU->SetLineWidth(1.);
       hSOIAmplitudesEEGPU->Draw("sames");
       gPad->Update();
-      auto stats = (TPaveStats *)hSOIAmplitudesEEGPU->FindObject("stats");
+      auto stats = (TPaveStats*)hSOIAmplitudesEEGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2 - y1));
+      stats->SetY1NDC(y1 - (y2-y1));
     }
-    c.cd(3);
+    //       c.cd(3);
+    c.cd(2);
+    gPad->SetGrid();
     hSOIAmplitudesEBGPUvsCPU->Draw("COLZ");
-    c.cd(4);
-    hSOIAmplitudesEEGPUvsCPU->Draw("COLZ");
+    //       c.cd(4);
     c.cd(5);
-    hSOIAmplitudesEBdeltavsCPU->Draw("COLZ");
+    gPad->SetGrid();
+    hSOIAmplitudesEEGPUvsCPU->Draw("COLZ");
+    //       c.cd(5);
+    c.cd(3);
+    //       hSOIAmplitudesEBdeltavsCPU->Draw("COLZ");
+    hSOIAmplitudesEBGPUCPUratio->Draw("");
+    //       c.cd(6);
     c.cd(6);
-    hSOIAmplitudesEEdeltavsCPU->Draw("COLZ");
-    c.cd(7);
-    {
-      gPad->SetLogy();
-      hSOIAmplitudesEBGPUCPUratio->SetLineColor(kBlack);
-      hSOIAmplitudesEBGPUCPUratio->SetLineWidth(1.);
-      hSOIAmplitudesEBGPUCPUratio->Draw("");
-    }
-    c.cd(8);
-    {
-      gPad->SetLogy();
-      hSOIAmplitudesEEGPUCPUratio->SetLineColor(kBlack);
-      hSOIAmplitudesEEGPUCPUratio->SetLineWidth(1.);
-      hSOIAmplitudesEEGPUCPUratio->Draw("");
-    }
-
-    c.SaveAs("ecal-amplitudes.pdf");
-  }
-  {
-    TCanvas c("plots", "plots", 4200, 6200);
-    c.Divide(2, 3);
-
+    //       hSOIAmplitudesEEdeltavsCPU->Draw("COLZ");
+    hSOIAmplitudesEEGPUCPUratio->Draw("");
+    
+    c.SaveAs("ecal-amplitudes.root");
+    c.SaveAs("ecal-amplitudes.png");
+    
+    // chi2
+    
+    //       c.cd(1);
     c.cd(1);
     {
       gPad->SetLogy();
@@ -261,13 +365,14 @@ int main(int argc, char *argv[]) {
       hChi2EBGPU->SetLineWidth(1.);
       hChi2EBGPU->Draw("sames");
       gPad->Update();
-      auto stats = (TPaveStats *)hChi2EBGPU->FindObject("stats");
+      auto stats = (TPaveStats*)hChi2EBGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2 - y1));
+      stats->SetY1NDC(y1 - (y2-y1));
     }
-    c.cd(2);
+    //       c.cd(2);
+    c.cd(4);
     {
       gPad->SetLogy();
       hChi2EECPU->SetLineColor(kBlack);
@@ -277,27 +382,170 @@ int main(int argc, char *argv[]) {
       hChi2EEGPU->SetLineWidth(1.);
       hChi2EEGPU->Draw("sames");
       gPad->Update();
-      auto stats = (TPaveStats *)hChi2EEGPU->FindObject("stats");
+      auto stats = (TPaveStats*)hChi2EEGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2 - y1));
+      stats->SetY1NDC(y1 - (y2-y1));
     }
-    c.cd(3);
+    //       c.cd(3);
+    c.cd(2);
+    gPad->SetGrid();
     hChi2EBGPUvsCPU->Draw("COLZ");
-    c.cd(4);
+    //       c.cd(4);
+    c.cd(5);
+    gPad->SetGrid();
     hChi2EEGPUvsCPU->Draw("COLZ");
+    //       c.cd(5);
+    c.cd(3);
+    //       hChi2EBdeltavsCPU->Draw("COLZ");
+    hChi2EBGPUCPUratio->Draw("");
+    //       c.cd(6);
+    c.cd(6);
+    //       hChi2EEdeltavsCPU->Draw("COLZ");
+    hChi2EEGPUCPUratio->Draw("");
+    
+    c.SaveAs("ecal-chi2.root");
+    c.SaveAs("ecal-chi2.png");
+    
+    
+    
+    // flags
+    
+    //       c.cd(1);
+    c.cd(1);
+    {
+      gPad->SetLogy();
+      hFlagsEBCPU->SetLineColor(kBlack);
+      hFlagsEBCPU->SetLineWidth(1.);
+      hFlagsEBCPU->Draw("");
+      hFlagsEBGPU->SetLineColor(kBlue);
+      hFlagsEBGPU->SetLineWidth(1.);
+      hFlagsEBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hFlagsEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2-y1));
+    }
+    //       c.cd(2);
+    c.cd(4);
+    {
+      gPad->SetLogy();
+      hFlagsEECPU->SetLineColor(kBlack);
+      hFlagsEECPU->SetLineWidth(1.);
+      hFlagsEECPU->Draw("");
+      hFlagsEEGPU->SetLineColor(kBlue);
+      hFlagsEEGPU->SetLineWidth(1.);
+      hFlagsEEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hFlagsEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2-y1));
+    }
+    //       c.cd(3);
+    c.cd(2);
+    gPad->SetGrid();
+    hFlagsEBGPUvsCPU->Draw("COLZ");
+    //       c.cd(4);
     c.cd(5);
-    hChi2EBdeltavsCPU->Draw("COLZ");
+    gPad->SetGrid();
+    hFlagsEEGPUvsCPU->Draw("COLZ");
+    //       c.cd(5);
+    c.cd(3);
+    //       hFlagsEBdeltavsCPU->Draw("COLZ");
+    hFlagsEBGPUCPUratio->Draw("");
+    
+    //       c.cd(6);
     c.cd(6);
-    hChi2EEdeltavsCPU->Draw("COLZ");
-
-    c.SaveAs("ecal-chi2.pdf");
+    //       hFlagsEEdeltavsCPU->Draw("COLZ");
+    hFlagsEEGPUCPUratio->Draw("");
+    
+    
+    c.SaveAs("ecal-flags.root");
+    c.SaveAs("ecal-flags.png");
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    TCanvas cRechits("Rechits", "Rechits", 1750, 860);
+    cRechits.Divide(3, 2);
+    
+    // Plotting the sizes of GPU vs CPU for each event of EB 
+    cRechits.cd(1);
+    {
+      gPad->SetLogy();
+      hRechitsEBCPU->SetLineColor(kRed);
+      hRechitsEBCPU->SetLineWidth(2);
+      hRechitsEBCPU->Draw("");
+      hRechitsEBGPU->SetLineColor(kBlue);
+      hRechitsEBGPU->SetLineWidth(2);
+      hRechitsEBGPU->Draw("sames");
+      cRechits.Update();
+      auto stats = (TPaveStats*)hRechitsEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2-y1));
+    } 
+    cRechits.cd(4);
+    {
+      gPad->SetLogy();
+      hRechitsEECPU->SetLineColor(kRed);
+      hRechitsEECPU->SetLineWidth(2);
+      hRechitsEECPU->Draw("");
+      hRechitsEEGPU->SetLineColor(kBlue);
+      hRechitsEEGPU->SetLineWidth(2);
+      hRechitsEEGPU->Draw("sames");
+      cRechits.Update();
+      auto stats = (TPaveStats*)hRechitsEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2-y1));
+    }
+    cRechits.cd(2); {
+      hRechitsEBGPUvsCPU->Draw("COLZ");
+    }
+    cRechits.cd(5); {
+      hRechitsEEGPUvsCPU->Draw("COLZ");
+    }
+    cRechits.cd(3); {
+      gPad->SetLogy();
+      //hRechitsEBdeltavsCPU->Draw("COLZ");
+      hRechitsEBGPUCPUratio->Draw("");
+    }
+    cRechits.cd(6); {
+      gPad->SetLogy();
+      //hRechitsEEdeltavsCPU->Draw("COLZ");
+      hRechitsEEGPUCPUratio->Draw("");
+    }
+    cRechits.SaveAs("ecal-rechits.root");
+    cRechits.SaveAs("ecal-rechits.png");
+    
+    
+    
+    
+    
+    
   }
-
+  
   rf.Close();
   rfout.Write();
   rfout.Close();
-
+  
   return 0;
 }
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h
index 419e50b3636c6..a35ef1c57a381 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h
@@ -283,9 +283,9 @@ struct conf_data {
 #include "CondFormats/EcalObjects/interface/EcalChannelStatus.h"
 #include "CondFormats/EcalObjects/interface/EcalChannelStatusCode.h"
 
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h"
 
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h"
@@ -378,9 +378,9 @@ namespace ecal {
     
     // const refs products to conditions
     struct ConditionsProducts {
-      EcalADCToGeVConstantGPU::Product    const& ADCToGeV      ;
-      EcalIntercalibConstantsGPU::Product const& Intercalib    ;
-      EcalChannelStatusGPU::Product       const& ChannelStatus ;
+      EcalRechitADCToGeVConstantGPU::Product const& ADCToGeV      ;
+      EcalIntercalibConstantsGPU::Product    const& Intercalib    ;
+      EcalRechitChannelStatusGPU::Product    const& ChannelStatus ;
       //     
       EcalLaserAPDPNRatiosGPU::Product     const& LaserAPDPNRatios    ;
       EcalLaserAPDPNRatiosRefGPU::Product  const& LaserAPDPNRatiosRef ;
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h
deleted file mode 100644
index 4f6cb43eddee0..0000000000000
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef RecoLocalCalo_EcalRecProducers_src_EcalADCToGeVConstantGPU_h
-#define RecoLocalCalo_EcalRecProducers_src_EcalADCToGeVConstantGPU_h
-
-#include "CondFormats/EcalObjects/interface/EcalADCToGeVConstant.h"
-
-#ifndef __CUDACC__
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
-#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
-#endif
-
-class EcalADCToGeVConstantGPU {
-public:
-  struct Product {
-    ~Product();
-    float *adc2gev = nullptr;
-  };
-  
-  #ifndef __CUDACC__
-  
-  // 
-  EcalADCToGeVConstantGPU(EcalADCToGeVConstant const&);
-  
-  // will call dealloation for Product thru ~Product
-  ~EcalADCToGeVConstantGPU() = default;
-  
-  // get device pointers
-  Product const& getProduct(cudaStream_t) const;
-  
-  // 
-  static std::string name() { return std::string{"ecalADCToGeVConstantGPU"}; }
-  
-private:
-  // in the future, we need to arrange so to avoid this copy on the host
-  // store eb first then ee
-  std::vector<float, CUDAHostAllocator<float>> adc2gev_;
-  
-  cms::cuda::ESProduct<Product> product_;
-  
-  #endif
-};
-
-
-#endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h
deleted file mode 100644
index 0932e7f0641d9..0000000000000
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef RecoLocalCalo_EcalRecProducers_src_EcalChannelStatusGPU_h
-#define RecoLocalCalo_EcalRecProducers_src_EcalChannelStatusGPU_h
-
-#include "CondFormats/EcalObjects/interface/EcalChannelStatus.h"
-
-#ifndef __CUDACC__
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
-#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
-#endif
-
-class EcalChannelStatusGPU {
-public:
-  struct Product {
-    ~Product();
-    uint16_t *status = nullptr;
-  };
-  
-  #ifndef __CUDACC__
-  
-  // 
-  EcalChannelStatusGPU(EcalChannelStatus const&);
-  
-  // will call dealloation for Product thru ~Product
-  ~EcalChannelStatusGPU() = default;
-  
-  // get device pointers
-  Product const& getProduct(cudaStream_t) const;
-  
-  // 
-  static std::string name() { return std::string{"ecalChannelStatusGPU"}; }
-  
-private:
-  // in the future, we need to arrange so to avoid this copy on the host
-  // store eb first then ee
-  std::vector<uint16_t, CUDAHostAllocator<uint16_t>> status_;
-  
-  cms::cuda::ESProduct<Product> product_;
-  
-  #endif
-};
-
-
-#endif
diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu
index c67677055c189..23d9c12aa0582 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu
@@ -392,7 +392,7 @@ namespace ecal {
             50,
             offsetForHashes,
             offsetForInputs);
-//         cudaCheck(cudaGetLastError());
+        cudaCheck(cudaGetLastError());
       }
 
     }  // namespace v1
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalADCToGeVConstantGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalADCToGeVConstantGPU.cc
deleted file mode 100644
index acddf19fe01c2..0000000000000
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalADCToGeVConstantGPU.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h"
-
-#include "FWCore/Utilities/interface/typelookup.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-
-EcalADCToGeVConstantGPU::EcalADCToGeVConstantGPU(EcalADCToGeVConstant const& values) 
-: adc2gev_(2)  // size is 2, one form EB and one for EE
-{
-  adc2gev_[0] = values.getEBValue();
-  adc2gev_[1] = values.getEEValue(); 
-}
-
-EcalADCToGeVConstantGPU::Product::~Product() {
-  // deallocation
-  cudaCheck( cudaFree(adc2gev) );
-}
-
-EcalADCToGeVConstantGPU::Product const& EcalADCToGeVConstantGPU::getProduct(
-  cudaStream_t cudaStream) const 
-{
-  auto const& product = product_.dataForCurrentDeviceAsync(
-                   cudaStream,
-                   [this](EcalADCToGeVConstantGPU::Product& product, cudaStream_t cudaStream) {
-                         // malloc
-                         cudaCheck( cudaMalloc((void**)&product.adc2gev,
-                                               this->adc2gev_.size() * sizeof(float)) );
-                         // transfer 
-                         cudaCheck( cudaMemcpyAsync(product.adc2gev,
-                                                    this->adc2gev_.data(),
-                                                    this->adc2gev_.size() * sizeof(float),
-                                                    cudaMemcpyHostToDevice,
-                                                    cudaStream) );
-                   }
-  );
-  
-  return product;
-}
-
-TYPELOOKUP_DATA_REG(EcalADCToGeVConstantGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc
deleted file mode 100644
index 91293902bb667..0000000000000
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h"
-
-#include "FWCore/Utilities/interface/typelookup.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-
-EcalChannelStatusGPU::EcalChannelStatusGPU(EcalChannelStatus const& values) 
-: status_(values.size())
-{
-  // fill in eb
-  auto const& barrelValues = values.barrelItems();
-  for (unsigned int i=0; i<barrelValues.size(); i++) {
-    status_[i] = barrelValues[i].getEncodedStatusCode();
-  }
-  
-  // fill in ee
-  auto const& endcapValues = values.endcapItems();
-  auto const offset = barrelValues.size();
-  for (unsigned int i=0; i<endcapValues.size(); i++) {
-    status_[offset + i] = endcapValues[i].getEncodedStatusCode();
-  }
-}
-
-EcalChannelStatusGPU::Product::~Product() {
-  // deallocation
-  cudaCheck( cudaFree(status) );
-}
-
-EcalChannelStatusGPU::Product const& EcalChannelStatusGPU::getProduct(cudaStream_t cudaStream) const { 
-  auto const& product = product_.dataForCurrentDeviceAsync(
-    cudaStream,
-    [this](EcalChannelStatusGPU::Product& product, cudaStream_t cudaStream) {
-      // malloc
-      cudaCheck( cudaMalloc((void**)&product.status,
-                            this->status_.size() * sizeof(uint16_t)) );
-      // transfer 
-      cudaCheck( cudaMemcpyAsync(product.status,
-                                 this->status_.data(),
-                                 this->status_.size() * sizeof(uint16_t),
-                                 cudaMemcpyHostToDevice,
-                                 cudaStream) );
-    }
-  );
-  
-  return product;
-}
-
-TYPELOOKUP_DATA_REG(EcalChannelStatusGPU);
-
-
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu
index dbfe4833c7d3f..c8d2926b29afc 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu
@@ -83,7 +83,7 @@ namespace ecal {
           gainSwitchUseMaxSampleEB,
           gainSwitchUseMaxSampleEE,
           totalChannels);
-//       cudaCheck(cudaGetLastError());
+      cudaCheck(cudaGetLastError());
 
       //
       // 2d preparation kernel
@@ -112,7 +112,7 @@ namespace ecal {
                                                                scratch.isSaturated,
                                                                offsetForHashes,
                                                                offsetForInputs);
-//       cudaCheck(cudaGetLastError());
+      cudaCheck(cudaGetLastError());
 
       // run minimization kernels
       v1::minimization_procedure(eventInputGPU, eventOutputGPU, scratch, conditions, configParameters, cudaStream);
@@ -148,7 +148,7 @@ namespace ecal {
             conditions.sampleMask.getEcalSampleMaskRecordEB(),
             conditions.sampleMask.getEcalSampleMaskRecordEE(),
             totalChannels);
-//         cudaCheck(cudaGetLastError());
+        cudaCheck(cudaGetLastError());
 
         //
         // TODO: small kernel only for EB. It needs to be checked if
@@ -170,7 +170,7 @@ namespace ecal {
             conditions.sampleMask.getEcalSampleMaskRecordEB(),
             totalChannels,
             offsetForInputs);
-//         cudaCheck(cudaGetLastError());
+        cudaCheck(cudaGetLastError());
 
         //
         //
@@ -186,7 +186,7 @@ namespace ecal {
             scratch.sum0sNullHypot,
             scratch.sumAAsNullHypot,
             totalChannels);
-//         cudaCheck(cudaGetLastError());
+        cudaCheck(cudaGetLastError());
 
         unsigned int nchannels_per_block_makeratio = 10;
         unsigned int threads_makeratio = 45 * nchannels_per_block_makeratio;
@@ -220,7 +220,7 @@ namespace ecal {
             configParameters.timeFitLimitsSecondEE,
             totalChannels,
             offsetForInputs);
-//         cudaCheck(cudaGetLastError());
+        cudaCheck(cudaGetLastError());
 
         //
         //
@@ -252,7 +252,7 @@ namespace ecal {
                                                                     scratch.timeError,
                                                                     totalChannels,
                                                                     offsetForInputs);
-//         cudaCheck(cudaGetLastError());
+        cudaCheck(cudaGetLastError());
 
         //
         //
@@ -298,7 +298,7 @@ namespace ecal {
             offsetForHashes,
             offsetForInputs,
             totalChannels);
-//         cudaCheck(cudaGetLastError());
+        cudaCheck(cudaGetLastError());
       }
 
       /*
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc
index c9dba159719b3..c2a01e3d5c349 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc
@@ -27,9 +27,9 @@
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h"
 // for rechit
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h"
@@ -75,8 +75,8 @@ using EcalTimeCalibConstantsGPUESProducer = EcalESProducerGPU<
                                                               EcalTimeCalibConstantsRcd
                                                               >;
                                                              
-using EcalADCToGeVConstantGPUESProducer = EcalESProducerGPU<
-                                                            EcalADCToGeVConstantGPU,
+using EcalRechitADCToGeVConstantGPUESProducer = EcalESProducerGPU<
+                                                            EcalRechitADCToGeVConstantGPU,
                                                             EcalADCToGeVConstant,
                                                             EcalADCToGeVConstantRcd
                                                             >;
@@ -87,8 +87,8 @@ using EcalIntercalibConstantsGPUESProducer = EcalESProducerGPU<
                                                                EcalIntercalibConstantsRcd
                                                                >;
 
-using EcalChannelStatusGPUESProducer = EcalESProducerGPU<
-                                                         EcalChannelStatusGPU,
+using EcalRechitChannelStatusGPUESProducer = EcalESProducerGPU<
+                                                         EcalRechitChannelStatusGPU,
                                                          EcalChannelStatus,
                                                          EcalChannelStatusRcd
                                                          >;
@@ -129,9 +129,9 @@ DEFINE_FWK_EVENTSETUP_MODULE(EcalSamplesCorrelationGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(EcalTimeBiasCorrectionsGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(EcalTimeCalibConstantsGPUESProducer);
 
-DEFINE_FWK_EVENTSETUP_MODULE(EcalADCToGeVConstantGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(EcalRechitADCToGeVConstantGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(EcalIntercalibConstantsGPUESProducer);
-DEFINE_FWK_EVENTSETUP_MODULE(EcalChannelStatusGPUESProducer);
+DEFINE_FWK_EVENTSETUP_MODULE(EcalRechitChannelStatusGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(EcalLaserAPDPNRatiosGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(EcalLaserAPDPNRatiosRefGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(EcalLaserAlphasGPUESProducer);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
index a9d4bb9e670f4..bbe05aceda79b 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
@@ -36,9 +36,9 @@
 
 
 // conditions gpu
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h"
 
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h"
@@ -95,9 +95,9 @@ class EcalRecHitProducerGPU: public edm::stream::EDProducer<edm::ExternalWork> {
   
   
   // conditions handles
-  edm::ESHandle<EcalADCToGeVConstantGPU>    ADCToGeVConstantHandle_;
-  edm::ESHandle<EcalIntercalibConstantsGPU> IntercalibConstantsHandle_;
-  edm::ESHandle<EcalChannelStatusGPU>       ChannelStatusHandle_;
+  edm::ESHandle<EcalRechitADCToGeVConstantGPU> ADCToGeVConstantHandle_;
+  edm::ESHandle<EcalIntercalibConstantsGPU>    IntercalibConstantsHandle_;
+  edm::ESHandle<EcalRechitChannelStatusGPU>    ChannelStatusHandle_;
   
   edm::ESHandle<EcalLaserAPDPNRatiosGPU>    LaserAPDPNRatiosHandle_;
   edm::ESHandle<EcalLaserAPDPNRatiosRefGPU> LaserAPDPNRatiosRefHandle_;
@@ -401,7 +401,7 @@ void EcalRecHitProducerGPU::acquire(
     ctx.stream()
   );
   
-  cudaCheck(cudaGetLastError());
+//   cudaCheck(cudaGetLastError());
   
   
 }
diff --git a/RecoLocalCalo/EcalRecProducers/test/ecalRawDecodingAndMultifit.py b/RecoLocalCalo/EcalRecProducers/test/ecalRawDecodingAndMultifit.py
index 4886238cc620f..dbfa0ca20e5fe 100644
--- a/RecoLocalCalo/EcalRecProducers/test/ecalRawDecodingAndMultifit.py
+++ b/RecoLocalCalo/EcalRecProducers/test/ecalRawDecodingAndMultifit.py
@@ -68,6 +68,18 @@
 #process.load("RecoLocalCalo.EcalRecProducers.ecalChannelStatusGPUESProducer_cfi")
 
 
+
+process.load("RecoLocalCalo.EcalRecProducers.ecalRechitADCToGeVConstantGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalIntercalibConstantsGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalRechitChannelStatusGPUESProducer_cfi")
+    
+process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosRefGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAlphasGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalLinearCorrectionsGPUESProducer_cfi")
+    
+    
+    
 #process.ecalMultiFitUncalibRecHitgpu.algoPSet.threads = cms.vint32(256, 1, 1)
 
 
diff --git a/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py b/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py
index f1b68836b2101..02f84eebf21b3 100644
--- a/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py
+++ b/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py
@@ -151,19 +151,23 @@
 #process.ecalRecHit
 
     
-    
+process.load("RecoLocalCalo.EcalRecProducers.ecalRechitADCToGeVConstantGPUESProducer_cfi")
+process.load("RecoLocalCalo.EcalRecProducers.ecalRechitChannelStatusGPUESProducer_cfi")
 #process.load("RecoLocalCalo.EcalRecProducers.ecalADCToGeVConstantGPUESProducer_cfi")
+#process.load("RecoLocalCalo.EcalRecProducers.ecalChannelStatusGPUESProducer_cfi")
 process.load("RecoLocalCalo.EcalRecProducers.ecalIntercalibConstantsGPUESProducer_cfi")
-process.load("RecoLocalCalo.EcalRecProducers.ecalChannelStatusGPUESProducer_cfi")
     
 process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosGPUESProducer_cfi")
 process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosRefGPUESProducer_cfi")
 process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAlphasGPUESProducer_cfi")
 process.load("RecoLocalCalo.EcalRecProducers.ecalLinearCorrectionsGPUESProducer_cfi")
     
-#process.load("RecoLocalCalo.EcalRecProducers.ecalRecHitGPU_cfi")
-#process.ecalRecHitProducerGPU = process.ecalRecHitGPU.clone()
+process.load("RecoLocalCalo.EcalRecProducers.ecalRecHitGPU_cfi")
+process.ecalRecHitProducerGPU = process.ecalRecHitGPU.clone()
+ 
  
+process.load("RecoLocalCalo.EcalRecProducers.ecalCPURecHitProducer_cfi")
+
  
 #
 # AM : TEST to see if the number of rechits matches
@@ -244,7 +248,7 @@
 
 process.out = cms.OutputModule(
     "PoolOutputModule",
-    fileName = cms.untracked.string("test.root")
+    fileName = cms.untracked.string("testRechit.root")
 )
 
 #process.out = cms.OutputModule("AsciiOutputModule",
@@ -273,8 +277,8 @@
 #   gpu
     *process.ecalUncalibRecHitProducerGPU
     *process.ecalCPUUncalibRecHitProducer
-    #*process.ecalRecHitProducerGPU
-    #*process.ecalCPURecHitProducer
+    *process.ecalRecHitProducerGPU
+    *process.ecalCPURecHitProducer
 )
 
 process.schedule = cms.Schedule(
@@ -296,3 +300,7 @@
 process.MessageLogger.categories.append("CUDAService")
 
 
+#
+process.DependencyGraph = cms.Service("DependencyGraph")
+
+

From 32b56f616fa7336fe12412fbf792337f140b6476 Mon Sep 17 00:00:00 2001
From: amassiro <massironi.andrea@gmail.com>
Date: Wed, 13 May 2020 16:35:35 +0200
Subject: [PATCH 14/30] missing files

---
 .../bin/makeEcalRechitValidationPlots.cpp     | 844 ++++++++++++++++++
 .../interface/EcalRechitADCToGeVConstantGPU.h |  43 +
 .../interface/EcalRechitChannelStatusGPU.h    |  43 +
 .../src/EcalRechitADCToGeVConstantGPU.cc      |  39 +
 .../src/EcalRechitChannelStatusGPU.cc         |  52 ++
 5 files changed, 1021 insertions(+)
 create mode 100644 RecoLocalCalo/EcalRecAlgos/bin/makeEcalRechitValidationPlots.cpp
 create mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h
 create mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h
 create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalRechitADCToGeVConstantGPU.cc
 create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc

diff --git a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalRechitValidationPlots.cpp b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalRechitValidationPlots.cpp
new file mode 100644
index 0000000000000..4e7718791b603
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalRechitValidationPlots.cpp
@@ -0,0 +1,844 @@
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <fstream>
+
+#include <TCanvas.h>
+#include <TStyle.h>
+#include <TPad.h>
+#include <TFile.h>
+#include <TH1D.h>
+#include <TH2D.h>
+#include <TTree.h>
+#include <TPaveStats.h>
+
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "DataFormats/EcalRecHit/interface/EcalRecHit.h"
+#include "DataFormats/EcalRecHit/interface/EcalRecHitCollections.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
+
+int main(int argc, char *argv[]) {
+  if (argc<3) {
+    std::cout << "run with: ./makeEcalRechitValidationPlots <path to input file> <output file>\n";
+    exit(0);
+  }
+  // Set the GPU and CPU pointers for both EB and EE
+  edm::Wrapper<ecal::RecHit<ecal::Tag::soa>> *wgpuEB=nullptr;
+  edm::Wrapper<ecal::RecHit<ecal::Tag::soa>> *wgpuEE=nullptr;
+  edm::Wrapper<EBRecHitCollection> *wcpuEB = nullptr;
+  edm::Wrapper<EERecHitCollection> *wcpuEE = nullptr;
+  
+  std::string fileName = argv[1]; // The input file containing the data to be validated (i.e. result.root)
+  std::string outFileName = argv[2]; //The output file in which the validation results will be saved (i.e. output.root)
+  
+  //output
+  TFile rfout{outFileName.c_str(), "recreate"};
+  
+  int nbins = 200;
+  int last = 5000.;
+  
+  int nbins_energy = 300;
+  float last_energy = 2.;
+  
+  int nbins_chi2 = 200;
+  float last_chi2 = 100.;
+  
+  int nbins_flag = 40;
+  //   int nbins_flag = 1000;
+  int last_flag = 1500;
+  //   int nbins_flag = 40;
+  //   int last_flag = 10000;
+  
+  int nbins_extra = 200;
+  int last_extra = 200;
+  
+  int nbins_delta = 201;  // use an odd number to center around 0
+  float delta = 0.2;
+  
+  // RecHits plots for EB and EE on both GPU and CPU
+  auto hRechitsEBGPU = new TH1D("RechitsEBGPU", "RechitsEBGPU; No. of Rechits. No Filter GPU", nbins, 0, last);
+  auto hRechitsEBCPU = new TH1D("RechitsEBCPU", "RechitsEBCPU; No. of Rechits. No Filter GPU", nbins, 0, last);
+  auto hRechitsEEGPU = new TH1D("RechitsEEGPU", "RechitsEEGPU; No. of Rechits. No Filter GPU", nbins, 0, last);
+  auto hRechitsEECPU = new TH1D("RechitsEECPU", "RechitsEECPU; No. of Rechits. No Filter GPU", nbins, 0, last);
+  auto hRechitsEBGPUvsCPU = new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU. No Filter GPU", last, 0, last, last, 0, last);
+  auto hRechitsEEGPUvsCPU = new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU. No Filter GPU", last, 0, last, last, 0, last);
+  auto hRechitsEBGPUCPUratio = new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU. No Filter GPU", 200, 0.95, 1.05);
+  auto hRechitsEEGPUCPUratio = new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU. No Filter GPU", 200, 0.95, 1.05);
+  auto hRechitsEBdeltavsCPU = new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU. No Filter GPU", nbins, 0, last, nbins_delta, -delta, delta);
+  auto hRechitsEEdeltavsCPU = new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU. No Filter GPU", nbins, 0, last, nbins_delta, -delta, delta);
+  
+  // RecHits plots for EB and EE on both GPU and CPU
+  auto hSelectedRechitsEBGPU = new TH1D("RechitsEBGPU", "RechitsEBGPU; No. of Rechits", nbins, 0, last);
+  auto hSelectedRechitsEBCPU = new TH1D("RechitsEBCPU", "RechitsEBCPU; No. of Rechits", nbins, 0, last);
+  auto hSelectedRechitsEEGPU = new TH1D("RechitsEEGPU", "RechitsEEGPU; No. of Rechits", nbins, 0, last);
+  auto hSelectedRechitsEECPU = new TH1D("RechitsEECPU", "RechitsEECPU; No. of Rechits", nbins, 0, last);
+  auto hSelectedRechitsEBGPUvsCPU = new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last);
+  auto hSelectedRechitsEEGPUvsCPU = new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last);
+  auto hSelectedRechitsEBGPUCPUratio = new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05);
+  auto hSelectedRechitsEEGPUCPUratio = new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05);
+  auto hSelectedRechitsEBdeltavsCPU = new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+  auto hSelectedRechitsEEdeltavsCPU = new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+  
+  // RecHits plots for EB and EE on both GPU and CPU
+  auto hPositiveRechitsEBGPU = new TH1D("RechitsEBGPU", "RechitsEBGPU; No. of Rechits", nbins, 0, last);
+  auto hPositiveRechitsEBCPU = new TH1D("RechitsEBCPU", "RechitsEBCPU; No. of Rechits", nbins, 0, last);
+  auto hPositiveRechitsEEGPU = new TH1D("RechitsEEGPU", "RechitsEEGPU; No. of Rechits", nbins, 0, last);
+  auto hPositiveRechitsEECPU = new TH1D("RechitsEECPU", "RechitsEECPU; No. of Rechits", nbins, 0, last);
+  auto hPositiveRechitsEBGPUvsCPU = new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last);
+  auto hPositiveRechitsEEGPUvsCPU = new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last);
+  auto hPositiveRechitsEBGPUCPUratio = new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05);
+  auto hPositiveRechitsEEGPUCPUratio = new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05);
+  auto hPositiveRechitsEBdeltavsCPU = new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+  auto hPositiveRechitsEEdeltavsCPU = new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+  
+  // Energies plots for EB and EE on both GPU and CPU
+  auto hEnergiesEBGPU = new TH1D("EnergiesEBGPU", "EnergiesEBGPU; Energy [GeV]", nbins_energy, 0, last_energy);
+  auto hEnergiesEEGPU = new TH1D("EnergiesEEGPU", "EnergiesEEGPU; Energy [GeV]", nbins_energy, 0, last_energy);
+  auto hEnergiesEBCPU = new TH1D("EnergiesEBCPU", "EnergiesEBCPU; Energy [GeV]", nbins_energy, 0, last_energy);
+  auto hEnergiesEECPU = new TH1D("EnergiesEECPU", "EnergiesEECPU; Energy [GeV]", nbins_energy, 0, last_energy);
+  auto hEnergiesEBGPUvsCPU = new TH2D("EnergiesEBGPUvsCPU", "EnergiesEBGPUvsCPU; CPU; GPU", nbins_energy, 0, last_energy, nbins_energy, 0, last_energy);
+  auto hEnergiesEEGPUvsCPU = new TH2D("EnergiesEEGPUvsCPU", "EnergiesEEGPUvsCPU; CPU; GPU", nbins_energy, 0, last_energy, nbins_energy, 0, last_energy);
+  auto hEnergiesEBGPUCPUratio = new TH1D("EnergiesEBGPU/CPUratio", "EnergiesEBGPU/CPUratio; GPU/CPU", 100, 0.8, 1.2);
+  auto hEnergiesEEGPUCPUratio = new TH1D("EnergiesEEGPU/CPUratio", "EnergiesEEGPU/CPUratio; GPU/CPU", 100, 0.8, 1.2);
+  auto hEnergiesEBdeltavsCPU = new TH2D("EnergiesEBdeltavsCPU", "EnergiesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+  auto hEnergiesEEdeltavsCPU = new TH2D("EnergiesEEdeltavsCPU", "EnergiesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+  
+  // Chi2 plots for EB and EE on both GPU and CPU
+  auto hChi2EBGPU = new TH1D("Chi2EBGPU", "Chi2EBGPU; Ch^{2}", nbins_chi2, 0, last_chi2);
+  auto hChi2EEGPU = new TH1D("Chi2EEGPU", "Chi2EEGPU; Ch^{2}", nbins_chi2, 0, last_chi2);
+  auto hChi2EBCPU = new TH1D("Chi2EBCPU", "Chi2EBCPU; Ch^{2}", nbins_chi2, 0, last_chi2);
+  auto hChi2EECPU = new TH1D("Chi2EECPU", "Chi2EECPU; Ch^{2}", nbins_chi2, 0, last_chi2);
+  auto hChi2EBGPUvsCPU = new TH2D("Chi2EBGPUvsCPU", "Chi2EBGPUvsCPU; CPU; GPU", nbins_chi2, 0, 100, nbins_chi2, 0, 100);
+  auto hChi2EEGPUvsCPU = new TH2D("Chi2EEGPUvsCPU", "Chi2EEGPUvsCPU; CPU; GPU", nbins_chi2, 0, 100, nbins_chi2, 0, 100);
+  auto hChi2EBGPUCPUratio = new TH1D("Chi2EBGPU/CPUratio", "Chi2EBGPU/CPUratio; GPU/CPU", 100, 0.8, 1.2);
+  auto hChi2EEGPUCPUratio = new TH1D("Chi2EEGPU/CPUratio", "Chi2EEGPU/CPUratio; GPU/CPU", 100, 0.8, 1.2);
+  auto hChi2EBdeltavsCPU = new TH2D("Chi2EBdeltavsCPU", "Chi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
+  auto hChi2EEdeltavsCPU = new TH2D("Chi2EEdeltavsCPU", "Chi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
+  
+  // Flags plots for EB and EE on both GPU and CPU
+  auto hFlagsEBGPU = new TH1D("FlagsEBGPU", "FlagsEBGPU; Flags", nbins_flag, -10, last_flag);
+  auto hFlagsEBCPU = new TH1D("FlagsEBCPU", "FlagsEBCPU; Flags", nbins_flag, -10, last_flag);
+  auto hFlagsEEGPU = new TH1D("FlagsEEGPU", "FlagsEEGPU; Flags", nbins_flag, -10, last_flag);
+  auto hFlagsEECPU = new TH1D("FlagsEECPU", "FlagsEECPU; Flags", nbins_flag, -10, last_flag);
+  auto hFlagsEBGPUvsCPU = new TH2D("FlagsEBGPUvsCPU", "FlagsEBGPUvsCPU; CPU; GPU", nbins_flag, -10, last_flag, nbins_flag, -10, last_flag);
+  auto hFlagsEEGPUvsCPU = new TH2D("FlagsEEGPUvsCPU", "FlagsEEGPUvsCPU; CPU; GPU", nbins_flag, -10, last_flag, nbins_flag, -10, last_flag);
+  auto hFlagsEBGPUCPUratio = new TH1D("FlagsEBGPU/CPUratio", "FlagsEBGPU/CPUratio; GPU/CPU", 50, -5, 10);
+  auto hFlagsEEGPUCPUratio = new TH1D("FlagsEEGPU/CPUratio", "FlagsEEGPU/CPUratio; GPU/CPU", 50, -5, 10);
+  auto hFlagsEBdeltavsCPU = new TH2D("FlagsEBdeltavsCPU", "FlagsEBdeltavsCPU", nbins_flag, -10, last_flag, nbins_delta, -delta, delta);
+  auto hFlagsEEdeltavsCPU = new TH2D("FlagsEEdeltavsCPU", "FlagsEEdeltavsCPU", nbins_flag, -10, last_flag, nbins_delta, -delta, delta);
+  
+  // Extras plots for EB and EE on both GPU and CPU
+  auto hExtrasEBGPU = new TH1D("ExtrasEBGPU", "ExtrasEBGPU; No. of Extras", nbins_extra, 0, last_extra);
+  auto hExtrasEBCPU = new TH1D("ExtrasEBCPU", "ExtrasEBCPU; No. of Extras", nbins_extra, 0, last_extra);
+  auto hExtrasEEGPU = new TH1D("ExtrasEEGPU", "ExtrasEEGPU; No. of Extras", nbins_extra, 0, last_extra);
+  auto hExtrasEECPU = new TH1D("ExtrasEECPU", "ExtrasEECPU; No. of Extras", nbins_extra, 0, last_extra);
+  auto hExtrasEBGPUvsCPU = new TH2D("ExtrasEBGPUvsCPU", "ExtrasEBGPUvsCPU; CPU; GPU", nbins_extra, 0, last_extra,nbins_extra, 0, last_extra);
+  auto hExtrasEEGPUvsCPU = new TH2D("ExtrasEEGPUvsCPU", "ExtrasEEGPUvsCPU; CPU; GPU", nbins_extra, 0, last_extra, nbins_extra, 0, last_extra);
+  auto hExtrasEBGPUCPUratio = new TH1D("ExtrasEBGPU/CPUratio", "ExtrasEBGPU/CPUratio; GPU/CPU", 50, 0.0, 2.0);
+  auto hExtrasEEGPUCPUratio = new TH1D("ExtrasEEGPU/CPUratio", "ExtrasEEGPU/CPUratio; GPU/CPU", 50, 0.0, 2.0);
+  auto hExtrasEBdeltavsCPU = new TH2D("ExtrasEBdeltavsCPU", "ExtrasEBdeltavsCPU", nbins_extra, 0, last_extra, nbins_delta, -delta, delta);
+  auto hExtrasEEdeltavsCPU = new TH2D("ExtrasEEdeltavsCPU", "ExtrasEEdeltavsCPU", nbins_extra, 0, last_extra, nbins_delta, -delta, delta);
+  
+  // input file setup for tree
+  std::cout << "validating file " << fileName << std::endl;
+  TFile rf{fileName.c_str()};
+  TTree *rt = (TTree*)rf.Get("Events");
+  
+  // Allocating the appropriate data to their respective pointers
+  rt->SetBranchAddress("ecalTagsoaecalRecHit_ecalCPURecHitProducer_EcalRecHitsEB_RECO.", &wgpuEB);
+  rt->SetBranchAddress("ecalTagsoaecalRecHit_ecalCPURecHitProducer_EcalRecHitsEE_RECO.", &wgpuEE);
+  rt->SetBranchAddress("EcalRecHitsSorted_ecalRecHit_EcalRecHitsEB_RECO.", &wcpuEB);
+  rt->SetBranchAddress("EcalRecHitsSorted_ecalRecHit_EcalRecHitsEE_RECO.", &wcpuEE);
+  
+  constexpr float eps_diff = 1e-3;
+  
+  // accumulate sizes for events and sizes of each event on both GPU and CPU
+  //   auto const nentries = rt->GetEntries();
+  int nentries = rt->GetEntries();
+  
+  //---- AM: tests 
+  if (nentries > 1000) {
+    nentries = 1000;
+  }
+  //   nentries = 1;
+  
+  std::cout << "#events to validate over: " << nentries << std::endl;
+  for (int ie=0; ie<nentries; ++ie) {
+    rt->GetEntry(ie);
+    
+    //     const char* ordinal[] = { "th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th" };
+    auto cpu_eb_size = wcpuEB->bareProduct().size();
+    auto cpu_ee_size = wcpuEE->bareProduct().size();
+    auto gpu_eb_size = wgpuEB->bareProduct().energy.size();
+    auto gpu_ee_size = wgpuEE->bareProduct().energy.size();
+    float eb_ratio = (float) gpu_eb_size/cpu_eb_size;
+    float ee_ratio = (float) gpu_ee_size/cpu_ee_size;
+    
+    // Filling up the histograms on events sizes for EB and EE on both GPU and CPU
+    hRechitsEBGPU->Fill(gpu_eb_size);
+    hRechitsEBCPU->Fill(cpu_eb_size);
+    hRechitsEEGPU->Fill(gpu_ee_size);
+    hRechitsEECPU->Fill(cpu_ee_size);
+    hRechitsEBGPUvsCPU->Fill(cpu_eb_size, gpu_eb_size);
+    hRechitsEEGPUvsCPU->Fill(cpu_ee_size, gpu_ee_size);
+    hRechitsEBGPUCPUratio->Fill(eb_ratio);
+    hRechitsEEGPUCPUratio->Fill(ee_ratio);
+    hRechitsEBdeltavsCPU->Fill(cpu_eb_size, gpu_eb_size-cpu_eb_size);
+    hRechitsEEdeltavsCPU->Fill(cpu_ee_size, gpu_ee_size-cpu_ee_size);
+    
+    /*    
+     *    // condition that sizes on GPU and CPU should be the same for EB or EE
+     *       if (cpu_eb_size != gpu_eb_size or cpu_ee_size != gpu_ee_size) {
+     *         std::cerr << ie << ordinal[ie % 10] << " entry:\n"
+     *                   << "  EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << gpu_eb_size << " (gpu)\n"
+     *                   << "  EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << gpu_ee_size << " (gpu)" << std::endl;
+     *                  
+     *         continue;
+  }
+  assert(wgpuEB->bareProduct().energy.size() == wcpuEB->bareProduct().size());
+  assert(wgpuEE->bareProduct().energy.size() == wcpuEE->bareProduct().size()); 
+  auto const neb = wcpuEB->bareProduct().size(); //like cpu_eb_size but set to constant
+  auto const nee = wcpuEE->bareProduct().size(); //like cpu_ee_size but set to constant
+  */
+    
+    uint selected_gpu_eb_size = 0;
+    uint selected_gpu_ee_size = 0;
+    
+    uint positive_gpu_eb_size = 0;
+    uint positive_gpu_ee_size = 0;
+    
+    // EB:
+    for (uint32_t i=0; i<gpu_eb_size; ++i) {
+      auto const did_gpu = wgpuEB->bareProduct().did[i]; // set the did for the current RecHit
+      // Set the variables for GPU
+      auto const enr_gpu = wgpuEB->bareProduct().energy[i]; 
+      auto const chi2_gpu = wgpuEB->bareProduct().chi2[i];
+      auto const flag_gpu = wgpuEB->bareProduct().flagBits[i]; 
+      auto const extra_gpu = wgpuEB->bareProduct().extra[i];
+      
+      // you have "-1" if the crystal is not selected
+      if ( enr_gpu>=0 ) {
+        selected_gpu_eb_size++;
+        
+        if ( enr_gpu>0 ) {
+          positive_gpu_eb_size++;
+        }
+        
+        // find the Rechit on CPU reflecting the same did
+        auto const cpu_iter = wcpuEB->bareProduct().find(DetId{did_gpu}); 
+        if (cpu_iter == wcpuEB->bareProduct().end()) {
+          //           std::cerr << ie << ordinal[ie % 10] << " entry\n"
+          //                   << "  Did not find a DetId " << did_gpu_eb
+          //                 << " in a CPU collection\n";
+          std::cerr << "  Did not find a DetId " << did_gpu << " in a CPU collection\n";
+          continue;
+        }
+        // Set the variables for CPU
+        auto const enr_cpu = cpu_iter->energy();
+        auto const chi2_cpu = cpu_iter->chi2();
+//         auto const flag_cpu = cpu_iter->flagBits();
+        auto const flag_cpu = 1;
+//         auto const extra_cpu = cpu_iter->extra();
+        auto const extra_cpu = 1;
+        //       auto const flag_cpu = cpu_iter->flagBits() ? cpu_iter->flagBits():-1;
+        //       auto const extra_cpu = cpu_iter->extra() ? cpu_iter->extra():-1;
+        
+        // AM: TEST
+        //       if (extra_cpu != 10) continue;
+        
+        // Fill the energy and Chi2 histograms for GPU and CPU and their comparisons with delta
+        hEnergiesEBGPU->Fill(enr_gpu);
+        hEnergiesEBCPU->Fill(enr_cpu);
+        //       std::cout<<"EB CPU Energy:\t"<<enr_cpu<<std::endl;
+        hEnergiesEBGPUvsCPU->Fill(enr_cpu, enr_gpu);
+        hEnergiesEBGPUCPUratio->Fill(enr_gpu/enr_cpu);
+        hEnergiesEBdeltavsCPU->Fill(enr_cpu, enr_gpu-enr_cpu);
+        
+        hChi2EBGPU->Fill(chi2_gpu);
+        hChi2EBCPU->Fill(chi2_cpu);
+        hChi2EBGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
+        hChi2EBGPUCPUratio->Fill(chi2_gpu/chi2_cpu);
+        hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu);
+        
+        hFlagsEBGPU->Fill(flag_gpu);
+        hFlagsEBCPU->Fill(flag_cpu);
+        hFlagsEBGPUvsCPU->Fill(flag_cpu, flag_gpu);
+        hFlagsEBGPUCPUratio->Fill(flag_cpu ? flag_gpu/flag_cpu : -1);
+        hFlagsEBdeltavsCPU->Fill(flag_cpu, flag_gpu-flag_cpu);
+        
+        hExtrasEBGPU->Fill(extra_gpu);
+        hExtrasEBCPU->Fill(extra_cpu);
+        hExtrasEBGPUvsCPU->Fill(extra_cpu, extra_gpu);
+        hExtrasEBGPUCPUratio->Fill(extra_cpu ? extra_gpu/extra_cpu : -1);
+        hExtrasEBdeltavsCPU->Fill(extra_cpu, extra_gpu-extra_cpu);
+        
+        // Check if abs difference between GPU and CPU values for energy and Chi2 are smaller than eps, if not print message
+        // if ((std::abs(enr_gpu - enr_cpu) >= eps_diff) or
+        //      (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu))
+        //  {
+        //      printf("EB eventid = %d chid = %d energy_gpu = %f energy_cpu %f chi2_gpu = %f chi2_cpu = %f\n",
+        //          ie, i, enr_gpu, enr_cpu, chi2_gpu, chi2_cpu);
+        //      if (std::isnan(chi2_gpu))
+        //        printf("*** nan ***\n");
+        //  } 
+        
+      }
+    }
+    
+    // EE:
+    for (uint32_t i=0; i<gpu_ee_size; ++i) {
+      auto const did_gpu = wgpuEE->bareProduct().did[i]; // set the did for the current RecHit
+      // Set the variables for GPU
+      auto const enr_gpu = wgpuEE->bareProduct().energy[i]; 
+      auto const chi2_gpu = wgpuEE->bareProduct().chi2[i];
+      auto const flag_gpu = wgpuEE->bareProduct().flagBits[i]; 
+      auto const extra_gpu = wgpuEE->bareProduct().extra[i];
+      
+      // you have "-1" if the crystal is not selected
+      if ( enr_gpu>=0 ) {
+        selected_gpu_ee_size++;
+        
+        if ( enr_gpu>0 ) {
+          positive_gpu_ee_size++;
+        }
+        
+        // find the Rechit on CPU reflecting the same did
+        auto const cpu_iter = wcpuEE->bareProduct().find(DetId{did_gpu}); 
+        if (cpu_iter == wcpuEE->bareProduct().end()) {
+          //    std::cerr << ie << ordinal[ie % 10] << " entry\n"
+          //            << "  Did not find a DetId " << did_gpu
+          //          << " in a CPU collection\n";
+          std::cerr << "  Did not find a DetId " << did_gpu << " in a CPU collection\n";
+          continue;
+        }
+        // Set the variables for CPU
+        auto const enr_cpu = cpu_iter->energy();
+        auto const chi2_cpu = cpu_iter->chi2();
+//         auto const flag_cpu = cpu_iter->flagBits();
+        auto const flag_cpu = 1;
+//         auto const extra_cpu = cpu_iter->extra();
+        auto const extra_cpu = 1;
+        //       auto const flag_cpu = cpu_iter->flagBits()?cpu_iter->flagBits():-1;
+        //       auto const extra_cpu = cpu_iter->extra()?cpu_iter->extra():-1;
+        
+        
+        // AM: TEST
+        //       if (extra_cpu != 10) continue;
+        
+        
+        // Fill the energy and Chi2 histograms for GPU and CPU and their comparisons with delta
+        hEnergiesEEGPU->Fill(enr_gpu);
+        hEnergiesEECPU->Fill(enr_cpu);
+        hEnergiesEEGPUvsCPU->Fill(enr_cpu, enr_gpu);
+        hEnergiesEEGPUCPUratio->Fill(enr_gpu/enr_cpu);
+        hEnergiesEEdeltavsCPU->Fill(enr_cpu, enr_gpu-enr_cpu);
+        
+        hChi2EEGPU->Fill(chi2_gpu);
+        hChi2EECPU->Fill(chi2_cpu);
+        hChi2EEGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
+        hChi2EEGPUCPUratio->Fill(chi2_gpu/chi2_cpu);
+        hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu);
+        
+        hFlagsEEGPU->Fill(flag_gpu);
+        hFlagsEECPU->Fill(flag_cpu);
+        hFlagsEEGPUvsCPU->Fill(flag_cpu, flag_gpu);
+        hFlagsEEGPUCPUratio->Fill(flag_cpu ? flag_gpu/flag_cpu : -1);
+        hFlagsEEdeltavsCPU->Fill(flag_cpu, flag_gpu-flag_cpu);
+        
+        hExtrasEEGPU->Fill(extra_gpu);
+        hExtrasEECPU->Fill(extra_cpu);
+        hExtrasEEGPUvsCPU->Fill(extra_cpu, extra_gpu);
+        hExtrasEEGPUCPUratio->Fill(extra_cpu ? extra_gpu/extra_cpu : -1);
+        hExtrasEEdeltavsCPU->Fill(extra_cpu, extra_gpu-extra_cpu);
+        
+        // Check if abs difference between GPU and CPU values for energy and Chi2 are smaller than eps, if not print message
+        // if ((std::abs(enr_gpu - enr_cpu) >= eps_diff) or
+        //      (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu))
+        //  {
+        //      printf("EE eventid = %d chid = %d energy_gpu = %f energy_cpu %f chi2_gpu = %f chi2_cpu = %f\n",
+        //          ie, i, enr_gpu, enr_cpu, chi2_gpu, chi2_cpu);
+        //      if (std::isnan(chi2_gpu))
+        //        printf("*** nan ***\n");
+        //  } 
+      }
+    }
+    
+    
+    //
+    // now the rechit counting
+    //
+    float selected_eb_ratio = (float) selected_gpu_eb_size/cpu_eb_size;
+    float selected_ee_ratio = (float) selected_gpu_ee_size/cpu_ee_size;
+    
+    // Filling up the histograms on events sizes for EB and EE on both GPU and CPU
+    hSelectedRechitsEBGPU->Fill(selected_gpu_eb_size);
+    hSelectedRechitsEBCPU->Fill(cpu_eb_size);
+    hSelectedRechitsEEGPU->Fill(selected_gpu_ee_size);
+    hSelectedRechitsEECPU->Fill(cpu_ee_size);
+    hSelectedRechitsEBGPUvsCPU->Fill(cpu_eb_size, selected_gpu_eb_size);
+    hSelectedRechitsEEGPUvsCPU->Fill(cpu_ee_size, selected_gpu_ee_size);
+    hSelectedRechitsEBGPUCPUratio->Fill(selected_eb_ratio);
+    hSelectedRechitsEEGPUCPUratio->Fill(selected_ee_ratio);
+    hSelectedRechitsEBdeltavsCPU->Fill(cpu_eb_size, selected_gpu_eb_size-cpu_eb_size);
+    hSelectedRechitsEEdeltavsCPU->Fill(cpu_ee_size, selected_gpu_ee_size-cpu_ee_size);
+    
+    
+    //
+    // now the rechit counting
+    //
+    
+    
+    uint positive_cpu_eb_size = 0;
+    uint positive_cpu_ee_size = 0;
+    
+    // EB:
+    for (uint32_t i=0; i<cpu_eb_size; ++i) {
+      auto const enr_cpu = wcpuEB->bareProduct()[i].energy(); 
+      if (enr_cpu > 0) {
+        positive_cpu_eb_size++;
+      }
+    }
+    // EE:
+    for (uint32_t i=0; i<cpu_ee_size; ++i) {
+      auto const enr_cpu = wcpuEE->bareProduct()[i].energy(); 
+      if (enr_cpu > 0) {
+        positive_cpu_ee_size++;
+      }
+    }
+    
+    
+    float positive_eb_ratio = (float) positive_gpu_eb_size/positive_cpu_eb_size;
+    float positive_ee_ratio = (float) positive_gpu_ee_size/positive_cpu_ee_size;
+    
+    // Filling up the histograms on events sizes for EB and EE on both GPU and CPU
+    hPositiveRechitsEBGPU->Fill(positive_gpu_eb_size);
+    hPositiveRechitsEBCPU->Fill(positive_cpu_eb_size);
+    hPositiveRechitsEEGPU->Fill(positive_gpu_ee_size);
+    hPositiveRechitsEECPU->Fill(positive_cpu_ee_size);
+    hPositiveRechitsEBGPUvsCPU->Fill(positive_cpu_eb_size, positive_gpu_eb_size);
+    hPositiveRechitsEEGPUvsCPU->Fill(positive_cpu_ee_size, positive_gpu_ee_size);
+    hPositiveRechitsEBGPUCPUratio->Fill(positive_eb_ratio);
+    hPositiveRechitsEEGPUCPUratio->Fill(positive_ee_ratio);
+    hPositiveRechitsEBdeltavsCPU->Fill(positive_cpu_eb_size, positive_gpu_eb_size-positive_cpu_eb_size);
+    hPositiveRechitsEEdeltavsCPU->Fill(positive_cpu_ee_size, positive_gpu_ee_size-positive_cpu_ee_size);
+    
+    
+    
+    if (cpu_eb_size != selected_gpu_eb_size or cpu_ee_size != selected_gpu_ee_size) {
+      //       std::cerr << ie << ordinal[ie % 10] << " entry:\n"
+      std::cerr << ie << " entry:\n"
+      << "  EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << selected_gpu_eb_size << " (gpu)\n"
+      << "  EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << selected_gpu_ee_size << " (gpu)" << std::endl;
+    }
+    
+    
+    
+  }
+  
+  
+  
+  
+  // Plotting the results:
+  {
+    // Canvases Setup:
+    TCanvas cAllRechits("AllRechits", "AllRechits", 1750, 860);
+    cAllRechits.Divide(3, 2);
+    TCanvas cRechits("Rechits", "Rechits", 1750, 860);
+    cRechits.Divide(3, 2);
+    TCanvas cRechitsPositive("RechitsPositive", "RechitsPositive", 1750, 860);
+    cRechitsPositive.Divide(3, 2);
+    TCanvas cEnergies("Energies", "Energies", 1750, 860);
+    cEnergies.Divide(3, 2);
+    TCanvas cChi2("Chi2", "Chi2", 1750, 860);
+    cChi2.Divide(3, 2);
+    TCanvas cFlags("Flags", "Flags", 1750, 860);
+    cFlags.Divide(3, 2);
+    TCanvas cExtras("Extras", "Extras", 1750, 860);
+    cExtras.Divide(3, 2);
+    
+    
+    
+    // Plotting the sizes of GPU vs CPU for each event of EB 
+    cAllRechits.cd(1);
+    {
+      gPad->SetLogy();
+      hRechitsEBCPU->SetLineColor(kRed);
+      hRechitsEBCPU->SetLineWidth(2);
+      hRechitsEBCPU->Draw("");
+      hRechitsEBGPU->SetLineColor(kBlue);
+      hRechitsEBGPU->SetLineWidth(2);
+      hRechitsEBGPU->Draw("sames");
+      cAllRechits.Update();
+      auto stats = (TPaveStats*)hRechitsEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2-y1));
+    } 
+    cAllRechits.cd(4);
+    {
+      gPad->SetLogy();
+      hRechitsEECPU->SetLineColor(kRed);
+      hRechitsEECPU->SetLineWidth(2);
+      hRechitsEECPU->Draw("");
+      hRechitsEEGPU->SetLineColor(kBlue);
+      hRechitsEEGPU->SetLineWidth(2);
+      hRechitsEEGPU->Draw("sames");
+      cAllRechits.Update();
+      auto stats = (TPaveStats*)hRechitsEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2-y1));
+    }
+    cAllRechits.cd(2); {
+      gStyle->SetPalette(55);
+      hRechitsEBGPUvsCPU->Draw("COLZ");
+    }
+    cAllRechits.cd(5); {
+      gStyle->SetPalette(55);
+      hRechitsEEGPUvsCPU->Draw("COLZ");
+    }
+    cAllRechits.cd(3); {
+      gPad->SetLogy();
+      //hRechitsEBdeltavsCPU->Draw("COLZ");
+      hRechitsEBGPUCPUratio->Draw("");
+    }
+    cAllRechits.cd(6); {
+      gPad->SetLogy();
+      //hRechitsEEdeltavsCPU->Draw("COLZ");
+      hRechitsEEGPUCPUratio->Draw("");
+    }
+    cAllRechits.SaveAs("ecal-allrechits.root");
+    cAllRechits.SaveAs("ecal-allrechits.png");
+    
+    
+    
+    // Plotting the sizes of GPU vs CPU for each event of EB 
+    cRechits.cd(1);
+    {
+      gPad->SetLogy();
+      hSelectedRechitsEBCPU->SetLineColor(kRed);
+      hSelectedRechitsEBCPU->SetLineWidth(2);
+      hSelectedRechitsEBCPU->Draw("");
+      hSelectedRechitsEBGPU->SetLineColor(kBlue);
+      hSelectedRechitsEBGPU->SetLineWidth(2);
+      hSelectedRechitsEBGPU->Draw("sames");
+      cRechits.Update();
+      auto stats = (TPaveStats*)hSelectedRechitsEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2-y1));
+    } 
+    cRechits.cd(4);
+    {
+      gPad->SetLogy();
+      hSelectedRechitsEECPU->SetLineColor(kRed);
+      hSelectedRechitsEECPU->SetLineWidth(2);
+      hSelectedRechitsEECPU->Draw("");
+      hSelectedRechitsEEGPU->SetLineColor(kBlue);
+      hSelectedRechitsEEGPU->SetLineWidth(2);
+      hSelectedRechitsEEGPU->Draw("sames");
+      cRechits.Update();
+      auto stats = (TPaveStats*)hSelectedRechitsEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2-y1));
+    }
+    cRechits.cd(2); {
+      gStyle->SetPalette(55);
+      hSelectedRechitsEBGPUvsCPU->Draw("COLZ");
+    }
+    cRechits.cd(5); {
+      gStyle->SetPalette(55);
+      hSelectedRechitsEEGPUvsCPU->Draw("COLZ");
+    }
+    cRechits.cd(3); {
+      gPad->SetLogy();
+      //hSelectedRechitsEBdeltavsCPU->Draw("COLZ");
+      hSelectedRechitsEBGPUCPUratio->Draw("");
+    }
+    cRechits.cd(6); {
+      gPad->SetLogy();
+      //hSelectedRechitsEEdeltavsCPU->Draw("COLZ");
+      hSelectedRechitsEEGPUCPUratio->Draw("");
+    }
+    cRechits.SaveAs("ecal-rechits.root");
+    cRechits.SaveAs("ecal-rechits.png");
+    
+    
+    
+    
+    // Plotting the sizes of GPU vs CPU for each event of EB 
+    cRechitsPositive.cd(1);
+    {
+      gPad->SetLogy();
+      hPositiveRechitsEBCPU->SetLineColor(kRed);
+      hPositiveRechitsEBCPU->SetLineWidth(2);
+      hPositiveRechitsEBCPU->Draw("");
+      hPositiveRechitsEBGPU->SetLineColor(kBlue);
+      hPositiveRechitsEBGPU->SetLineWidth(2);
+      hPositiveRechitsEBGPU->Draw("sames");
+      cRechitsPositive.Update();
+      auto stats = (TPaveStats*)hPositiveRechitsEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2-y1));
+    } 
+    cRechitsPositive.cd(4);
+    {
+      gPad->SetLogy();
+      hPositiveRechitsEECPU->SetLineColor(kRed);
+      hPositiveRechitsEECPU->SetLineWidth(2);
+      hPositiveRechitsEECPU->Draw("");
+      hPositiveRechitsEEGPU->SetLineColor(kBlue);
+      hPositiveRechitsEEGPU->SetLineWidth(2);
+      hPositiveRechitsEEGPU->Draw("sames");
+      cRechitsPositive.Update();
+      auto stats = (TPaveStats*)hPositiveRechitsEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2-y1));
+    }
+    cRechitsPositive.cd(2); {
+      gStyle->SetPalette(55);
+      hPositiveRechitsEBGPUvsCPU->Draw("COLZ");
+    }
+    cRechitsPositive.cd(5); {
+      gStyle->SetPalette(55);
+      hPositiveRechitsEEGPUvsCPU->Draw("COLZ");
+    }
+    cRechitsPositive.cd(3); {
+      gPad->SetLogy();
+      //hPositiveRechitsEBdeltavsCPU->Draw("COLZ");
+      hPositiveRechitsEBGPUCPUratio->Draw("");
+    }
+    cRechitsPositive.cd(6); {
+      gPad->SetLogy();
+      //hPositiveRechitsEEdeltavsCPU->Draw("COLZ");
+      hPositiveRechitsEEGPUCPUratio->Draw("");
+    }
+    cRechitsPositive.SaveAs("ecal-rechits-positive.root");
+    cRechitsPositive.SaveAs("ecal-rechits-positive.png");
+    
+    
+    cEnergies.cd(1);
+    {
+      gPad->SetLogy();
+      hEnergiesEBCPU->SetLineColor(kBlack);
+      hEnergiesEBCPU->SetLineWidth(2);
+      hEnergiesEBCPU->Draw("");
+      hEnergiesEBGPU->SetLineColor(kBlue);
+      hEnergiesEBGPU->SetLineWidth(2);
+      hEnergiesEBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hEnergiesEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2-y1));
+    }
+    cEnergies.cd(4);
+    {
+      gPad->SetLogy();
+      hEnergiesEECPU->SetLineColor(kBlack);
+      hEnergiesEECPU->SetLineWidth(2);
+      hEnergiesEECPU->Draw("");
+      hEnergiesEEGPU->SetLineColor(kBlue);
+      hEnergiesEEGPU->SetLineWidth(2);
+      hEnergiesEEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hEnergiesEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2-y1));
+    }
+    cEnergies.cd(2); {
+      hEnergiesEBGPUvsCPU->Draw("COLZ");
+    }
+    cEnergies.cd(5); {
+      hEnergiesEEGPUvsCPU->Draw("COLZ");
+    }
+    cEnergies.cd(3); {
+      gPad->SetLogy();
+      //hEnergiesEBdeltavsCPU->Draw("COLZ");
+      hEnergiesEBGPUCPUratio->Draw("");
+    }
+    cEnergies.cd(6); {
+      gPad->SetLogy();
+      //hEnergiesEEdeltavsCPU->Draw("COLZ");
+      hEnergiesEEGPUCPUratio->Draw("");
+    }
+    cEnergies.SaveAs("ecal-energies.root");
+    cEnergies.SaveAs("ecal-energies.png");
+    
+    
+    cChi2.cd(1);
+    {
+      gPad->SetLogy();
+      hChi2EBCPU->SetLineColor(kBlack);
+      hChi2EBCPU->SetLineWidth(2);
+      hChi2EBCPU->Draw("");
+      hChi2EBGPU->SetLineColor(kBlue);
+      hChi2EBGPU->SetLineWidth(2);
+      hChi2EBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hChi2EBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2-y1));
+    }
+    cChi2.cd(4);
+    {
+      gPad->SetLogy();
+      hChi2EECPU->SetLineColor(kBlack);
+      hChi2EECPU->SetLineWidth(2);
+      hChi2EECPU->Draw("");
+      hChi2EEGPU->SetLineColor(kBlue);
+      hChi2EEGPU->SetLineWidth(2);
+      hChi2EEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hChi2EEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2-y1));
+    }
+    cChi2.cd(2); {
+      hChi2EBGPUvsCPU->Draw("COLZ");
+    }
+    cChi2.cd(5); {
+      hChi2EEGPUvsCPU->Draw("COLZ");
+    }
+    cChi2.cd(3); {
+      gPad->SetLogy();
+      //hChi2EBdeltavsCPU->Draw("COLZ");
+      hChi2EBGPUCPUratio->Draw("");
+    }
+    cChi2.cd(6); {
+      gPad->SetLogy();
+      //hChi2EEdeltavsCPU->Draw("COLZ");
+      hChi2EEGPUCPUratio->Draw("");
+    }
+    cChi2.SaveAs("ecal-chi2.root");
+    cChi2.SaveAs("ecal-chi2.png");
+    
+    
+    cFlags.cd(1);
+    {
+      gPad->SetLogy();
+      hFlagsEBCPU->SetLineColor(kBlack);
+      hFlagsEBCPU->SetLineWidth(2);
+      hFlagsEBCPU->Draw("");
+      hFlagsEBGPU->SetLineColor(kBlue);
+      hFlagsEBGPU->SetLineWidth(2);
+      hFlagsEBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hFlagsEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2-y1));
+    }
+    cFlags.cd(4);
+    {
+      gPad->SetLogy();
+      hFlagsEECPU->SetLineColor(kBlack);
+      hFlagsEECPU->SetLineWidth(2);
+      hFlagsEECPU->Draw("");
+      hFlagsEEGPU->SetLineColor(kBlue);
+      hFlagsEEGPU->SetLineWidth(2);
+      hFlagsEEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hFlagsEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2-y1));
+    }
+    cFlags.cd(2); {
+      hFlagsEBGPUvsCPU->Draw("COLZ");
+    }
+    cFlags.cd(5); {
+      hFlagsEEGPUvsCPU->Draw("COLZ");
+    }
+    cFlags.cd(3); {
+      gPad->SetLogy();
+      //hFlagsEBdeltavsCPU->Draw("COLZ");
+      hFlagsEBGPUCPUratio->Draw("");
+    }
+    cFlags.cd(6); {
+      gPad->SetLogy();
+      //hFlagsEEdeltavsCPU->Draw("COLZ");
+      hFlagsEEGPUCPUratio->Draw("");
+    }
+    cFlags.SaveAs("ecal-flags.root");
+    cFlags.SaveAs("ecal-flags.png");
+    
+    
+    cExtras.cd(1);
+    {
+      gPad->SetLogy();
+      hExtrasEBCPU->SetLineColor(kBlack);
+      hExtrasEBCPU->SetLineWidth(2);
+      hExtrasEBCPU->Draw("");
+      hExtrasEBGPU->SetLineColor(kBlue);
+      hExtrasEBGPU->SetLineWidth(2);
+      hExtrasEBGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hExtrasEBGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2-y1));
+    }
+    cExtras.cd(4);
+    {
+      gPad->SetLogy();
+      hExtrasEECPU->SetLineColor(kBlack);
+      hExtrasEECPU->SetLineWidth(2);
+      hExtrasEECPU->Draw("");
+      hExtrasEEGPU->SetLineColor(kBlue);
+      hExtrasEEGPU->SetLineWidth(2);
+      hExtrasEEGPU->Draw("sames");
+      gPad->Update();
+      auto stats = (TPaveStats*)hExtrasEEGPU->FindObject("stats");
+      auto y2 = stats->GetY2NDC();
+      auto y1 = stats->GetY1NDC();
+      stats->SetY2NDC(y1);
+      stats->SetY1NDC(y1 - (y2-y1));
+    }
+    cExtras.cd(2); {
+      hExtrasEBGPUvsCPU->Draw("COLZ");
+    }
+    cExtras.cd(5); {
+      hExtrasEEGPUvsCPU->Draw("COLZ");
+    }
+    cExtras.cd(3); {
+      gPad->SetLogy();
+      //hExtrasEBdeltavsCPU->Draw("COLZ");
+      hExtrasEBGPUCPUratio->Draw("");
+    }
+    cExtras.cd(6); {
+      gPad->SetLogy();
+      //hExtrasEEdeltavsCPU->Draw("COLZ");
+      hExtrasEEGPUCPUratio->Draw("");
+    }
+    cExtras.SaveAs("ecal-extras.root");
+    cExtras.SaveAs("ecal-extras.png");
+  } 
+  
+  // Close all open files
+  rf.Close();
+  rfout.Write();
+  rfout.Close();
+  
+  return 0;
+}
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h
new file mode 100644
index 0000000000000..8addc316f366d
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h
@@ -0,0 +1,43 @@
+#ifndef RecoLocalCalo_EcalRecProducers_src_EcalRechitADCToGeVConstantGPU_h
+#define RecoLocalCalo_EcalRecProducers_src_EcalRechitADCToGeVConstantGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalADCToGeVConstant.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif
+
+class EcalRechitADCToGeVConstantGPU {
+public:
+  struct Product {
+    ~Product();
+    float *adc2gev = nullptr;
+  };
+  
+  #ifndef __CUDACC__
+  
+  // 
+  EcalRechitADCToGeVConstantGPU(EcalADCToGeVConstant const&);
+  
+  // will call dealloation for Product thru ~Product
+  ~EcalRechitADCToGeVConstantGPU() = default;
+  
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+  
+  // 
+  static std::string name() { return std::string{"ecalRechitADCToGeVConstantGPU"}; }
+  
+private:
+  // in the future, we need to arrange so to avoid this copy on the host
+  // store eb first then ee
+  std::vector<float, CUDAHostAllocator<float>> adc2gev_;
+  
+  cms::cuda::ESProduct<Product> product_;
+  
+  #endif
+};
+
+
+#endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h
new file mode 100644
index 0000000000000..2329b3752089d
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h
@@ -0,0 +1,43 @@
+#ifndef RecoLocalCalo_EcalRecProducers_src_EcalRechitChannelStatusGPU_h
+#define RecoLocalCalo_EcalRecProducers_src_EcalRechitChannelStatusGPU_h
+
+#include "CondFormats/EcalObjects/interface/EcalChannelStatus.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#endif
+
+class EcalRechitChannelStatusGPU {
+public:
+  struct Product {
+    ~Product();
+    uint16_t *status = nullptr;
+  };
+  
+  #ifndef __CUDACC__
+  
+  // 
+  EcalRechitChannelStatusGPU(EcalChannelStatus const&);
+  
+  // will call dealloation for Product thru ~Product
+  ~EcalRechitChannelStatusGPU() = default;
+  
+  // get device pointers
+  Product const& getProduct(cudaStream_t) const;
+  
+  // 
+  static std::string name() { return std::string{"ecalRechitChannelStatusGPU"}; }
+  
+private:
+  // in the future, we need to arrange so to avoid this copy on the host
+  // store eb first then ee
+  std::vector<uint16_t, CUDAHostAllocator<uint16_t>> status_;
+  
+  cms::cuda::ESProduct<Product> product_;
+  
+  #endif
+};
+
+
+#endif
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRechitADCToGeVConstantGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitADCToGeVConstantGPU.cc
new file mode 100644
index 0000000000000..3824b0989f622
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitADCToGeVConstantGPU.cc
@@ -0,0 +1,39 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalRechitADCToGeVConstantGPU::EcalRechitADCToGeVConstantGPU(EcalADCToGeVConstant const& values) 
+: adc2gev_(2)  // size is 2, one form EB and one for EE
+{
+  adc2gev_[0] = values.getEBValue();
+  adc2gev_[1] = values.getEEValue(); 
+}
+
+EcalRechitADCToGeVConstantGPU::Product::~Product() {
+  // deallocation
+  cudaCheck( cudaFree(adc2gev) );
+}
+
+EcalRechitADCToGeVConstantGPU::Product const& EcalRechitADCToGeVConstantGPU::getProduct(
+  cudaStream_t cudaStream) const 
+{
+  auto const& product = product_.dataForCurrentDeviceAsync(
+                   cudaStream,
+                   [this](EcalRechitADCToGeVConstantGPU::Product& product, cudaStream_t cudaStream) {
+                         // malloc
+                         cudaCheck( cudaMalloc((void**)&product.adc2gev,
+                                               this->adc2gev_.size() * sizeof(float)) );
+                         // transfer 
+                         cudaCheck( cudaMemcpyAsync(product.adc2gev,
+                                                    this->adc2gev_.data(),
+                                                    this->adc2gev_.size() * sizeof(float),
+                                                    cudaMemcpyHostToDevice,
+                                                    cudaStream) );
+                   }
+  );
+  
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalRechitADCToGeVConstantGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc
new file mode 100644
index 0000000000000..53f32df6f9697
--- /dev/null
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc
@@ -0,0 +1,52 @@
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h"
+
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+EcalRechitChannelStatusGPU::EcalRechitChannelStatusGPU(EcalChannelStatus const& values) 
+: status_(values.size())
+{
+  
+  std::cout << " I am running EcalRechitChannelStatusGPU ::>> debug ongoing ... " << std::endl;
+  
+  // fill in eb
+  auto const& barrelValues = values.barrelItems();
+  for (unsigned int i=0; i<barrelValues.size(); i++) {
+    status_[i] = barrelValues[i].getEncodedStatusCode();
+  }
+  
+  // fill in ee
+  auto const& endcapValues = values.endcapItems();
+  auto const offset = barrelValues.size();
+  for (unsigned int i=0; i<endcapValues.size(); i++) {
+    status_[offset + i] = endcapValues[i].getEncodedStatusCode();
+  }
+}
+
+EcalRechitChannelStatusGPU::Product::~Product() {
+  // deallocation
+  cudaCheck( cudaFree(status) );
+}
+
+EcalRechitChannelStatusGPU::Product const& EcalRechitChannelStatusGPU::getProduct(
+  cudaStream_t cudaStream) const
+  {
+    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
+                                                             [this](EcalRechitChannelStatusGPU::Product& product, cudaStream_t cudaStream) {
+                                                               // malloc
+                                                               cudaCheck( cudaMalloc((void**)&product.status,
+                                                                                     this->status_.size() * sizeof(uint16_t)) );
+                                                               // transfer 
+                                                               cudaCheck( cudaMemcpyAsync(product.status,
+                                                                                          this->status_.data(),
+                                                                                          this->status_.size() * sizeof(uint16_t),
+                                                                                          cudaMemcpyHostToDevice,
+                                                                                          cudaStream) );
+                                                             }
+    );
+    
+    return product;
+  }
+  
+  TYPELOOKUP_DATA_REG(EcalRechitChannelStatusGPU);
+  
\ No newline at end of file

From f159c2922c2c5b2f1e2608e7639a53fd0678f934 Mon Sep 17 00:00:00 2001
From: amassiro <massironi.andrea@gmail.com>
Date: Fri, 15 May 2020 17:58:11 +0200
Subject: [PATCH 15/30] fixes as for PR and clang-tidy

---
 .../interface/EcalChannelStatusCode.h         | 19 +--------
 EventFilter/EcalRawToDigi/BuildFile.xml       | 10 ++---
 .../EcalRecAlgos/interface/DeclsForKernels.h  | 41 +++++++++++--------
 .../src/EcalRecHitBuilderKernels.cu           | 24 ++++++-----
 .../src/EcalRecHitBuilderKernels.h            |  4 +-
 .../src/EcalRechitChannelStatusGPU.cc         |  5 +--
 .../EcalRecAlgos/src/KernelHelpers.cu         |  6 +--
 RecoLocalCalo/EcalRecProducers/BuildFile.xml  |  1 -
 .../EcalRecProducers/plugins/BuildFile.xml    |  2 -
 .../plugins/EcalCPURecHitProducer.cc          |  3 +-
 .../plugins/EcalRecHitProducerGPU.cc          |  7 +---
 .../test/testEcalRechitProducer_cfg.py        |  7 +++-
 .../test/testEcalUncalibRechitProducer_cfg.py |  2 -
 13 files changed, 59 insertions(+), 72 deletions(-)

diff --git a/CondFormats/EcalObjects/interface/EcalChannelStatusCode.h b/CondFormats/EcalObjects/interface/EcalChannelStatusCode.h
index a52868fe0d8df..09202950bfc68 100644
--- a/CondFormats/EcalObjects/interface/EcalChannelStatusCode.h
+++ b/CondFormats/EcalObjects/interface/EcalChannelStatusCode.h
@@ -5,7 +5,6 @@
  * Created: 14 Nov 2006
  **/
 
-
 #include "CondFormats/Serialization/interface/Serializable.h"
 
 #include <iostream>
@@ -17,10 +16,7 @@
  */
 
 class EcalChannelStatusCode {
-
-  
 public:
-  
   enum Code {
     kOk = 0,
     kDAC,
@@ -39,22 +35,12 @@ class EcalChannelStatusCode {
     kNoDataNoTP
   };
 
-  enum Bits {
-    kHV=0,
-    kLV,
-    kDAQ,
-    kTP,
-    kTrigger,
-    kTemperature,
-    kNextToDead
-  };
-  
+  enum Bits { kHV = 0, kLV, kDAQ, kTP, kTrigger, kTemperature, kNextToDead };
+
 public:
-  
   EcalChannelStatusCode() : status_(0) {}
   EcalChannelStatusCode(const uint16_t& encodedStatus) : status_(encodedStatus){};
 
-  
   void print(std::ostream& s) const { s << "status is: " << status_; }
 
   /// return decoded status
@@ -69,7 +55,6 @@ class EcalChannelStatusCode {
   static const int chStatusMask = 0x1F;
 
 private:
-  
   static const int kBitsOffset = 5;
   /* bits 1-5 store a status code:
        	0 	channel ok 
diff --git a/EventFilter/EcalRawToDigi/BuildFile.xml b/EventFilter/EcalRawToDigi/BuildFile.xml
index b23c65ec201e4..da28405324833 100644
--- a/EventFilter/EcalRawToDigi/BuildFile.xml
+++ b/EventFilter/EcalRawToDigi/BuildFile.xml
@@ -1,4 +1,6 @@
 <use name="boost"/>
+<use name="cuda"/>
+<use name="CUDADataFormats/EcalDigi" />
 <use name="CalibCalorimetry/EcalLaserCorrection"/>
 <use name="CondFormats/DataRecord"/>
 <use name="CondFormats/EcalObjects"/>
@@ -14,15 +16,11 @@
 <use name="FWCore/PluginManager"/>
 <use name="Geometry/EcalMapping"/>
 <use name="Geometry/Records"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="RecoLocalCalo/EcalRecAlgos"/>
 <use name="RecoLocalCalo/EcalRecProducers"/>
 <use name="Utilities/StorageFactory"/>
-
-<use   name="cuda"/>
-<use   name="HeterogeneousCore/CUDAUtilities"/>
-<use   name="HeterogeneousCore/CUDACore"/>
-<use   name="CUDADataFormats/EcalDigi" />
-
 <export>
   <lib name="1"/>
 </export>
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h
index a35ef1c57a381..92d4bee3100f3 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h
@@ -11,6 +11,10 @@
 #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
 #include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h"
 
+// 
+// ECAL UncalibRechit producer
+// 
+
 #include "CondFormats/EcalObjects/interface/EcalWeightSet.h"
 #include "CondFormats/EcalObjects/interface/EcalPedestals.h"
 #include "CondFormats/EcalObjects/interface/EcalGainRatios.h"
@@ -27,6 +31,27 @@
 
 #include "CUDADataFormats/EcalDigi/interface/DigisCollection.h"
 
+// 
+// ECAL Rechit producer
+// 
+
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
+
+#include "CondFormats/EcalObjects/interface/EcalChannelStatus.h"
+#include "CondFormats/EcalObjects/interface/EcalChannelStatusCode.h"
+
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h"
+
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h"
+
+
+
+
 struct EcalPulseShape;
 class EcalSampleMask;
 class EcalTimeBiasCorrections;
@@ -278,22 +303,6 @@ struct conf_data {
 // ECAL Rechit producer
 // 
 
-#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
-
-#include "CondFormats/EcalObjects/interface/EcalChannelStatus.h"
-#include "CondFormats/EcalObjects/interface/EcalChannelStatusCode.h"
-
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h"
-
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h"
-
-
-
 namespace ecal { 
   namespace rechit {
     
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
index 5c50bdaa58f7f..c4f3d22dd0a1d 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
@@ -143,7 +143,7 @@ namespace ecal {
       uint32_t* extra,
       // other
       int const nchannels,
-      uint32_t const offsetForInput,
+      uint32_t const nChannelsBarrel,
       uint32_t const offsetForHashes                     
     ) {
       
@@ -155,37 +155,39 @@ namespace ecal {
       int ch = threadIdx.x + blockDim.x*blockIdx.x;
       
       if (ch < nchannels) {
+      
+        bool isEndcap = (ch >= nChannelsBarrel);
         
-        int const inputCh = ch >= offsetForInput
-        ? ch - offsetForInput
+        int const inputCh = isEndcap
+        ? ch - nChannelsBarrel
         : ch;
         
-        uint32_t const * didCh = ch >= offsetForInput
+        uint32_t const * didCh = isEndcap
         ? did_ee
         : did_eb;
         
         // only two values, EB or EE
         // AM : FIXME : why not using "isBarrel" ?    isBarrel ? adc2gev[0] : adc2gev[1]
-        float adc2gev_to_use = ch >= offsetForInput
+        float adc2gev_to_use = isEndcap
         ? adc2gev[1]  // ee
         : adc2gev[0]; // eb
         
         
         // first EB and then EE
         
-        ::ecal::reco::StorageScalarType const* amplitude = ch >= offsetForInput
+        ::ecal::reco::StorageScalarType const* amplitude = isEndcap
         ? amplitude_ee
         : amplitude_eb;
         
-        ::ecal::reco::StorageScalarType const* time_in = ch >= offsetForInput
+        ::ecal::reco::StorageScalarType const* time_in = isEndcap
         ? time_ee
         : time_eb;
         
-        ::ecal::reco::StorageScalarType const* chi2_in = ch >= offsetForInput
+        ::ecal::reco::StorageScalarType const* chi2_in = isEndcap
         ? chi2_ee
         : chi2_eb;
         
-        uint32_t const* flags_in = ch >= offsetForInput
+        uint32_t const* flags_in = isEndcap
         ? flags_ee
         : flags_eb;
         
@@ -641,7 +643,7 @@ namespace ecal {
       //     eventDataForScratchGPU_,
       ConditionsProducts const& conditions, 
       ConfigurationParameters const& configParameters,
-      uint32_t const  offsetForInput,
+      uint32_t const  nChannelsBarrel,
       edm::TimeValue_t const event_time,
       cudaStream_t cudaStream
     ){
@@ -724,7 +726,7 @@ namespace ecal {
         eventOutputGPU.extra,
         // other
         nchannels,
-        offsetForInput,
+        nChannelsBarrel,
         conditions.offsetForHashes
       );
       
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h
index a1809dbded6bd..30bc589a9a5c2 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h
@@ -74,7 +74,7 @@ namespace ecal {
       uint32_t* flagBits,
       uint32_t* extra,
       int const nchannels,
-      uint32_t const offsetForInput,
+      uint32_t const nChannelsBarrel,
       uint32_t const offsetForHashes  
     );
     
@@ -87,7 +87,7 @@ namespace ecal {
       //     eventDataForScratchGPU_,
       ConditionsProducts const& conditions, 
       ConfigurationParameters const& configParameters,
-      uint32_t const offsetForInput, 
+      uint32_t const nChannelsBarrel, 
       edm::TimeValue_t const event_time,
       cudaStream_t cudaStream
     );
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc
index 53f32df6f9697..7f38a23ec9168 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc
@@ -5,10 +5,7 @@
 
 EcalRechitChannelStatusGPU::EcalRechitChannelStatusGPU(EcalChannelStatus const& values) 
 : status_(values.size())
-{
-  
-  std::cout << " I am running EcalRechitChannelStatusGPU ::>> debug ongoing ... " << std::endl;
-  
+{  
   // fill in eb
   auto const& barrelValues = values.barrelItems();
   for (unsigned int i=0; i<barrelValues.size(); i++) {
diff --git a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu
index 1e44a6794d29f..e4e1a59565e0d 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu
@@ -73,9 +73,9 @@ namespace ecal {
             // 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16
           };
           
-          int iym, ixm, il, ic, ii;
-          iym = 4;
-          ixm = 17;
+          int il, ic, ii;
+          const int iym = 4;
+          const int ixm = 17;
           int iX_ = iX + 1;
           int iY_ = iY + 1;
           il = iym - iY_;
diff --git a/RecoLocalCalo/EcalRecProducers/BuildFile.xml b/RecoLocalCalo/EcalRecProducers/BuildFile.xml
index 1136de24bd52e..abbae509cdab0 100644
--- a/RecoLocalCalo/EcalRecProducers/BuildFile.xml
+++ b/RecoLocalCalo/EcalRecProducers/BuildFile.xml
@@ -1,6 +1,5 @@
 <use name="clhep"/>
 <use name="cuda"/>
-<use name="cuda-api-wrappers"/>
 <use name="CUDADataFormats/EcalRecHitSoA"/>
 <use name="CondFormats/EcalObjects"/>
 <use name="FWCore/Framework"/>
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml
index 328f809bfd56a..4b10eee31c1ce 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml
+++ b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml
@@ -1,5 +1,4 @@
 <use name="cuda"/>
-<use name="cuda-api-wrappers"/>
 <use name="CUDADataFormats/EcalRecHitSoA"/>
 <use name="HeterogeneousCore/CUDACore"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
@@ -17,7 +16,6 @@
 <use name="CondFormats/DataRecord"/>
 <use name="RecoLocalCalo/EcalRecAlgos"/>
 <use name="FWCore/MessageLogger"/>
-<use name="FWCore/MessageService"/>
 <library file="*.cc" name="RecoLocalCaloEcalRecProducersPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc
index c3a5851c1d2bd..fc6ae22ff57e0 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc
@@ -25,7 +25,7 @@ class EcalCPURecHitProducer
 {
 public:
   explicit EcalCPURecHitProducer(edm::ParameterSet const& ps);
-  ~EcalCPURecHitProducer() override;
+  ~EcalCPURecHitProducer() override = default;
   static void fillDescriptions(edm::ConfigurationDescriptions&);
   
 private:
@@ -65,7 +65,6 @@ void EcalCPURecHitProducer::fillDescriptions(
   , containsTimingInformation_{ps.getParameter<bool>("containsTimingInformation")}
   {}
   
-  EcalCPURecHitProducer::~EcalCPURecHitProducer() {}
   
   void EcalCPURecHitProducer::acquire(
     edm::Event const& event,
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
index bbe05aceda79b..7422838471ebc 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
@@ -336,16 +336,13 @@ void EcalRecHitProducerGPU::acquire(
   nee_ = eeUncalibRecHits.size;
   //   std::cout << " [EcalRecHitProducerGPU::acquire]  neb_:nee_ = " << neb_ << " : " << nee_ << std::endl;
   
-  int nchannelsEB = ebUncalibRecHits.size;
-  
-  int offsetForInput = nchannelsEB;  // first EB and then EE
+  int nchannelsEB = ebUncalibRecHits.size; // --> offsetForInput, first EB and then EE
   
   // conditions
   // - laser correction 
   // - IC
   // - adt2gev
   
-  
   //   
   setup.get<EcalADCToGeVConstantRcd>()   .get(ADCToGeVConstantHandle_);
   setup.get<EcalIntercalibConstantsRcd>().get(IntercalibConstantsHandle_);
@@ -396,7 +393,7 @@ void EcalRecHitProducerGPU::acquire(
     //     eventDataForScratchGPU_,
     conditions,  
     configParameters_,
-    offsetForInput,
+    nchannelsEB,
     event_time,
     ctx.stream()
   );
diff --git a/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py b/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py
index 02f84eebf21b3..bc3d8a20ea5c1 100644
--- a/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py
+++ b/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py
@@ -18,6 +18,11 @@
 #process.load('DQMOffline.Configuration.DQMOffline_cff')
 process.load('Configuration.StandardSequences.FrontierConditions_GlobalTag_cff')
 
+
+
+
+
+
 # Other statements
 from Configuration.AlCa.GlobalTag import GlobalTag
 process.GlobalTag = GlobalTag(process.GlobalTag, '102X_dataRun2_HLT_v2', '')
@@ -301,6 +306,6 @@
 
 
 #
-process.DependencyGraph = cms.Service("DependencyGraph")
+#process.DependencyGraph = cms.Service("DependencyGraph")
 
 
diff --git a/RecoLocalCalo/EcalRecProducers/test/testEcalUncalibRechitProducer_cfg.py b/RecoLocalCalo/EcalRecProducers/test/testEcalUncalibRechitProducer_cfg.py
index 7fdf723b67bdd..be1934d8e002c 100644
--- a/RecoLocalCalo/EcalRecProducers/test/testEcalUncalibRechitProducer_cfg.py
+++ b/RecoLocalCalo/EcalRecProducers/test/testEcalUncalibRechitProducer_cfg.py
@@ -1,4 +1,3 @@
-
 import FWCore.ParameterSet.Config as cms
 
 from Configuration.StandardSequences.Eras import eras
@@ -54,7 +53,6 @@
 # ../cfipython/slc7_amd64_gcc700/RecoLocalCalo/EcalRecProducers/ecalUncalibRecHitProducerGPU_cfi.py
 #
 process.load("RecoLocalCalo.EcalRecProducers.ecalUncalibRecHitProducerGPU_cfi")
-#
 process.load("RecoLocalCalo.EcalRecProducers.ecalMultiFitUncalibRecHit_cfi")
 
 # for validation of gpu multifit products

From e2f7e8af219e22ec3a675ec68b402885cc2fa997 Mon Sep 17 00:00:00 2001
From: amassiro <massironi.andrea@gmail.com>
Date: Mon, 18 May 2020 09:50:55 +0200
Subject: [PATCH 16/30] further changes from PR comments

---
 .../EcalRecAlgos/src/EcalRecHitBuilderKernels.cu          | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
index c4f3d22dd0a1d..792b422cefd6f 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
@@ -151,10 +151,12 @@ namespace ecal {
       //       
       //    NB: energy   "type_wrapper<reco::StorageScalarType, L>::type" most likely std::vector<float>
       //       
+
+      for (int ch = threadIdx.x + blockDim.x*blockIdx.x; ch < nchannels; ch += blockDim.x*gridDim.x) {  
+        
+//       int ch = threadIdx.x + blockDim.x*blockIdx.x;
       
-      int ch = threadIdx.x + blockDim.x*blockIdx.x;
-      
-      if (ch < nchannels) {
+//       if (ch < nchannels) {
       
         bool isEndcap = (ch >= nChannelsBarrel);
         

From 1cb2afa59cd0f491f69fa41c575e742714f2ab58 Mon Sep 17 00:00:00 2001
From: amassiro <massironi.andrea@gmail.com>
Date: Mon, 18 May 2020 10:42:02 +0200
Subject: [PATCH 17/30] after running clang tidy

---
 .../EcalRecHitSoA/interface/EcalRecHit_soa.h  |  36 +-
 ...eEcalMultifitResultsGpuValidationPlots.cpp | 385 ++++----
 .../bin/makeEcalRechitValidationPlots.cpp     | 546 ++++++------
 .../EcalRecAlgos/interface/DeclsForKernels.h  | 509 +++++------
 .../interface/EcalIntercalibConstantsGPU.h    |  25 +-
 .../interface/EcalLaserAPDPNRatiosGPU.h       |  47 +-
 .../interface/EcalLaserAPDPNRatiosRefGPU.h    |  25 +-
 .../interface/EcalLaserAlphasGPU.h            |  25 +-
 .../interface/EcalLinearCorrectionsGPU.h      |  46 +-
 .../interface/EcalRechitADCToGeVConstantGPU.h |  27 +-
 .../interface/EcalRechitChannelStatusGPU.h    |  27 +-
 .../src/AmplitudeComputationCommonKernels.cu  |   7 +-
 .../src/AmplitudeComputationKernels.cu        |   5 +-
 .../src/EcalIntercalibConstantsGPU.cc         |  54 +-
 .../src/EcalLaserAPDPNRatiosGPU.cc            | 135 ++-
 .../src/EcalLaserAPDPNRatiosRefGPU.cc         |  54 +-
 .../EcalRecAlgos/src/EcalLaserAlphasGPU.cc    |  54 +-
 .../src/EcalLinearCorrectionsGPU.cc           | 126 ++-
 .../src/EcalRecHitBuilderKernels.cu           | 831 ++++++++----------
 .../src/EcalRecHitBuilderKernels.h            | 158 ++--
 .../src/EcalRechitADCToGeVConstantGPU.cc      |  37 +-
 .../src/EcalRechitChannelStatusGPU.cc         |  51 +-
 .../EcalRecAlgos/src/KernelHelpers.cu         | 366 ++++----
 .../EcalRecAlgos/src/KernelHelpers.h          |  16 +-
 .../plugins/EcalCPURecHitProducer.cc          | 268 +++---
 .../plugins/EcalESProducersGPUDefs.cc         | 117 +--
 .../plugins/EcalRecHitConvertGPU2CPUFormat.cc | 185 ++--
 .../plugins/EcalRecHitProducerGPU.cc          | 437 ++++-----
 28 files changed, 2100 insertions(+), 2499 deletions(-)

diff --git a/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h b/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h
index 20d342d1b7073..8379dec5c81ad 100644
--- a/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h
+++ b/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h
@@ -11,29 +11,26 @@
 #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
 
 namespace ecal {
-  
-  template<typename L = Tag::soa>
-  struct RecHit  : public Detail::Base<L> {
-    
+
+  template <typename L = Tag::soa>
+  struct RecHit : public Detail::Base<L> {
     RecHit() = default;
     RecHit(const RecHit&) = default;
     RecHit& operator=(const RecHit&) = default;
-    
+
     RecHit(RecHit&&) = default;
     RecHit& operator=(RecHit&&) = default;
-    
+
     typename type_wrapper<reco::StorageScalarType, L>::type energy;
     typename type_wrapper<reco::StorageScalarType, L>::type time;
-    typename type_wrapper<reco::StorageScalarType, L>::type chi2; // should we remove this, since already included in "extra" ?
-    typename type_wrapper<uint32_t, L>::type extra;    // packed uint32_t for timeError, chi2, energyError
-    typename type_wrapper<uint32_t, L>::type flagBits; // store rechit condition (see Flags enum) in a bit-wise way
-    
+    typename type_wrapper<reco::StorageScalarType, L>::type chi2;  // should we remove this, since already included in "extra" ?
+    typename type_wrapper<uint32_t, L>::type extra;     // packed uint32_t for timeError, chi2, energyError
+    typename type_wrapper<uint32_t, L>::type flagBits;  // store rechit condition (see Flags enum) in a bit-wise way
+
     typename type_wrapper<uint32_t, L>::type did;
-    
-    
-    template<typename U = L>
-    typename std::enable_if<std::is_same<U, Tag::soa>::value, void>::type 
-    resize(size_t size) {
+
+    template <typename U = L>
+    typename std::enable_if<std::is_same<U, Tag::soa>::value, void>::type resize(size_t size) {
       energy.resize(size);
       time.resize(size);
       chi2.resize(size);
@@ -42,11 +39,10 @@ namespace ecal {
       did.resize(size);
     }
   };
-  
+
   using SoARecHitCollection = RecHit<Tag::soa>;
-  
-}
 
-#endif 
-// RecoLocalCalo_EcalRecAlgos_interface_EcalRecHit_soa_h
+}  // namespace ecal
 
+#endif
+// RecoLocalCalo_EcalRecAlgos_interface_EcalRecHit_soa_h
diff --git a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp
index 04ba175eebb1e..1cf7c9d706317 100644
--- a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp
+++ b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp
@@ -17,57 +17,54 @@
 
 #include "TStyle.h"
 
-
-void setAxis(TH2D* histo) {
+void setAxis(TH2D *histo) {
   histo->GetXaxis()->SetTitle("cpu");
   histo->GetYaxis()->SetTitle("gpu");
 }
 
-
-void setAxisDelta(TH2D* histo) {
+void setAxisDelta(TH2D *histo) {
   histo->GetXaxis()->SetTitle("cpu");
   histo->GetYaxis()->SetTitle("#Delta gpu-cpu");
 }
 
 int main(int argc, char *argv[]) {
-  if (argc<3) {
+  if (argc < 3) {
     std::cout << "run with: ./validateGPU <path to input file> <output file>\n";
     exit(0);
   }
-  
+
   gStyle->SetOptStat("ourme");
-  
-  edm::Wrapper<ecal::UncalibratedRecHit<ecal::Tag::soa>> *wgpuEB=nullptr;
-  edm::Wrapper<ecal::UncalibratedRecHit<ecal::Tag::soa>> *wgpuEE=nullptr;
+
+  edm::Wrapper<ecal::UncalibratedRecHit<ecal::Tag::soa>> *wgpuEB = nullptr;
+  edm::Wrapper<ecal::UncalibratedRecHit<ecal::Tag::soa>> *wgpuEE = nullptr;
   edm::Wrapper<EBUncalibratedRecHitCollection> *wcpuEB = nullptr;
   edm::Wrapper<EEUncalibratedRecHitCollection> *wcpuEE = nullptr;
-  
+
   std::string fileName = argv[1];
   std::string outFileName = argv[2];
-  
+
   // output
   TFile rfout{outFileName.c_str(), "recreate"};
-  
+
   int nbins_count = 200;
   float last_count = 5000.;
   int nbins_count_delta = 201;
-  
+
   int nbins = 300;
   float last = 3000.;
-  
+
   //     int nbins_chi2 = 1000;
   //     float last_chi2 = 1000.;
   int nbins_chi2 = 1000;
   float last_chi2 = 200.;
-  
+
   int nbins_flags = 100;
   float last_flags = 100.;
   float delta_flags = 20;
-  
+
   int nbins_delta = 201;  // use an odd number to center around 0
   float delta = 0.2;
-  
-  
+
   // RecHits plots for EB and EE on both GPU and CPU
   auto hRechitsEBGPU = new TH1D("RechitsEBGPU", "RechitsEBGPU; No. of Rechits", nbins_count, 0, last_count);
   auto hRechitsEBCPU = new TH1D("RechitsEBCPU", "RechitsEBCPU; No. of Rechits", nbins_count, 0, last_count);
@@ -75,75 +72,110 @@ int main(int argc, char *argv[]) {
   auto hRechitsEECPU = new TH1D("RechitsEECPU", "RechitsEECPU; No. of Rechits", nbins_count, 0, last_count);
   auto hRechitsEBGPUCPUratio = new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU", 50, 0.9, 1.1);
   auto hRechitsEEGPUCPUratio = new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU", 50, 0.9, 1.1);
-  
+
   auto hSOIAmplitudesEBGPU = new TH1D("hSOIAmplitudesEBGPU", "hSOIAmplitudesEBGPU", nbins, 0, last);
   auto hSOIAmplitudesEEGPU = new TH1D("hSOIAmplitudesEEGPU", "hSOIAmplitudesEEGPU", nbins, 0, last);
   auto hSOIAmplitudesEBCPU = new TH1D("hSOIAmplitudesEBCPU", "hSOIAmplitudesEBCPU", nbins, 0, last);
   auto hSOIAmplitudesEECPU = new TH1D("hSOIAmplitudesEECPU", "hSOIAmplitudesEECPU", nbins, 0, last);
-  auto hSOIAmplitudesEBGPUCPUratio = new TH1D("SOIAmplitudesEBGPU/CPUratio", "SOIAmplitudesEBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
-  auto hSOIAmplitudesEEGPUCPUratio = new TH1D("SOIAmplitudesEEGPU/CPUratio", "SOIAmplitudesEEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
-  
+  auto hSOIAmplitudesEBGPUCPUratio =
+      new TH1D("SOIAmplitudesEBGPU/CPUratio", "SOIAmplitudesEBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
+  auto hSOIAmplitudesEEGPUCPUratio =
+      new TH1D("SOIAmplitudesEEGPU/CPUratio", "SOIAmplitudesEEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
+
   auto hChi2EBGPU = new TH1D("hChi2EBGPU", "hChi2EBGPU", nbins_chi2, 0, last_chi2);
   auto hChi2EEGPU = new TH1D("hChi2EEGPU", "hChi2EEGPU", nbins_chi2, 0, last_chi2);
   auto hChi2EBCPU = new TH1D("hChi2EBCPU", "hChi2EBCPU", nbins_chi2, 0, last_chi2);
   auto hChi2EECPU = new TH1D("hChi2EECPU", "hChi2EECPU", nbins_chi2, 0, last_chi2);
   auto hChi2EBGPUCPUratio = new TH1D("Chi2EBGPU/CPUratio", "Chi2EBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
   auto hChi2EEGPUCPUratio = new TH1D("Chi2EEGPU/CPUratio", "Chi2EEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
-  
+
   auto hFlagsEBGPU = new TH1D("hFlagsEBGPU", "hFlagsEBGPU", nbins_flags, 0, last_flags);
   auto hFlagsEEGPU = new TH1D("hFlagsEEGPU", "hFlagsEEGPU", nbins_flags, 0, last_flags);
   auto hFlagsEBCPU = new TH1D("hFlagsEBCPU", "hFlagsEBCPU", nbins_flags, 0, last_flags);
   auto hFlagsEECPU = new TH1D("hFlagsEECPU", "hFlagsEECPU", nbins_flags, 0, last_flags);
   auto hFlagsEBGPUCPUratio = new TH1D("FlagsEBGPU/CPUratio", "FlagsEBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
   auto hFlagsEEGPUCPUratio = new TH1D("FlagsEEGPU/CPUratio", "FlagsEEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1);
-  
-  auto hSOIAmplitudesEBGPUvsCPU = new TH2D("hSOIAmplitudesEBGPUvsCPU", "hSOIAmplitudesEBGPUvsCPU", nbins, 0, last, nbins, 0, last);                       setAxis(hSOIAmplitudesEBGPUvsCPU  ) ;
-  auto hSOIAmplitudesEEGPUvsCPU = new TH2D("hSOIAmplitudesEEGPUvsCPU", "hSOIAmplitudesEEGPUvsCPU", nbins, 0, last, nbins, 0, last);                       setAxis(hSOIAmplitudesEEGPUvsCPU  ) ;
-  auto hSOIAmplitudesEBdeltavsCPU = new TH2D("hSOIAmplitudesEBdeltavsCPU", "hSOIAmplitudesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);     setAxisDelta(hSOIAmplitudesEBdeltavsCPU) ;
-  auto hSOIAmplitudesEEdeltavsCPU = new TH2D("hSOIAmplitudesEEdeltavsCPU", "hSOIAmplitudesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);     setAxisDelta(hSOIAmplitudesEEdeltavsCPU) ;
-  
-  auto hChi2EBGPUvsCPU = new TH2D("hChi2EBGPUvsCPU", "hChi2EBGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2);                      setAxis(hChi2EBGPUvsCPU  ) ;
-  auto hChi2EEGPUvsCPU = new TH2D("hChi2EEGPUvsCPU", "hChi2EEGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2);                      setAxis(hChi2EEGPUvsCPU  ) ;
-  auto hChi2EBdeltavsCPU = new TH2D("hChi2EBdeltavsCPU", "hChi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);              setAxisDelta(hChi2EBdeltavsCPU) ;
-  auto hChi2EEdeltavsCPU = new TH2D("hChi2EEdeltavsCPU", "hChi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);              setAxisDelta(hChi2EEdeltavsCPU) ;
-  
-  auto hFlagsEBGPUvsCPU = new TH2D("hFlagsEBGPUvsCPU", "hFlagsEBGPUvsCPU", nbins_flags, 0, last_flags, nbins_flags, 0, last_flags);                      setAxis(hFlagsEBGPUvsCPU  ) ;
-  auto hFlagsEEGPUvsCPU = new TH2D("hFlagsEEGPUvsCPU", "hFlagsEEGPUvsCPU", nbins_flags, 0, last_flags, nbins_flags, 0, last_flags);                      setAxis(hFlagsEEGPUvsCPU  ) ;
-  auto hFlagsEBdeltavsCPU = new TH2D("hFlagsEBdeltavsCPU", "hFlagsEBdeltavsCPU", nbins_flags, 0, last_flags, nbins_delta, -delta_flags, delta_flags);    setAxisDelta(hFlagsEBdeltavsCPU) ;
-  auto hFlagsEEdeltavsCPU = new TH2D("hFlagsEEdeltavsCPU", "hFlagsEEdeltavsCPU", nbins_flags, 0, last_flags, nbins_delta, -delta_flags, delta_flags);    setAxisDelta(hFlagsEEdeltavsCPU) ;
-  
-  auto hRechitsEBGPUvsCPU = new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU", last_count, 0, last_count, last_count, 0, last_count);       setAxis(hRechitsEBGPUvsCPU  ) ;
-  auto hRechitsEEGPUvsCPU = new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU", last_count, 0, last_count, last_count, 0, last_count);       setAxis(hRechitsEEGPUvsCPU  ) ;
-  auto hRechitsEBdeltavsCPU = new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU", nbins_count, 0, last_count, nbins_count_delta, -delta, delta);   setAxisDelta(hRechitsEBdeltavsCPU) ;
-  auto hRechitsEEdeltavsCPU = new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU", nbins_count, 0, last_count, nbins_count_delta, -delta, delta);   setAxisDelta(hRechitsEEdeltavsCPU) ;
-  
-  
+
+  auto hSOIAmplitudesEBGPUvsCPU =
+      new TH2D("hSOIAmplitudesEBGPUvsCPU", "hSOIAmplitudesEBGPUvsCPU", nbins, 0, last, nbins, 0, last);
+  setAxis(hSOIAmplitudesEBGPUvsCPU);
+  auto hSOIAmplitudesEEGPUvsCPU =
+      new TH2D("hSOIAmplitudesEEGPUvsCPU", "hSOIAmplitudesEEGPUvsCPU", nbins, 0, last, nbins, 0, last);
+  setAxis(hSOIAmplitudesEEGPUvsCPU);
+  auto hSOIAmplitudesEBdeltavsCPU =
+      new TH2D("hSOIAmplitudesEBdeltavsCPU", "hSOIAmplitudesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+  setAxisDelta(hSOIAmplitudesEBdeltavsCPU);
+  auto hSOIAmplitudesEEdeltavsCPU =
+      new TH2D("hSOIAmplitudesEEdeltavsCPU", "hSOIAmplitudesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+  setAxisDelta(hSOIAmplitudesEEdeltavsCPU);
+
+  auto hChi2EBGPUvsCPU =
+      new TH2D("hChi2EBGPUvsCPU", "hChi2EBGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2);
+  setAxis(hChi2EBGPUvsCPU);
+  auto hChi2EEGPUvsCPU =
+      new TH2D("hChi2EEGPUvsCPU", "hChi2EEGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2);
+  setAxis(hChi2EEGPUvsCPU);
+  auto hChi2EBdeltavsCPU =
+      new TH2D("hChi2EBdeltavsCPU", "hChi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
+  setAxisDelta(hChi2EBdeltavsCPU);
+  auto hChi2EEdeltavsCPU =
+      new TH2D("hChi2EEdeltavsCPU", "hChi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
+  setAxisDelta(hChi2EEdeltavsCPU);
+
+  auto hFlagsEBGPUvsCPU =
+      new TH2D("hFlagsEBGPUvsCPU", "hFlagsEBGPUvsCPU", nbins_flags, 0, last_flags, nbins_flags, 0, last_flags);
+  setAxis(hFlagsEBGPUvsCPU);
+  auto hFlagsEEGPUvsCPU =
+      new TH2D("hFlagsEEGPUvsCPU", "hFlagsEEGPUvsCPU", nbins_flags, 0, last_flags, nbins_flags, 0, last_flags);
+  setAxis(hFlagsEEGPUvsCPU);
+  auto hFlagsEBdeltavsCPU = new TH2D(
+      "hFlagsEBdeltavsCPU", "hFlagsEBdeltavsCPU", nbins_flags, 0, last_flags, nbins_delta, -delta_flags, delta_flags);
+  setAxisDelta(hFlagsEBdeltavsCPU);
+  auto hFlagsEEdeltavsCPU = new TH2D(
+      "hFlagsEEdeltavsCPU", "hFlagsEEdeltavsCPU", nbins_flags, 0, last_flags, nbins_delta, -delta_flags, delta_flags);
+  setAxisDelta(hFlagsEEdeltavsCPU);
+
+  auto hRechitsEBGPUvsCPU = new TH2D(
+      "RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU", last_count, 0, last_count, last_count, 0, last_count);
+  setAxis(hRechitsEBGPUvsCPU);
+  auto hRechitsEEGPUvsCPU = new TH2D(
+      "RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU", last_count, 0, last_count, last_count, 0, last_count);
+  setAxis(hRechitsEEGPUvsCPU);
+  auto hRechitsEBdeltavsCPU = new TH2D(
+      "RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU", nbins_count, 0, last_count, nbins_count_delta, -delta, delta);
+  setAxisDelta(hRechitsEBdeltavsCPU);
+  auto hRechitsEEdeltavsCPU = new TH2D(
+      "RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU", nbins_count, 0, last_count, nbins_count_delta, -delta, delta);
+  setAxisDelta(hRechitsEEdeltavsCPU);
+
   // input
   std::cout << "validating file " << fileName << std::endl;
   TFile rf{fileName.c_str()};
-  TTree *rt = (TTree*)rf.Get("Events");
-  rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEB_RECO.", &wgpuEB);
-  rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEE_RECO.", &wgpuEE);
+  TTree *rt = (TTree *)rf.Get("Events");
+  rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEB_RECO.",
+                       &wgpuEB);
+  rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEE_RECO.",
+                       &wgpuEE);
   rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEB_RECO.", &wcpuEB);
   rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEE_RECO.", &wcpuEE);
-  
+
   constexpr float eps_diff = 1e-3;
-  
+
   // accumulate
   auto const nentries = rt->GetEntries();
   std::cout << "#events to validate over: " << nentries << std::endl;
-  for (int ie=0; ie<nentries; ++ie) {
+  for (int ie = 0; ie < nentries; ++ie) {
     rt->GetEntry(ie);
-    
-    const char* ordinal[] = { "th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th" };
+
+    const char *ordinal[] = {"th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th"};
     auto cpu_eb_size = wcpuEB->bareProduct().size();
     auto cpu_ee_size = wcpuEE->bareProduct().size();
     auto gpu_eb_size = wgpuEB->bareProduct().amplitude.size();
     auto gpu_ee_size = wgpuEE->bareProduct().amplitude.size();
-    
-    float eb_ratio = (float) gpu_eb_size/cpu_eb_size;
-    float ee_ratio = (float) gpu_ee_size/cpu_ee_size;
-    
+
+    float eb_ratio = (float)gpu_eb_size / cpu_eb_size;
+    float ee_ratio = (float)gpu_ee_size / cpu_ee_size;
+
     // Filling up the histograms on events sizes for EB and EE on both GPU and CPU
     hRechitsEBGPU->Fill(gpu_eb_size);
     hRechitsEBCPU->Fill(cpu_eb_size);
@@ -153,151 +185,158 @@ int main(int argc, char *argv[]) {
     hRechitsEEGPUvsCPU->Fill(cpu_ee_size, gpu_ee_size);
     hRechitsEBGPUCPUratio->Fill(eb_ratio);
     hRechitsEEGPUCPUratio->Fill(ee_ratio);
-    hRechitsEBdeltavsCPU->Fill(cpu_eb_size, gpu_eb_size-cpu_eb_size);
-    hRechitsEEdeltavsCPU->Fill(cpu_ee_size, gpu_ee_size-cpu_ee_size);
-    
-    
+    hRechitsEBdeltavsCPU->Fill(cpu_eb_size, gpu_eb_size - cpu_eb_size);
+    hRechitsEEdeltavsCPU->Fill(cpu_ee_size, gpu_ee_size - cpu_ee_size);
+
     if (cpu_eb_size != gpu_eb_size or cpu_ee_size != gpu_ee_size) {
       std::cerr << ie << ordinal[ie % 10] << " entry:\n"
-      << "  EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << gpu_eb_size << " (gpu)\n"
-      << "  EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << gpu_ee_size << " (gpu)" << std::endl;
+                << "  EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << gpu_eb_size
+                << " (gpu)\n"
+                << "  EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << gpu_ee_size
+                << " (gpu)" << std::endl;
       continue;
     }
-    
+
     assert(wgpuEB->bareProduct().amplitude.size() == wcpuEB->bareProduct().size());
     assert(wgpuEE->bareProduct().amplitude.size() == wcpuEE->bareProduct().size());
     auto const neb = wcpuEB->bareProduct().size();
     auto const nee = wcpuEE->bareProduct().size();
-    
-    
-    for (uint32_t i=0; i<neb; ++i) {
+
+    for (uint32_t i = 0; i < neb; ++i) {
       auto const did_gpu = wgpuEB->bareProduct().did[i];
       auto const soi_amp_gpu = wgpuEB->bareProduct().amplitude[i];
       auto const cpu_iter = wcpuEB->bareProduct().find(DetId{did_gpu});
       if (cpu_iter == wcpuEB->bareProduct().end()) {
         std::cerr << ie << ordinal[ie % 10] << " entry\n"
-        << "  Did not find a DetId " << did_gpu
-        << " in a CPU collection\n";
+                  << "  Did not find a DetId " << did_gpu << " in a CPU collection\n";
         continue;
       }
       auto const soi_amp_cpu = cpu_iter->amplitude();
       auto const chi2_gpu = wgpuEB->bareProduct().chi2[i];
       auto const chi2_cpu = cpu_iter->chi2();
-      
+
       auto const flags_gpu = wgpuEB->bareProduct().flags[i];
       auto const flags_cpu = cpu_iter->flags();
-      
+
       hSOIAmplitudesEBGPU->Fill(soi_amp_gpu);
       hSOIAmplitudesEBCPU->Fill(soi_amp_cpu);
       hSOIAmplitudesEBGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu);
-      hSOIAmplitudesEBdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu-soi_amp_cpu);
-      if (soi_amp_cpu>0) hSOIAmplitudesEBGPUCPUratio->Fill( (float) soi_amp_gpu/soi_amp_cpu);
-      
+      hSOIAmplitudesEBdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu - soi_amp_cpu);
+      if (soi_amp_cpu > 0)
+        hSOIAmplitudesEBGPUCPUratio->Fill((float)soi_amp_gpu / soi_amp_cpu);
+
       hChi2EBGPU->Fill(chi2_gpu);
       hChi2EBCPU->Fill(chi2_cpu);
       hChi2EBGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
-      hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu);
-      if (chi2_cpu>0) hChi2EBGPUCPUratio->Fill( (float) chi2_gpu/chi2_cpu);
-      
-      if (fabs(chi2_gpu/chi2_cpu-1) > 0.05 || fabs(soi_amp_gpu/soi_amp_cpu-1) > 0.05) {
+      hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu);
+      if (chi2_cpu > 0)
+        hChi2EBGPUCPUratio->Fill((float)chi2_gpu / chi2_cpu);
+
+      if (fabs(chi2_gpu / chi2_cpu - 1) > 0.05 || fabs(soi_amp_gpu / soi_amp_cpu - 1) > 0.05) {
         std::cout << " ---- EB  " << std::endl;
         std::cout << " eventid = " << ie << " xtal = " << i << std::endl;
-        std::cout << " chi2_gpu    = " << chi2_gpu    << " chi2_cpu =    " << chi2_cpu << std::endl;
+        std::cout << " chi2_gpu    = " << chi2_gpu << " chi2_cpu =    " << chi2_cpu << std::endl;
         std::cout << " soi_amp_gpu = " << soi_amp_gpu << " soi_amp_cpu = " << soi_amp_cpu << std::endl;
-        std::cout << " flags_gpu   = " << flags_gpu   << " flags_cpu =   " << flags_cpu << std::endl;
-      } 
-      
+        std::cout << " flags_gpu   = " << flags_gpu << " flags_cpu =   " << flags_cpu << std::endl;
+      }
+
       hFlagsEBGPU->Fill(flags_gpu);
       hFlagsEBCPU->Fill(flags_cpu);
       hFlagsEBGPUvsCPU->Fill(flags_cpu, flags_gpu);
-      hFlagsEBdeltavsCPU->Fill(flags_cpu, flags_gpu-flags_cpu);
-      if (flags_cpu>0) hFlagsEBGPUCPUratio->Fill( (float) flags_gpu/flags_cpu);
-      
-      if (flags_cpu!=flags_gpu) {
+      hFlagsEBdeltavsCPU->Fill(flags_cpu, flags_gpu - flags_cpu);
+      if (flags_cpu > 0)
+        hFlagsEBGPUCPUratio->Fill((float)flags_gpu / flags_cpu);
+
+      if (flags_cpu != flags_gpu) {
         std::cout << "    >>  No! Different flag cpu:gpu = " << flags_cpu << " : " << flags_gpu;
         std::cout << std::endl;
       }
-      
-      if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or
-        (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu)
-        or (flags_cpu!=flags_gpu) )
-      {
+
+      if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or
+          std::isnan(chi2_gpu) or (flags_cpu != flags_gpu)) {
         printf("EB eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n",
-               ie, i, soi_amp_gpu, soi_amp_cpu, chi2_gpu, chi2_cpu);
+               ie,
+               i,
+               soi_amp_gpu,
+               soi_amp_cpu,
+               chi2_gpu,
+               chi2_cpu);
         if (std::isnan(chi2_gpu))
           printf("*** nan ***\n");
       }
     }
-    
-    for (uint32_t i=0; i<nee; ++i) {
+
+    for (uint32_t i = 0; i < nee; ++i) {
       auto const did_gpu = wgpuEE->bareProduct().did[i];
       auto const soi_amp_gpu = wgpuEE->bareProduct().amplitude[i];
       auto const cpu_iter = wcpuEE->bareProduct().find(DetId{did_gpu});
       if (cpu_iter == wcpuEE->bareProduct().end()) {
         std::cerr << ie << ordinal[ie % 10] << " entry\n"
-        << "  did not find a DetId " << did_gpu
-        << " in a CPU collection\n";
+                  << "  did not find a DetId " << did_gpu << " in a CPU collection\n";
         continue;
       }
       auto const soi_amp_cpu = cpu_iter->amplitude();
       auto const chi2_gpu = wgpuEE->bareProduct().chi2[i];
       auto const chi2_cpu = cpu_iter->chi2();
-      
+
       auto const flags_gpu = wgpuEE->bareProduct().flags[i];
       auto const flags_cpu = cpu_iter->flags();
-      
-      
+
       hSOIAmplitudesEEGPU->Fill(soi_amp_gpu);
       hSOIAmplitudesEECPU->Fill(soi_amp_cpu);
       hSOIAmplitudesEEGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu);
-      hSOIAmplitudesEEdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu-soi_amp_cpu);
-      if (soi_amp_cpu>0) hSOIAmplitudesEEGPUCPUratio->Fill( (float) soi_amp_gpu/soi_amp_cpu);
-      
+      hSOIAmplitudesEEdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu - soi_amp_cpu);
+      if (soi_amp_cpu > 0)
+        hSOIAmplitudesEEGPUCPUratio->Fill((float)soi_amp_gpu / soi_amp_cpu);
+
       hChi2EEGPU->Fill(chi2_gpu);
       hChi2EECPU->Fill(chi2_cpu);
       hChi2EEGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
-      hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu);
-      if (chi2_cpu>0) hChi2EEGPUCPUratio->Fill( (float) chi2_gpu/chi2_cpu);
-      
-      if (fabs(chi2_gpu/chi2_cpu-1) > 0.05 || fabs(soi_amp_gpu/soi_amp_cpu-1) > 0.05) {
+      hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu);
+      if (chi2_cpu > 0)
+        hChi2EEGPUCPUratio->Fill((float)chi2_gpu / chi2_cpu);
+
+      if (fabs(chi2_gpu / chi2_cpu - 1) > 0.05 || fabs(soi_amp_gpu / soi_amp_cpu - 1) > 0.05) {
         std::cout << " ---- EE  " << std::endl;
         std::cout << " eventid = " << ie << " xtal = " << i << std::endl;
-        std::cout << " chi2_gpu    = " << chi2_gpu    << " chi2_cpu =    " << chi2_cpu << std::endl;
+        std::cout << " chi2_gpu    = " << chi2_gpu << " chi2_cpu =    " << chi2_cpu << std::endl;
         std::cout << " soi_amp_gpu = " << soi_amp_gpu << " soi_amp_cpu = " << soi_amp_cpu << std::endl;
-        std::cout << " flags_gpu   = " << flags_gpu   << " flags_cpu =   " << flags_cpu << std::endl;
-      } 
-      
+        std::cout << " flags_gpu   = " << flags_gpu << " flags_cpu =   " << flags_cpu << std::endl;
+      }
+
       hFlagsEEGPU->Fill(flags_gpu);
       hFlagsEECPU->Fill(flags_cpu);
       hFlagsEEGPUvsCPU->Fill(flags_cpu, flags_gpu);
-      hFlagsEEdeltavsCPU->Fill(flags_cpu, flags_gpu-flags_cpu);
-      if (flags_cpu>0) hFlagsEEGPUCPUratio->Fill( (float) flags_gpu/flags_cpu);
-      
-      if (flags_cpu!=flags_gpu) {
+      hFlagsEEdeltavsCPU->Fill(flags_cpu, flags_gpu - flags_cpu);
+      if (flags_cpu > 0)
+        hFlagsEEGPUCPUratio->Fill((float)flags_gpu / flags_cpu);
+
+      if (flags_cpu != flags_gpu) {
         std::cout << "    >>  No! Different flag cpu:gpu = " << flags_cpu << " : " << flags_gpu;
         std::cout << std::endl;
       }
-      
-      if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or
-        (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu)
-        or (flags_cpu!=flags_gpu) )
-      {
+
+      if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or
+          std::isnan(chi2_gpu) or (flags_cpu != flags_gpu)) {
         printf("EE eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n",
-               ie, static_cast<int>(neb+i), soi_amp_gpu, soi_amp_cpu, chi2_gpu, chi2_cpu);
+               ie,
+               static_cast<int>(neb + i),
+               soi_amp_gpu,
+               soi_amp_cpu,
+               chi2_gpu,
+               chi2_cpu);
         if (std::isnan(chi2_gpu))
           printf("*** nan ***\n");
       }
     }
   }
-  
+
   {
-    
-    
     //       TCanvas c("plots", "plots", 4200, 6200);
     TCanvas c("plots", "plots", 1750, 860);
     //       c.Divide(2, 3);
     c.Divide(3, 2);
-    
+
     //       c.cd(1);
     c.cd(1);
     {
@@ -309,11 +348,11 @@ int main(int argc, char *argv[]) {
       hSOIAmplitudesEBGPU->SetLineWidth(1.);
       hSOIAmplitudesEBGPU->Draw("sames");
       gPad->Update();
-      auto stats = (TPaveStats*)hSOIAmplitudesEBGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hSOIAmplitudesEBGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
+      stats->SetY1NDC(y1 - (y2 - y1));
     }
     //       c.cd(2);
     c.cd(4);
@@ -326,11 +365,11 @@ int main(int argc, char *argv[]) {
       hSOIAmplitudesEEGPU->SetLineWidth(1.);
       hSOIAmplitudesEEGPU->Draw("sames");
       gPad->Update();
-      auto stats = (TPaveStats*)hSOIAmplitudesEEGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hSOIAmplitudesEEGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
+      stats->SetY1NDC(y1 - (y2 - y1));
     }
     //       c.cd(3);
     c.cd(2);
@@ -348,12 +387,12 @@ int main(int argc, char *argv[]) {
     c.cd(6);
     //       hSOIAmplitudesEEdeltavsCPU->Draw("COLZ");
     hSOIAmplitudesEEGPUCPUratio->Draw("");
-    
+
     c.SaveAs("ecal-amplitudes.root");
     c.SaveAs("ecal-amplitudes.png");
-    
+
     // chi2
-    
+
     //       c.cd(1);
     c.cd(1);
     {
@@ -365,11 +404,11 @@ int main(int argc, char *argv[]) {
       hChi2EBGPU->SetLineWidth(1.);
       hChi2EBGPU->Draw("sames");
       gPad->Update();
-      auto stats = (TPaveStats*)hChi2EBGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hChi2EBGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
+      stats->SetY1NDC(y1 - (y2 - y1));
     }
     //       c.cd(2);
     c.cd(4);
@@ -382,11 +421,11 @@ int main(int argc, char *argv[]) {
       hChi2EEGPU->SetLineWidth(1.);
       hChi2EEGPU->Draw("sames");
       gPad->Update();
-      auto stats = (TPaveStats*)hChi2EEGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hChi2EEGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
+      stats->SetY1NDC(y1 - (y2 - y1));
     }
     //       c.cd(3);
     c.cd(2);
@@ -404,14 +443,12 @@ int main(int argc, char *argv[]) {
     c.cd(6);
     //       hChi2EEdeltavsCPU->Draw("COLZ");
     hChi2EEGPUCPUratio->Draw("");
-    
+
     c.SaveAs("ecal-chi2.root");
     c.SaveAs("ecal-chi2.png");
-    
-    
-    
+
     // flags
-    
+
     //       c.cd(1);
     c.cd(1);
     {
@@ -423,11 +460,11 @@ int main(int argc, char *argv[]) {
       hFlagsEBGPU->SetLineWidth(1.);
       hFlagsEBGPU->Draw("sames");
       gPad->Update();
-      auto stats = (TPaveStats*)hFlagsEBGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hFlagsEBGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
+      stats->SetY1NDC(y1 - (y2 - y1));
     }
     //       c.cd(2);
     c.cd(4);
@@ -440,11 +477,11 @@ int main(int argc, char *argv[]) {
       hFlagsEEGPU->SetLineWidth(1.);
       hFlagsEEGPU->Draw("sames");
       gPad->Update();
-      auto stats = (TPaveStats*)hFlagsEEGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hFlagsEEGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
+      stats->SetY1NDC(y1 - (y2 - y1));
     }
     //       c.cd(3);
     c.cd(2);
@@ -458,33 +495,19 @@ int main(int argc, char *argv[]) {
     c.cd(3);
     //       hFlagsEBdeltavsCPU->Draw("COLZ");
     hFlagsEBGPUCPUratio->Draw("");
-    
+
     //       c.cd(6);
     c.cd(6);
     //       hFlagsEEdeltavsCPU->Draw("COLZ");
     hFlagsEEGPUCPUratio->Draw("");
-    
-    
+
     c.SaveAs("ecal-flags.root");
     c.SaveAs("ecal-flags.png");
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
+
     TCanvas cRechits("Rechits", "Rechits", 1750, 860);
     cRechits.Divide(3, 2);
-    
-    // Plotting the sizes of GPU vs CPU for each event of EB 
+
+    // Plotting the sizes of GPU vs CPU for each event of EB
     cRechits.cd(1);
     {
       gPad->SetLogy();
@@ -495,12 +518,12 @@ int main(int argc, char *argv[]) {
       hRechitsEBGPU->SetLineWidth(2);
       hRechitsEBGPU->Draw("sames");
       cRechits.Update();
-      auto stats = (TPaveStats*)hRechitsEBGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hRechitsEBGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
-    } 
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
     cRechits.cd(4);
     {
       gPad->SetLogy();
@@ -511,41 +534,35 @@ int main(int argc, char *argv[]) {
       hRechitsEEGPU->SetLineWidth(2);
       hRechitsEEGPU->Draw("sames");
       cRechits.Update();
-      auto stats = (TPaveStats*)hRechitsEEGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hRechitsEEGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
-    }
-    cRechits.cd(2); {
-      hRechitsEBGPUvsCPU->Draw("COLZ");
-    }
-    cRechits.cd(5); {
-      hRechitsEEGPUvsCPU->Draw("COLZ");
+      stats->SetY1NDC(y1 - (y2 - y1));
     }
-    cRechits.cd(3); {
+    cRechits.cd(2);
+    { hRechitsEBGPUvsCPU->Draw("COLZ"); }
+    cRechits.cd(5);
+    { hRechitsEEGPUvsCPU->Draw("COLZ"); }
+    cRechits.cd(3);
+    {
       gPad->SetLogy();
       //hRechitsEBdeltavsCPU->Draw("COLZ");
       hRechitsEBGPUCPUratio->Draw("");
     }
-    cRechits.cd(6); {
+    cRechits.cd(6);
+    {
       gPad->SetLogy();
       //hRechitsEEdeltavsCPU->Draw("COLZ");
       hRechitsEEGPUCPUratio->Draw("");
     }
     cRechits.SaveAs("ecal-rechits.root");
     cRechits.SaveAs("ecal-rechits.png");
-    
-    
-    
-    
-    
-    
   }
-  
+
   rf.Close();
   rfout.Write();
   rfout.Close();
-  
+
   return 0;
 }
diff --git a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalRechitValidationPlots.cpp b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalRechitValidationPlots.cpp
index 4e7718791b603..18f6bed0648ad 100644
--- a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalRechitValidationPlots.cpp
+++ b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalRechitValidationPlots.cpp
@@ -19,91 +19,113 @@
 #include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
 
 int main(int argc, char *argv[]) {
-  if (argc<3) {
+  if (argc < 3) {
     std::cout << "run with: ./makeEcalRechitValidationPlots <path to input file> <output file>\n";
     exit(0);
   }
   // Set the GPU and CPU pointers for both EB and EE
-  edm::Wrapper<ecal::RecHit<ecal::Tag::soa>> *wgpuEB=nullptr;
-  edm::Wrapper<ecal::RecHit<ecal::Tag::soa>> *wgpuEE=nullptr;
+  edm::Wrapper<ecal::RecHit<ecal::Tag::soa>> *wgpuEB = nullptr;
+  edm::Wrapper<ecal::RecHit<ecal::Tag::soa>> *wgpuEE = nullptr;
   edm::Wrapper<EBRecHitCollection> *wcpuEB = nullptr;
   edm::Wrapper<EERecHitCollection> *wcpuEE = nullptr;
-  
-  std::string fileName = argv[1]; // The input file containing the data to be validated (i.e. result.root)
-  std::string outFileName = argv[2]; //The output file in which the validation results will be saved (i.e. output.root)
-  
+
+  std::string fileName = argv[1];     // The input file containing the data to be validated (i.e. result.root)
+  std::string outFileName = argv[2];  //The output file in which the validation results will be saved (i.e. output.root)
+
   //output
   TFile rfout{outFileName.c_str(), "recreate"};
-  
+
   int nbins = 200;
   int last = 5000.;
-  
+
   int nbins_energy = 300;
   float last_energy = 2.;
-  
+
   int nbins_chi2 = 200;
   float last_chi2 = 100.;
-  
+
   int nbins_flag = 40;
   //   int nbins_flag = 1000;
   int last_flag = 1500;
   //   int nbins_flag = 40;
   //   int last_flag = 10000;
-  
+
   int nbins_extra = 200;
   int last_extra = 200;
-  
+
   int nbins_delta = 201;  // use an odd number to center around 0
   float delta = 0.2;
-  
+
   // RecHits plots for EB and EE on both GPU and CPU
   auto hRechitsEBGPU = new TH1D("RechitsEBGPU", "RechitsEBGPU; No. of Rechits. No Filter GPU", nbins, 0, last);
   auto hRechitsEBCPU = new TH1D("RechitsEBCPU", "RechitsEBCPU; No. of Rechits. No Filter GPU", nbins, 0, last);
   auto hRechitsEEGPU = new TH1D("RechitsEEGPU", "RechitsEEGPU; No. of Rechits. No Filter GPU", nbins, 0, last);
   auto hRechitsEECPU = new TH1D("RechitsEECPU", "RechitsEECPU; No. of Rechits. No Filter GPU", nbins, 0, last);
-  auto hRechitsEBGPUvsCPU = new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU. No Filter GPU", last, 0, last, last, 0, last);
-  auto hRechitsEEGPUvsCPU = new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU. No Filter GPU", last, 0, last, last, 0, last);
-  auto hRechitsEBGPUCPUratio = new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU. No Filter GPU", 200, 0.95, 1.05);
-  auto hRechitsEEGPUCPUratio = new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU. No Filter GPU", 200, 0.95, 1.05);
-  auto hRechitsEBdeltavsCPU = new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU. No Filter GPU", nbins, 0, last, nbins_delta, -delta, delta);
-  auto hRechitsEEdeltavsCPU = new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU. No Filter GPU", nbins, 0, last, nbins_delta, -delta, delta);
-  
+  auto hRechitsEBGPUvsCPU =
+      new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU. No Filter GPU", last, 0, last, last, 0, last);
+  auto hRechitsEEGPUvsCPU =
+      new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU. No Filter GPU", last, 0, last, last, 0, last);
+  auto hRechitsEBGPUCPUratio =
+      new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU. No Filter GPU", 200, 0.95, 1.05);
+  auto hRechitsEEGPUCPUratio =
+      new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU. No Filter GPU", 200, 0.95, 1.05);
+  auto hRechitsEBdeltavsCPU =
+      new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU. No Filter GPU", nbins, 0, last, nbins_delta, -delta, delta);
+  auto hRechitsEEdeltavsCPU =
+      new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU. No Filter GPU", nbins, 0, last, nbins_delta, -delta, delta);
+
   // RecHits plots for EB and EE on both GPU and CPU
   auto hSelectedRechitsEBGPU = new TH1D("RechitsEBGPU", "RechitsEBGPU; No. of Rechits", nbins, 0, last);
   auto hSelectedRechitsEBCPU = new TH1D("RechitsEBCPU", "RechitsEBCPU; No. of Rechits", nbins, 0, last);
   auto hSelectedRechitsEEGPU = new TH1D("RechitsEEGPU", "RechitsEEGPU; No. of Rechits", nbins, 0, last);
   auto hSelectedRechitsEECPU = new TH1D("RechitsEECPU", "RechitsEECPU; No. of Rechits", nbins, 0, last);
-  auto hSelectedRechitsEBGPUvsCPU = new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last);
-  auto hSelectedRechitsEEGPUvsCPU = new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last);
-  auto hSelectedRechitsEBGPUCPUratio = new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05);
-  auto hSelectedRechitsEEGPUCPUratio = new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05);
-  auto hSelectedRechitsEBdeltavsCPU = new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
-  auto hSelectedRechitsEEdeltavsCPU = new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
-  
+  auto hSelectedRechitsEBGPUvsCPU =
+      new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last);
+  auto hSelectedRechitsEEGPUvsCPU =
+      new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last);
+  auto hSelectedRechitsEBGPUCPUratio =
+      new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05);
+  auto hSelectedRechitsEEGPUCPUratio =
+      new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05);
+  auto hSelectedRechitsEBdeltavsCPU =
+      new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+  auto hSelectedRechitsEEdeltavsCPU =
+      new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+
   // RecHits plots for EB and EE on both GPU and CPU
   auto hPositiveRechitsEBGPU = new TH1D("RechitsEBGPU", "RechitsEBGPU; No. of Rechits", nbins, 0, last);
   auto hPositiveRechitsEBCPU = new TH1D("RechitsEBCPU", "RechitsEBCPU; No. of Rechits", nbins, 0, last);
   auto hPositiveRechitsEEGPU = new TH1D("RechitsEEGPU", "RechitsEEGPU; No. of Rechits", nbins, 0, last);
   auto hPositiveRechitsEECPU = new TH1D("RechitsEECPU", "RechitsEECPU; No. of Rechits", nbins, 0, last);
-  auto hPositiveRechitsEBGPUvsCPU = new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last);
-  auto hPositiveRechitsEEGPUvsCPU = new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last);
-  auto hPositiveRechitsEBGPUCPUratio = new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05);
-  auto hPositiveRechitsEEGPUCPUratio = new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05);
-  auto hPositiveRechitsEBdeltavsCPU = new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
-  auto hPositiveRechitsEEdeltavsCPU = new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
-  
+  auto hPositiveRechitsEBGPUvsCPU =
+      new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last);
+  auto hPositiveRechitsEEGPUvsCPU =
+      new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last);
+  auto hPositiveRechitsEBGPUCPUratio =
+      new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05);
+  auto hPositiveRechitsEEGPUCPUratio =
+      new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05);
+  auto hPositiveRechitsEBdeltavsCPU =
+      new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+  auto hPositiveRechitsEEdeltavsCPU =
+      new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+
   // Energies plots for EB and EE on both GPU and CPU
   auto hEnergiesEBGPU = new TH1D("EnergiesEBGPU", "EnergiesEBGPU; Energy [GeV]", nbins_energy, 0, last_energy);
   auto hEnergiesEEGPU = new TH1D("EnergiesEEGPU", "EnergiesEEGPU; Energy [GeV]", nbins_energy, 0, last_energy);
   auto hEnergiesEBCPU = new TH1D("EnergiesEBCPU", "EnergiesEBCPU; Energy [GeV]", nbins_energy, 0, last_energy);
   auto hEnergiesEECPU = new TH1D("EnergiesEECPU", "EnergiesEECPU; Energy [GeV]", nbins_energy, 0, last_energy);
-  auto hEnergiesEBGPUvsCPU = new TH2D("EnergiesEBGPUvsCPU", "EnergiesEBGPUvsCPU; CPU; GPU", nbins_energy, 0, last_energy, nbins_energy, 0, last_energy);
-  auto hEnergiesEEGPUvsCPU = new TH2D("EnergiesEEGPUvsCPU", "EnergiesEEGPUvsCPU; CPU; GPU", nbins_energy, 0, last_energy, nbins_energy, 0, last_energy);
+  auto hEnergiesEBGPUvsCPU = new TH2D(
+      "EnergiesEBGPUvsCPU", "EnergiesEBGPUvsCPU; CPU; GPU", nbins_energy, 0, last_energy, nbins_energy, 0, last_energy);
+  auto hEnergiesEEGPUvsCPU = new TH2D(
+      "EnergiesEEGPUvsCPU", "EnergiesEEGPUvsCPU; CPU; GPU", nbins_energy, 0, last_energy, nbins_energy, 0, last_energy);
   auto hEnergiesEBGPUCPUratio = new TH1D("EnergiesEBGPU/CPUratio", "EnergiesEBGPU/CPUratio; GPU/CPU", 100, 0.8, 1.2);
   auto hEnergiesEEGPUCPUratio = new TH1D("EnergiesEEGPU/CPUratio", "EnergiesEEGPU/CPUratio; GPU/CPU", 100, 0.8, 1.2);
-  auto hEnergiesEBdeltavsCPU = new TH2D("EnergiesEBdeltavsCPU", "EnergiesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
-  auto hEnergiesEEdeltavsCPU = new TH2D("EnergiesEEdeltavsCPU", "EnergiesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
-  
+  auto hEnergiesEBdeltavsCPU =
+      new TH2D("EnergiesEBdeltavsCPU", "EnergiesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+  auto hEnergiesEEdeltavsCPU =
+      new TH2D("EnergiesEEdeltavsCPU", "EnergiesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta);
+
   // Chi2 plots for EB and EE on both GPU and CPU
   auto hChi2EBGPU = new TH1D("Chi2EBGPU", "Chi2EBGPU; Ch^{2}", nbins_chi2, 0, last_chi2);
   auto hChi2EEGPU = new TH1D("Chi2EEGPU", "Chi2EEGPU; Ch^{2}", nbins_chi2, 0, last_chi2);
@@ -113,68 +135,78 @@ int main(int argc, char *argv[]) {
   auto hChi2EEGPUvsCPU = new TH2D("Chi2EEGPUvsCPU", "Chi2EEGPUvsCPU; CPU; GPU", nbins_chi2, 0, 100, nbins_chi2, 0, 100);
   auto hChi2EBGPUCPUratio = new TH1D("Chi2EBGPU/CPUratio", "Chi2EBGPU/CPUratio; GPU/CPU", 100, 0.8, 1.2);
   auto hChi2EEGPUCPUratio = new TH1D("Chi2EEGPU/CPUratio", "Chi2EEGPU/CPUratio; GPU/CPU", 100, 0.8, 1.2);
-  auto hChi2EBdeltavsCPU = new TH2D("Chi2EBdeltavsCPU", "Chi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
-  auto hChi2EEdeltavsCPU = new TH2D("Chi2EEdeltavsCPU", "Chi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
-  
+  auto hChi2EBdeltavsCPU =
+      new TH2D("Chi2EBdeltavsCPU", "Chi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
+  auto hChi2EEdeltavsCPU =
+      new TH2D("Chi2EEdeltavsCPU", "Chi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta);
+
   // Flags plots for EB and EE on both GPU and CPU
   auto hFlagsEBGPU = new TH1D("FlagsEBGPU", "FlagsEBGPU; Flags", nbins_flag, -10, last_flag);
   auto hFlagsEBCPU = new TH1D("FlagsEBCPU", "FlagsEBCPU; Flags", nbins_flag, -10, last_flag);
   auto hFlagsEEGPU = new TH1D("FlagsEEGPU", "FlagsEEGPU; Flags", nbins_flag, -10, last_flag);
   auto hFlagsEECPU = new TH1D("FlagsEECPU", "FlagsEECPU; Flags", nbins_flag, -10, last_flag);
-  auto hFlagsEBGPUvsCPU = new TH2D("FlagsEBGPUvsCPU", "FlagsEBGPUvsCPU; CPU; GPU", nbins_flag, -10, last_flag, nbins_flag, -10, last_flag);
-  auto hFlagsEEGPUvsCPU = new TH2D("FlagsEEGPUvsCPU", "FlagsEEGPUvsCPU; CPU; GPU", nbins_flag, -10, last_flag, nbins_flag, -10, last_flag);
+  auto hFlagsEBGPUvsCPU =
+      new TH2D("FlagsEBGPUvsCPU", "FlagsEBGPUvsCPU; CPU; GPU", nbins_flag, -10, last_flag, nbins_flag, -10, last_flag);
+  auto hFlagsEEGPUvsCPU =
+      new TH2D("FlagsEEGPUvsCPU", "FlagsEEGPUvsCPU; CPU; GPU", nbins_flag, -10, last_flag, nbins_flag, -10, last_flag);
   auto hFlagsEBGPUCPUratio = new TH1D("FlagsEBGPU/CPUratio", "FlagsEBGPU/CPUratio; GPU/CPU", 50, -5, 10);
   auto hFlagsEEGPUCPUratio = new TH1D("FlagsEEGPU/CPUratio", "FlagsEEGPU/CPUratio; GPU/CPU", 50, -5, 10);
-  auto hFlagsEBdeltavsCPU = new TH2D("FlagsEBdeltavsCPU", "FlagsEBdeltavsCPU", nbins_flag, -10, last_flag, nbins_delta, -delta, delta);
-  auto hFlagsEEdeltavsCPU = new TH2D("FlagsEEdeltavsCPU", "FlagsEEdeltavsCPU", nbins_flag, -10, last_flag, nbins_delta, -delta, delta);
-  
+  auto hFlagsEBdeltavsCPU =
+      new TH2D("FlagsEBdeltavsCPU", "FlagsEBdeltavsCPU", nbins_flag, -10, last_flag, nbins_delta, -delta, delta);
+  auto hFlagsEEdeltavsCPU =
+      new TH2D("FlagsEEdeltavsCPU", "FlagsEEdeltavsCPU", nbins_flag, -10, last_flag, nbins_delta, -delta, delta);
+
   // Extras plots for EB and EE on both GPU and CPU
   auto hExtrasEBGPU = new TH1D("ExtrasEBGPU", "ExtrasEBGPU; No. of Extras", nbins_extra, 0, last_extra);
   auto hExtrasEBCPU = new TH1D("ExtrasEBCPU", "ExtrasEBCPU; No. of Extras", nbins_extra, 0, last_extra);
   auto hExtrasEEGPU = new TH1D("ExtrasEEGPU", "ExtrasEEGPU; No. of Extras", nbins_extra, 0, last_extra);
   auto hExtrasEECPU = new TH1D("ExtrasEECPU", "ExtrasEECPU; No. of Extras", nbins_extra, 0, last_extra);
-  auto hExtrasEBGPUvsCPU = new TH2D("ExtrasEBGPUvsCPU", "ExtrasEBGPUvsCPU; CPU; GPU", nbins_extra, 0, last_extra,nbins_extra, 0, last_extra);
-  auto hExtrasEEGPUvsCPU = new TH2D("ExtrasEEGPUvsCPU", "ExtrasEEGPUvsCPU; CPU; GPU", nbins_extra, 0, last_extra, nbins_extra, 0, last_extra);
+  auto hExtrasEBGPUvsCPU = new TH2D(
+      "ExtrasEBGPUvsCPU", "ExtrasEBGPUvsCPU; CPU; GPU", nbins_extra, 0, last_extra, nbins_extra, 0, last_extra);
+  auto hExtrasEEGPUvsCPU = new TH2D(
+      "ExtrasEEGPUvsCPU", "ExtrasEEGPUvsCPU; CPU; GPU", nbins_extra, 0, last_extra, nbins_extra, 0, last_extra);
   auto hExtrasEBGPUCPUratio = new TH1D("ExtrasEBGPU/CPUratio", "ExtrasEBGPU/CPUratio; GPU/CPU", 50, 0.0, 2.0);
   auto hExtrasEEGPUCPUratio = new TH1D("ExtrasEEGPU/CPUratio", "ExtrasEEGPU/CPUratio; GPU/CPU", 50, 0.0, 2.0);
-  auto hExtrasEBdeltavsCPU = new TH2D("ExtrasEBdeltavsCPU", "ExtrasEBdeltavsCPU", nbins_extra, 0, last_extra, nbins_delta, -delta, delta);
-  auto hExtrasEEdeltavsCPU = new TH2D("ExtrasEEdeltavsCPU", "ExtrasEEdeltavsCPU", nbins_extra, 0, last_extra, nbins_delta, -delta, delta);
-  
+  auto hExtrasEBdeltavsCPU =
+      new TH2D("ExtrasEBdeltavsCPU", "ExtrasEBdeltavsCPU", nbins_extra, 0, last_extra, nbins_delta, -delta, delta);
+  auto hExtrasEEdeltavsCPU =
+      new TH2D("ExtrasEEdeltavsCPU", "ExtrasEEdeltavsCPU", nbins_extra, 0, last_extra, nbins_delta, -delta, delta);
+
   // input file setup for tree
   std::cout << "validating file " << fileName << std::endl;
   TFile rf{fileName.c_str()};
-  TTree *rt = (TTree*)rf.Get("Events");
-  
+  TTree *rt = (TTree *)rf.Get("Events");
+
   // Allocating the appropriate data to their respective pointers
   rt->SetBranchAddress("ecalTagsoaecalRecHit_ecalCPURecHitProducer_EcalRecHitsEB_RECO.", &wgpuEB);
   rt->SetBranchAddress("ecalTagsoaecalRecHit_ecalCPURecHitProducer_EcalRecHitsEE_RECO.", &wgpuEE);
   rt->SetBranchAddress("EcalRecHitsSorted_ecalRecHit_EcalRecHitsEB_RECO.", &wcpuEB);
   rt->SetBranchAddress("EcalRecHitsSorted_ecalRecHit_EcalRecHitsEE_RECO.", &wcpuEE);
-  
+
   constexpr float eps_diff = 1e-3;
-  
+
   // accumulate sizes for events and sizes of each event on both GPU and CPU
   //   auto const nentries = rt->GetEntries();
   int nentries = rt->GetEntries();
-  
-  //---- AM: tests 
+
+  //---- AM: tests
   if (nentries > 1000) {
     nentries = 1000;
   }
   //   nentries = 1;
-  
+
   std::cout << "#events to validate over: " << nentries << std::endl;
-  for (int ie=0; ie<nentries; ++ie) {
+  for (int ie = 0; ie < nentries; ++ie) {
     rt->GetEntry(ie);
-    
+
     //     const char* ordinal[] = { "th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th" };
     auto cpu_eb_size = wcpuEB->bareProduct().size();
     auto cpu_ee_size = wcpuEE->bareProduct().size();
     auto gpu_eb_size = wgpuEB->bareProduct().energy.size();
     auto gpu_ee_size = wgpuEE->bareProduct().energy.size();
-    float eb_ratio = (float) gpu_eb_size/cpu_eb_size;
-    float ee_ratio = (float) gpu_ee_size/cpu_ee_size;
-    
+    float eb_ratio = (float)gpu_eb_size / cpu_eb_size;
+    float ee_ratio = (float)gpu_ee_size / cpu_ee_size;
+
     // Filling up the histograms on events sizes for EB and EE on both GPU and CPU
     hRechitsEBGPU->Fill(gpu_eb_size);
     hRechitsEBCPU->Fill(cpu_eb_size);
@@ -184,9 +216,9 @@ int main(int argc, char *argv[]) {
     hRechitsEEGPUvsCPU->Fill(cpu_ee_size, gpu_ee_size);
     hRechitsEBGPUCPUratio->Fill(eb_ratio);
     hRechitsEEGPUCPUratio->Fill(ee_ratio);
-    hRechitsEBdeltavsCPU->Fill(cpu_eb_size, gpu_eb_size-cpu_eb_size);
-    hRechitsEEdeltavsCPU->Fill(cpu_ee_size, gpu_ee_size-cpu_ee_size);
-    
+    hRechitsEBdeltavsCPU->Fill(cpu_eb_size, gpu_eb_size - cpu_eb_size);
+    hRechitsEEdeltavsCPU->Fill(cpu_ee_size, gpu_ee_size - cpu_ee_size);
+
     /*    
      *    // condition that sizes on GPU and CPU should be the same for EB or EE
      *       if (cpu_eb_size != gpu_eb_size or cpu_ee_size != gpu_ee_size) {
@@ -201,32 +233,32 @@ int main(int argc, char *argv[]) {
   auto const neb = wcpuEB->bareProduct().size(); //like cpu_eb_size but set to constant
   auto const nee = wcpuEE->bareProduct().size(); //like cpu_ee_size but set to constant
   */
-    
+
     uint selected_gpu_eb_size = 0;
     uint selected_gpu_ee_size = 0;
-    
+
     uint positive_gpu_eb_size = 0;
     uint positive_gpu_ee_size = 0;
-    
+
     // EB:
-    for (uint32_t i=0; i<gpu_eb_size; ++i) {
-      auto const did_gpu = wgpuEB->bareProduct().did[i]; // set the did for the current RecHit
+    for (uint32_t i = 0; i < gpu_eb_size; ++i) {
+      auto const did_gpu = wgpuEB->bareProduct().did[i];  // set the did for the current RecHit
       // Set the variables for GPU
-      auto const enr_gpu = wgpuEB->bareProduct().energy[i]; 
+      auto const enr_gpu = wgpuEB->bareProduct().energy[i];
       auto const chi2_gpu = wgpuEB->bareProduct().chi2[i];
-      auto const flag_gpu = wgpuEB->bareProduct().flagBits[i]; 
+      auto const flag_gpu = wgpuEB->bareProduct().flagBits[i];
       auto const extra_gpu = wgpuEB->bareProduct().extra[i];
-      
+
       // you have "-1" if the crystal is not selected
-      if ( enr_gpu>=0 ) {
+      if (enr_gpu >= 0) {
         selected_gpu_eb_size++;
-        
-        if ( enr_gpu>0 ) {
+
+        if (enr_gpu > 0) {
           positive_gpu_eb_size++;
         }
-        
+
         // find the Rechit on CPU reflecting the same did
-        auto const cpu_iter = wcpuEB->bareProduct().find(DetId{did_gpu}); 
+        auto const cpu_iter = wcpuEB->bareProduct().find(DetId{did_gpu});
         if (cpu_iter == wcpuEB->bareProduct().end()) {
           //           std::cerr << ie << ordinal[ie % 10] << " entry\n"
           //                   << "  Did not find a DetId " << did_gpu_eb
@@ -237,42 +269,42 @@ int main(int argc, char *argv[]) {
         // Set the variables for CPU
         auto const enr_cpu = cpu_iter->energy();
         auto const chi2_cpu = cpu_iter->chi2();
-//         auto const flag_cpu = cpu_iter->flagBits();
+        //         auto const flag_cpu = cpu_iter->flagBits();
         auto const flag_cpu = 1;
-//         auto const extra_cpu = cpu_iter->extra();
+        //         auto const extra_cpu = cpu_iter->extra();
         auto const extra_cpu = 1;
         //       auto const flag_cpu = cpu_iter->flagBits() ? cpu_iter->flagBits():-1;
         //       auto const extra_cpu = cpu_iter->extra() ? cpu_iter->extra():-1;
-        
+
         // AM: TEST
         //       if (extra_cpu != 10) continue;
-        
+
         // Fill the energy and Chi2 histograms for GPU and CPU and their comparisons with delta
         hEnergiesEBGPU->Fill(enr_gpu);
         hEnergiesEBCPU->Fill(enr_cpu);
         //       std::cout<<"EB CPU Energy:\t"<<enr_cpu<<std::endl;
         hEnergiesEBGPUvsCPU->Fill(enr_cpu, enr_gpu);
-        hEnergiesEBGPUCPUratio->Fill(enr_gpu/enr_cpu);
-        hEnergiesEBdeltavsCPU->Fill(enr_cpu, enr_gpu-enr_cpu);
-        
+        hEnergiesEBGPUCPUratio->Fill(enr_gpu / enr_cpu);
+        hEnergiesEBdeltavsCPU->Fill(enr_cpu, enr_gpu - enr_cpu);
+
         hChi2EBGPU->Fill(chi2_gpu);
         hChi2EBCPU->Fill(chi2_cpu);
         hChi2EBGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
-        hChi2EBGPUCPUratio->Fill(chi2_gpu/chi2_cpu);
-        hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu);
-        
+        hChi2EBGPUCPUratio->Fill(chi2_gpu / chi2_cpu);
+        hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu);
+
         hFlagsEBGPU->Fill(flag_gpu);
         hFlagsEBCPU->Fill(flag_cpu);
         hFlagsEBGPUvsCPU->Fill(flag_cpu, flag_gpu);
-        hFlagsEBGPUCPUratio->Fill(flag_cpu ? flag_gpu/flag_cpu : -1);
-        hFlagsEBdeltavsCPU->Fill(flag_cpu, flag_gpu-flag_cpu);
-        
+        hFlagsEBGPUCPUratio->Fill(flag_cpu ? flag_gpu / flag_cpu : -1);
+        hFlagsEBdeltavsCPU->Fill(flag_cpu, flag_gpu - flag_cpu);
+
         hExtrasEBGPU->Fill(extra_gpu);
         hExtrasEBCPU->Fill(extra_cpu);
         hExtrasEBGPUvsCPU->Fill(extra_cpu, extra_gpu);
-        hExtrasEBGPUCPUratio->Fill(extra_cpu ? extra_gpu/extra_cpu : -1);
-        hExtrasEBdeltavsCPU->Fill(extra_cpu, extra_gpu-extra_cpu);
-        
+        hExtrasEBGPUCPUratio->Fill(extra_cpu ? extra_gpu / extra_cpu : -1);
+        hExtrasEBdeltavsCPU->Fill(extra_cpu, extra_gpu - extra_cpu);
+
         // Check if abs difference between GPU and CPU values for energy and Chi2 are smaller than eps, if not print message
         // if ((std::abs(enr_gpu - enr_cpu) >= eps_diff) or
         //      (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu))
@@ -281,30 +313,29 @@ int main(int argc, char *argv[]) {
         //          ie, i, enr_gpu, enr_cpu, chi2_gpu, chi2_cpu);
         //      if (std::isnan(chi2_gpu))
         //        printf("*** nan ***\n");
-        //  } 
-        
+        //  }
       }
     }
-    
+
     // EE:
-    for (uint32_t i=0; i<gpu_ee_size; ++i) {
-      auto const did_gpu = wgpuEE->bareProduct().did[i]; // set the did for the current RecHit
+    for (uint32_t i = 0; i < gpu_ee_size; ++i) {
+      auto const did_gpu = wgpuEE->bareProduct().did[i];  // set the did for the current RecHit
       // Set the variables for GPU
-      auto const enr_gpu = wgpuEE->bareProduct().energy[i]; 
+      auto const enr_gpu = wgpuEE->bareProduct().energy[i];
       auto const chi2_gpu = wgpuEE->bareProduct().chi2[i];
-      auto const flag_gpu = wgpuEE->bareProduct().flagBits[i]; 
+      auto const flag_gpu = wgpuEE->bareProduct().flagBits[i];
       auto const extra_gpu = wgpuEE->bareProduct().extra[i];
-      
+
       // you have "-1" if the crystal is not selected
-      if ( enr_gpu>=0 ) {
+      if (enr_gpu >= 0) {
         selected_gpu_ee_size++;
-        
-        if ( enr_gpu>0 ) {
+
+        if (enr_gpu > 0) {
           positive_gpu_ee_size++;
         }
-        
+
         // find the Rechit on CPU reflecting the same did
-        auto const cpu_iter = wcpuEE->bareProduct().find(DetId{did_gpu}); 
+        auto const cpu_iter = wcpuEE->bareProduct().find(DetId{did_gpu});
         if (cpu_iter == wcpuEE->bareProduct().end()) {
           //    std::cerr << ie << ordinal[ie % 10] << " entry\n"
           //            << "  Did not find a DetId " << did_gpu
@@ -315,43 +346,41 @@ int main(int argc, char *argv[]) {
         // Set the variables for CPU
         auto const enr_cpu = cpu_iter->energy();
         auto const chi2_cpu = cpu_iter->chi2();
-//         auto const flag_cpu = cpu_iter->flagBits();
+        //         auto const flag_cpu = cpu_iter->flagBits();
         auto const flag_cpu = 1;
-//         auto const extra_cpu = cpu_iter->extra();
+        //         auto const extra_cpu = cpu_iter->extra();
         auto const extra_cpu = 1;
         //       auto const flag_cpu = cpu_iter->flagBits()?cpu_iter->flagBits():-1;
         //       auto const extra_cpu = cpu_iter->extra()?cpu_iter->extra():-1;
-        
-        
+
         // AM: TEST
         //       if (extra_cpu != 10) continue;
-        
-        
+
         // Fill the energy and Chi2 histograms for GPU and CPU and their comparisons with delta
         hEnergiesEEGPU->Fill(enr_gpu);
         hEnergiesEECPU->Fill(enr_cpu);
         hEnergiesEEGPUvsCPU->Fill(enr_cpu, enr_gpu);
-        hEnergiesEEGPUCPUratio->Fill(enr_gpu/enr_cpu);
-        hEnergiesEEdeltavsCPU->Fill(enr_cpu, enr_gpu-enr_cpu);
-        
+        hEnergiesEEGPUCPUratio->Fill(enr_gpu / enr_cpu);
+        hEnergiesEEdeltavsCPU->Fill(enr_cpu, enr_gpu - enr_cpu);
+
         hChi2EEGPU->Fill(chi2_gpu);
         hChi2EECPU->Fill(chi2_cpu);
         hChi2EEGPUvsCPU->Fill(chi2_cpu, chi2_gpu);
-        hChi2EEGPUCPUratio->Fill(chi2_gpu/chi2_cpu);
-        hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu);
-        
+        hChi2EEGPUCPUratio->Fill(chi2_gpu / chi2_cpu);
+        hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu);
+
         hFlagsEEGPU->Fill(flag_gpu);
         hFlagsEECPU->Fill(flag_cpu);
         hFlagsEEGPUvsCPU->Fill(flag_cpu, flag_gpu);
-        hFlagsEEGPUCPUratio->Fill(flag_cpu ? flag_gpu/flag_cpu : -1);
-        hFlagsEEdeltavsCPU->Fill(flag_cpu, flag_gpu-flag_cpu);
-        
+        hFlagsEEGPUCPUratio->Fill(flag_cpu ? flag_gpu / flag_cpu : -1);
+        hFlagsEEdeltavsCPU->Fill(flag_cpu, flag_gpu - flag_cpu);
+
         hExtrasEEGPU->Fill(extra_gpu);
         hExtrasEECPU->Fill(extra_cpu);
         hExtrasEEGPUvsCPU->Fill(extra_cpu, extra_gpu);
-        hExtrasEEGPUCPUratio->Fill(extra_cpu ? extra_gpu/extra_cpu : -1);
-        hExtrasEEdeltavsCPU->Fill(extra_cpu, extra_gpu-extra_cpu);
-        
+        hExtrasEEGPUCPUratio->Fill(extra_cpu ? extra_gpu / extra_cpu : -1);
+        hExtrasEEdeltavsCPU->Fill(extra_cpu, extra_gpu - extra_cpu);
+
         // Check if abs difference between GPU and CPU values for energy and Chi2 are smaller than eps, if not print message
         // if ((std::abs(enr_gpu - enr_cpu) >= eps_diff) or
         //      (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu))
@@ -360,17 +389,16 @@ int main(int argc, char *argv[]) {
         //          ie, i, enr_gpu, enr_cpu, chi2_gpu, chi2_cpu);
         //      if (std::isnan(chi2_gpu))
         //        printf("*** nan ***\n");
-        //  } 
+        //  }
       }
     }
-    
-    
+
     //
     // now the rechit counting
     //
-    float selected_eb_ratio = (float) selected_gpu_eb_size/cpu_eb_size;
-    float selected_ee_ratio = (float) selected_gpu_ee_size/cpu_ee_size;
-    
+    float selected_eb_ratio = (float)selected_gpu_eb_size / cpu_eb_size;
+    float selected_ee_ratio = (float)selected_gpu_ee_size / cpu_ee_size;
+
     // Filling up the histograms on events sizes for EB and EE on both GPU and CPU
     hSelectedRechitsEBGPU->Fill(selected_gpu_eb_size);
     hSelectedRechitsEBCPU->Fill(cpu_eb_size);
@@ -380,37 +408,34 @@ int main(int argc, char *argv[]) {
     hSelectedRechitsEEGPUvsCPU->Fill(cpu_ee_size, selected_gpu_ee_size);
     hSelectedRechitsEBGPUCPUratio->Fill(selected_eb_ratio);
     hSelectedRechitsEEGPUCPUratio->Fill(selected_ee_ratio);
-    hSelectedRechitsEBdeltavsCPU->Fill(cpu_eb_size, selected_gpu_eb_size-cpu_eb_size);
-    hSelectedRechitsEEdeltavsCPU->Fill(cpu_ee_size, selected_gpu_ee_size-cpu_ee_size);
-    
-    
+    hSelectedRechitsEBdeltavsCPU->Fill(cpu_eb_size, selected_gpu_eb_size - cpu_eb_size);
+    hSelectedRechitsEEdeltavsCPU->Fill(cpu_ee_size, selected_gpu_ee_size - cpu_ee_size);
+
     //
     // now the rechit counting
     //
-    
-    
+
     uint positive_cpu_eb_size = 0;
     uint positive_cpu_ee_size = 0;
-    
+
     // EB:
-    for (uint32_t i=0; i<cpu_eb_size; ++i) {
-      auto const enr_cpu = wcpuEB->bareProduct()[i].energy(); 
+    for (uint32_t i = 0; i < cpu_eb_size; ++i) {
+      auto const enr_cpu = wcpuEB->bareProduct()[i].energy();
       if (enr_cpu > 0) {
         positive_cpu_eb_size++;
       }
     }
     // EE:
-    for (uint32_t i=0; i<cpu_ee_size; ++i) {
-      auto const enr_cpu = wcpuEE->bareProduct()[i].energy(); 
+    for (uint32_t i = 0; i < cpu_ee_size; ++i) {
+      auto const enr_cpu = wcpuEE->bareProduct()[i].energy();
       if (enr_cpu > 0) {
         positive_cpu_ee_size++;
       }
     }
-    
-    
-    float positive_eb_ratio = (float) positive_gpu_eb_size/positive_cpu_eb_size;
-    float positive_ee_ratio = (float) positive_gpu_ee_size/positive_cpu_ee_size;
-    
+
+    float positive_eb_ratio = (float)positive_gpu_eb_size / positive_cpu_eb_size;
+    float positive_ee_ratio = (float)positive_gpu_ee_size / positive_cpu_ee_size;
+
     // Filling up the histograms on events sizes for EB and EE on both GPU and CPU
     hPositiveRechitsEBGPU->Fill(positive_gpu_eb_size);
     hPositiveRechitsEBCPU->Fill(positive_cpu_eb_size);
@@ -420,25 +445,19 @@ int main(int argc, char *argv[]) {
     hPositiveRechitsEEGPUvsCPU->Fill(positive_cpu_ee_size, positive_gpu_ee_size);
     hPositiveRechitsEBGPUCPUratio->Fill(positive_eb_ratio);
     hPositiveRechitsEEGPUCPUratio->Fill(positive_ee_ratio);
-    hPositiveRechitsEBdeltavsCPU->Fill(positive_cpu_eb_size, positive_gpu_eb_size-positive_cpu_eb_size);
-    hPositiveRechitsEEdeltavsCPU->Fill(positive_cpu_ee_size, positive_gpu_ee_size-positive_cpu_ee_size);
-    
-    
-    
+    hPositiveRechitsEBdeltavsCPU->Fill(positive_cpu_eb_size, positive_gpu_eb_size - positive_cpu_eb_size);
+    hPositiveRechitsEEdeltavsCPU->Fill(positive_cpu_ee_size, positive_gpu_ee_size - positive_cpu_ee_size);
+
     if (cpu_eb_size != selected_gpu_eb_size or cpu_ee_size != selected_gpu_ee_size) {
       //       std::cerr << ie << ordinal[ie % 10] << " entry:\n"
       std::cerr << ie << " entry:\n"
-      << "  EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << selected_gpu_eb_size << " (gpu)\n"
-      << "  EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << selected_gpu_ee_size << " (gpu)" << std::endl;
+                << "  EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << selected_gpu_eb_size
+                << " (gpu)\n"
+                << "  EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << selected_gpu_ee_size
+                << " (gpu)" << std::endl;
     }
-    
-    
-    
   }
-  
-  
-  
-  
+
   // Plotting the results:
   {
     // Canvases Setup:
@@ -456,10 +475,8 @@ int main(int argc, char *argv[]) {
     cFlags.Divide(3, 2);
     TCanvas cExtras("Extras", "Extras", 1750, 860);
     cExtras.Divide(3, 2);
-    
-    
-    
-    // Plotting the sizes of GPU vs CPU for each event of EB 
+
+    // Plotting the sizes of GPU vs CPU for each event of EB
     cAllRechits.cd(1);
     {
       gPad->SetLogy();
@@ -470,12 +487,12 @@ int main(int argc, char *argv[]) {
       hRechitsEBGPU->SetLineWidth(2);
       hRechitsEBGPU->Draw("sames");
       cAllRechits.Update();
-      auto stats = (TPaveStats*)hRechitsEBGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hRechitsEBGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
-    } 
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
     cAllRechits.cd(4);
     {
       gPad->SetLogy();
@@ -486,36 +503,38 @@ int main(int argc, char *argv[]) {
       hRechitsEEGPU->SetLineWidth(2);
       hRechitsEEGPU->Draw("sames");
       cAllRechits.Update();
-      auto stats = (TPaveStats*)hRechitsEEGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hRechitsEEGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
+      stats->SetY1NDC(y1 - (y2 - y1));
     }
-    cAllRechits.cd(2); {
+    cAllRechits.cd(2);
+    {
       gStyle->SetPalette(55);
       hRechitsEBGPUvsCPU->Draw("COLZ");
     }
-    cAllRechits.cd(5); {
+    cAllRechits.cd(5);
+    {
       gStyle->SetPalette(55);
       hRechitsEEGPUvsCPU->Draw("COLZ");
     }
-    cAllRechits.cd(3); {
+    cAllRechits.cd(3);
+    {
       gPad->SetLogy();
       //hRechitsEBdeltavsCPU->Draw("COLZ");
       hRechitsEBGPUCPUratio->Draw("");
     }
-    cAllRechits.cd(6); {
+    cAllRechits.cd(6);
+    {
       gPad->SetLogy();
       //hRechitsEEdeltavsCPU->Draw("COLZ");
       hRechitsEEGPUCPUratio->Draw("");
     }
     cAllRechits.SaveAs("ecal-allrechits.root");
     cAllRechits.SaveAs("ecal-allrechits.png");
-    
-    
-    
-    // Plotting the sizes of GPU vs CPU for each event of EB 
+
+    // Plotting the sizes of GPU vs CPU for each event of EB
     cRechits.cd(1);
     {
       gPad->SetLogy();
@@ -526,12 +545,12 @@ int main(int argc, char *argv[]) {
       hSelectedRechitsEBGPU->SetLineWidth(2);
       hSelectedRechitsEBGPU->Draw("sames");
       cRechits.Update();
-      auto stats = (TPaveStats*)hSelectedRechitsEBGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hSelectedRechitsEBGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
-    } 
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
     cRechits.cd(4);
     {
       gPad->SetLogy();
@@ -542,37 +561,38 @@ int main(int argc, char *argv[]) {
       hSelectedRechitsEEGPU->SetLineWidth(2);
       hSelectedRechitsEEGPU->Draw("sames");
       cRechits.Update();
-      auto stats = (TPaveStats*)hSelectedRechitsEEGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hSelectedRechitsEEGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
+      stats->SetY1NDC(y1 - (y2 - y1));
     }
-    cRechits.cd(2); {
+    cRechits.cd(2);
+    {
       gStyle->SetPalette(55);
       hSelectedRechitsEBGPUvsCPU->Draw("COLZ");
     }
-    cRechits.cd(5); {
+    cRechits.cd(5);
+    {
       gStyle->SetPalette(55);
       hSelectedRechitsEEGPUvsCPU->Draw("COLZ");
     }
-    cRechits.cd(3); {
+    cRechits.cd(3);
+    {
       gPad->SetLogy();
       //hSelectedRechitsEBdeltavsCPU->Draw("COLZ");
       hSelectedRechitsEBGPUCPUratio->Draw("");
     }
-    cRechits.cd(6); {
+    cRechits.cd(6);
+    {
       gPad->SetLogy();
       //hSelectedRechitsEEdeltavsCPU->Draw("COLZ");
       hSelectedRechitsEEGPUCPUratio->Draw("");
     }
     cRechits.SaveAs("ecal-rechits.root");
     cRechits.SaveAs("ecal-rechits.png");
-    
-    
-    
-    
-    // Plotting the sizes of GPU vs CPU for each event of EB 
+
+    // Plotting the sizes of GPU vs CPU for each event of EB
     cRechitsPositive.cd(1);
     {
       gPad->SetLogy();
@@ -583,12 +603,12 @@ int main(int argc, char *argv[]) {
       hPositiveRechitsEBGPU->SetLineWidth(2);
       hPositiveRechitsEBGPU->Draw("sames");
       cRechitsPositive.Update();
-      auto stats = (TPaveStats*)hPositiveRechitsEBGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hPositiveRechitsEBGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
-    } 
+      stats->SetY1NDC(y1 - (y2 - y1));
+    }
     cRechitsPositive.cd(4);
     {
       gPad->SetLogy();
@@ -599,34 +619,37 @@ int main(int argc, char *argv[]) {
       hPositiveRechitsEEGPU->SetLineWidth(2);
       hPositiveRechitsEEGPU->Draw("sames");
       cRechitsPositive.Update();
-      auto stats = (TPaveStats*)hPositiveRechitsEEGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hPositiveRechitsEEGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
+      stats->SetY1NDC(y1 - (y2 - y1));
     }
-    cRechitsPositive.cd(2); {
+    cRechitsPositive.cd(2);
+    {
       gStyle->SetPalette(55);
       hPositiveRechitsEBGPUvsCPU->Draw("COLZ");
     }
-    cRechitsPositive.cd(5); {
+    cRechitsPositive.cd(5);
+    {
       gStyle->SetPalette(55);
       hPositiveRechitsEEGPUvsCPU->Draw("COLZ");
     }
-    cRechitsPositive.cd(3); {
+    cRechitsPositive.cd(3);
+    {
       gPad->SetLogy();
       //hPositiveRechitsEBdeltavsCPU->Draw("COLZ");
       hPositiveRechitsEBGPUCPUratio->Draw("");
     }
-    cRechitsPositive.cd(6); {
+    cRechitsPositive.cd(6);
+    {
       gPad->SetLogy();
       //hPositiveRechitsEEdeltavsCPU->Draw("COLZ");
       hPositiveRechitsEEGPUCPUratio->Draw("");
     }
     cRechitsPositive.SaveAs("ecal-rechits-positive.root");
     cRechitsPositive.SaveAs("ecal-rechits-positive.png");
-    
-    
+
     cEnergies.cd(1);
     {
       gPad->SetLogy();
@@ -637,11 +660,11 @@ int main(int argc, char *argv[]) {
       hEnergiesEBGPU->SetLineWidth(2);
       hEnergiesEBGPU->Draw("sames");
       gPad->Update();
-      auto stats = (TPaveStats*)hEnergiesEBGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hEnergiesEBGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
+      stats->SetY1NDC(y1 - (y2 - y1));
     }
     cEnergies.cd(4);
     {
@@ -653,32 +676,31 @@ int main(int argc, char *argv[]) {
       hEnergiesEEGPU->SetLineWidth(2);
       hEnergiesEEGPU->Draw("sames");
       gPad->Update();
-      auto stats = (TPaveStats*)hEnergiesEEGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hEnergiesEEGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
-    }
-    cEnergies.cd(2); {
-      hEnergiesEBGPUvsCPU->Draw("COLZ");
+      stats->SetY1NDC(y1 - (y2 - y1));
     }
-    cEnergies.cd(5); {
-      hEnergiesEEGPUvsCPU->Draw("COLZ");
-    }
-    cEnergies.cd(3); {
+    cEnergies.cd(2);
+    { hEnergiesEBGPUvsCPU->Draw("COLZ"); }
+    cEnergies.cd(5);
+    { hEnergiesEEGPUvsCPU->Draw("COLZ"); }
+    cEnergies.cd(3);
+    {
       gPad->SetLogy();
       //hEnergiesEBdeltavsCPU->Draw("COLZ");
       hEnergiesEBGPUCPUratio->Draw("");
     }
-    cEnergies.cd(6); {
+    cEnergies.cd(6);
+    {
       gPad->SetLogy();
       //hEnergiesEEdeltavsCPU->Draw("COLZ");
       hEnergiesEEGPUCPUratio->Draw("");
     }
     cEnergies.SaveAs("ecal-energies.root");
     cEnergies.SaveAs("ecal-energies.png");
-    
-    
+
     cChi2.cd(1);
     {
       gPad->SetLogy();
@@ -689,11 +711,11 @@ int main(int argc, char *argv[]) {
       hChi2EBGPU->SetLineWidth(2);
       hChi2EBGPU->Draw("sames");
       gPad->Update();
-      auto stats = (TPaveStats*)hChi2EBGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hChi2EBGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
+      stats->SetY1NDC(y1 - (y2 - y1));
     }
     cChi2.cd(4);
     {
@@ -705,32 +727,31 @@ int main(int argc, char *argv[]) {
       hChi2EEGPU->SetLineWidth(2);
       hChi2EEGPU->Draw("sames");
       gPad->Update();
-      auto stats = (TPaveStats*)hChi2EEGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hChi2EEGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
-    }
-    cChi2.cd(2); {
-      hChi2EBGPUvsCPU->Draw("COLZ");
-    }
-    cChi2.cd(5); {
-      hChi2EEGPUvsCPU->Draw("COLZ");
+      stats->SetY1NDC(y1 - (y2 - y1));
     }
-    cChi2.cd(3); {
+    cChi2.cd(2);
+    { hChi2EBGPUvsCPU->Draw("COLZ"); }
+    cChi2.cd(5);
+    { hChi2EEGPUvsCPU->Draw("COLZ"); }
+    cChi2.cd(3);
+    {
       gPad->SetLogy();
       //hChi2EBdeltavsCPU->Draw("COLZ");
       hChi2EBGPUCPUratio->Draw("");
     }
-    cChi2.cd(6); {
+    cChi2.cd(6);
+    {
       gPad->SetLogy();
       //hChi2EEdeltavsCPU->Draw("COLZ");
       hChi2EEGPUCPUratio->Draw("");
     }
     cChi2.SaveAs("ecal-chi2.root");
     cChi2.SaveAs("ecal-chi2.png");
-    
-    
+
     cFlags.cd(1);
     {
       gPad->SetLogy();
@@ -741,11 +762,11 @@ int main(int argc, char *argv[]) {
       hFlagsEBGPU->SetLineWidth(2);
       hFlagsEBGPU->Draw("sames");
       gPad->Update();
-      auto stats = (TPaveStats*)hFlagsEBGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hFlagsEBGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
+      stats->SetY1NDC(y1 - (y2 - y1));
     }
     cFlags.cd(4);
     {
@@ -757,32 +778,31 @@ int main(int argc, char *argv[]) {
       hFlagsEEGPU->SetLineWidth(2);
       hFlagsEEGPU->Draw("sames");
       gPad->Update();
-      auto stats = (TPaveStats*)hFlagsEEGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hFlagsEEGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
+      stats->SetY1NDC(y1 - (y2 - y1));
     }
-    cFlags.cd(2); {
-      hFlagsEBGPUvsCPU->Draw("COLZ");
-    }
-    cFlags.cd(5); {
-      hFlagsEEGPUvsCPU->Draw("COLZ");
-    }
-    cFlags.cd(3); {
+    cFlags.cd(2);
+    { hFlagsEBGPUvsCPU->Draw("COLZ"); }
+    cFlags.cd(5);
+    { hFlagsEEGPUvsCPU->Draw("COLZ"); }
+    cFlags.cd(3);
+    {
       gPad->SetLogy();
       //hFlagsEBdeltavsCPU->Draw("COLZ");
       hFlagsEBGPUCPUratio->Draw("");
     }
-    cFlags.cd(6); {
+    cFlags.cd(6);
+    {
       gPad->SetLogy();
       //hFlagsEEdeltavsCPU->Draw("COLZ");
       hFlagsEEGPUCPUratio->Draw("");
     }
     cFlags.SaveAs("ecal-flags.root");
     cFlags.SaveAs("ecal-flags.png");
-    
-    
+
     cExtras.cd(1);
     {
       gPad->SetLogy();
@@ -793,11 +813,11 @@ int main(int argc, char *argv[]) {
       hExtrasEBGPU->SetLineWidth(2);
       hExtrasEBGPU->Draw("sames");
       gPad->Update();
-      auto stats = (TPaveStats*)hExtrasEBGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hExtrasEBGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
+      stats->SetY1NDC(y1 - (y2 - y1));
     }
     cExtras.cd(4);
     {
@@ -809,36 +829,36 @@ int main(int argc, char *argv[]) {
       hExtrasEEGPU->SetLineWidth(2);
       hExtrasEEGPU->Draw("sames");
       gPad->Update();
-      auto stats = (TPaveStats*)hExtrasEEGPU->FindObject("stats");
+      auto stats = (TPaveStats *)hExtrasEEGPU->FindObject("stats");
       auto y2 = stats->GetY2NDC();
       auto y1 = stats->GetY1NDC();
       stats->SetY2NDC(y1);
-      stats->SetY1NDC(y1 - (y2-y1));
-    }
-    cExtras.cd(2); {
-      hExtrasEBGPUvsCPU->Draw("COLZ");
+      stats->SetY1NDC(y1 - (y2 - y1));
     }
-    cExtras.cd(5); {
-      hExtrasEEGPUvsCPU->Draw("COLZ");
-    }
-    cExtras.cd(3); {
+    cExtras.cd(2);
+    { hExtrasEBGPUvsCPU->Draw("COLZ"); }
+    cExtras.cd(5);
+    { hExtrasEEGPUvsCPU->Draw("COLZ"); }
+    cExtras.cd(3);
+    {
       gPad->SetLogy();
       //hExtrasEBdeltavsCPU->Draw("COLZ");
       hExtrasEBGPUCPUratio->Draw("");
     }
-    cExtras.cd(6); {
+    cExtras.cd(6);
+    {
       gPad->SetLogy();
       //hExtrasEEdeltavsCPU->Draw("COLZ");
       hExtrasEEGPUCPUratio->Draw("");
     }
     cExtras.SaveAs("ecal-extras.root");
     cExtras.SaveAs("ecal-extras.png");
-  } 
-  
+  }
+
   // Close all open files
   rf.Close();
   rfout.Write();
   rfout.Close();
-  
+
   return 0;
 }
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h
index 92d4bee3100f3..80a3f838e9de9 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h
@@ -11,9 +11,9 @@
 #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
 #include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h"
 
-// 
+//
 // ECAL UncalibRechit producer
-// 
+//
 
 #include "CondFormats/EcalObjects/interface/EcalWeightSet.h"
 #include "CondFormats/EcalObjects/interface/EcalPedestals.h"
@@ -31,9 +31,9 @@
 
 #include "CUDADataFormats/EcalDigi/interface/DigisCollection.h"
 
-// 
+//
 // ECAL Rechit producer
-// 
+//
 
 #include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
 
@@ -49,9 +49,6 @@
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h"
 
-
-
-
 struct EcalPulseShape;
 class EcalSampleMask;
 class EcalTimeBiasCorrections;
@@ -62,347 +59,289 @@ class EcalSamplesCorrelation;
 class EBDigiCollection;
 class EEDigiCollection;
 
-namespace ecal { namespace multifit {
-
-enum class TimeComputationState : char {
-    NotFinished = 0,
-    Finished = 1
-};
-enum class MinimizationState : char {
-    NotFinished = 0,
-    Finished = 1,
-    Precomputed = 2,
-};
+namespace ecal {
+  namespace multifit {
 
-//
-struct EventInputDataGPU {
-    ecal::DigisCollection const& ebDigis;
-    ecal::DigisCollection const& eeDigis;
-};
+    enum class TimeComputationState : char { NotFinished = 0, Finished = 1 };
+    enum class MinimizationState : char {
+      NotFinished = 0,
+      Finished = 1,
+      Precomputed = 2,
+    };
 
-// parameters have a fixed type
-// Can we go by with single precision
-struct ConfigurationParameters {
-    using type = double;
-    // device ptrs
-    type *amplitudeFitParametersEB=nullptr, *amplitudeFitParametersEE=nullptr;
+    //
+    struct EventInputDataGPU {
+      ecal::DigisCollection const& ebDigis;
+      ecal::DigisCollection const& eeDigis;
+    };
 
-    uint32_t timeFitParametersSizeEB, timeFitParametersSizeEE;
-    // device ptrs
-    type *timeFitParametersEB=nullptr, *timeFitParametersEE=nullptr;
+    // parameters have a fixed type
+    // Can we go by with single precision
+    struct ConfigurationParameters {
+      using type = double;
+      // device ptrs
+      type *amplitudeFitParametersEB = nullptr, *amplitudeFitParametersEE = nullptr;
 
-    type timeFitLimitsFirstEB, timeFitLimitsFirstEE;
-    type timeFitLimitsSecondEB, timeFitLimitsSecondEE;
+      uint32_t timeFitParametersSizeEB, timeFitParametersSizeEE;
+      // device ptrs
+      type *timeFitParametersEB = nullptr, *timeFitParametersEE = nullptr;
 
-    type timeConstantTermEB, timeConstantTermEE;
+      type timeFitLimitsFirstEB, timeFitLimitsFirstEE;
+      type timeFitLimitsSecondEB, timeFitLimitsSecondEE;
 
-    type timeNconstEB, timeNconstEE;
+      type timeConstantTermEB, timeConstantTermEE;
 
-    type amplitudeThreshEE, amplitudeThreshEB;
+      type timeNconstEB, timeNconstEE;
 
-    type outOfTimeThreshG12pEB, outOfTimeThreshG12mEB;
-    type outOfTimeThreshG12pEE, outOfTimeThreshG12mEE;
-    type outOfTimeThreshG61pEE, outOfTimeThreshG61mEE;
-    type outOfTimeThreshG61pEB, outOfTimeThreshG61mEB;
+      type amplitudeThreshEE, amplitudeThreshEB;
 
-    std::array<uint32_t, 3> kernelMinimizeThreads;
+      type outOfTimeThreshG12pEB, outOfTimeThreshG12mEB;
+      type outOfTimeThreshG12pEE, outOfTimeThreshG12mEE;
+      type outOfTimeThreshG61pEE, outOfTimeThreshG61mEE;
+      type outOfTimeThreshG61pEB, outOfTimeThreshG61mEB;
 
-    bool shouldRunTimingComputation;
-};
+      std::array<uint32_t, 3> kernelMinimizeThreads;
 
-struct EventOutputDataGPU final : public ::ecal::UncalibratedRecHit<::ecal::Tag::ptr> 
-{
-    void allocate(ConfigurationParameters const& configParameters, uint32_t size) {
-        cudaCheck( cudaMalloc((void**)&amplitudesAll,
-            size * sizeof(SampleVector)) );
-        cudaCheck( cudaMalloc((void**)&amplitude,
-            size * sizeof(::ecal::reco::StorageScalarType)) );
-        cudaCheck( cudaMalloc((void**)&chi2,
-            size * sizeof(::ecal::reco::StorageScalarType)) );
-        cudaCheck( cudaMalloc((void**)&pedestal,
-            size * sizeof(::ecal::reco::StorageScalarType)) );
+      bool shouldRunTimingComputation;
+    };
 
+    struct EventOutputDataGPU final : public ::ecal::UncalibratedRecHit<::ecal::Tag::ptr> {
+      void allocate(ConfigurationParameters const& configParameters, uint32_t size) {
+        cudaCheck(cudaMalloc((void**)&amplitudesAll, size * sizeof(SampleVector)));
+        cudaCheck(cudaMalloc((void**)&amplitude, size * sizeof(::ecal::reco::StorageScalarType)));
+        cudaCheck(cudaMalloc((void**)&chi2, size * sizeof(::ecal::reco::StorageScalarType)));
+        cudaCheck(cudaMalloc((void**)&pedestal, size * sizeof(::ecal::reco::StorageScalarType)));
 
         if (configParameters.shouldRunTimingComputation) {
-            cudaCheck( cudaMalloc((void**)&jitter,
-                size * sizeof(::ecal::reco::StorageScalarType)) );
-            cudaCheck( cudaMalloc((void**)&jitterError,
-                size * sizeof(::ecal::reco::StorageScalarType)) );
+          cudaCheck(cudaMalloc((void**)&jitter, size * sizeof(::ecal::reco::StorageScalarType)));
+          cudaCheck(cudaMalloc((void**)&jitterError, size * sizeof(::ecal::reco::StorageScalarType)));
         }
 
-        cudaCheck( cudaMalloc((void**)&did,
-            size * sizeof(uint32_t)) );
-        cudaCheck( cudaMalloc((void**)&flags,
-            size * sizeof(uint32_t)) );
-    }
-
-    void deallocate(ConfigurationParameters const& configParameters) {
-        cudaCheck( cudaFree(amplitudesAll) );
-        cudaCheck( cudaFree(amplitude) );
-        cudaCheck( cudaFree(chi2) );
-        cudaCheck( cudaFree(pedestal) );
+        cudaCheck(cudaMalloc((void**)&did, size * sizeof(uint32_t)));
+        cudaCheck(cudaMalloc((void**)&flags, size * sizeof(uint32_t)));
+      }
+
+      void deallocate(ConfigurationParameters const& configParameters) {
+        cudaCheck(cudaFree(amplitudesAll));
+        cudaCheck(cudaFree(amplitude));
+        cudaCheck(cudaFree(chi2));
+        cudaCheck(cudaFree(pedestal));
         if (configParameters.shouldRunTimingComputation) {
-            cudaCheck( cudaFree(jitter) );
-            cudaCheck( cudaFree(jitterError) );
+          cudaCheck(cudaFree(jitter));
+          cudaCheck(cudaFree(jitterError));
         }
-        cudaCheck( cudaFree(did) );
-        cudaCheck( cudaFree(flags) );
-    }
-};
-
-struct EventDataForScratchGPU {
-    SampleVector *samples = nullptr;
-    SampleGainVector *gainsNoise = nullptr;
-
-    SampleMatrix* noisecov = nullptr;
-    PulseMatrixType *pulse_matrix = nullptr;
-    BXVectorType *activeBXs = nullptr;
-    char *acState = nullptr;
-
-    bool *hasSwitchToGain6=nullptr,
-         *hasSwitchToGain1=nullptr,
-         *isSaturated=nullptr;
-
-    SampleVector::Scalar *sample_values, *sample_value_errors;
-    bool *useless_sample_values;
-    SampleVector::Scalar* chi2sNullHypot;
-    SampleVector::Scalar* sum0sNullHypot;
-    SampleVector::Scalar* sumAAsNullHypot;
-    char* pedestal_nums;
-    SampleVector::Scalar *tMaxAlphaBetas, *tMaxErrorAlphaBetas;
-    SampleVector::Scalar *accTimeMax, *accTimeWgt;
-    SampleVector::Scalar *ampMaxAlphaBeta, *ampMaxError;
-    SampleVector::Scalar *timeMax, *timeError;
-    TimeComputationState *tcState;
-
-    void allocate(ConfigurationParameters const& configParameters, uint32_t size) {
-        cudaCheck( cudaMalloc((void**)&samples,
-            size * sizeof(SampleVector)) );
-        cudaCheck( cudaMalloc((void**)&gainsNoise,
-            size * sizeof(SampleGainVector)) );
-
-        cudaCheck( cudaMalloc((void**)&noisecov,
-            size * sizeof(SampleMatrix)) );
-        cudaCheck( cudaMalloc((void**)&pulse_matrix,
-            size * sizeof(PulseMatrixType)) );
-        cudaCheck( cudaMalloc((void**)&activeBXs,
-            size * sizeof(BXVectorType)) );
-        cudaCheck( cudaMalloc((void**)&acState,
-            size * sizeof(char)) );
-
-        cudaCheck( cudaMalloc((void**)&hasSwitchToGain6,
-            size * sizeof(bool)) );
-        cudaCheck( cudaMalloc((void**)&hasSwitchToGain1,
-            size * sizeof(bool)) );
-        cudaCheck( cudaMalloc((void**)&isSaturated,
-            size * sizeof(bool)) );
+        cudaCheck(cudaFree(did));
+        cudaCheck(cudaFree(flags));
+      }
+    };
+
+    struct EventDataForScratchGPU {
+      SampleVector* samples = nullptr;
+      SampleGainVector* gainsNoise = nullptr;
+
+      SampleMatrix* noisecov = nullptr;
+      PulseMatrixType* pulse_matrix = nullptr;
+      BXVectorType* activeBXs = nullptr;
+      char* acState = nullptr;
+
+      bool *hasSwitchToGain6 = nullptr, *hasSwitchToGain1 = nullptr, *isSaturated = nullptr;
+
+      SampleVector::Scalar *sample_values, *sample_value_errors;
+      bool* useless_sample_values;
+      SampleVector::Scalar* chi2sNullHypot;
+      SampleVector::Scalar* sum0sNullHypot;
+      SampleVector::Scalar* sumAAsNullHypot;
+      char* pedestal_nums;
+      SampleVector::Scalar *tMaxAlphaBetas, *tMaxErrorAlphaBetas;
+      SampleVector::Scalar *accTimeMax, *accTimeWgt;
+      SampleVector::Scalar *ampMaxAlphaBeta, *ampMaxError;
+      SampleVector::Scalar *timeMax, *timeError;
+      TimeComputationState* tcState;
+
+      void allocate(ConfigurationParameters const& configParameters, uint32_t size) {
+        cudaCheck(cudaMalloc((void**)&samples, size * sizeof(SampleVector)));
+        cudaCheck(cudaMalloc((void**)&gainsNoise, size * sizeof(SampleGainVector)));
+
+        cudaCheck(cudaMalloc((void**)&noisecov, size * sizeof(SampleMatrix)));
+        cudaCheck(cudaMalloc((void**)&pulse_matrix, size * sizeof(PulseMatrixType)));
+        cudaCheck(cudaMalloc((void**)&activeBXs, size * sizeof(BXVectorType)));
+        cudaCheck(cudaMalloc((void**)&acState, size * sizeof(char)));
+
+        cudaCheck(cudaMalloc((void**)&hasSwitchToGain6, size * sizeof(bool)));
+        cudaCheck(cudaMalloc((void**)&hasSwitchToGain1, size * sizeof(bool)));
+        cudaCheck(cudaMalloc((void**)&isSaturated, size * sizeof(bool)));
 
         if (configParameters.shouldRunTimingComputation) {
-            cudaCheck( cudaMalloc((void**)&sample_values,
-                size * sizeof(SampleVector)) );
-            cudaCheck( cudaMalloc((void**)&sample_value_errors,
-                size * sizeof(SampleVector)) );
-            cudaCheck( cudaMalloc((void**)&useless_sample_values,
-                size * sizeof(bool) * EcalDataFrame::MAXSAMPLES) );
-            cudaCheck( cudaMalloc((void**)&chi2sNullHypot,
-                size * sizeof(SampleVector::Scalar)) );
-            cudaCheck( cudaMalloc((void**)&sum0sNullHypot,
-                size * sizeof(SampleVector::Scalar)) );
-            cudaCheck( cudaMalloc((void**)&sumAAsNullHypot,
-                size * sizeof(SampleVector::Scalar)) );
-            cudaCheck( cudaMalloc((void**)&pedestal_nums,
-                size * sizeof(char)) );
-
-            cudaCheck( cudaMalloc((void**)&tMaxAlphaBetas,
-                size * sizeof(SampleVector::Scalar)) );
-            cudaCheck( cudaMalloc((void**)&tMaxErrorAlphaBetas,
-                size * sizeof(SampleVector::Scalar)) );
-            cudaCheck( cudaMalloc((void**)&accTimeMax,
-                size * sizeof(SampleVector::Scalar)) );
-            cudaCheck( cudaMalloc((void**)&accTimeWgt,
-                size * sizeof(SampleVector::Scalar)) );
-            cudaCheck( cudaMalloc((void**)&ampMaxAlphaBeta,
-                size * sizeof(SampleVector::Scalar)) );
-            cudaCheck( cudaMalloc((void**)&ampMaxError,
-                size * sizeof(SampleVector::Scalar)) );
-            cudaCheck( cudaMalloc((void**)&timeMax,
-                size * sizeof(SampleVector::Scalar)) );
-            cudaCheck( cudaMalloc((void**)&timeError,
-                size * sizeof(SampleVector::Scalar)) );
-            cudaCheck( cudaMalloc((void**)&tcState,
-                size * sizeof(TimeComputationState)) );
+          cudaCheck(cudaMalloc((void**)&sample_values, size * sizeof(SampleVector)));
+          cudaCheck(cudaMalloc((void**)&sample_value_errors, size * sizeof(SampleVector)));
+          cudaCheck(cudaMalloc((void**)&useless_sample_values, size * sizeof(bool) * EcalDataFrame::MAXSAMPLES));
+          cudaCheck(cudaMalloc((void**)&chi2sNullHypot, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&sum0sNullHypot, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&sumAAsNullHypot, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&pedestal_nums, size * sizeof(char)));
+
+          cudaCheck(cudaMalloc((void**)&tMaxAlphaBetas, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&tMaxErrorAlphaBetas, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&accTimeMax, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&accTimeWgt, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&ampMaxAlphaBeta, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&ampMaxError, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&timeMax, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&timeError, size * sizeof(SampleVector::Scalar)));
+          cudaCheck(cudaMalloc((void**)&tcState, size * sizeof(TimeComputationState)));
         }
-    }
+      }
 
-    void deallocate(ConfigurationParameters const& configParameters) {
-        cudaCheck( cudaFree(samples) );
-        cudaCheck( cudaFree(gainsNoise) );
+      void deallocate(ConfigurationParameters const& configParameters) {
+        cudaCheck(cudaFree(samples));
+        cudaCheck(cudaFree(gainsNoise));
 
-        cudaCheck( cudaFree(noisecov) );
-        cudaCheck( cudaFree(pulse_matrix) );
-        cudaCheck( cudaFree(activeBXs) );
-        cudaCheck( cudaFree(acState) );
+        cudaCheck(cudaFree(noisecov));
+        cudaCheck(cudaFree(pulse_matrix));
+        cudaCheck(cudaFree(activeBXs));
+        cudaCheck(cudaFree(acState));
 
-        cudaCheck( cudaFree(hasSwitchToGain6) );
-        cudaCheck( cudaFree(hasSwitchToGain1) );
-        cudaCheck( cudaFree(isSaturated) );
+        cudaCheck(cudaFree(hasSwitchToGain6));
+        cudaCheck(cudaFree(hasSwitchToGain1));
+        cudaCheck(cudaFree(isSaturated));
 
         if (configParameters.shouldRunTimingComputation) {
-            cudaCheck( cudaFree(sample_values) );
-            cudaCheck( cudaFree(sample_value_errors) );
-            cudaCheck( cudaFree(useless_sample_values) );
-            cudaCheck( cudaFree(chi2sNullHypot) );
-            cudaCheck( cudaFree(sum0sNullHypot) );
-            cudaCheck( cudaFree(sumAAsNullHypot) );
-            cudaCheck( cudaFree(pedestal_nums) );
-
-            cudaCheck( cudaFree(tMaxAlphaBetas) );
-            cudaCheck( cudaFree(tMaxErrorAlphaBetas) );
-            cudaCheck( cudaFree(accTimeMax) );
-            cudaCheck( cudaFree(accTimeWgt) );
-            cudaCheck( cudaFree(ampMaxAlphaBeta) );
-            cudaCheck( cudaFree(ampMaxError) );
-            cudaCheck( cudaFree(timeMax) );
-            cudaCheck( cudaFree(timeError) );
-            cudaCheck( cudaFree(tcState) );
+          cudaCheck(cudaFree(sample_values));
+          cudaCheck(cudaFree(sample_value_errors));
+          cudaCheck(cudaFree(useless_sample_values));
+          cudaCheck(cudaFree(chi2sNullHypot));
+          cudaCheck(cudaFree(sum0sNullHypot));
+          cudaCheck(cudaFree(sumAAsNullHypot));
+          cudaCheck(cudaFree(pedestal_nums));
+
+          cudaCheck(cudaFree(tMaxAlphaBetas));
+          cudaCheck(cudaFree(tMaxErrorAlphaBetas));
+          cudaCheck(cudaFree(accTimeMax));
+          cudaCheck(cudaFree(accTimeWgt));
+          cudaCheck(cudaFree(ampMaxAlphaBeta));
+          cudaCheck(cudaFree(ampMaxError));
+          cudaCheck(cudaFree(timeMax));
+          cudaCheck(cudaFree(timeError));
+          cudaCheck(cudaFree(tcState));
         }
-    }
-};
-
-// const refs products to conditions
-struct ConditionsProducts {
-    EcalPedestalsGPU::Product const& pedestals;
-    EcalGainRatiosGPU::Product const& gainRatios;
-    EcalPulseShapesGPU::Product const& pulseShapes;
-    EcalPulseCovariancesGPU::Product const& pulseCovariances;
-    EcalSamplesCorrelationGPU::Product const& samplesCorrelation;
-    EcalTimeBiasCorrectionsGPU::Product const& timeBiasCorrections;
-    EcalTimeCalibConstantsGPU::Product const& timeCalibConstants;
-    EcalSampleMask const& sampleMask;
-    EcalTimeOffsetConstant const& timeOffsetConstant;
-    uint32_t offsetForHashes;
-};
-
-//*/
-
-struct xyz {
-    int x,y,z;
-};
-
-struct conf_data {
-    xyz threads;
-    bool runV1;
-    cudaStream_t cuStream;
-};
-
-}}        
-        
-        
-  
-// 
+      }
+    };
+
+    // const refs products to conditions
+    struct ConditionsProducts {
+      EcalPedestalsGPU::Product const& pedestals;
+      EcalGainRatiosGPU::Product const& gainRatios;
+      EcalPulseShapesGPU::Product const& pulseShapes;
+      EcalPulseCovariancesGPU::Product const& pulseCovariances;
+      EcalSamplesCorrelationGPU::Product const& samplesCorrelation;
+      EcalTimeBiasCorrectionsGPU::Product const& timeBiasCorrections;
+      EcalTimeCalibConstantsGPU::Product const& timeCalibConstants;
+      EcalSampleMask const& sampleMask;
+      EcalTimeOffsetConstant const& timeOffsetConstant;
+      uint32_t offsetForHashes;
+    };
+
+    //*/
+
+    struct xyz {
+      int x, y, z;
+    };
+
+    struct conf_data {
+      xyz threads;
+      bool runV1;
+      cudaStream_t cuStream;
+    };
+
+  }  // namespace multifit
+}  // namespace ecal
+
+//
 // ECAL Rechit producer
-// 
+//
 
-namespace ecal { 
+namespace ecal {
   namespace rechit {
-    
+
     // parameters that are read in the configuration file for rechit producer
     struct ConfigurationParameters {
       // device ptrs
-      int *ChannelStatusToBeExcluded=nullptr; 
+      int* ChannelStatusToBeExcluded = nullptr;
       uint32_t ChannelStatusToBeExcludedSize;
-      
+
       bool killDeadChannels;
-      
-      bool recoverEBIsolatedChannels ;
-      bool recoverEEIsolatedChannels ;
-      bool recoverEBVFE              ;
-      bool recoverEEVFE              ;
-      bool recoverEBFE               ;
-      bool recoverEEFE               ;
-      
+
+      bool recoverEBIsolatedChannels;
+      bool recoverEEIsolatedChannels;
+      bool recoverEBVFE;
+      bool recoverEEVFE;
+      bool recoverEBFE;
+      bool recoverEEFE;
+
       float EBLaserMIN;
       float EELaserMIN;
       float EBLaserMAX;
       float EELaserMAX;
-      
+
       //       std::vector<std::vector<uint32_t> > v_DB_reco_flags;
       int* expanded_v_DB_reco_flags;
       uint32_t* expanded_Sizes_v_DB_reco_flags;
       uint32_t* expanded_flagbit_v_DB_reco_flags;
       uint32_t expanded_v_DB_reco_flagsSize;
-      
+
       uint32_t flagmask;
-      
-      
-      //       
+
+      //
       //       bool shouldRunTimingComputation;
     };
-    
-    
-    
-    
-    
-    
+
     struct EventOutputDataGPU final : public ::ecal::RecHit<::ecal::Tag::ptr> {
-      
       void allocate(ConfigurationParameters const& configParameters, uint32_t size) {
         //      void allocate(uint32_t size) {
         //---- configParameters -> needed only to decide if to save the timing information or not
-        
-        cudaCheck( cudaMalloc((void**)&energy,
-                              size * sizeof(::ecal::reco::StorageScalarType)) );
-        cudaCheck( cudaMalloc((void**)&time,
-                              size * sizeof(::ecal::reco::StorageScalarType)) );
-        cudaCheck( cudaMalloc((void**)&chi2,
-                              size * sizeof(::ecal::reco::StorageScalarType)) );
-        cudaCheck( cudaMalloc((void**)&flagBits,
-                              size * sizeof(uint32_t)) );
-        cudaCheck( cudaMalloc((void**)&extra,
-                              size * sizeof(uint32_t)) );      
-        cudaCheck( cudaMalloc((void**)&did,
-                              size * sizeof(uint32_t)) );
+
+        cudaCheck(cudaMalloc((void**)&energy, size * sizeof(::ecal::reco::StorageScalarType)));
+        cudaCheck(cudaMalloc((void**)&time, size * sizeof(::ecal::reco::StorageScalarType)));
+        cudaCheck(cudaMalloc((void**)&chi2, size * sizeof(::ecal::reco::StorageScalarType)));
+        cudaCheck(cudaMalloc((void**)&flagBits, size * sizeof(uint32_t)));
+        cudaCheck(cudaMalloc((void**)&extra, size * sizeof(uint32_t)));
+        cudaCheck(cudaMalloc((void**)&did, size * sizeof(uint32_t)));
       }
-      
-      
+
       void deallocate(ConfigurationParameters const& configParameters) {
         //     void deallocate() {
         //---- configParameters -> needed only to decide if to save the timing information or not
-        
-        cudaCheck( cudaFree(energy) );
-        cudaCheck( cudaFree(time) );
-        cudaCheck( cudaFree(chi2) );
-        cudaCheck( cudaFree(flagBits) );
-        cudaCheck( cudaFree(extra) );
-        cudaCheck( cudaFree(did) );
+
+        cudaCheck(cudaFree(energy));
+        cudaCheck(cudaFree(time));
+        cudaCheck(cudaFree(chi2));
+        cudaCheck(cudaFree(flagBits));
+        cudaCheck(cudaFree(extra));
+        cudaCheck(cudaFree(did));
       }
     };
-    
-    
-    
+
     struct EventInputDataGPU {
       ecal::UncalibratedRecHit<ecal::Tag::ptr> const& ebUncalibRecHits;
       ecal::UncalibratedRecHit<ecal::Tag::ptr> const& eeUncalibRecHits;
     };
-    
+
     // const refs products to conditions
     struct ConditionsProducts {
-      EcalRechitADCToGeVConstantGPU::Product const& ADCToGeV      ;
-      EcalIntercalibConstantsGPU::Product    const& Intercalib    ;
-      EcalRechitChannelStatusGPU::Product    const& ChannelStatus ;
-      //     
-      EcalLaserAPDPNRatiosGPU::Product     const& LaserAPDPNRatios    ;
-      EcalLaserAPDPNRatiosRefGPU::Product  const& LaserAPDPNRatiosRef ;
-      EcalLaserAlphasGPU::Product          const& LaserAlphas         ;
-      EcalLinearCorrectionsGPU::Product    const& LinearCorrections   ;
-      //     
-      //     
-      uint32_t offsetForHashes;    
+      EcalRechitADCToGeVConstantGPU::Product const& ADCToGeV;
+      EcalIntercalibConstantsGPU::Product const& Intercalib;
+      EcalRechitChannelStatusGPU::Product const& ChannelStatus;
+      //
+      EcalLaserAPDPNRatiosGPU::Product const& LaserAPDPNRatios;
+      EcalLaserAPDPNRatiosRefGPU::Product const& LaserAPDPNRatiosRef;
+      EcalLaserAlphasGPU::Product const& LaserAlphas;
+      EcalLinearCorrectionsGPU::Product const& LinearCorrections;
+      //
+      //
+      uint32_t offsetForHashes;
     };
-    
-    
-    
-  }
-}
+
+  }  // namespace rechit
+}  // namespace ecal
 
 #endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h
index ae36aa78c9e45..c59527a6d9f5a 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h
@@ -12,33 +12,32 @@ class EcalIntercalibConstantsGPU {
 public:
   struct Product {
     ~Product();
-    float *values = nullptr;
+    float* values = nullptr;
   };
-  
-  #ifndef __CUDACC__
-  // 
+
+#ifndef __CUDACC__
+  //
   EcalIntercalibConstantsGPU(EcalIntercalibConstants const&);
-  
+
   // will call dealloation for Product thru ~Product
   ~EcalIntercalibConstantsGPU() = default;
-  
+
   // get device pointers
   Product const& getProduct(cudaStream_t) const;
-  
+
   // TODO: do this centrally
   // get offset for hashes. equals number of barrel items
   uint32_t getOffset() const { return valuesEB_.size(); }
-  
-  // 
+
+  //
   static std::string name() { return std::string{"ecalIntercalibConstantsGPU"}; }
-  
+
 private:
   std::vector<float> const& valuesEB_;
   std::vector<float> const& valuesEE_;
-  
+
   cms::cuda::ESProduct<Product> product_;
-  #endif
+#endif
 };
 
-
 #endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h
index 53c8ea6ba67b7..9b87c3228e5c7 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h
@@ -12,47 +12,42 @@ class EcalLaserAPDPNRatiosGPU {
 public:
   struct Product {
     ~Product();
-    float *p1=nullptr;
-    float *p2=nullptr;
-    float *p3=nullptr;
-    edm::TimeValue_t *t1=nullptr;
-    edm::TimeValue_t *t2=nullptr;
-    edm::TimeValue_t *t3=nullptr;
+    float *p1 = nullptr;
+    float *p2 = nullptr;
+    float *p3 = nullptr;
+    edm::TimeValue_t *t1 = nullptr;
+    edm::TimeValue_t *t2 = nullptr;
+    edm::TimeValue_t *t3 = nullptr;
   };
-  
-  #ifndef __CUDACC__
-  
-  // 
-  EcalLaserAPDPNRatiosGPU(EcalLaserAPDPNRatios const&);
-  
+
+#ifndef __CUDACC__
+
+  //
+  EcalLaserAPDPNRatiosGPU(EcalLaserAPDPNRatios const &);
+
   // will call dealloation for Product thru ~Product
   ~EcalLaserAPDPNRatiosGPU() = default;
-  
+
   // get device pointers
-  Product const& getProduct(cudaStream_t) const;
-  
-  // 
+  Product const &getProduct(cudaStream_t) const;
+
+  //
   static std::string name() { return std::string{"ecalLaserAPDPNRatiosGPU"}; }
-  
+
 private:
   // in the future, we need to arrange so to avoid this copy on the host
   // store eb first then ee
   std::vector<float, CUDAHostAllocator<float> > p1_;
   std::vector<float, CUDAHostAllocator<float> > p2_;
   std::vector<float, CUDAHostAllocator<float> > p3_;
-  
+
   std::vector<edm::TimeValue_t, CUDAHostAllocator<edm::TimeValue_t> > t1_;
   std::vector<edm::TimeValue_t, CUDAHostAllocator<edm::TimeValue_t> > t2_;
   std::vector<edm::TimeValue_t, CUDAHostAllocator<edm::TimeValue_t> > t3_;
-  
-  cms::cuda::ESProduct<Product> product_;
-  
-  #endif
-};
 
+  cms::cuda::ESProduct<Product> product_;
 
 #endif
+};
 
-
-
-
+#endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h
index 191c78a7c4617..6e48d50f217f3 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h
@@ -12,33 +12,32 @@ class EcalLaserAPDPNRatiosRefGPU {
 public:
   struct Product {
     ~Product();
-    float *values = nullptr;
+    float* values = nullptr;
   };
-  
-  #ifndef __CUDACC__
-  // 
+
+#ifndef __CUDACC__
+  //
   EcalLaserAPDPNRatiosRefGPU(EcalLaserAPDPNRatiosRef const&);
-  
+
   // will call dealloation for Product thru ~Product
   ~EcalLaserAPDPNRatiosRefGPU() = default;
-  
+
   // get device pointers
   Product const& getProduct(cudaStream_t) const;
-  
+
   // TODO: do this centrally
   // get offset for hashes. equals number of barrel items
   uint32_t getOffset() const { return valuesEB_.size(); }
-  
-  // 
+
+  //
   static std::string name() { return std::string{"ecalLaserAPDPNRatiosRefGPU"}; }
-  
+
 private:
   std::vector<float> const& valuesEB_;
   std::vector<float> const& valuesEE_;
-  
+
   cms::cuda::ESProduct<Product> product_;
-  #endif
+#endif
 };
 
-
 #endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h
index ac97e6c514bac..d787c5700cd7e 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h
@@ -12,33 +12,32 @@ class EcalLaserAlphasGPU {
 public:
   struct Product {
     ~Product();
-    float *values = nullptr;
+    float* values = nullptr;
   };
-  
-  #ifndef __CUDACC__
-  // 
+
+#ifndef __CUDACC__
+  //
   EcalLaserAlphasGPU(EcalLaserAlphas const&);
-  
+
   // will call dealloation for Product thru ~Product
   ~EcalLaserAlphasGPU() = default;
-  
+
   // get device pointers
   Product const& getProduct(cudaStream_t) const;
-  
+
   // TODO: do this centrally
   // get offset for hashes. equals number of barrel items
   uint32_t getOffset() const { return valuesEB_.size(); }
-  
-  // 
+
+  //
   static std::string name() { return std::string{"ecalLaserAlphasGPU"}; }
-  
+
 private:
   std::vector<float> const& valuesEB_;
   std::vector<float> const& valuesEE_;
-  
+
   cms::cuda::ESProduct<Product> product_;
-  #endif
+#endif
 };
 
-
 #endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h
index 41469bcf16c82..f2b395f5660fa 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h
@@ -12,46 +12,42 @@ class EcalLinearCorrectionsGPU {
 public:
   struct Product {
     ~Product();
-    float *p1=nullptr;
-    float *p2=nullptr;
-    float *p3=nullptr;
-    edm::TimeValue_t *t1=nullptr;
-    edm::TimeValue_t *t2=nullptr;
-    edm::TimeValue_t *t3=nullptr;
+    float *p1 = nullptr;
+    float *p2 = nullptr;
+    float *p3 = nullptr;
+    edm::TimeValue_t *t1 = nullptr;
+    edm::TimeValue_t *t2 = nullptr;
+    edm::TimeValue_t *t3 = nullptr;
   };
-  
-  #ifndef __CUDACC__
-  
-  // 
-  EcalLinearCorrectionsGPU(EcalLinearCorrections const&);
-  
+
+#ifndef __CUDACC__
+
+  //
+  EcalLinearCorrectionsGPU(EcalLinearCorrections const &);
+
   // will call dealloation for Product thru ~Product
   ~EcalLinearCorrectionsGPU() = default;
-  
+
   // get device pointers
-  Product const& getProduct(cudaStream_t) const;
-  
-  // 
+  Product const &getProduct(cudaStream_t) const;
+
+  //
   static std::string name() { return std::string{"ecalLinearCorrectionsGPU"}; }
-  
+
 private:
   // in the future, we need to arrange so to avoid this copy on the host
   // store eb first then ee
   std::vector<float, CUDAHostAllocator<float>> p1_;
   std::vector<float, CUDAHostAllocator<float>> p2_;
   std::vector<float, CUDAHostAllocator<float>> p3_;
-  
+
   std::vector<edm::TimeValue_t, CUDAHostAllocator<edm::TimeValue_t>> t1_;
   std::vector<edm::TimeValue_t, CUDAHostAllocator<edm::TimeValue_t>> t2_;
   std::vector<edm::TimeValue_t, CUDAHostAllocator<edm::TimeValue_t>> t3_;
-  
-  cms::cuda::ESProduct<Product> product_;
-  
-  #endif
-};
 
+  cms::cuda::ESProduct<Product> product_;
 
 #endif
+};
 
-
-
+#endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h
index 8addc316f366d..3838a757cc2e1 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h
@@ -12,32 +12,31 @@ class EcalRechitADCToGeVConstantGPU {
 public:
   struct Product {
     ~Product();
-    float *adc2gev = nullptr;
+    float* adc2gev = nullptr;
   };
-  
-  #ifndef __CUDACC__
-  
-  // 
+
+#ifndef __CUDACC__
+
+  //
   EcalRechitADCToGeVConstantGPU(EcalADCToGeVConstant const&);
-  
+
   // will call dealloation for Product thru ~Product
   ~EcalRechitADCToGeVConstantGPU() = default;
-  
+
   // get device pointers
   Product const& getProduct(cudaStream_t) const;
-  
-  // 
+
+  //
   static std::string name() { return std::string{"ecalRechitADCToGeVConstantGPU"}; }
-  
+
 private:
   // in the future, we need to arrange so to avoid this copy on the host
   // store eb first then ee
   std::vector<float, CUDAHostAllocator<float>> adc2gev_;
-  
+
   cms::cuda::ESProduct<Product> product_;
-  
-  #endif
-};
 
+#endif
+};
 
 #endif
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h
index 2329b3752089d..bf3f0f600224e 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h
@@ -12,32 +12,31 @@ class EcalRechitChannelStatusGPU {
 public:
   struct Product {
     ~Product();
-    uint16_t *status = nullptr;
+    uint16_t* status = nullptr;
   };
-  
-  #ifndef __CUDACC__
-  
-  // 
+
+#ifndef __CUDACC__
+
+  //
   EcalRechitChannelStatusGPU(EcalChannelStatus const&);
-  
+
   // will call dealloation for Product thru ~Product
   ~EcalRechitChannelStatusGPU() = default;
-  
+
   // get device pointers
   Product const& getProduct(cudaStream_t) const;
-  
-  // 
+
+  //
   static std::string name() { return std::string{"ecalRechitChannelStatusGPU"}; }
-  
+
 private:
   // in the future, we need to arrange so to avoid this copy on the host
   // store eb first then ee
   std::vector<uint16_t, CUDAHostAllocator<uint16_t>> status_;
-  
+
   cms::cuda::ESProduct<Product> product_;
-  
-  #endif
-};
 
+#endif
+};
 
 #endif
diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu
index 139c1c31f09a9..717a005a3dfb1 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu
@@ -85,7 +85,8 @@ namespace ecal {
         auto const did = DetId{dids[inputCh]};
         auto const isBarrel = did.subdetId() == EcalBarrel;
         // TODO offset for ee, 0 for eb
-        auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
+        auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
+                                       : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
 
         //
         // pulse shape template
@@ -345,7 +346,8 @@ namespace ecal {
       bool tmp1 = hasSwitchToGain1[ch];
       auto const did = DetId{dids[inputCh]};
       auto const isBarrel = did.subdetId() == EcalBarrel;
-      auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
+      auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
+                                     : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
       auto const G12SamplesCorrelation = isBarrel ? G12SamplesCorrelationEB : G12SamplesCorrelationEE;
       auto const* G6SamplesCorrelation = isBarrel ? G6SamplesCorrelationEB : G6SamplesCorrelationEE;
       auto const* G1SamplesCorrelation = isBarrel ? G1SamplesCorrelationEB : G1SamplesCorrelationEE;
@@ -503,4 +505,3 @@ namespace ecal {
 
   }  // namespace multifit
 }  // namespace ecal
-
diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu
index b12fa6fc1043f..a3f9cf71caaf6 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu
@@ -158,8 +158,9 @@ namespace ecal {
         auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
         auto const did = DetId{dids[inputCh]};
         auto const isBarrel = did.subdetId() == EcalBarrel;
-        auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
-        
+        auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
+                                       : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
+
         // inits
         int iter = 0;
         int npassive = 0;
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalIntercalibConstantsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalIntercalibConstantsGPU.cc
index 844a28d27fd8e..dec10cff57dd0 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalIntercalibConstantsGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalIntercalibConstantsGPU.cc
@@ -3,41 +3,37 @@
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-EcalIntercalibConstantsGPU::EcalIntercalibConstantsGPU(EcalIntercalibConstants const& values) 
-: valuesEB_{values.barrelItems()}
-, valuesEE_{values.endcapItems()}
-{}
+EcalIntercalibConstantsGPU::EcalIntercalibConstantsGPU(EcalIntercalibConstants const& values)
+    : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {}
 
 EcalIntercalibConstantsGPU::Product::~Product() {
   // deallocation
-  cudaCheck( cudaFree(values) );
+  cudaCheck(cudaFree(values));
 }
 
 EcalIntercalibConstantsGPU::Product const& EcalIntercalibConstantsGPU::getProduct(cudaStream_t cudaStream) const {
-  auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
-                                                           [this](EcalIntercalibConstantsGPU::Product& product, cudaStream_t cudaStream) {
-                                                             // malloc
-                                                             cudaCheck( cudaMalloc((void**)&product.values,
-                                                                                   (this->valuesEB_.size() + this->valuesEE_.size()) * 
-                                                                                   sizeof(float)) );
-                                                             
-                                                             // offset in floats, not bytes
-                                                             auto const offset = this->valuesEB_.size();
-                                                             
-                                                             // transfer 
-                                                             cudaCheck( cudaMemcpyAsync(product.values,
-                                                                                        this->valuesEB_.data(),
-                                                                                        this->valuesEB_.size() * sizeof(float),
-                                                                                        cudaMemcpyHostToDevice,
-                                                                                        cudaStream) );
-                                                             cudaCheck( cudaMemcpyAsync(product.values + offset,
-                                                                                        this->valuesEE_.data(),
-                                                                                        this->valuesEE_.size() * sizeof(float),
-                                                                                        cudaMemcpyHostToDevice,
-                                                                                        cudaStream) );
-                                                           }
-  );
-  
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalIntercalibConstantsGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(
+            cudaMalloc((void**)&product.values, (this->valuesEB_.size() + this->valuesEE_.size()) * sizeof(float)));
+
+        // offset in floats, not bytes
+        auto const offset = this->valuesEB_.size();
+
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.values,
+                                  this->valuesEB_.data(),
+                                  this->valuesEB_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.values + offset,
+                                  this->valuesEE_.data(),
+                                  this->valuesEE_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
   return product;
 }
 
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosGPU.cc
index f54f7bd47c022..4aa92ea6750fe 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosGPU.cc
@@ -3,107 +3,84 @@
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-EcalLaserAPDPNRatiosGPU::EcalLaserAPDPNRatiosGPU(EcalLaserAPDPNRatios const& values) 
-: p1_(values.getLaserMap().size())
-, p2_(values.getLaserMap().size())
-, p3_(values.getLaserMap().size())
-, t1_(values.getTimeMap().size())
-, t2_(values.getTimeMap().size())
-, t3_(values.getTimeMap().size())
-{
-  
+EcalLaserAPDPNRatiosGPU::EcalLaserAPDPNRatiosGPU(EcalLaserAPDPNRatios const& values)
+    : p1_(values.getLaserMap().size()),
+      p2_(values.getLaserMap().size()),
+      p3_(values.getLaserMap().size()),
+      t1_(values.getTimeMap().size()),
+      t2_(values.getTimeMap().size()),
+      t3_(values.getTimeMap().size()) {
   // fill in eb
   //     auto const& barrelValues = values.barrelItems();
-  for (unsigned int i=0; i<values.getLaserMap().barrelItems().size(); i++) {
+  for (unsigned int i = 0; i < values.getLaserMap().barrelItems().size(); i++) {
     p1_[i] = values.getLaserMap().barrelItems()[i].p1;
     p2_[i] = values.getLaserMap().barrelItems()[i].p2;
     p3_[i] = values.getLaserMap().barrelItems()[i].p3;
   }
-  
+
   // fill in ee
   //     auto const& endcapValues = values.endcapItems();
   auto const offset_laser = values.getLaserMap().barrelItems().size();
-  for (unsigned int i=0; i<values.getLaserMap().endcapItems().size(); i++) {
+  for (unsigned int i = 0; i < values.getLaserMap().endcapItems().size(); i++) {
     p1_[offset_laser + i] = values.getLaserMap().endcapItems()[i].p1;
     p2_[offset_laser + i] = values.getLaserMap().endcapItems()[i].p2;
     p3_[offset_laser + i] = values.getLaserMap().endcapItems()[i].p3;
   }
-  
+
   //   Time is a simple std::vector
   //       typedef std::vector<EcalLaserTimeStamp> EcalLaserTimeStampMap;
-  for (unsigned int i=0; i<values.getTimeMap().size(); i++) {
+  for (unsigned int i = 0; i < values.getTimeMap().size(); i++) {
     t1_[i] = values.getTimeMap()[i].t1.value();
     t2_[i] = values.getTimeMap()[i].t2.value();
     t3_[i] = values.getTimeMap()[i].t3.value();
   }
 }
 
-
-
 EcalLaserAPDPNRatiosGPU::Product::~Product() {
   // deallocation
-  cudaCheck( cudaFree(p1) );
-  cudaCheck( cudaFree(p2) );
-  cudaCheck( cudaFree(p3) );
-  cudaCheck( cudaFree(t1) );
-  cudaCheck( cudaFree(t2) );
-  cudaCheck( cudaFree(t3) );
+  cudaCheck(cudaFree(p1));
+  cudaCheck(cudaFree(p2));
+  cudaCheck(cudaFree(p3));
+  cudaCheck(cudaFree(t1));
+  cudaCheck(cudaFree(t2));
+  cudaCheck(cudaFree(t3));
 }
 
-EcalLaserAPDPNRatiosGPU::Product const& EcalLaserAPDPNRatiosGPU::getProduct(
-  cudaStream_t cudaStream) const
-  {
-    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
-                                                             [this](EcalLaserAPDPNRatiosGPU::Product& product, cudaStream_t cudaStream) {
-                                                               // malloc
-                                                               cudaCheck( cudaMalloc((void**)&product.p1,
-                                                                                     this->p1_.size() * sizeof(float)) );
-                                                               cudaCheck( cudaMalloc((void**)&product.p2,
-                                                                                     this->p2_.size() * sizeof(float)) );
-                                                               cudaCheck( cudaMalloc((void**)&product.p3,
-                                                                                     this->p3_.size() * sizeof(float)) );
-                                                               cudaCheck( cudaMalloc((void**)&product.t1,
-                                                                                     this->t1_.size() * sizeof(edm::TimeValue_t)) );
-                                                               cudaCheck( cudaMalloc((void**)&product.t2,
-                                                                                     this->t2_.size() * sizeof(edm::TimeValue_t)) );
-                                                               cudaCheck( cudaMalloc((void**)&product.t3,
-                                                                                     this->t3_.size() * sizeof(edm::TimeValue_t)) );
-                                                               // transfer 
-                                                               cudaCheck( cudaMemcpyAsync(product.p1,
-                                                                                          this->p1_.data(),
-                                                                                          this->p1_.size() * sizeof(float),
-                                                                                          cudaMemcpyHostToDevice,
-                                                                                          cudaStream) );
-                                                               cudaCheck( cudaMemcpyAsync(product.p2,
-                                                                                          this->p2_.data(),
-                                                                                          this->p2_.size() * sizeof(float),
-                                                                                          cudaMemcpyHostToDevice,
-                                                                                          cudaStream) );
-                                                               cudaCheck( cudaMemcpyAsync(product.p3,
-                                                                                          this->p3_.data(),
-                                                                                          this->p3_.size() * sizeof(float),
-                                                                                          cudaMemcpyHostToDevice,
-                                                                                          cudaStream) );
-                                                               cudaCheck( cudaMemcpyAsync(product.t1,
-                                                                                          this->t1_.data(),
-                                                                                          this->t1_.size() * sizeof(edm::TimeValue_t),
-                                                                                          cudaMemcpyHostToDevice,
-                                                                                          cudaStream) );
-                                                               cudaCheck( cudaMemcpyAsync(product.t2,
-                                                                                          this->t2_.data(),
-                                                                                          this->t2_.size() * sizeof(edm::TimeValue_t),
-                                                                                          cudaMemcpyHostToDevice,
-                                                                                          cudaStream) );
-                                                               cudaCheck( cudaMemcpyAsync(product.t3,
-                                                                                          this->t3_.data(),
-                                                                                          this->t3_.size() * sizeof(edm::TimeValue_t),
-                                                                                          cudaMemcpyHostToDevice,
-                                                                                          cudaStream) );
-                                                             }
-    );
-    
-    return product;
-  }
-  
-  TYPELOOKUP_DATA_REG(EcalLaserAPDPNRatiosGPU);
-  
\ No newline at end of file
+EcalLaserAPDPNRatiosGPU::Product const& EcalLaserAPDPNRatiosGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalLaserAPDPNRatiosGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.p1, this->p1_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.p2, this->p2_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.p3, this->p3_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.t1, this->t1_.size() * sizeof(edm::TimeValue_t)));
+        cudaCheck(cudaMalloc((void**)&product.t2, this->t2_.size() * sizeof(edm::TimeValue_t)));
+        cudaCheck(cudaMalloc((void**)&product.t3, this->t3_.size() * sizeof(edm::TimeValue_t)));
+        // transfer
+        cudaCheck(cudaMemcpyAsync(
+            product.p1, this->p1_.data(), this->p1_.size() * sizeof(float), cudaMemcpyHostToDevice, cudaStream));
+        cudaCheck(cudaMemcpyAsync(
+            product.p2, this->p2_.data(), this->p2_.size() * sizeof(float), cudaMemcpyHostToDevice, cudaStream));
+        cudaCheck(cudaMemcpyAsync(
+            product.p3, this->p3_.data(), this->p3_.size() * sizeof(float), cudaMemcpyHostToDevice, cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.t1,
+                                  this->t1_.data(),
+                                  this->t1_.size() * sizeof(edm::TimeValue_t),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.t2,
+                                  this->t2_.data(),
+                                  this->t2_.size() * sizeof(edm::TimeValue_t),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.t3,
+                                  this->t3_.data(),
+                                  this->t3_.size() * sizeof(edm::TimeValue_t),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalLaserAPDPNRatiosGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosRefGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosRefGPU.cc
index c4c07361a8535..8f77cf48fe1d1 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosRefGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosRefGPU.cc
@@ -3,41 +3,37 @@
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-EcalLaserAPDPNRatiosRefGPU::EcalLaserAPDPNRatiosRefGPU(EcalLaserAPDPNRatiosRef const& values) 
-: valuesEB_{values.barrelItems()}
-, valuesEE_{values.endcapItems()}
-{}
+EcalLaserAPDPNRatiosRefGPU::EcalLaserAPDPNRatiosRefGPU(EcalLaserAPDPNRatiosRef const& values)
+    : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {}
 
 EcalLaserAPDPNRatiosRefGPU::Product::~Product() {
   // deallocation
-  cudaCheck( cudaFree(values) );
+  cudaCheck(cudaFree(values));
 }
 
 EcalLaserAPDPNRatiosRefGPU::Product const& EcalLaserAPDPNRatiosRefGPU::getProduct(cudaStream_t cudaStream) const {
-  auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
-                                                           [this](EcalLaserAPDPNRatiosRefGPU::Product& product, cudaStream_t cudaStream) {
-                                                             // malloc
-                                                             cudaCheck( cudaMalloc((void**)&product.values,
-                                                                                   (this->valuesEB_.size() + this->valuesEE_.size()) * 
-                                                                                   sizeof(float)) );
-                                                             
-                                                             // offset in floats, not bytes
-                                                             auto const offset = this->valuesEB_.size();
-                                                             
-                                                             // transfer 
-                                                             cudaCheck( cudaMemcpyAsync(product.values,
-                                                                                        this->valuesEB_.data(),
-                                                                                        this->valuesEB_.size() * sizeof(float),
-                                                                                        cudaMemcpyHostToDevice,
-                                                                                        cudaStream) );
-                                                             cudaCheck( cudaMemcpyAsync(product.values + offset,
-                                                                                        this->valuesEE_.data(),
-                                                                                        this->valuesEE_.size() * sizeof(float),
-                                                                                        cudaMemcpyHostToDevice,
-                                                                                        cudaStream) );
-                                                           }
-  );
-  
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalLaserAPDPNRatiosRefGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(
+            cudaMalloc((void**)&product.values, (this->valuesEB_.size() + this->valuesEE_.size()) * sizeof(float)));
+
+        // offset in floats, not bytes
+        auto const offset = this->valuesEB_.size();
+
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.values,
+                                  this->valuesEB_.data(),
+                                  this->valuesEB_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.values + offset,
+                                  this->valuesEE_.data(),
+                                  this->valuesEE_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
   return product;
 }
 
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAlphasGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAlphasGPU.cc
index 24257fd8b547a..91de441bff683 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAlphasGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAlphasGPU.cc
@@ -3,41 +3,37 @@
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-EcalLaserAlphasGPU::EcalLaserAlphasGPU(EcalLaserAlphas const& values) 
-: valuesEB_{values.barrelItems()}
-, valuesEE_{values.endcapItems()}
-{}
+EcalLaserAlphasGPU::EcalLaserAlphasGPU(EcalLaserAlphas const& values)
+    : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {}
 
 EcalLaserAlphasGPU::Product::~Product() {
   // deallocation
-  cudaCheck( cudaFree(values) );
+  cudaCheck(cudaFree(values));
 }
 
 EcalLaserAlphasGPU::Product const& EcalLaserAlphasGPU::getProduct(cudaStream_t cudaStream) const {
-  auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
-                                                           [this](EcalLaserAlphasGPU::Product& product, cudaStream_t cudaStream) {
-                                                             // malloc
-                                                             cudaCheck( cudaMalloc((void**)&product.values,
-                                                                                   (this->valuesEB_.size() + this->valuesEE_.size()) * 
-                                                                                   sizeof(float)) );
-                                                             
-                                                             // offset in floats, not bytes
-                                                             auto const offset = this->valuesEB_.size();
-                                                             
-                                                             // transfer 
-                                                             cudaCheck( cudaMemcpyAsync(product.values,
-                                                                                        this->valuesEB_.data(),
-                                                                                        this->valuesEB_.size() * sizeof(float),
-                                                                                        cudaMemcpyHostToDevice,
-                                                                                        cudaStream) );
-                                                             cudaCheck( cudaMemcpyAsync(product.values + offset,
-                                                                                        this->valuesEE_.data(),
-                                                                                        this->valuesEE_.size() * sizeof(float),
-                                                                                        cudaMemcpyHostToDevice,
-                                                                                        cudaStream) );
-                                                           }
-  );
-  
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalLaserAlphasGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(
+            cudaMalloc((void**)&product.values, (this->valuesEB_.size() + this->valuesEE_.size()) * sizeof(float)));
+
+        // offset in floats, not bytes
+        auto const offset = this->valuesEB_.size();
+
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.values,
+                                  this->valuesEB_.data(),
+                                  this->valuesEB_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.values + offset,
+                                  this->valuesEE_.data(),
+                                  this->valuesEE_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
   return product;
 }
 
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc
index 2dedb1074bee7..20946028aba90 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc
@@ -3,100 +3,78 @@
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-EcalLinearCorrectionsGPU::EcalLinearCorrectionsGPU(EcalLinearCorrections const& values) 
-: p1_(values.getValueMap().size())
-, p2_(values.getValueMap().size())
-, p3_(values.getValueMap().size())
-, t1_(values.getTimeMap().size())
-, t2_(values.getTimeMap().size())
-, t3_(values.getTimeMap().size())
-{
-  
+EcalLinearCorrectionsGPU::EcalLinearCorrectionsGPU(EcalLinearCorrections const& values)
+    : p1_(values.getValueMap().size()),
+      p2_(values.getValueMap().size()),
+      p3_(values.getValueMap().size()),
+      t1_(values.getTimeMap().size()),
+      t2_(values.getTimeMap().size()),
+      t3_(values.getTimeMap().size()) {
   // fill in eb
-  for (unsigned int i=0; i<values.getValueMap().barrelItems().size(); i++) {
+  for (unsigned int i = 0; i < values.getValueMap().barrelItems().size(); i++) {
     p1_[i] = values.getValueMap().barrelItems()[i].p1;
     p2_[i] = values.getValueMap().barrelItems()[i].p2;
     p3_[i] = values.getValueMap().barrelItems()[i].p3;
   }
-  
+
   // fill in ee
   auto const offset_laser = values.getValueMap().barrelItems().size();
-  for (unsigned int i=0; i<values.getValueMap().endcapItems().size(); i++) {
+  for (unsigned int i = 0; i < values.getValueMap().endcapItems().size(); i++) {
     p1_[offset_laser + i] = values.getValueMap().endcapItems()[i].p1;
     p2_[offset_laser + i] = values.getValueMap().endcapItems()[i].p2;
     p3_[offset_laser + i] = values.getValueMap().endcapItems()[i].p3;
   }
-  
+
   //   Time is a simple std::vector
   //       typedef std::vector<EcalLaserTimeStamp> EcalLaserTimeStampMap;
-  for (unsigned int i=0; i<values.getTimeMap().size(); i++) {
+  for (unsigned int i = 0; i < values.getTimeMap().size(); i++) {
     t1_[i] = values.getTimeMap()[i].t1.value();
     t2_[i] = values.getTimeMap()[i].t2.value();
     t3_[i] = values.getTimeMap()[i].t3.value();
   }
-  
 }
 
 EcalLinearCorrectionsGPU::Product::~Product() {
   // deallocation
-  cudaCheck( cudaFree(p1) );
-  cudaCheck( cudaFree(p2) );
+  cudaCheck(cudaFree(p1));
+  cudaCheck(cudaFree(p2));
 }
 
-EcalLinearCorrectionsGPU::Product const& EcalLinearCorrectionsGPU::getProduct(
-  cudaStream_t cudaStream) const
-  {
-    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
-                                                             [this](EcalLinearCorrectionsGPU::Product& product, cudaStream_t cudaStream) {
-                                                               // malloc
-                                                               cudaCheck( cudaMalloc((void**)&product.p1,
-                                                                                     this->p1_.size() * sizeof(float)) );
-                                                               cudaCheck( cudaMalloc((void**)&product.p2,
-                                                                                     this->p2_.size() * sizeof(float)) );
-                                                               cudaCheck( cudaMalloc((void**)&product.p3,
-                                                                                     this->p3_.size() * sizeof(float)) );
-                                                               cudaCheck( cudaMalloc((void**)&product.t1,
-                                                                                     this->t1_.size() * sizeof(edm::TimeValue_t)) );
-                                                               cudaCheck( cudaMalloc((void**)&product.t2,
-                                                                                     this->t2_.size() * sizeof(edm::TimeValue_t)) );
-                                                               cudaCheck( cudaMalloc((void**)&product.t3,
-                                                                                     this->t3_.size() * sizeof(edm::TimeValue_t)) );
-                                                               // transfer 
-                                                               cudaCheck( cudaMemcpyAsync(product.p1,
-                                                                                          this->p1_.data(),
-                                                                                          this->p1_.size() * sizeof(float),
-                                                                                          cudaMemcpyHostToDevice,
-                                                                                          cudaStream) );
-                                                               cudaCheck( cudaMemcpyAsync(product.p2,
-                                                                                          this->p2_.data(),
-                                                                                          this->p2_.size() * sizeof(float),
-                                                                                          cudaMemcpyHostToDevice,
-                                                                                          cudaStream) );
-                                                               cudaCheck( cudaMemcpyAsync(product.p3,
-                                                                                          this->p3_.data(),
-                                                                                          this->p3_.size() * sizeof(float),
-                                                                                          cudaMemcpyHostToDevice,
-                                                                                          cudaStream) );
-                                                               cudaCheck( cudaMemcpyAsync(product.t1,
-                                                                                          this->t1_.data(),
-                                                                                          this->t1_.size() * sizeof(edm::TimeValue_t),
-                                                                                          cudaMemcpyHostToDevice,
-                                                                                          cudaStream) );
-                                                               cudaCheck( cudaMemcpyAsync(product.t2,
-                                                                                          this->t2_.data(),
-                                                                                          this->t2_.size() * sizeof(edm::TimeValue_t),
-                                                                                          cudaMemcpyHostToDevice,
-                                                                                          cudaStream) );
-                                                               cudaCheck( cudaMemcpyAsync(product.t3,
-                                                                                          this->t3_.data(),
-                                                                                          this->t3_.size() * sizeof(edm::TimeValue_t),
-                                                                                          cudaMemcpyHostToDevice,
-                                                                                          cudaStream) );
-                                                             }
-    );
-    
-    return product;
-  }
-  
-  TYPELOOKUP_DATA_REG(EcalLinearCorrectionsGPU);
-  
\ No newline at end of file
+EcalLinearCorrectionsGPU::Product const& EcalLinearCorrectionsGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalLinearCorrectionsGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.p1, this->p1_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.p2, this->p2_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.p3, this->p3_.size() * sizeof(float)));
+        cudaCheck(cudaMalloc((void**)&product.t1, this->t1_.size() * sizeof(edm::TimeValue_t)));
+        cudaCheck(cudaMalloc((void**)&product.t2, this->t2_.size() * sizeof(edm::TimeValue_t)));
+        cudaCheck(cudaMalloc((void**)&product.t3, this->t3_.size() * sizeof(edm::TimeValue_t)));
+        // transfer
+        cudaCheck(cudaMemcpyAsync(
+            product.p1, this->p1_.data(), this->p1_.size() * sizeof(float), cudaMemcpyHostToDevice, cudaStream));
+        cudaCheck(cudaMemcpyAsync(
+            product.p2, this->p2_.data(), this->p2_.size() * sizeof(float), cudaMemcpyHostToDevice, cudaStream));
+        cudaCheck(cudaMemcpyAsync(
+            product.p3, this->p3_.data(), this->p3_.size() * sizeof(float), cudaMemcpyHostToDevice, cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.t1,
+                                  this->t1_.data(),
+                                  this->t1_.size() * sizeof(edm::TimeValue_t),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.t2,
+                                  this->t2_.data(),
+                                  this->t2_.size() * sizeof(edm::TimeValue_t),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+        cudaCheck(cudaMemcpyAsync(product.t3,
+                                  this->t3_.data(),
+                                  this->t3_.size() * sizeof(edm::TimeValue_t),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalLinearCorrectionsGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
index 792b422cefd6f..54c376214c4c6 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
@@ -9,56 +9,49 @@
 //
 #include "EcalRecHitBuilderKernels.h"
 
-
 #include "KernelHelpers.h"
 
-
-
-
 namespace ecal {
   namespace rechit {
-    
-    
+
     // uncalibrecHit flags
     enum UncalibRecHitFlags {
-      kGood=-1,                 // channel is good (mutually exclusive with other states)  setFlagBit(kGood) reset flags_ to zero 
-      kPoorReco,                // channel has been badly reconstructed (e.g. bad shape, bad chi2 etc.)
-      kSaturated,               // saturated channel
-      kOutOfTime,               // channel out of time
-      kLeadingEdgeRecovered,    // saturated channel: energy estimated from the leading edge before saturation
-      kHasSwitchToGain6,        // at least one data frame is in G6
-      kHasSwitchToGain1         // at least one data frame is in G1
+      kGood = -1,  // channel is good (mutually exclusive with other states)  setFlagBit(kGood) reset flags_ to zero
+      kPoorReco,   // channel has been badly reconstructed (e.g. bad shape, bad chi2 etc.)
+      kSaturated,  // saturated channel
+      kOutOfTime,  // channel out of time
+      kLeadingEdgeRecovered,  // saturated channel: energy estimated from the leading edge before saturation
+      kHasSwitchToGain6,      // at least one data frame is in G6
+      kHasSwitchToGain1       // at least one data frame is in G1
     };
-    
-    
+
     // recHit flags
-    enum RecHitFlags { 
-      RecHitFlags_kGood=0,                   // channel ok, the energy and time measurement are reliable
-      RecHitFlags_kPoorReco,                 // the energy is available from the UncalibRecHit, but approximate (bad shape, large chi2)
-      RecHitFlags_kOutOfTime,                // the energy is available from the UncalibRecHit (sync reco), but the event is out of time
-      RecHitFlags_kFaultyHardware,           // The energy is available from the UncalibRecHit, channel is faulty at some hardware level (e.g. noisy)
-      RecHitFlags_kNoisy,                    // the channel is very noisy
-      RecHitFlags_kPoorCalib,                // the energy is available from the UncalibRecHit, but the calibration of the channel is poor
-      RecHitFlags_kSaturated,                // saturated channel (recovery not tried)
-      RecHitFlags_kLeadingEdgeRecovered,     // saturated channel: energy estimated from the leading edge before saturation
-      RecHitFlags_kNeighboursRecovered,      // saturated/isolated dead: energy estimated from neighbours
-      RecHitFlags_kTowerRecovered,           // channel in TT with no data link, info retrieved from Trigger Primitive
-      RecHitFlags_kDead,                     // channel is dead and any recovery fails
-      RecHitFlags_kKilled,                   // MC only flag: the channel is killed in the real detector
-      RecHitFlags_kTPSaturated,              // the channel is in a region with saturated TP
-      RecHitFlags_kL1SpikeFlag,              // the channel is in a region with TP with sFGVB = 0
-      RecHitFlags_kWeird,                    // the signal is believed to originate from an anomalous deposit (spike) 
-      RecHitFlags_kDiWeird,                  // the signal is anomalous, and neighbors another anomalous signal  
-      RecHitFlags_kHasSwitchToGain6,         // at least one data frame is in G6
-      RecHitFlags_kHasSwitchToGain1,         // at least one data frame is in G1
+    enum RecHitFlags {
+      RecHitFlags_kGood = 0,  // channel ok, the energy and time measurement are reliable
+      RecHitFlags_kPoorReco,  // the energy is available from the UncalibRecHit, but approximate (bad shape, large chi2)
+      RecHitFlags_kOutOfTime,  // the energy is available from the UncalibRecHit (sync reco), but the event is out of time
+      RecHitFlags_kFaultyHardware,  // The energy is available from the UncalibRecHit, channel is faulty at some hardware level (e.g. noisy)
+      RecHitFlags_kNoisy,      // the channel is very noisy
+      RecHitFlags_kPoorCalib,  // the energy is available from the UncalibRecHit, but the calibration of the channel is poor
+      RecHitFlags_kSaturated,             // saturated channel (recovery not tried)
+      RecHitFlags_kLeadingEdgeRecovered,  // saturated channel: energy estimated from the leading edge before saturation
+      RecHitFlags_kNeighboursRecovered,   // saturated/isolated dead: energy estimated from neighbours
+      RecHitFlags_kTowerRecovered,        // channel in TT with no data link, info retrieved from Trigger Primitive
+      RecHitFlags_kDead,                  // channel is dead and any recovery fails
+      RecHitFlags_kKilled,                // MC only flag: the channel is killed in the real detector
+      RecHitFlags_kTPSaturated,           // the channel is in a region with saturated TP
+      RecHitFlags_kL1SpikeFlag,           // the channel is in a region with TP with sFGVB = 0
+      RecHitFlags_kWeird,                 // the signal is believed to originate from an anomalous deposit (spike)
+      RecHitFlags_kDiWeird,               // the signal is anomalous, and neighbors another anomalous signal
+      RecHitFlags_kHasSwitchToGain6,      // at least one data frame is in G6
+      RecHitFlags_kHasSwitchToGain1,      // at least one data frame is in G1
       //
-      RecHitFlags_kUnknown                   // to ease the interface with functions returning flags. 
+      RecHitFlags_kUnknown  // to ease the interface with functions returning flags.
     };
-    
-    
+
     // status code
     enum EcalChannelStatusCode_Code {
-      kOk=0,
+      kOk = 0,
       kDAC,
       kNoLaser,
       kNoisy,
@@ -72,143 +65,118 @@ namespace ecal {
       kNonRespondingIsolated,
       kDeadVFE,
       kDeadFE,
-      kNoDataNoTP      
+      kNoDataNoTP
     };
-    
-    
-    
-    
-    
-    __global__
-    void kernel_create_ecal_rehit(
-      // configuration 
-      int const* ChannelStatusToBeExcluded,
-      uint32_t ChannelStatusToBeExcludedSize,   
-      bool const killDeadChannels,
-      bool const recoverEBIsolatedChannels,
-      bool const recoverEEIsolatedChannels,
-      bool const recoverEBVFE,             
-      bool const recoverEEVFE,             
-      bool const recoverEBFE,             
-      bool const recoverEEFE,              
-      float const EBLaserMIN,
-      float const EELaserMIN,
-      float const EBLaserMAX,
-      float const EELaserMAX,
-      // for flags setting
-      int const* expanded_v_DB_reco_flags,    // FIXME AM: to be checked
-      uint32_t const* expanded_Sizes_v_DB_reco_flags,
-      uint32_t const* expanded_flagbit_v_DB_reco_flags,
-      uint32_t expanded_v_DB_reco_flagsSize,
-      uint32_t flagmask,
-      // conditions
-      float const* adc2gev,
-      float const* intercalib,
-      uint16_t const* status,
-      float const* apdpnrefs,
-      float const* alphas,
-      // input for transparency corrections
-      float const* p1,
-      float const* p2,
-      float const* p3,
-      edm::TimeValue_t const* t1,
-      edm::TimeValue_t const* t2,
-      edm::TimeValue_t const* t3,  
-      // input for linear corrections
-      float const* lp1,
-      float const* lp2,
-      float const* lp3,
-      edm::TimeValue_t const* lt1,
-      edm::TimeValue_t const* lt2,
-      edm::TimeValue_t const* lt3,                    
-      // time, used for time dependent corrections
-      edm::TimeValue_t const event_time,
-      // input
-      uint32_t const* did_eb,
-      uint32_t const* did_ee,
-      ::ecal::reco::StorageScalarType const* amplitude_eb,   // in adc counts  
-      ::ecal::reco::StorageScalarType const* amplitude_ee,   // in adc counts  
-      ::ecal::reco::StorageScalarType const* time_eb,   
-      ::ecal::reco::StorageScalarType const* time_ee,   
-      ::ecal::reco::StorageScalarType const* chi2_eb,   
-      ::ecal::reco::StorageScalarType const* chi2_ee,   
-      uint32_t const* flags_eb,
-      uint32_t const* flags_ee,
-      // output
-      uint32_t *did,
-      ::ecal::reco::StorageScalarType* energy,   // in energy [GeV]  
-      ::ecal::reco::StorageScalarType* time,  
-      ::ecal::reco::StorageScalarType* chi2,  
-      uint32_t* flagBits,
-      uint32_t* extra,
-      // other
-      int const nchannels,
-      uint32_t const nChannelsBarrel,
-      uint32_t const offsetForHashes                     
-    ) {
-      
-      
-      //       
+
+    __global__ void kernel_create_ecal_rehit(
+        // configuration
+        int const* ChannelStatusToBeExcluded,
+        uint32_t ChannelStatusToBeExcludedSize,
+        bool const killDeadChannels,
+        bool const recoverEBIsolatedChannels,
+        bool const recoverEEIsolatedChannels,
+        bool const recoverEBVFE,
+        bool const recoverEEVFE,
+        bool const recoverEBFE,
+        bool const recoverEEFE,
+        float const EBLaserMIN,
+        float const EELaserMIN,
+        float const EBLaserMAX,
+        float const EELaserMAX,
+        // for flags setting
+        int const* expanded_v_DB_reco_flags,  // FIXME AM: to be checked
+        uint32_t const* expanded_Sizes_v_DB_reco_flags,
+        uint32_t const* expanded_flagbit_v_DB_reco_flags,
+        uint32_t expanded_v_DB_reco_flagsSize,
+        uint32_t flagmask,
+        // conditions
+        float const* adc2gev,
+        float const* intercalib,
+        uint16_t const* status,
+        float const* apdpnrefs,
+        float const* alphas,
+        // input for transparency corrections
+        float const* p1,
+        float const* p2,
+        float const* p3,
+        edm::TimeValue_t const* t1,
+        edm::TimeValue_t const* t2,
+        edm::TimeValue_t const* t3,
+        // input for linear corrections
+        float const* lp1,
+        float const* lp2,
+        float const* lp3,
+        edm::TimeValue_t const* lt1,
+        edm::TimeValue_t const* lt2,
+        edm::TimeValue_t const* lt3,
+        // time, used for time dependent corrections
+        edm::TimeValue_t const event_time,
+        // input
+        uint32_t const* did_eb,
+        uint32_t const* did_ee,
+        ::ecal::reco::StorageScalarType const* amplitude_eb,  // in adc counts
+        ::ecal::reco::StorageScalarType const* amplitude_ee,  // in adc counts
+        ::ecal::reco::StorageScalarType const* time_eb,
+        ::ecal::reco::StorageScalarType const* time_ee,
+        ::ecal::reco::StorageScalarType const* chi2_eb,
+        ::ecal::reco::StorageScalarType const* chi2_ee,
+        uint32_t const* flags_eb,
+        uint32_t const* flags_ee,
+        // output
+        uint32_t* did,
+        ::ecal::reco::StorageScalarType* energy,  // in energy [GeV]
+        ::ecal::reco::StorageScalarType* time,
+        ::ecal::reco::StorageScalarType* chi2,
+        uint32_t* flagBits,
+        uint32_t* extra,
+        // other
+        int const nchannels,
+        uint32_t const nChannelsBarrel,
+        uint32_t const offsetForHashes) {
+      //
       //    NB: energy   "type_wrapper<reco::StorageScalarType, L>::type" most likely std::vector<float>
-      //       
-
-      for (int ch = threadIdx.x + blockDim.x*blockIdx.x; ch < nchannels; ch += blockDim.x*gridDim.x) {  
-        
-//       int ch = threadIdx.x + blockDim.x*blockIdx.x;
-      
-//       if (ch < nchannels) {
-      
+      //
+
+      for (int ch = threadIdx.x + blockDim.x * blockIdx.x; ch < nchannels; ch += blockDim.x * gridDim.x) {
+        //       int ch = threadIdx.x + blockDim.x*blockIdx.x;
+
+        //       if (ch < nchannels) {
+
         bool isEndcap = (ch >= nChannelsBarrel);
-        
-        int const inputCh = isEndcap
-        ? ch - nChannelsBarrel
-        : ch;
-        
-        uint32_t const * didCh = isEndcap
-        ? did_ee
-        : did_eb;
-        
+
+        int const inputCh = isEndcap ? ch - nChannelsBarrel : ch;
+
+        uint32_t const* didCh = isEndcap ? did_ee : did_eb;
+
         // only two values, EB or EE
         // AM : FIXME : why not using "isBarrel" ?    isBarrel ? adc2gev[0] : adc2gev[1]
-        float adc2gev_to_use = isEndcap
-        ? adc2gev[1]  // ee
-        : adc2gev[0]; // eb
-        
-        
+        float adc2gev_to_use = isEndcap ? adc2gev[1]   // ee
+                                        : adc2gev[0];  // eb
+
         // first EB and then EE
-        
-        ::ecal::reco::StorageScalarType const* amplitude = isEndcap
-        ? amplitude_ee
-        : amplitude_eb;
-        
-        ::ecal::reco::StorageScalarType const* time_in = isEndcap
-        ? time_ee
-        : time_eb;
-        
-        ::ecal::reco::StorageScalarType const* chi2_in = isEndcap
-        ? chi2_ee
-        : chi2_eb;
-        
-        uint32_t const* flags_in = isEndcap
-        ? flags_ee
-        : flags_eb;
-        
+
+        ::ecal::reco::StorageScalarType const* amplitude = isEndcap ? amplitude_ee : amplitude_eb;
+
+        ::ecal::reco::StorageScalarType const* time_in = isEndcap ? time_ee : time_eb;
+
+        ::ecal::reco::StorageScalarType const* chi2_in = isEndcap ? chi2_ee : chi2_eb;
+
+        uint32_t const* flags_in = isEndcap ? flags_ee : flags_eb;
+
         // simple copy
         did[ch] = didCh[inputCh];
-        
+
         auto const did_to_use = DetId{didCh[inputCh]};
-        
+
         auto const isBarrel = did_to_use.subdetId() == EcalBarrel;
-        auto const hashedId = isBarrel
-        ? ecal::reconstruction::hashedIndexEB(did_to_use.rawId())
-        : offsetForHashes + ecal::reconstruction::hashedIndexEE(did_to_use.rawId());
-        
+        auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did_to_use.rawId())
+                                       : offsetForHashes + ecal::reconstruction::hashedIndexEE(did_to_use.rawId());
+
         float const intercalib_to_use = intercalib[hashedId];
-        
-        
+
         // get laser coefficient
         float lasercalib = 1.;
-        
+
         //
         // AM: ideas
         //
@@ -217,24 +185,22 @@ namespace ecal {
         //    Then only if the LS is different, update the laser correction
         //    The variation within a LS is not worth pursuing (<< 0.1% !!)
         //    and below the precision we can claim on the laser corrections (right?).
-        //    This will save quite some time (also for the CPU version?)    
+        //    This will save quite some time (also for the CPU version?)
         //
-                
+
         int iLM = 1;
-        
+
         if (isBarrel) {
-          iLM = ecal::reconstruction::laser_monitoring_region_EB (did_to_use.rawId());
-        }
-        else {
-          iLM = ecal::reconstruction::laser_monitoring_region_EE (did_to_use.rawId());
+          iLM = ecal::reconstruction::laser_monitoring_region_EB(did_to_use.rawId());
+        } else {
+          iLM = ecal::reconstruction::laser_monitoring_region_EE(did_to_use.rawId());
         }
-        
-        
+
         long long t_i = 0, t_f = 0;
         float p_i = 0, p_f = 0;
         long long lt_i = 0, lt_f = 0;
         float lp_i = 0, lp_f = 0;
-        
+
         // laser
         if (event_time >= t1[iLM - 1] && event_time < t2[iLM - 1]) {
           t_i = t1[iLM - 1];
@@ -251,15 +217,14 @@ namespace ecal {
           t_f = t2[iLM - 1];
           p_i = p1[hashedId];
           p_f = p2[hashedId];
-          
+
         } else if (event_time > t3[iLM - 1]) {
           t_i = t2[iLM - 1];
           t_f = t3[iLM - 1];
           p_i = p2[hashedId];
           p_f = p3[hashedId];
         }
-        
-        
+
         // linear corrections
         if (event_time >= lt1[iLM - 1] && event_time < lt2[iLM - 1]) {
           lt_i = lt1[iLM - 1];
@@ -276,26 +241,27 @@ namespace ecal {
           lt_f = lt2[iLM - 1];
           lp_i = lp1[hashedId];
           lp_f = lp2[hashedId];
-          
+
         } else if (event_time > lt3[iLM - 1]) {
           lt_i = lt2[iLM - 1];
           lt_f = lt3[iLM - 1];
           lp_i = lp2[hashedId];
           lp_f = lp3[hashedId];
         }
-        
-        
-        // apdpnref and alpha 
+
+        // apdpnref and alpha
         float apdpnref = apdpnrefs[hashedId];
         float alpha = alphas[hashedId];
-        
+
         // now calculate transparency correction
         if (apdpnref != 0 && (t_i - t_f) != 0 && (lt_i - lt_f) != 0) {
           long long tt = event_time;  // never subtract two unsigned!
-          float interpolatedLaserResponse =   p_i / apdpnref + float(tt - t_i)  * (p_f - p_i)   / (apdpnref * float(t_f - t_i));
-          
-          float interpolatedLinearResponse = lp_i / apdpnref + float(tt - lt_i) * (lp_f - lp_i) / (apdpnref * float(lt_f - lt_i));  // FIXED BY FC
-          
+          float interpolatedLaserResponse =
+              p_i / apdpnref + float(tt - t_i) * (p_f - p_i) / (apdpnref * float(t_f - t_i));
+
+          float interpolatedLinearResponse =
+              lp_i / apdpnref + float(tt - lt_i) * (lp_f - lp_i) / (apdpnref * float(lt_f - lt_i));  // FIXED BY FC
+
           if (interpolatedLinearResponse > 2.f || interpolatedLinearResponse < 0.1f) {
             interpolatedLinearResponse = 1.f;
           }
@@ -303,302 +269,279 @@ namespace ecal {
             // AM :  how the heck is it possible?
             //             interpolatedLaserResponse = 0.0001;
             lasercalib = 1.;
-            
-          }
-          else {
-            
+
+          } else {
             float interpolatedTransparencyResponse = interpolatedLaserResponse / interpolatedLinearResponse;
-            
+
             // ... and now this:
-            lasercalib = 1.f / ( std::pow(interpolatedTransparencyResponse, alpha) * interpolatedLinearResponse);
-            
+            lasercalib = 1.f / (std::pow(interpolatedTransparencyResponse, alpha) * interpolatedLinearResponse);
           }
         }
-        
+
         //
         // Check for channels to be excluded from reconstruction
-        //        
+        //
         //
         // Default energy? Not to be updated if "ChannelStatusToBeExcluded"
         // Exploited later by the module "EcalRecHitConvertGPU2CPUFormat"
         //
-        energy[ch] = -1; //---- AM: default, un-physical, ok
-        
+        energy[ch] = -1;  //---- AM: default, un-physical, ok
+
         //
-        static const int chStatusMask      = 0x1F;
+        static const int chStatusMask = 0x1F;
         // ChannelStatusToBeExcluded is a "int" then I put "dbstatus" to be the same
-        int dbstatus = EcalChannelStatusCode_Code( (status[hashedId]) & chStatusMask );
+        int dbstatus = EcalChannelStatusCode_Code((status[hashedId]) & chStatusMask);
         if (ChannelStatusToBeExcludedSize != 0) {
-          for (int ich_to_check = 0; ich_to_check<ChannelStatusToBeExcludedSize; ich_to_check++) {
-            if ( ChannelStatusToBeExcluded[ich_to_check] == dbstatus ) {
-              return; 
+          for (int ich_to_check = 0; ich_to_check < ChannelStatusToBeExcludedSize; ich_to_check++) {
+            if (ChannelStatusToBeExcluded[ich_to_check] == dbstatus) {
+              return;
             }
           }
         }
-        
+
         // Take our association map of dbstatuses-> recHit flagbits and return the apporpriate flagbit word
-        
+
         //
         // AM: get the smaller "flagbit_counter" with match
         //
-        
+
         uint32_t temporary_flagBits = 0;
-        
+
         int iterator_flags = 0;
         bool need_to_exit = false;
         int flagbit_counter = 0;
         while (!need_to_exit) {
           iterator_flags = 0;
-          for (unsigned int i = 0; i != expanded_v_DB_reco_flagsSize; ++i) { 
+          for (unsigned int i = 0; i != expanded_v_DB_reco_flagsSize; ++i) {
             // check the correct "flagbit"
             if (expanded_flagbit_v_DB_reco_flags[i] == flagbit_counter) {
-              
               for (unsigned int j = 0; j < expanded_Sizes_v_DB_reco_flags[i]; j++) {
-                
-                if ( expanded_v_DB_reco_flags[iterator_flags] == dbstatus ) {
-                  temporary_flagBits =  0x1 << expanded_flagbit_v_DB_reco_flags[i];      
+                if (expanded_v_DB_reco_flags[iterator_flags] == dbstatus) {
+                  temporary_flagBits = 0x1 << expanded_flagbit_v_DB_reco_flags[i];
                   need_to_exit = true;
-                  break; // also from the big loop!!!
-                  
+                  break;  // also from the big loop!!!
                 }
                 iterator_flags++;
               }
-            }
-            else {
+            } else {
               // if not, got to the next bunch directly
               iterator_flags += expanded_Sizes_v_DB_reco_flags[i];
             }
-            
+
             if (need_to_exit) {
               break;
             }
-            
           }
-          flagbit_counter+=1;
+          flagbit_counter += 1;
         }
-        
-        
-        if ( (flagmask & temporary_flagBits) && killDeadChannels ) {
+
+        if ((flagmask & temporary_flagBits) && killDeadChannels) {
           return;
         }
-        
-        
+
         //
         flagBits[ch] = temporary_flagBits;
-        
+
         //
         // multiply the adc counts with factors to get GeV
         //
-        
+
         //         energy[ch] = amplitude[inputCh] * adc2gev_to_use * intercalib_to_use ;
         energy[ch] = amplitude[inputCh] * adc2gev_to_use * intercalib_to_use * lasercalib;
-        
+
         // Time is not saved so far, FIXME
         //         time[ch] = time_in[inputCh];
-        
-        
-        if (chi2_in[inputCh] > 64) chi2[ch] = 64;
-        else chi2[ch] = chi2_in[inputCh];
-        
-        
+
+        if (chi2_in[inputCh] > 64)
+          chi2[ch] = 64;
+        else
+          chi2[ch] = chi2_in[inputCh];
+
         // NB: calculate the "flagBits extra"  --> not really "flags", but actually an encoded version of energy uncertainty, time unc., ...
         extra[ch] = 0;
-        
+
         //
         // extra packing ...
         //
-        
+
         uint32_t offset;
         uint32_t width;
         uint32_t value;
-        
+
         float chi2_temp = chi2[ch];
-        if (chi2_temp > 64) chi2_temp = 64;
+        if (chi2_temp > 64)
+          chi2_temp = 64;
         // use 7 bits
-        uint32_t rawChi2 = lround(chi2_temp / 64. * ((1<<7)-1));
-        
+        uint32_t rawChi2 = lround(chi2_temp / 64. * ((1 << 7) - 1));
+
         offset = 0;
         width = 7;
-        value = 0; 
-        
+        value = 0;
+
         uint32_t mask = ((1 << width) - 1) << offset;
         value &= ~mask;
         value |= (rawChi2 & ((1U << width) - 1)) << offset;
-        
+
         //         extra[ch] = value;
-        //         
-        
+        //
+
         // rawEnergy is actually "error" !!!
         uint32_t rawEnergy = 0;
-        
-        
-        // AM: FIXME: this is not propagated currently to the uncalibrecHit collection SOA 
+
+        // AM: FIXME: this is not propagated currently to the uncalibrecHit collection SOA
         //            if you want to store this in "extra", we need first to add it to the uncalibrecHit results
         //            then it will be something like the following
         //         amplitudeError[inputCh] * adc2gev_to_use * intercalib_to_use * lasercalib
-        //         
-        //         
-        
-        float amplitudeError_ch = 0. ; // amplitudeError[ch];
-        
+        //
+        //
+
+        float amplitudeError_ch = 0.;  // amplitudeError[ch];
+
         if (amplitudeError_ch > 0.001) {
           //           uint16_t exponent = getPower10(amplitudeError_ch);
-          
-          static constexpr float p10[] = {1.e-2f,1.e-1f,1.f,1.e1f,1.e2f,1.e3f,1.e4f,1.e5f,1.e6f};
-          int b = amplitudeError_ch<p10[4] ? 0 : 5;
-          for (;b<9;++b) if (amplitudeError_ch<p10[b]) break;
-          
+
+          static constexpr float p10[] = {1.e-2f, 1.e-1f, 1.f, 1.e1f, 1.e2f, 1.e3f, 1.e4f, 1.e5f, 1.e6f};
+          int b = amplitudeError_ch < p10[4] ? 0 : 5;
+          for (; b < 9; ++b)
+            if (amplitudeError_ch < p10[b])
+              break;
+
           uint16_t exponent = b;
-          
-          static constexpr float ip10[] = {1.e5f,1.e4f,1.e3f,1.e2f,1.e1f,1.e0f,1.e-1f,1.e-2f,1.e-3f,1.e-4};
-          uint16_t significand = lround( amplitudeError_ch * ip10[exponent]);
+
+          static constexpr float ip10[] = {1.e5f, 1.e4f, 1.e3f, 1.e2f, 1.e1f, 1.e0f, 1.e-1f, 1.e-2f, 1.e-3f, 1.e-4};
+          uint16_t significand = lround(amplitudeError_ch * ip10[exponent]);
           // use 13 bits (3 exponent, 10 significand)
           rawEnergy = exponent << 10 | significand;
         }
-        
-        
+
         offset = 8;
         width = 13;
         // value from last change, ok
-        
+
         mask = ((1 << width) - 1) << offset;
         value &= ~mask;
         value |= (rawEnergy & ((1U << width) - 1)) << offset;
-        
+
         uint32_t jitterErrorBits = 0;
         jitterErrorBits = jitterErrorBits & 0xFF;
-        
-        
+
         offset = 24;
         width = 8;
         // value from last change, ok
-        
+
         mask = ((1 << width) - 1) << offset;
         value &= ~mask;
         value |= (jitterErrorBits & ((1U << width) - 1)) << offset;
-        
+
         //
         // now finally set "extra[ch]"
         //
         extra[ch] = value;
-        
-        
+
         //
         // additional flags setting
         //
         // using correctly the flags as calculated at the UncalibRecHit stage
         //
         // Now fill flags
-        
+
         bool good = true;
-        
-        if ( flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kLeadingEdgeRecovered) ) ) {
-          flagBits[ch]  |=  (0x1 <<  (RecHitFlags::RecHitFlags_kLeadingEdgeRecovered));  
-          good = false;          
+
+        if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kLeadingEdgeRecovered))) {
+          flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kLeadingEdgeRecovered));
+          good = false;
         }
-        
-        if (flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kSaturated) ) ) {
+
+        if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kSaturated))) {
           // leading edge recovery failed - still keep the information
           // about the saturation and do not flag as dead
-          flagBits[ch]  |=  (0x1 <<  (RecHitFlags::RecHitFlags_kSaturated));  
+          flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kSaturated));
           good = false;
         }
-        
+
         //
-        // AM: why do we have two tests one after the other checking almost the same thing??? 
+        // AM: why do we have two tests one after the other checking almost the same thing???
         // Please clean up the code, ... also the original one!
-        //        
-        // uncalibRH.isSaturated() ---> 
-        //         
+        //
+        // uncalibRH.isSaturated() --->
+        //
         //                                   bool EcalUncalibratedRecHit::isSaturated() const {
         //                                     return EcalUncalibratedRecHit::checkFlag(kSaturated);
         //                                   }
         //
         //
-        
-        if ( flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kSaturated) ) ) {
-          flagBits[ch]  |= (0x1 <<  (RecHitFlags::RecHitFlags_kSaturated));  
+
+        if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kSaturated))) {
+          flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kSaturated));
           good = false;
         }
-        
-        if (flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kOutOfTime) ) ) {
-          flagBits[ch]  |= (0x1 <<  (RecHitFlags::RecHitFlags_kOutOfTime));
+
+        if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kOutOfTime))) {
+          flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kOutOfTime));
           good = false;
         }
-        if (flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kPoorReco) ) ) {
-          flagBits[ch]  |= (0x1 <<  (RecHitFlags::RecHitFlags_kPoorReco));
+        if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kPoorReco))) {
+          flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorReco));
           good = false;
         }
-        if (flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kHasSwitchToGain6) ) ) {
-          flagBits[ch]  |= (0x1 <<  (RecHitFlags::RecHitFlags_kHasSwitchToGain6));
+        if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kHasSwitchToGain6))) {
+          flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kHasSwitchToGain6));
         }
-        if (flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kHasSwitchToGain1) ) ) {
-          flagBits[ch]  |= (0x1 <<  (RecHitFlags::RecHitFlags_kHasSwitchToGain1));
+        if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kHasSwitchToGain1))) {
+          flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kHasSwitchToGain1));
         }
-        
-        
+
         if (good) {
           flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kGood));
         }
-        
-        if (isBarrel  && (lasercalib < EBLaserMIN || lasercalib > EBLaserMAX)) {
-          flagBits[ch]  |= (0x1 <<  (RecHitFlags::RecHitFlags_kPoorCalib));
-          
+
+        if (isBarrel && (lasercalib < EBLaserMIN || lasercalib > EBLaserMAX)) {
+          flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorCalib));
         }
         if (!isBarrel && (lasercalib < EELaserMIN || lasercalib > EELaserMAX)) {
-          flagBits[ch]  |= (0x1 <<  (RecHitFlags::RecHitFlags_kPoorCalib));
+          flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorCalib));
         }
-        
-        
-  
+
         // recover, killing, and other stuff
-    
-    //
-    // Structure:
-    //  EB
-    //  EE
-    //
-    //
-    //  - single MVA
-    //  - democratic sharing
-    //  - kill all the other cases
-    //
-    
+
+        //
+        // Structure:
+        //  EB
+        //  EE
+        //
+        //
+        //  - single MVA
+        //  - democratic sharing
+        //  - kill all the other cases
+        //
+
         bool is_Single = false;
-        bool is_FE     = false;
-        bool is_VFE    = false;
-        
-        bool is_recoverable = false; // DetIdToBeRecovered
-        
-        if ( dbstatus == 10 ||  dbstatus == 11 ||  dbstatus == 12 ) {
+        bool is_FE = false;
+        bool is_VFE = false;
+
+        bool is_recoverable = false;  // DetIdToBeRecovered
+
+        if (dbstatus == 10 || dbstatus == 11 || dbstatus == 12) {
           is_recoverable = true;
         }
-        
-        
+
         if (is_recoverable) {
           if (dbstatus == EcalChannelStatusCode_Code::kDeadVFE) {
             is_VFE = true;
-          }
-          else if (dbstatus == EcalChannelStatusCode_Code::kDeadVFE) {
+          } else if (dbstatus == EcalChannelStatusCode_Code::kDeadVFE) {
             is_FE = true;
-          }
-          else {
+          } else {
             is_Single = true;
           }
-          
-          
+
           // EB
           if (isBarrel) {
-            if (is_Single || is_FE || is_VFE) {           
+            if (is_Single || is_FE || is_VFE) {
               // single MVA
-              if (is_Single && (recoverEBIsolatedChannels || !killDeadChannels) ) {
-               
-                  
+              if (is_Single && (recoverEBIsolatedChannels || !killDeadChannels)) {
               }
               // decmocratic sharing
-              else if (is_FE && (recoverEBFE || !killDeadChannels) ) {
-               
-                
+              else if (is_FE && (recoverEBFE || !killDeadChannels)) {
               }
               // kill all the other cases
               else {
@@ -607,20 +550,17 @@ namespace ecal {
             }
           }
           // EE
-          else { 
-            if (is_Single || is_FE || is_VFE) {           
+          else {
+            if (is_Single || is_FE || is_VFE) {
               // single MVA
-              if (is_Single && (recoverEBIsolatedChannels || !killDeadChannels) ) {
-                
-                
+              if (is_Single && (recoverEBIsolatedChannels || !killDeadChannels)) {
               }
               // decmocratic sharing
-              else if (is_FE && (recoverEBFE || !killDeadChannels) ) {
-                  
-                //                
-                //  Code is definitely too long ...              
-                //                
-                
+              else if (is_FE && (recoverEBFE || !killDeadChannels)) {
+                //
+                //  Code is definitely too long ...
+                //
+
               }
               // kill all the other cases
               else {
@@ -628,115 +568,102 @@ namespace ecal {
               }
             }
           }
-          
-        }   
-    
-  
-      } // end channel
-      
+        }
+
+      }  // end channel
     }
-    
-    
-    
+
     // host version, to be called by the plugin
-    void create_ecal_rehit(
-      EventInputDataGPU const& eventInputGPU,
-      EventOutputDataGPU&      eventOutputGPU,
-      //     eventDataForScratchGPU_,
-      ConditionsProducts const& conditions, 
-      ConfigurationParameters const& configParameters,
-      uint32_t const  nChannelsBarrel,
-      edm::TimeValue_t const event_time,
-      cudaStream_t cudaStream
-    ){
-      
-      int nchannels = eventInputGPU.ebUncalibRecHits.size + eventInputGPU.eeUncalibRecHits.size ;
-      
-//       unsigned int nchannels_per_block = 32;
+    void create_ecal_rehit(EventInputDataGPU const& eventInputGPU,
+                           EventOutputDataGPU& eventOutputGPU,
+                           //     eventDataForScratchGPU_,
+                           ConditionsProducts const& conditions,
+                           ConfigurationParameters const& configParameters,
+                           uint32_t const nChannelsBarrel,
+                           edm::TimeValue_t const event_time,
+                           cudaStream_t cudaStream) {
+      int nchannels = eventInputGPU.ebUncalibRecHits.size + eventInputGPU.eeUncalibRecHits.size;
+
+      //       unsigned int nchannels_per_block = 32;
       unsigned int nchannels_per_block = 16;
       unsigned int threads_min = nchannels_per_block;
-      unsigned int blocks_min = (nchannels + threads_min - 1) / threads_min; // TEST : to be optimized (AM)
-      
-      // 
+      unsigned int blocks_min = (nchannels + threads_min - 1) / threads_min;  // TEST : to be optimized (AM)
+
+      //
       // kernel create rechit
       //
-      
-//       auto const nbytesShared = 2 * threads_min * MapSymM<DataType, SampleVector::RowsAtCompileTime>::total * sizeof(DataType);
-      
-      kernel_create_ecal_rehit <<< blocks_min, threads_min, 0, cudaStream >>> (
-//       kernel_create_ecal_rehit <<< blocks_min, threads_min, nbytesShared, cudaStream >>> (
-//       kernel_create_ecal_rehit <<< blocks_min, threads_min >>> (
-        // configuration 
-        configParameters.ChannelStatusToBeExcluded,
-        configParameters.ChannelStatusToBeExcludedSize,
-        configParameters.killDeadChannels,
-        configParameters.recoverEBIsolatedChannels,
-        configParameters.recoverEEIsolatedChannels,
-        configParameters.recoverEBVFE,             
-        configParameters.recoverEEVFE,             
-        configParameters.recoverEBFE,             
-        configParameters.recoverEEFE,              
-        configParameters.EBLaserMIN,
-        configParameters.EELaserMIN,
-        configParameters.EBLaserMAX,
-        configParameters.EELaserMAX,
-        // for flags setting
-        configParameters.expanded_v_DB_reco_flags,
-        configParameters.expanded_Sizes_v_DB_reco_flags,
-        configParameters.expanded_flagbit_v_DB_reco_flags,
-        configParameters.expanded_v_DB_reco_flagsSize,
-        configParameters.flagmask,
-        // conditions
-        conditions.ADCToGeV.adc2gev,
-        conditions.Intercalib.values,  
-        conditions.ChannelStatus.status,  
-        conditions.LaserAPDPNRatiosRef.values,  
-        conditions.LaserAlphas.values,  
-        // input for transparency corrections
-        conditions.LaserAPDPNRatios.p1,
-        conditions.LaserAPDPNRatios.p2,
-        conditions.LaserAPDPNRatios.p3,
-        conditions.LaserAPDPNRatios.t1,
-        conditions.LaserAPDPNRatios.t2,
-        conditions.LaserAPDPNRatios.t3,
-        // input for linear corrections
-        conditions.LinearCorrections.p1,
-        conditions.LinearCorrections.p2,
-        conditions.LinearCorrections.p3,
-        conditions.LinearCorrections.t1,
-        conditions.LinearCorrections.t2,
-        conditions.LinearCorrections.t3,
-        // time, used for time dependent corrections
-        event_time,
-        // input
-        eventInputGPU.ebUncalibRecHits.did,
-        eventInputGPU.eeUncalibRecHits.did,
-        eventInputGPU.ebUncalibRecHits.amplitude, 
-        eventInputGPU.eeUncalibRecHits.amplitude, 
-        eventInputGPU.ebUncalibRecHits.jitter, 
-        eventInputGPU.eeUncalibRecHits.jitter, 
-        eventInputGPU.ebUncalibRecHits.chi2, 
-        eventInputGPU.eeUncalibRecHits.chi2, 
-        eventInputGPU.ebUncalibRecHits.flags, 
-        eventInputGPU.eeUncalibRecHits.flags, 
-        // output
-        eventOutputGPU.did,
-        eventOutputGPU.energy,
-        eventOutputGPU.time,
-        eventOutputGPU.chi2,
-        eventOutputGPU.flagBits,
-        eventOutputGPU.extra,
-        // other
-        nchannels,
-        nChannelsBarrel,
-        conditions.offsetForHashes
-      );
-      
-      
-      
-    }  
-    
-    
-  }
-  
-}
+
+      //       auto const nbytesShared = 2 * threads_min * MapSymM<DataType, SampleVector::RowsAtCompileTime>::total * sizeof(DataType);
+
+      kernel_create_ecal_rehit<<<blocks_min, threads_min, 0, cudaStream>>>(
+          //       kernel_create_ecal_rehit <<< blocks_min, threads_min, nbytesShared, cudaStream >>> (
+          //       kernel_create_ecal_rehit <<< blocks_min, threads_min >>> (
+          // configuration
+          configParameters.ChannelStatusToBeExcluded,
+          configParameters.ChannelStatusToBeExcludedSize,
+          configParameters.killDeadChannels,
+          configParameters.recoverEBIsolatedChannels,
+          configParameters.recoverEEIsolatedChannels,
+          configParameters.recoverEBVFE,
+          configParameters.recoverEEVFE,
+          configParameters.recoverEBFE,
+          configParameters.recoverEEFE,
+          configParameters.EBLaserMIN,
+          configParameters.EELaserMIN,
+          configParameters.EBLaserMAX,
+          configParameters.EELaserMAX,
+          // for flags setting
+          configParameters.expanded_v_DB_reco_flags,
+          configParameters.expanded_Sizes_v_DB_reco_flags,
+          configParameters.expanded_flagbit_v_DB_reco_flags,
+          configParameters.expanded_v_DB_reco_flagsSize,
+          configParameters.flagmask,
+          // conditions
+          conditions.ADCToGeV.adc2gev,
+          conditions.Intercalib.values,
+          conditions.ChannelStatus.status,
+          conditions.LaserAPDPNRatiosRef.values,
+          conditions.LaserAlphas.values,
+          // input for transparency corrections
+          conditions.LaserAPDPNRatios.p1,
+          conditions.LaserAPDPNRatios.p2,
+          conditions.LaserAPDPNRatios.p3,
+          conditions.LaserAPDPNRatios.t1,
+          conditions.LaserAPDPNRatios.t2,
+          conditions.LaserAPDPNRatios.t3,
+          // input for linear corrections
+          conditions.LinearCorrections.p1,
+          conditions.LinearCorrections.p2,
+          conditions.LinearCorrections.p3,
+          conditions.LinearCorrections.t1,
+          conditions.LinearCorrections.t2,
+          conditions.LinearCorrections.t3,
+          // time, used for time dependent corrections
+          event_time,
+          // input
+          eventInputGPU.ebUncalibRecHits.did,
+          eventInputGPU.eeUncalibRecHits.did,
+          eventInputGPU.ebUncalibRecHits.amplitude,
+          eventInputGPU.eeUncalibRecHits.amplitude,
+          eventInputGPU.ebUncalibRecHits.jitter,
+          eventInputGPU.eeUncalibRecHits.jitter,
+          eventInputGPU.ebUncalibRecHits.chi2,
+          eventInputGPU.eeUncalibRecHits.chi2,
+          eventInputGPU.ebUncalibRecHits.flags,
+          eventInputGPU.eeUncalibRecHits.flags,
+          // output
+          eventOutputGPU.did,
+          eventOutputGPU.energy,
+          eventOutputGPU.time,
+          eventOutputGPU.chi2,
+          eventOutputGPU.flagBits,
+          eventOutputGPU.extra,
+          // other
+          nchannels,
+          nChannelsBarrel,
+          conditions.offsetForHashes);
+    }
+
+  }  // namespace rechit
+
+}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h
index 30bc589a9a5c2..f0816257eb61e 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h
@@ -10,89 +10,81 @@
 
 #include "DataFormats/Provenance/interface/Timestamp.h"
 
-
-namespace ecal { 
+namespace ecal {
   namespace rechit {
-    
-    
-    __global__
-    void kernel_create_ecal_rehit(
-      // configuration 
-      int const* ChannelStatusToBeExcluded,
-      uint32_t ChannelStatusToBeExcludedSize, 
-      bool killDeadChannels,
-      bool const recoverEBIsolatedChannels,
-      bool const recoverEEIsolatedChannels,
-      bool const recoverEBVFE,             
-      bool const recoverEEVFE,             
-      bool const recoverEBFE,             
-      bool const recoverEEFE,
-      // for flags setting
-      int const* expanded_v_DB_reco_flags,
-      uint32_t const* expanded_Sizes_v_DB_reco_flags,
-      uint32_t const* expanded_flagbit_v_DB_reco_flags,
-      uint32_t expanded_v_DB_reco_flagsSize,
-      uint32_t flagmask,
-      // conditions
-      float const* adc2gev,
-      float const* intercalib,
-      uint16_t const* status,
-      float const* apdpnrefs,
-      float const* alphas,
-      // input for transparency corrections
-      float const* p1,
-      float const* p2,
-      float const* p3,
-      edm::TimeValue_t const* t1,
-      edm::TimeValue_t const* t2,
-      edm::TimeValue_t const* t3,  
-      // input for linear corrections
-      float const* lp1,
-      float const* lp2,
-      float const* lp3,
-      edm::TimeValue_t const* lt1,
-      edm::TimeValue_t const* lt2,
-      edm::TimeValue_t const* lt3,                    
-      // time, used for time dependent corrections
-      edm::TimeValue_t const event_time,
-      // input
-      uint32_t const* did_eb,
-      uint32_t const* did_ee,
-      ::ecal::reco::StorageScalarType const* amplitude_eb,   // in adc counts  
-      ::ecal::reco::StorageScalarType const* amplitude_ee,   // in adc counts  
-      ::ecal::reco::StorageScalarType const* time_eb,   
-      ::ecal::reco::StorageScalarType const* time_ee,   
-      ::ecal::reco::StorageScalarType const* chi2_eb,   
-      ::ecal::reco::StorageScalarType const* chi2_ee,   
-      uint32_t const* flags_eb,   
-      uint32_t const* flags_ee,   
-      // output
-      uint32_t *did,
-      ::ecal::reco::StorageScalarType* energy,   // in energy [GeV]  
-      ::ecal::reco::StorageScalarType* time,  
-      ::ecal::reco::StorageScalarType* chi2,  
-      uint32_t* flagBits,
-      uint32_t* extra,
-      int const nchannels,
-      uint32_t const nChannelsBarrel,
-      uint32_t const offsetForHashes  
-    );
-    
-    
+
+    __global__ void kernel_create_ecal_rehit(
+        // configuration
+        int const* ChannelStatusToBeExcluded,
+        uint32_t ChannelStatusToBeExcludedSize,
+        bool killDeadChannels,
+        bool const recoverEBIsolatedChannels,
+        bool const recoverEEIsolatedChannels,
+        bool const recoverEBVFE,
+        bool const recoverEEVFE,
+        bool const recoverEBFE,
+        bool const recoverEEFE,
+        // for flags setting
+        int const* expanded_v_DB_reco_flags,
+        uint32_t const* expanded_Sizes_v_DB_reco_flags,
+        uint32_t const* expanded_flagbit_v_DB_reco_flags,
+        uint32_t expanded_v_DB_reco_flagsSize,
+        uint32_t flagmask,
+        // conditions
+        float const* adc2gev,
+        float const* intercalib,
+        uint16_t const* status,
+        float const* apdpnrefs,
+        float const* alphas,
+        // input for transparency corrections
+        float const* p1,
+        float const* p2,
+        float const* p3,
+        edm::TimeValue_t const* t1,
+        edm::TimeValue_t const* t2,
+        edm::TimeValue_t const* t3,
+        // input for linear corrections
+        float const* lp1,
+        float const* lp2,
+        float const* lp3,
+        edm::TimeValue_t const* lt1,
+        edm::TimeValue_t const* lt2,
+        edm::TimeValue_t const* lt3,
+        // time, used for time dependent corrections
+        edm::TimeValue_t const event_time,
+        // input
+        uint32_t const* did_eb,
+        uint32_t const* did_ee,
+        ::ecal::reco::StorageScalarType const* amplitude_eb,  // in adc counts
+        ::ecal::reco::StorageScalarType const* amplitude_ee,  // in adc counts
+        ::ecal::reco::StorageScalarType const* time_eb,
+        ::ecal::reco::StorageScalarType const* time_ee,
+        ::ecal::reco::StorageScalarType const* chi2_eb,
+        ::ecal::reco::StorageScalarType const* chi2_ee,
+        uint32_t const* flags_eb,
+        uint32_t const* flags_ee,
+        // output
+        uint32_t* did,
+        ::ecal::reco::StorageScalarType* energy,  // in energy [GeV]
+        ::ecal::reco::StorageScalarType* time,
+        ::ecal::reco::StorageScalarType* chi2,
+        uint32_t* flagBits,
+        uint32_t* extra,
+        int const nchannels,
+        uint32_t const nChannelsBarrel,
+        uint32_t const offsetForHashes);
+
     // host version, to be called by the plugin
-    
-    void create_ecal_rehit(
-      EventInputDataGPU const& eventInputGPU,
-      EventOutputDataGPU&      eventOutputGPU,
-      //     eventDataForScratchGPU_,
-      ConditionsProducts const& conditions, 
-      ConfigurationParameters const& configParameters,
-      uint32_t const nChannelsBarrel, 
-      edm::TimeValue_t const event_time,
-      cudaStream_t cudaStream
-    );
-    
-  }
-  
-}
 
+    void create_ecal_rehit(EventInputDataGPU const& eventInputGPU,
+                           EventOutputDataGPU& eventOutputGPU,
+                           //     eventDataForScratchGPU_,
+                           ConditionsProducts const& conditions,
+                           ConfigurationParameters const& configParameters,
+                           uint32_t const nChannelsBarrel,
+                           edm::TimeValue_t const event_time,
+                           cudaStream_t cudaStream);
+
+  }  // namespace rechit
+
+}  // namespace ecal
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRechitADCToGeVConstantGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitADCToGeVConstantGPU.cc
index 3824b0989f622..5f01068f95186 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalRechitADCToGeVConstantGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitADCToGeVConstantGPU.cc
@@ -3,36 +3,31 @@
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-EcalRechitADCToGeVConstantGPU::EcalRechitADCToGeVConstantGPU(EcalADCToGeVConstant const& values) 
-: adc2gev_(2)  // size is 2, one form EB and one for EE
+EcalRechitADCToGeVConstantGPU::EcalRechitADCToGeVConstantGPU(EcalADCToGeVConstant const& values)
+    : adc2gev_(2)  // size is 2, one form EB and one for EE
 {
   adc2gev_[0] = values.getEBValue();
-  adc2gev_[1] = values.getEEValue(); 
+  adc2gev_[1] = values.getEEValue();
 }
 
 EcalRechitADCToGeVConstantGPU::Product::~Product() {
   // deallocation
-  cudaCheck( cudaFree(adc2gev) );
+  cudaCheck(cudaFree(adc2gev));
 }
 
-EcalRechitADCToGeVConstantGPU::Product const& EcalRechitADCToGeVConstantGPU::getProduct(
-  cudaStream_t cudaStream) const 
-{
+EcalRechitADCToGeVConstantGPU::Product const& EcalRechitADCToGeVConstantGPU::getProduct(cudaStream_t cudaStream) const {
   auto const& product = product_.dataForCurrentDeviceAsync(
-                   cudaStream,
-                   [this](EcalRechitADCToGeVConstantGPU::Product& product, cudaStream_t cudaStream) {
-                         // malloc
-                         cudaCheck( cudaMalloc((void**)&product.adc2gev,
-                                               this->adc2gev_.size() * sizeof(float)) );
-                         // transfer 
-                         cudaCheck( cudaMemcpyAsync(product.adc2gev,
-                                                    this->adc2gev_.data(),
-                                                    this->adc2gev_.size() * sizeof(float),
-                                                    cudaMemcpyHostToDevice,
-                                                    cudaStream) );
-                   }
-  );
-  
+      cudaStream, [this](EcalRechitADCToGeVConstantGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.adc2gev, this->adc2gev_.size() * sizeof(float)));
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.adc2gev,
+                                  this->adc2gev_.data(),
+                                  this->adc2gev_.size() * sizeof(float),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
   return product;
 }
 
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc
index 7f38a23ec9168..1e6801fbd326a 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc
@@ -3,47 +3,40 @@
 #include "FWCore/Utilities/interface/typelookup.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-EcalRechitChannelStatusGPU::EcalRechitChannelStatusGPU(EcalChannelStatus const& values) 
-: status_(values.size())
-{  
+EcalRechitChannelStatusGPU::EcalRechitChannelStatusGPU(EcalChannelStatus const& values) : status_(values.size()) {
   // fill in eb
   auto const& barrelValues = values.barrelItems();
-  for (unsigned int i=0; i<barrelValues.size(); i++) {
+  for (unsigned int i = 0; i < barrelValues.size(); i++) {
     status_[i] = barrelValues[i].getEncodedStatusCode();
   }
-  
+
   // fill in ee
   auto const& endcapValues = values.endcapItems();
   auto const offset = barrelValues.size();
-  for (unsigned int i=0; i<endcapValues.size(); i++) {
+  for (unsigned int i = 0; i < endcapValues.size(); i++) {
     status_[offset + i] = endcapValues[i].getEncodedStatusCode();
   }
 }
 
 EcalRechitChannelStatusGPU::Product::~Product() {
   // deallocation
-  cudaCheck( cudaFree(status) );
+  cudaCheck(cudaFree(status));
 }
 
-EcalRechitChannelStatusGPU::Product const& EcalRechitChannelStatusGPU::getProduct(
-  cudaStream_t cudaStream) const
-  {
-    auto const& product = product_.dataForCurrentDeviceAsync(cudaStream,
-                                                             [this](EcalRechitChannelStatusGPU::Product& product, cudaStream_t cudaStream) {
-                                                               // malloc
-                                                               cudaCheck( cudaMalloc((void**)&product.status,
-                                                                                     this->status_.size() * sizeof(uint16_t)) );
-                                                               // transfer 
-                                                               cudaCheck( cudaMemcpyAsync(product.status,
-                                                                                          this->status_.data(),
-                                                                                          this->status_.size() * sizeof(uint16_t),
-                                                                                          cudaMemcpyHostToDevice,
-                                                                                          cudaStream) );
-                                                             }
-    );
-    
-    return product;
-  }
-  
-  TYPELOOKUP_DATA_REG(EcalRechitChannelStatusGPU);
-  
\ No newline at end of file
+EcalRechitChannelStatusGPU::Product const& EcalRechitChannelStatusGPU::getProduct(cudaStream_t cudaStream) const {
+  auto const& product = product_.dataForCurrentDeviceAsync(
+      cudaStream, [this](EcalRechitChannelStatusGPU::Product& product, cudaStream_t cudaStream) {
+        // malloc
+        cudaCheck(cudaMalloc((void**)&product.status, this->status_.size() * sizeof(uint16_t)));
+        // transfer
+        cudaCheck(cudaMemcpyAsync(product.status,
+                                  this->status_.data(),
+                                  this->status_.size() * sizeof(uint16_t),
+                                  cudaMemcpyHostToDevice,
+                                  cudaStream));
+      });
+
+  return product;
+}
+
+TYPELOOKUP_DATA_REG(EcalRechitChannelStatusGPU);
diff --git a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu
index e4e1a59565e0d..c9d023deb8824 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu
@@ -5,19 +5,17 @@
 
 namespace ecal {
   namespace reconstruction {
-    
+
     namespace internal {
-      
+
       namespace barrel {
-        
+
         __device__ __forceinline__ bool positiveZ(uint32_t id) { return id & 0x10000; }
-        
+
         __device__ __forceinline__ uint32_t ietaAbs(uint32_t id) { return (id >> 9) & 0x7F; }
-        
+
         __device__ __forceinline__ uint32_t iphi(uint32_t id) { return id & 0x1FF; }
-        
-        
-        
+
         __device__ int dccFromSm(int ism) {
           int iz = 1;
           if (ism > 18)
@@ -27,9 +25,9 @@ namespace ecal {
           int idcc = 9 + ism;
           if (iz == +1)
             idcc += 18;
-          return idcc;  
+          return idcc;
         }
-        
+
         __device__ int sm(int ieta, int iphi) {
           int iz = 1;
           if (ieta < 0)
@@ -43,36 +41,28 @@ namespace ecal {
             ism += 18;
           return ism;
         }
-        
-        
+
         __device__ int dcc(int ieta, int iphi) {
           int ism = sm(ieta, iphi);
           return dccFromSm(ism);
         }
-        
-        
-        
-        
-        //        
+
+        //
         // ---- why on hell things are so complex and not simple ???
-        //        
-        
-        
-        __device__ int lm_channel (int iX, int iY) {
-          
+        //
+
+        __device__ int lm_channel(int iX, int iY) {
           static const int idx_[] = {
-            // 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16
-            1, 2, 2, 2, 2, 4, 4, 4, 4,
-            6, 6, 6, 6, 8, 8, 8, 8,  // 3
-            1, 2, 2, 2, 2, 4, 4, 4, 4,
-            6, 6, 6, 6, 8, 8, 8, 8,  // 2
-            1, 3, 3, 3, 3, 5, 5, 5, 5,
-            7, 7, 7, 7, 9, 9, 9, 9,  // 1
-            1, 3, 3, 3, 3, 5, 5, 5, 5,
-            7, 7, 7, 7, 9, 9, 9, 9  // 0
-            // 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16
+              // clang-format off
+         // 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16
+            1, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8,  // 3
+            1, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8,  // 2
+            1, 3, 3, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 9, 9, 9, 9,  // 1
+            1, 3, 3, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 9, 9, 9, 9  // 0
+         // 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16
+              // clang-format on
           };
-          
+
           int il, ic, ii;
           const int iym = 4;
           const int ixm = 17;
@@ -85,15 +75,12 @@ namespace ecal {
             return -1;
           };
           return idx_[ii];
-          
         }
-        
-        
-        
-        __device__ int localCoord_x (int ieta, int iphi) {
+
+        __device__ int localCoord_x(int ieta, int iphi) {
           int iz = 1;
           if (ieta < 0) {
-            iz = -1; 
+            iz = -1;
           }
           ieta *= iz;
           //   int iphi_ = iphi;
@@ -105,15 +92,14 @@ namespace ecal {
           //   if (iz == -1) {
           //     iy = 19 - iy;
           //   }
-          
+
           return ix;
         }
-        
-        
-        __device__ int localCoord_y (int ieta, int iphi) {
+
+        __device__ int localCoord_y(int ieta, int iphi) {
           int iz = 1;
           if (ieta < 0) {
-            iz = -1; 
+            iz = -1;
           }
           //   ieta *= iz;
           int iphi_ = iphi;
@@ -125,94 +111,80 @@ namespace ecal {
           if (iz == -1) {
             iy = 19 - iy;
           }
-          
+
           return iy;
         }
-        
-        
-        __device__ int lmmod (int ieta, int iphi) {
-          
+
+        __device__ int lmmod(int ieta, int iphi) {
           int ix = localCoord_x(ieta, iphi);
           int iy = localCoord_y(ieta, iphi);
-          
+
           return lm_channel(ix / 5, iy / 5);
         }
-        
-        
-        
-        __device__ int side (int ieta, int iphi) {
+
+        __device__ int side(int ieta, int iphi) {
           int ilmmod = lmmod(ieta, iphi);
           return (ilmmod % 2 == 0) ? 1 : 0;
         }
-        
-        
-        
+
       }  // namespace barrel
-      
+
     }  // namespace internal
-    
+
     __device__ uint32_t hashedIndexEB(uint32_t id) {
       using namespace internal::barrel;
       return (EBDetId::MAX_IETA + (positiveZ(id) ? ietaAbs(id) - 1 : -ietaAbs(id))) * EBDetId::MAX_IPHI + iphi(id) - 1;
     }
-    
-    
-    
-    // 
+
+    //
     // https://cmssdt.cern.ch/lxr/source/CalibCalorimetry/EcalLaserAnalyzer/src/MEEBGeom.cc
     //  function: "lmr"
-    
-    __device__ 
-    int laser_monitoring_region_EB(uint32_t id) {
+
+    __device__ int laser_monitoring_region_EB(uint32_t id) {
       using namespace internal::barrel;
-      
+
       int ieta;
       if (positiveZ(id)) {
         ieta = ietaAbs(id);
+      } else {
+        ieta = -ietaAbs(id);
       }
-      else {
-        ieta = - ietaAbs(id);            
-      }
-      
-      int idcc = dcc(ieta, (int) (iphi(id)) );
+
+      int idcc = dcc(ieta, (int)(iphi(id)));
       int ism = idcc - 9;
-      
-      int iside = side(ieta, (int) (iphi(id)) );
+
+      int iside = side(ieta, (int)(iphi(id)));
       //   int iside = positiveZ(id) ? 1 : 0;
-      
-      return ( 1 + 2 * (ism - 1) + iside );
+
+      return (1 + 2 * (ism - 1) + iside);
       //   return ieta;
       //   return (int) (iphi(id));
       //   return idcc;
       //   return iside;
-      
     }
-    
-    
-    
-    
+
     namespace internal {
-      
+
       namespace endcap {
-        
+
         __device__ __forceinline__ uint32_t ix(uint32_t id) { return (id >> 7) & 0x7F; }
-        
+
         __device__ __forceinline__ uint32_t iy(uint32_t id) { return id & 0x7F; }
-        
+
         __device__ __forceinline__ bool positiveZ(uint32_t id) { return id & 0x4000; }
-        
+
         // these constants come from EE Det Id
         __constant__ const unsigned short kxf[] = {
-          41, 51, 41, 51, 41, 51, 36, 51, 36, 51, 26, 51, 26, 51, 26, 51, 21, 51, 21, 51, 21, 51, 21, 51, 21,
-          51, 16, 51, 16, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 9,  51, 9,  51, 9,  51, 9,  51, 9,  51,
-          6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 4,  51, 4,  51, 4,
-          51, 4,  51, 4,  56, 1,  58, 1,  59, 1,  60, 1,  61, 1,  61, 1,  62, 1,  62, 1,  62, 1,  62, 1,  62,
-          1,  62, 1,  62, 1,  62, 1,  62, 1,  62, 1,  61, 1,  61, 1,  60, 1,  59, 1,  58, 4,  56, 4,  51, 4,
-          51, 4,  51, 4,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51,
-          9,  51, 9,  51, 9,  51, 9,  51, 9,  51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 16, 51, 16, 51, 21,
-          51, 21, 51, 21, 51, 21, 51, 21, 51, 26, 51, 26, 51, 26, 51, 36, 51, 36, 51, 41, 51, 41, 51, 41, 51};
-          
-          __constant__ const unsigned short kdi[] = {
+            41, 51, 41, 51, 41, 51, 36, 51, 36, 51, 26, 51, 26, 51, 26, 51, 21, 51, 21, 51, 21, 51, 21, 51, 21,
+            51, 16, 51, 16, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 9,  51, 9,  51, 9,  51, 9,  51, 9,  51,
+            6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 4,  51, 4,  51, 4,
+            51, 4,  51, 4,  56, 1,  58, 1,  59, 1,  60, 1,  61, 1,  61, 1,  62, 1,  62, 1,  62, 1,  62, 1,  62,
+            1,  62, 1,  62, 1,  62, 1,  62, 1,  62, 1,  61, 1,  61, 1,  60, 1,  59, 1,  58, 4,  56, 4,  51, 4,
+            51, 4,  51, 4,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51, 6,  51,
+            9,  51, 9,  51, 9,  51, 9,  51, 9,  51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 16, 51, 16, 51, 21,
+            51, 21, 51, 21, 51, 21, 51, 21, 51, 26, 51, 26, 51, 26, 51, 36, 51, 36, 51, 41, 51, 41, 51, 41, 51};
+
+        __constant__ const unsigned short kdi[] = {
             0,    10,   20,   30,   40,   50,   60,   75,   90,   105,  120,  145,  170,  195,  220,  245,  270,
             300,  330,  360,  390,  420,  450,  480,  510,  540,  570,  605,  640,  675,  710,  747,  784,  821,
             858,  895,  932,  969,  1006, 1043, 1080, 1122, 1164, 1206, 1248, 1290, 1332, 1374, 1416, 1458, 1500,
@@ -225,137 +197,112 @@ namespace ecal {
             5950, 5992, 6034, 6076, 6118, 6160, 6202, 6244, 6281, 6318, 6355, 6392, 6429, 6466, 6503, 6540, 6577,
             6614, 6649, 6684, 6719, 6754, 6784, 6814, 6844, 6874, 6904, 6934, 6964, 6994, 7024, 7054, 7079, 7104,
             7129, 7154, 7179, 7204, 7219, 7234, 7249, 7264, 7274, 7284, 7294, 7304, 7314};
-            
-            
-            __device__ int quadrant(int iX, int iY) {
-              bool near = iX >= 11;
-              bool far = !near;
-              bool top = iY >= 11;
-              bool bot = !top;
-              
-              int iquad = 0;
-              if (near && top)
-                iquad = 1;
-              if (far && top)
-                iquad = 2;
-              if (far && bot)
-                iquad = 3;
-              if (near && bot)
-                iquad = 4;
-              
-              return iquad;
-            }
-            
-            __device__ int sector(int iX, int iY) {
-              //  Y (towards the surface)
-              //  T
-              //  |
-              //  |
-              //  |
-              //  o---------| X  (towards center of LHC)
-              //
-              static const int idx_[] = {
-                // 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
-                0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 9,
-                9, 9, 0, 0, 0, 0, 0, 0, 0,  // 20
-                0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 9,
-                9, 9, 9, 9, 9, 0, 0, 0, 0,  // 19
-                0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 9,
-                9, 9, 9, 9, 9, 8, 0, 0, 0,  // 18
-                0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 9,
-                9, 9, 9, 9, 8, 8, 8, 0, 0,  // 17
-                0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 9,
-                9, 9, 9, 9, 8, 8, 8, 8, 0,  // 16
-                0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 9,
-                9, 9, 9, 8, 8, 8, 8, 8, 0,  // 15
-                0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 9,
-                9, 9, 8, 8, 8, 8, 8, 8, 0,  // 14
-                2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 9,
-                9, 8, 8, 8, 8, 8, 8, 8, 8,  // 13
-                3, 3, 2, 2, 2, 2, 2, 2, 2, 0, 0,
-                8, 8, 8, 8, 8, 8, 8, 7, 7,  // 12
-                3, 3, 3, 3, 3, 3, 3, 2, 0, 0, 0,
-                0, 8, 7, 7, 7, 7, 7, 7, 7,  // 11
-                3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0,
-                0, 7, 7, 7, 7, 7, 7, 7, 7,  // 10
-                3, 3, 3, 3, 3, 3, 3, 4, 4, 0, 0,
-                6, 6, 7, 7, 7, 7, 7, 7, 7,  // 9
-                3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5,
-                6, 6, 6, 7, 7, 7, 7, 7, 7,  // 8
-                0, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5,
-                6, 6, 6, 6, 6, 7, 7, 7, 0,  // 7
-                0, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5,
-                5, 6, 6, 6, 6, 6, 6, 7, 0,  // 6
-                0, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5,
-                5, 6, 6, 6, 6, 6, 6, 6, 0,  // 5
-                0, 0, 4, 4, 4, 4, 4, 4, 5, 5, 5,
-                5, 6, 6, 6, 6, 6, 6, 0, 0,  // 4
-                0, 0, 0, 4, 4, 4, 4, 5, 5, 5, 5,
-                5, 5, 6, 6, 6, 6, 0, 0, 0,  // 3
-                0, 0, 0, 0, 4, 4, 4, 5, 5, 5, 5,
-                5, 5, 6, 6, 6, 0, 0, 0, 0,  // 2
-                0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5,
-                5, 5, 0, 0, 0, 0, 0, 0, 0  // 1
-                // 1  2  3  4  5  6  7  8  9 10   11 12 13 14 15 16 17 18 19 20
-              };
-              
-              int iym, ixm, il, ic, ii;
-              iym = 20;
-              ixm = 20;
-              int iX_ = iX;
-              int iY_ = iY;
-              il = iym - iY_;
-              ic = iX_ - 1;
-              ii = il * ixm + ic;
-              
-              if (ii < 0 || ii > (int)(sizeof(idx_) / sizeof(int)) || idx_[ii] == 0) {
-                return -1;
-              };
-              return idx_[ii];
-            }
-            
-            
-            
+
+        __device__ int quadrant(int iX, int iY) {
+          bool near = iX >= 11;
+          bool far = !near;
+          bool top = iY >= 11;
+          bool bot = !top;
+
+          int iquad = 0;
+          if (near && top)
+            iquad = 1;
+          if (far && top)
+            iquad = 2;
+          if (far && bot)
+            iquad = 3;
+          if (near && bot)
+            iquad = 4;
+
+          return iquad;
+        }
+
+        __device__ int sector(int iX, int iY) {
+          //  Y (towards the surface)
+          //  T
+          //  |
+          //  |
+          //  |
+          //  o---------| X  (towards center of LHC)
+          //
+          static const int idx_[] = {
+              // clang-format off
+             // 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
+                0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0,  // 20
+                0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0,  // 19
+                0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 8, 0, 0, 0,  // 18
+                0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 8, 8, 8, 0, 0,  // 17
+                0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 8, 8, 8, 8, 0,  // 16
+                0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 9, 9, 9, 9, 8, 8, 8, 8, 8, 0,  // 15
+                0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 9, 9, 9, 8, 8, 8, 8, 8, 8, 0,  // 14
+                2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8,  // 13
+                3, 3, 2, 2, 2, 2, 2, 2, 2, 0, 0, 8, 8, 8, 8, 8, 8, 8, 7, 7,  // 12
+                3, 3, 3, 3, 3, 3, 3, 2, 0, 0, 0, 0, 8, 7, 7, 7, 7, 7, 7, 7,  // 11
+                3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7,  // 10
+                3, 3, 3, 3, 3, 3, 3, 4, 4, 0, 0, 6, 6, 7, 7, 7, 7, 7, 7, 7,  // 9
+                3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 7, 7, 7,  // 8
+                0, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 0,  // 7
+                0, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 0,  // 6
+                0, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 0,  // 5
+                0, 0, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 0, 0,  // 4
+                0, 0, 0, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 0, 0, 0,  // 3
+                0, 0, 0, 0, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 0, 0, 0, 0,  // 2
+                0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0   // 1
+             // 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
+              // clang-format on
+          };
+
+          int iym, ixm, il, ic, ii;
+          iym = 20;
+          ixm = 20;
+          int iX_ = iX;
+          int iY_ = iY;
+          il = iym - iY_;
+          ic = iX_ - 1;
+          ii = il * ixm + ic;
+
+          if (ii < 0 || ii > (int)(sizeof(idx_) / sizeof(int)) || idx_[ii] == 0) {
+            return -1;
+          };
+          return idx_[ii];
+        }
+
       }  // namespace endcap
-      
+
     }  // namespace internal
-    
+
     __device__ uint32_t hashedIndexEE(uint32_t id) {
       using namespace internal::endcap;
-      
+
       const uint32_t jx(ix(id));
       const uint32_t jd(2 * (iy(id) - 1) + (jx - 1) / 50);
       return ((positiveZ(id) ? EEDetId::kEEhalf : 0) + kdi[jd] + jx - kxf[jd]);
     }
-    
-    
-    
-    
-    // 
+
+    //
     // https://cmssdt.cern.ch/lxr/source/CalibCalorimetry/EcalLaserAnalyzer/src/MEEEGeom.cc
     // https://github.com/cms-sw/cmssw/blob/master/CalibCalorimetry/EcalLaserCorrection/src/EcalLaserDbService.cc
-    // 
-    
-    __device__ 
-    int laser_monitoring_region_EE(uint32_t id) {
+    //
+
+    __device__ int laser_monitoring_region_EE(uint32_t id) {
       using namespace internal::endcap;
-      
+
       // SuperCrysCoord
       uint32_t iX = (ix(id) - 1) / 5 + 1;
       uint32_t iY = (iy(id) - 1) / 5 + 1;
-      
-      // Correct convention 
+
+      // Correct convention
       //   * @param iz iz/zside index: -1 for EE-, +1 for EE+
       //   https://github.com/cms-sw/cmssw/blob/master/DataFormats/EcalDetId/interface/EEDetId.h#L68-L71
       //   zside in https://github.com/cms-sw/cmssw/blob/master/CalibCalorimetry/EcalLaserCorrection/src/EcalLaserDbService.cc#L63
-      //   
+      //
       int iz = positiveZ(id) ? 1 : -1;
-      
+
       int iquad = quadrant(iX, iY);
       int isect = sector(iX, iY);
       if (isect < 0)
         return -1;
-      
+
       int ilmr = 0;
       ilmr = isect - 6;
       if (ilmr <= 0)
@@ -368,14 +315,9 @@ namespace ecal {
         ilmr += 72;
       else
         ilmr += 82;
-      
+
       return ilmr;
-      
     }
-    
-    
-    
-    
+
   }  // namespace reconstruction
 }  // namespace ecal
-
diff --git a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h
index d83f0c1fe2674..f291e85db5a06 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h
+++ b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h
@@ -431,22 +431,18 @@ namespace ecal {
   }  // namespace multifit
 }  // namespace ecal
 
-
 namespace ecal {
   namespace reconstruction {
-    
+
     __device__ uint32_t hashedIndexEB(uint32_t id);
-    
+
     __device__ uint32_t hashedIndexEE(uint32_t id);
-    
-    
+
     __device__ int laser_monitoring_region_EB(uint32_t id);
-    
+
     __device__ int laser_monitoring_region_EE(uint32_t id);
-    
+
   }  // namespace reconstruction
 }  // namespace ecal
 
-
-#endif // RecoLocalCalo_EcalRecAlgos_src_KernelHelpers_h
-
+#endif  // RecoLocalCalo_EcalRecAlgos_src_KernelHelpers_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc
index fc6ae22ff57e0..8c5e5c0c9783d 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc
@@ -9,8 +9,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h" 
-
+#include "FWCore/Framework/interface/MakerMacros.h"
 
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 
@@ -20,166 +19,145 @@
 
 #include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
 
-class EcalCPURecHitProducer
-: public edm::stream::EDProducer<edm::ExternalWork>
-{
+class EcalCPURecHitProducer : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
   explicit EcalCPURecHitProducer(edm::ParameterSet const& ps);
   ~EcalCPURecHitProducer() override = default;
   static void fillDescriptions(edm::ConfigurationDescriptions&);
-  
+
 private:
-  void acquire(edm::Event const&, 
-               edm::EventSetup const&,
-               edm::WaitingTaskWithArenaHolder) override;
-               void produce(edm::Event&, edm::EventSetup const&) override;
-               
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
+
 private:
   edm::EDGetTokenT<cms::cuda::Product<ecal::RecHit<ecal::Tag::ptr>>> recHitsInEBToken_, recHitsInEEToken_;
   edm::EDPutTokenT<ecal::RecHit<ecal::Tag::soa>> recHitsOutEBToken_, recHitsOutEEToken_;
-  
+
   ecal::RecHit<ecal::Tag::soa> recHitsEB_, recHitsEE_;
   bool containsTimingInformation_;
 };
 
-void EcalCPURecHitProducer::fillDescriptions(
-  edm::ConfigurationDescriptions& confDesc) {
+void EcalCPURecHitProducer::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
   edm::ParameterSetDescription desc;
-  
+
   desc.add<edm::InputTag>("recHitsInLabelEB", edm::InputTag{"ecalRecHitProducerGPU", "EcalRecHitsEB"});
   desc.add<edm::InputTag>("recHitsInLabelEE", edm::InputTag{"ecalRecHitProducerGPU", "EcalRecHitsEE"});
   desc.add<std::string>("recHitsOutLabelEB", "EcalRecHitsEB");
   desc.add<std::string>("recHitsOutLabelEE", "EcalRecHitsEE");
   desc.add<bool>("containsTimingInformation", false);
-  
+
   std::string label = "ecalCPURecHitProducer";
   confDesc.add(label, desc);
-  }
-  
-  EcalCPURecHitProducer::EcalCPURecHitProducer(
-    const edm::ParameterSet& ps) 
-  : recHitsInEBToken_{consumes<cms::cuda::Product<ecal::RecHit<ecal::Tag::ptr>>>(ps.getParameter<edm::InputTag>("recHitsInLabelEB"))}
-  , recHitsInEEToken_{consumes<cms::cuda::Product<ecal::RecHit<ecal::Tag::ptr>>>(ps.getParameter<edm::InputTag>("recHitsInLabelEE"))}
-  , recHitsOutEBToken_{produces<ecal::RecHit<ecal::Tag::soa>>(ps.getParameter<std::string>("recHitsOutLabelEB"))}
-  , recHitsOutEEToken_{produces<ecal::RecHit<ecal::Tag::soa>>(ps.getParameter<std::string>("recHitsOutLabelEE"))}
-  , containsTimingInformation_{ps.getParameter<bool>("containsTimingInformation")}
-  {}
-  
-  
-  void EcalCPURecHitProducer::acquire(
-    edm::Event const& event,
-    edm::EventSetup const& setup,
-    edm::WaitingTaskWithArenaHolder taskHolder) 
-  {
-    // retrieve data/ctx
-    auto const& ebRecHitsProduct = event.get(recHitsInEBToken_);
-    auto const& eeRecHitsProduct = event.get(recHitsInEEToken_);
-    cms::cuda::ScopedContextAcquire ctx{ebRecHitsProduct, std::move(taskHolder)};
-    auto const& ebRecHits = ctx.get(ebRecHitsProduct);
-    auto const& eeRecHits = ctx.get(eeRecHitsProduct);
-    
-    // resize the output buffers
-    recHitsEB_.resize(ebRecHits.size);
-    recHitsEE_.resize(eeRecHits.size);
-    
-    //     std::cout << " [EcalCPURecHitProducer::acquire] ebRecHits.size = " << ebRecHits.size << std::endl;
-    //     std::cout << " [EcalCPURecHitProducer::acquire] eeRecHits.size = " << eeRecHits.size << std::endl;
-    
-    // enqeue transfers
-    cudaCheck( cudaMemcpyAsync(recHitsEB_.did.data(),
-                               ebRecHits.did,
-                               recHitsEB_.did.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-    cudaCheck( cudaMemcpyAsync(recHitsEE_.did.data(),
-                               eeRecHits.did,
-                               recHitsEE_.did.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-    // 
-    //     ./CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h:using StorageScalarType = float;
-    // 
-    
-    cudaCheck( cudaMemcpyAsync(recHitsEB_.energy.data(),
-                               ebRecHits.energy,
-                               recHitsEB_.energy.size() * sizeof(::ecal::reco::StorageScalarType), 
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-    cudaCheck( cudaMemcpyAsync(recHitsEE_.energy.data(),
-                               eeRecHits.energy,
-                               recHitsEE_.energy.size() * sizeof(::ecal::reco::StorageScalarType), 
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-    
-    cudaCheck( cudaMemcpyAsync(recHitsEB_.chi2.data(),
-                               ebRecHits.chi2,
-                               recHitsEB_.chi2.size() * sizeof(::ecal::reco::StorageScalarType),   
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-    cudaCheck( cudaMemcpyAsync(recHitsEE_.chi2.data(),
-                               eeRecHits.chi2,
-                               recHitsEE_.chi2.size() * sizeof(::ecal::reco::StorageScalarType),   
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-    
-    cudaCheck( cudaMemcpyAsync(recHitsEB_.extra.data(),
-                               ebRecHits.extra,
-                               recHitsEB_.extra.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-    cudaCheck( cudaMemcpyAsync(recHitsEE_.extra.data(),
-                               eeRecHits.extra,
-                               recHitsEE_.extra.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-    
-    cudaCheck( cudaMemcpyAsync(recHitsEB_.flagBits.data(),
-                               ebRecHits.flagBits,
-                               recHitsEB_.flagBits.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-    cudaCheck( cudaMemcpyAsync(recHitsEE_.flagBits.data(),
-                               eeRecHits.flagBits,
-                               recHitsEE_.flagBits.size() * sizeof(uint32_t),
-                               cudaMemcpyDeviceToHost,
-                               ctx.stream()) );
-    
-    
-    
-    
-    //     for (unsigned int ieb = 0; ieb <  ebRecHits.size ; ieb++) {
-    //       if (recHitsEB_.extra[ieb] != 0 ) std::cout << " [ " << ieb << " :: " << ebRecHits.size << " ] [ " << recHitsEB_.did[ieb] << " ] eb extra = " << recHitsEB_.extra[ieb] << std::endl;
-    //     }
-    
-    //     
-    //     for (unsigned int ieb = 0; ieb <  ebRecHits.size ; ieb++) {
-    //       if (recHitsEB_.energy[ieb] != 0 ) std::cout << " [ " << ieb << " :: " << ebRecHits.size << " ] [ " << recHitsEB_.did[ieb] << " ] eb energy = " << recHitsEB_.energy[ieb] << std::endl;
-    //     }
-    //     
-    //     for (unsigned int iee = 0; iee <  eeRecHits.size ; iee++) {
-    //       if (recHitsEE_.energy[iee] != 0 ) std::cout << " [ " << iee << " :: " << eeRecHits.size << " ] [ " << recHitsEE_.did[iee] << " ] ee energy = " << recHitsEE_.energy[iee] << std::endl;
-    //     }
-    //     
-    
-    
-    
-    
-  }
-  
-  void EcalCPURecHitProducer::produce(
-    edm::Event& event, 
-    edm::EventSetup const& setup) 
-  {
-    // tmp vectors
-    auto recHitsOutEB = std::make_unique<ecal::RecHit<ecal::Tag::soa>>(std::move(recHitsEB_));
-    auto recHitsOutEE = std::make_unique<ecal::RecHit<ecal::Tag::soa>>(std::move(recHitsEE_));
-    
-    // put into event
-    event.put(recHitsOutEBToken_, std::move(recHitsOutEB));
-    event.put(recHitsOutEEToken_, std::move(recHitsOutEE));
-  }
-  
-  DEFINE_FWK_MODULE(EcalCPURecHitProducer);
-  
-  
-  
\ No newline at end of file
+}
+
+EcalCPURecHitProducer::EcalCPURecHitProducer(const edm::ParameterSet& ps)
+    : recHitsInEBToken_{consumes<cms::cuda::Product<ecal::RecHit<ecal::Tag::ptr>>>(
+          ps.getParameter<edm::InputTag>("recHitsInLabelEB"))},
+      recHitsInEEToken_{consumes<cms::cuda::Product<ecal::RecHit<ecal::Tag::ptr>>>(
+          ps.getParameter<edm::InputTag>("recHitsInLabelEE"))},
+      recHitsOutEBToken_{produces<ecal::RecHit<ecal::Tag::soa>>(ps.getParameter<std::string>("recHitsOutLabelEB"))},
+      recHitsOutEEToken_{produces<ecal::RecHit<ecal::Tag::soa>>(ps.getParameter<std::string>("recHitsOutLabelEE"))},
+      containsTimingInformation_{ps.getParameter<bool>("containsTimingInformation")} {}
+
+void EcalCPURecHitProducer::acquire(edm::Event const& event,
+                                    edm::EventSetup const& setup,
+                                    edm::WaitingTaskWithArenaHolder taskHolder) {
+  // retrieve data/ctx
+  auto const& ebRecHitsProduct = event.get(recHitsInEBToken_);
+  auto const& eeRecHitsProduct = event.get(recHitsInEEToken_);
+  cms::cuda::ScopedContextAcquire ctx{ebRecHitsProduct, std::move(taskHolder)};
+  auto const& ebRecHits = ctx.get(ebRecHitsProduct);
+  auto const& eeRecHits = ctx.get(eeRecHitsProduct);
+
+  // resize the output buffers
+  recHitsEB_.resize(ebRecHits.size);
+  recHitsEE_.resize(eeRecHits.size);
+
+  //     std::cout << " [EcalCPURecHitProducer::acquire] ebRecHits.size = " << ebRecHits.size << std::endl;
+  //     std::cout << " [EcalCPURecHitProducer::acquire] eeRecHits.size = " << eeRecHits.size << std::endl;
+
+  // enqeue transfers
+  cudaCheck(cudaMemcpyAsync(recHitsEB_.did.data(),
+                            ebRecHits.did,
+                            recHitsEB_.did.size() * sizeof(uint32_t),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+  cudaCheck(cudaMemcpyAsync(recHitsEE_.did.data(),
+                            eeRecHits.did,
+                            recHitsEE_.did.size() * sizeof(uint32_t),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+  //
+  //     ./CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h:using StorageScalarType = float;
+  //
+
+  cudaCheck(cudaMemcpyAsync(recHitsEB_.energy.data(),
+                            ebRecHits.energy,
+                            recHitsEB_.energy.size() * sizeof(::ecal::reco::StorageScalarType),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+  cudaCheck(cudaMemcpyAsync(recHitsEE_.energy.data(),
+                            eeRecHits.energy,
+                            recHitsEE_.energy.size() * sizeof(::ecal::reco::StorageScalarType),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+
+  cudaCheck(cudaMemcpyAsync(recHitsEB_.chi2.data(),
+                            ebRecHits.chi2,
+                            recHitsEB_.chi2.size() * sizeof(::ecal::reco::StorageScalarType),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+  cudaCheck(cudaMemcpyAsync(recHitsEE_.chi2.data(),
+                            eeRecHits.chi2,
+                            recHitsEE_.chi2.size() * sizeof(::ecal::reco::StorageScalarType),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+
+  cudaCheck(cudaMemcpyAsync(recHitsEB_.extra.data(),
+                            ebRecHits.extra,
+                            recHitsEB_.extra.size() * sizeof(uint32_t),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+  cudaCheck(cudaMemcpyAsync(recHitsEE_.extra.data(),
+                            eeRecHits.extra,
+                            recHitsEE_.extra.size() * sizeof(uint32_t),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+
+  cudaCheck(cudaMemcpyAsync(recHitsEB_.flagBits.data(),
+                            ebRecHits.flagBits,
+                            recHitsEB_.flagBits.size() * sizeof(uint32_t),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+  cudaCheck(cudaMemcpyAsync(recHitsEE_.flagBits.data(),
+                            eeRecHits.flagBits,
+                            recHitsEE_.flagBits.size() * sizeof(uint32_t),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+
+  //     for (unsigned int ieb = 0; ieb <  ebRecHits.size ; ieb++) {
+  //       if (recHitsEB_.extra[ieb] != 0 ) std::cout << " [ " << ieb << " :: " << ebRecHits.size << " ] [ " << recHitsEB_.did[ieb] << " ] eb extra = " << recHitsEB_.extra[ieb] << std::endl;
+  //     }
+
+  //
+  //     for (unsigned int ieb = 0; ieb <  ebRecHits.size ; ieb++) {
+  //       if (recHitsEB_.energy[ieb] != 0 ) std::cout << " [ " << ieb << " :: " << ebRecHits.size << " ] [ " << recHitsEB_.did[ieb] << " ] eb energy = " << recHitsEB_.energy[ieb] << std::endl;
+  //     }
+  //
+  //     for (unsigned int iee = 0; iee <  eeRecHits.size ; iee++) {
+  //       if (recHitsEE_.energy[iee] != 0 ) std::cout << " [ " << iee << " :: " << eeRecHits.size << " ] [ " << recHitsEE_.did[iee] << " ] ee energy = " << recHitsEE_.energy[iee] << std::endl;
+  //     }
+  //
+}
+
+void EcalCPURecHitProducer::produce(edm::Event& event, edm::EventSetup const& setup) {
+  // tmp vectors
+  auto recHitsOutEB = std::make_unique<ecal::RecHit<ecal::Tag::soa>>(std::move(recHitsEB_));
+  auto recHitsOutEE = std::make_unique<ecal::RecHit<ecal::Tag::soa>>(std::move(recHitsEE_));
+
+  // put into event
+  event.put(recHitsOutEBToken_, std::move(recHitsOutEB));
+  event.put(recHitsOutEEToken_, std::move(recHitsOutEE));
+}
+
+DEFINE_FWK_MODULE(EcalCPURecHitProducer);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc
index 5fded99cf3d0b..c2f6de85ef5a3 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc
@@ -17,7 +17,6 @@
 #include "CondFormats/DataRecord/interface/EcalLaserAlphasRcd.h"
 #include "CondFormats/DataRecord/interface/EcalLinearCorrectionsRcd.h"
 
-
 // for uncalibrechit
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h"
@@ -35,92 +34,49 @@
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h"
 
+#include <iostream>
+
+using EcalPedestalsGPUESProducer = EcalESProducerGPU<EcalPedestalsGPU, EcalPedestals, EcalPedestalsRcd>;
 
+using EcalGainRatiosGPUESProducer = EcalESProducerGPU<EcalGainRatiosGPU, EcalGainRatios, EcalGainRatiosRcd>;
 
+using EcalPulseShapesGPUESProducer = EcalESProducerGPU<EcalPulseShapesGPU, EcalPulseShapes, EcalPulseShapesRcd>;
 
+using EcalPulseCovariancesGPUESProducer =
+    EcalESProducerGPU<EcalPulseCovariancesGPU, EcalPulseCovariances, EcalPulseCovariancesRcd>;
 
-#include <iostream>
+using EcalSamplesCorrelationGPUESProducer =
+    EcalESProducerGPU<EcalSamplesCorrelationGPU, EcalSamplesCorrelation, EcalSamplesCorrelationRcd>;
+
+using EcalTimeBiasCorrectionsGPUESProducer =
+    EcalESProducerGPU<EcalTimeBiasCorrectionsGPU, EcalTimeBiasCorrections, EcalTimeBiasCorrectionsRcd>;
+
+using EcalTimeCalibConstantsGPUESProducer =
+    EcalESProducerGPU<EcalTimeCalibConstantsGPU, EcalTimeCalibConstants, EcalTimeCalibConstantsRcd>;
+
+using EcalRechitADCToGeVConstantGPUESProducer =
+    EcalESProducerGPU<EcalRechitADCToGeVConstantGPU, EcalADCToGeVConstant, EcalADCToGeVConstantRcd>;
 
+using EcalIntercalibConstantsGPUESProducer =
+    EcalESProducerGPU<EcalIntercalibConstantsGPU, EcalIntercalibConstants, EcalIntercalibConstantsRcd>;
 
-using EcalPedestalsGPUESProducer = EcalESProducerGPU<EcalPedestalsGPU,
-                                                     EcalPedestals,
-                                                     EcalPedestalsRcd>;
-                                                     
-using EcalGainRatiosGPUESProducer = EcalESProducerGPU<EcalGainRatiosGPU,
-                                                      EcalGainRatios,
-                                                      EcalGainRatiosRcd>;
-                                                      
-using EcalPulseShapesGPUESProducer = EcalESProducerGPU<EcalPulseShapesGPU,
-                                                       EcalPulseShapes,
-                                                       EcalPulseShapesRcd>;
-                                                       
-using EcalPulseCovariancesGPUESProducer = EcalESProducerGPU<EcalPulseCovariancesGPU,
-                                                            EcalPulseCovariances,
-                                                            EcalPulseCovariancesRcd>;
-                                                            
-using EcalSamplesCorrelationGPUESProducer = EcalESProducerGPU<
-                                                              EcalSamplesCorrelationGPU,
-                                                              EcalSamplesCorrelation,
-                                                              EcalSamplesCorrelationRcd
-                                                              >;
-
-using EcalTimeBiasCorrectionsGPUESProducer = EcalESProducerGPU<
-                                                               EcalTimeBiasCorrectionsGPU,
-                                                               EcalTimeBiasCorrections,
-                                                               EcalTimeBiasCorrectionsRcd
-                                                               >;
-
-using EcalTimeCalibConstantsGPUESProducer = EcalESProducerGPU<
-                                                              EcalTimeCalibConstantsGPU,
-                                                              EcalTimeCalibConstants,
-                                                              EcalTimeCalibConstantsRcd
-                                                              >;
-                                                             
-using EcalRechitADCToGeVConstantGPUESProducer = EcalESProducerGPU<
-                                                            EcalRechitADCToGeVConstantGPU,
-                                                            EcalADCToGeVConstant,
-                                                            EcalADCToGeVConstantRcd
-                                                            >;
-
-using EcalIntercalibConstantsGPUESProducer = EcalESProducerGPU<
-                                                               EcalIntercalibConstantsGPU,
-                                                               EcalIntercalibConstants,
-                                                               EcalIntercalibConstantsRcd
-                                                               >;
-
-using EcalRechitChannelStatusGPUESProducer = EcalESProducerGPU<
-                                                         EcalRechitChannelStatusGPU,
-                                                         EcalChannelStatus,
-                                                         EcalChannelStatusRcd
-                                                         >;
-
-using EcalLaserAPDPNRatiosGPUESProducer = EcalESProducerGPU<
-                                                            EcalLaserAPDPNRatiosGPU,
-                                                            EcalLaserAPDPNRatios,
-                                                            EcalLaserAPDPNRatiosRcd
-                                                            >;
-
-using EcalLaserAPDPNRatiosRefGPUESProducer = EcalESProducerGPU<
-                                                               EcalLaserAPDPNRatiosRefGPU,
-                                                               EcalLaserAPDPNRatiosRef,
-                                                               EcalLaserAPDPNRatiosRefRcd
-                                                               >;
-
-using EcalLaserAlphasGPUESProducer = EcalESProducerGPU<
-                                                       EcalLaserAlphasGPU,
-                                                       EcalLaserAlphas,
-                                                       EcalLaserAlphasRcd
-                                                       >;
-
-using EcalLinearCorrectionsGPUESProducer = EcalESProducerGPU<
-                                                             EcalLinearCorrectionsGPU,
-                                                             EcalLinearCorrections,
-                                                             EcalLinearCorrectionsRcd
-                                                             >;
-
-//    
+using EcalRechitChannelStatusGPUESProducer =
+    EcalESProducerGPU<EcalRechitChannelStatusGPU, EcalChannelStatus, EcalChannelStatusRcd>;
+
+using EcalLaserAPDPNRatiosGPUESProducer =
+    EcalESProducerGPU<EcalLaserAPDPNRatiosGPU, EcalLaserAPDPNRatios, EcalLaserAPDPNRatiosRcd>;
+
+using EcalLaserAPDPNRatiosRefGPUESProducer =
+    EcalESProducerGPU<EcalLaserAPDPNRatiosRefGPU, EcalLaserAPDPNRatiosRef, EcalLaserAPDPNRatiosRefRcd>;
+
+using EcalLaserAlphasGPUESProducer = EcalESProducerGPU<EcalLaserAlphasGPU, EcalLaserAlphas, EcalLaserAlphasRcd>;
+
+using EcalLinearCorrectionsGPUESProducer =
+    EcalESProducerGPU<EcalLinearCorrectionsGPU, EcalLinearCorrections, EcalLinearCorrectionsRcd>;
+
+//
 // This below also creates the .py config files, as described in "EcalESProducerGPU.h"
-//     
+//
 
 DEFINE_FWK_EVENTSETUP_MODULE(EcalPedestalsGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(EcalGainRatiosGPUESProducer);
@@ -137,4 +93,3 @@ DEFINE_FWK_EVENTSETUP_MODULE(EcalLaserAPDPNRatiosGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(EcalLaserAPDPNRatiosRefGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(EcalLaserAlphasGPUESProducer);
 DEFINE_FWK_EVENTSETUP_MODULE(EcalLinearCorrectionsGPUESProducer);
-
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc
index 54d772efa806b..548bc812ffa2e 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc
@@ -3,7 +3,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h" 
+#include "FWCore/Framework/interface/MakerMacros.h"
 
 // algorithm specific
 #include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
@@ -14,124 +14,111 @@
 
 #include <iostream>
 
-class EcalRecHitConvertGPU2CPUFormat
-: public edm::stream::EDProducer<>
-{
+class EcalRecHitConvertGPU2CPUFormat : public edm::stream::EDProducer<> {
 public:
   explicit EcalRecHitConvertGPU2CPUFormat(edm::ParameterSet const& ps);
   ~EcalRecHitConvertGPU2CPUFormat() override;
   static void fillDescriptions(edm::ConfigurationDescriptions&);
-  
+
 private:
   using GPURecHitType = ecal::RecHit<ecal::Tag::soa>;
   void produce(edm::Event&, edm::EventSetup const&) override;
-  
+
 private:
   const edm::EDGetTokenT<ecal::SoARecHitCollection> recHitsGPUEB_;
   const edm::EDGetTokenT<ecal::SoARecHitCollection> recHitsGPUEE_;
-  
+
   const std::string recHitsLabelCPUEB_, recHitsLabelCPUEE_;
 };
 
-void EcalRecHitConvertGPU2CPUFormat::fillDescriptions(
-  edm::ConfigurationDescriptions& confDesc) {
+void EcalRecHitConvertGPU2CPUFormat::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
   edm::ParameterSetDescription desc;
-  
+
   desc.add<edm::InputTag>("recHitsLabelGPUEB", edm::InputTag("ecalRecHitProducerGPU", "EcalRecHitsGPUEB"));
   desc.add<edm::InputTag>("recHitsLabelGPUEE", edm::InputTag("ecalRecHitProducerGPU", "EcalRecHitsGPUEE"));
-  
+
   desc.add<std::string>("recHitsLabelCPUEB", "EcalRecHitsEB");
   desc.add<std::string>("recHitsLabelCPUEE", "EcalRecHitsEE");
-  
+
   std::string label = "ecalRecHitConvertGPU2CPUFormat";
   confDesc.add(label, desc);
-  }
-  
-  EcalRecHitConvertGPU2CPUFormat::EcalRecHitConvertGPU2CPUFormat(const edm::ParameterSet& ps) 
-  : recHitsGPUEB_{consumes<ecal::SoARecHitCollection>(ps.getParameter<edm::InputTag>("recHitsLabelGPUEB"))}
-  , recHitsGPUEE_{consumes<ecal::SoARecHitCollection>(ps.getParameter<edm::InputTag>("recHitsLabelGPUEE"))}
-  , recHitsLabelCPUEB_{ps.getParameter<std::string>("recHitsLabelCPUEB")}
-  , recHitsLabelCPUEE_{ps.getParameter<std::string>("recHitsLabelCPUEE")}
-  {
-    produces<EBRecHitCollection>(recHitsLabelCPUEB_);
-    produces<EERecHitCollection>(recHitsLabelCPUEE_);
-  }
-  
-  EcalRecHitConvertGPU2CPUFormat::~EcalRecHitConvertGPU2CPUFormat() {}
-  
-  void EcalRecHitConvertGPU2CPUFormat::produce(
-    edm::Event& event, 
-    edm::EventSetup const& setup) 
-  {
-    edm::Handle<ecal::SoARecHitCollection> hRecHitsGPUEB, hRecHitsGPUEE;
-    event.getByToken(recHitsGPUEB_, hRecHitsGPUEB);
-    event.getByToken(recHitsGPUEE_, hRecHitsGPUEE);
-    
-    auto recHitsCPUEB = std::make_unique<EBRecHitCollection>();
-    auto recHitsCPUEE = std::make_unique<EERecHitCollection>();
-    recHitsCPUEB->reserve(hRecHitsGPUEB->energy.size());
-    recHitsCPUEE->reserve(hRecHitsGPUEE->energy.size());
-    
-    //     
-    //     explicit EcalRecHit(const DetId& id, float energy, float time, uint32_t extra = 0, uint32_t flagBits = 0):
-    //     
-    
-    for (uint32_t i=0; i<hRecHitsGPUEB->energy.size(); ++i) {
-      
-      //
-      // Save only if energy is >= 0 !
-      // This is extremely important because the channels that were supposed 
-      // to be excluded get "-1" as energy
-      //
-      
-      if (hRecHitsGPUEB->energy[i] >=0) {
-        recHitsCPUEB->emplace_back(
-          DetId{hRecHitsGPUEB->did[i]},
-          hRecHitsGPUEB->energy[i],
-          hRecHitsGPUEB->time[i],
-          hRecHitsGPUEB->extra[i],
-          hRecHitsGPUEB->flagBits[i]
-        );
-      }
-      
-      //       std::cout << " EB :: extra [" << i << "::" << hRecHitsGPUEB->energy.size() << "] = " << hRecHitsGPUEB->extra[i] << std::endl;        
-      
-      //         (*recHitsCPUEB)[i].setJitterError(hRecHitsGPUEB->timeError[i]);
-      //         auto const offset = i * EcalDataFrame::MAXSAMPLES;
-      //         for (uint32_t sample=0; sample<EcalDataFrame::MAXSAMPLES; ++sample) 
-      //             (*recHitsCPUEB)[i].setOutOfTimeAmplitude(
-      //                 sample, hRecHitsGPUEB->energysAll[offset + sample]);
+}
+
+EcalRecHitConvertGPU2CPUFormat::EcalRecHitConvertGPU2CPUFormat(const edm::ParameterSet& ps)
+    : recHitsGPUEB_{consumes<ecal::SoARecHitCollection>(ps.getParameter<edm::InputTag>("recHitsLabelGPUEB"))},
+      recHitsGPUEE_{consumes<ecal::SoARecHitCollection>(ps.getParameter<edm::InputTag>("recHitsLabelGPUEE"))},
+      recHitsLabelCPUEB_{ps.getParameter<std::string>("recHitsLabelCPUEB")},
+      recHitsLabelCPUEE_{ps.getParameter<std::string>("recHitsLabelCPUEE")} {
+  produces<EBRecHitCollection>(recHitsLabelCPUEB_);
+  produces<EERecHitCollection>(recHitsLabelCPUEE_);
+}
+
+EcalRecHitConvertGPU2CPUFormat::~EcalRecHitConvertGPU2CPUFormat() {}
+
+void EcalRecHitConvertGPU2CPUFormat::produce(edm::Event& event, edm::EventSetup const& setup) {
+  edm::Handle<ecal::SoARecHitCollection> hRecHitsGPUEB, hRecHitsGPUEE;
+  event.getByToken(recHitsGPUEB_, hRecHitsGPUEB);
+  event.getByToken(recHitsGPUEE_, hRecHitsGPUEE);
+
+  auto recHitsCPUEB = std::make_unique<EBRecHitCollection>();
+  auto recHitsCPUEE = std::make_unique<EERecHitCollection>();
+  recHitsCPUEB->reserve(hRecHitsGPUEB->energy.size());
+  recHitsCPUEE->reserve(hRecHitsGPUEE->energy.size());
+
+  //
+  //     explicit EcalRecHit(const DetId& id, float energy, float time, uint32_t extra = 0, uint32_t flagBits = 0):
+  //
+
+  for (uint32_t i = 0; i < hRecHitsGPUEB->energy.size(); ++i) {
+    //
+    // Save only if energy is >= 0 !
+    // This is extremely important because the channels that were supposed
+    // to be excluded get "-1" as energy
+    //
+
+    if (hRecHitsGPUEB->energy[i] >= 0) {
+      recHitsCPUEB->emplace_back(DetId{hRecHitsGPUEB->did[i]},
+                                 hRecHitsGPUEB->energy[i],
+                                 hRecHitsGPUEB->time[i],
+                                 hRecHitsGPUEB->extra[i],
+                                 hRecHitsGPUEB->flagBits[i]);
     }
-    
-    for (uint32_t i=0; i<hRecHitsGPUEE->energy.size(); ++i) {
-      //
-      // Save only if energy is >= 0 !
-      // This is extremely important because the channels that were supposed 
-      // to be excluded get "-1" as energy
-      //
-      
-      if (hRecHitsGPUEE->energy[i] >=0) {
-        recHitsCPUEE->emplace_back(
-          DetId{hRecHitsGPUEE->did[i]},
-          hRecHitsGPUEE->energy[i],
-          hRecHitsGPUEE->time[i],
-          hRecHitsGPUEE->extra[i],
-          hRecHitsGPUEE->flagBits[i]
-        );
-      }
-      
-      //       std::cout << " EE :: extra [" << i << "::" << hRecHitsGPUEE->energy.size() << "] = " << hRecHitsGPUEE->extra[i] << std::endl;        
-      
-      //         (*recHitsCPUEE)[i].setJitterError(hRecHitsGPUEE->timeError[i]);
-      //         auto const offset = i * EcalDataFrame::MAXSAMPLES;
-      //         for (uint32_t sample=0; sample<EcalDataFrame::MAXSAMPLES; ++sample) 
-      //             (*recHitsCPUEE)[i].setOutOfTimeAmplitude(
-      //                 sample, hRecHitsGPUEE->energysAll[offset + sample]);
+
+    //       std::cout << " EB :: extra [" << i << "::" << hRecHitsGPUEB->energy.size() << "] = " << hRecHitsGPUEB->extra[i] << std::endl;
+
+    //         (*recHitsCPUEB)[i].setJitterError(hRecHitsGPUEB->timeError[i]);
+    //         auto const offset = i * EcalDataFrame::MAXSAMPLES;
+    //         for (uint32_t sample=0; sample<EcalDataFrame::MAXSAMPLES; ++sample)
+    //             (*recHitsCPUEB)[i].setOutOfTimeAmplitude(
+    //                 sample, hRecHitsGPUEB->energysAll[offset + sample]);
+  }
+
+  for (uint32_t i = 0; i < hRecHitsGPUEE->energy.size(); ++i) {
+    //
+    // Save only if energy is >= 0 !
+    // This is extremely important because the channels that were supposed
+    // to be excluded get "-1" as energy
+    //
+
+    if (hRecHitsGPUEE->energy[i] >= 0) {
+      recHitsCPUEE->emplace_back(DetId{hRecHitsGPUEE->did[i]},
+                                 hRecHitsGPUEE->energy[i],
+                                 hRecHitsGPUEE->time[i],
+                                 hRecHitsGPUEE->extra[i],
+                                 hRecHitsGPUEE->flagBits[i]);
     }
-    
-    event.put(std::move(recHitsCPUEB), recHitsLabelCPUEB_);
-    event.put(std::move(recHitsCPUEE), recHitsLabelCPUEE_);
+
+    //       std::cout << " EE :: extra [" << i << "::" << hRecHitsGPUEE->energy.size() << "] = " << hRecHitsGPUEE->extra[i] << std::endl;
+
+    //         (*recHitsCPUEE)[i].setJitterError(hRecHitsGPUEE->timeError[i]);
+    //         auto const offset = i * EcalDataFrame::MAXSAMPLES;
+    //         for (uint32_t sample=0; sample<EcalDataFrame::MAXSAMPLES; ++sample)
+    //             (*recHitsCPUEE)[i].setOutOfTimeAmplitude(
+    //                 sample, hRecHitsGPUEE->energysAll[offset + sample]);
   }
-  
-  DEFINE_FWK_MODULE(EcalRecHitConvertGPU2CPUFormat);
-  
\ No newline at end of file
+
+  event.put(std::move(recHitsCPUEB), recHitsLabelCPUEB_);
+  event.put(std::move(recHitsCPUEE), recHitsLabelCPUEE_);
+}
+
+DEFINE_FWK_MODULE(EcalRecHitConvertGPU2CPUFormat);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
index 7422838471ebc..795a499987a06 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
@@ -1,23 +1,21 @@
 // framework
 #include "FWCore/Framework/interface/stream/EDProducer.h"
 
-
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h" 
+#include "FWCore/Framework/interface/MakerMacros.h"
 
-// 
-// 
-// 
+//
+//
+//
 
 // format
 #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
 #include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
 #include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h"
 
-
 // needed for definition of flags
 #include "DataFormats/EcalRecHit/interface/EcalRecHit.h"
 
@@ -34,7 +32,6 @@
 #include "CondFormats/DataRecord/interface/EcalLaserAlphasRcd.h"
 #include "CondFormats/DataRecord/interface/EcalLinearCorrectionsRcd.h"
 
-
 // conditions gpu
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h"
@@ -45,108 +42,89 @@
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h"
 #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h"
 
-
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 
 // configuration
 #include "CommonTools/Utils/interface/StringToEnumValue.h"
 
-
-class EcalRecHitProducerGPU: public edm::stream::EDProducer<edm::ExternalWork> {
-  
+class EcalRecHitProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
   explicit EcalRecHitProducerGPU(edm::ParameterSet const& ps);
   ~EcalRecHitProducerGPU() override;
   static void fillDescriptions(edm::ConfigurationDescriptions&);
-  
+
 private:
-  
   using RecHitType = ecal::RecHit<ecal::Tag::soa>;
-  void acquire(edm::Event const&, 
-               edm::EventSetup const&,
-               edm::WaitingTaskWithArenaHolder) override;
-               void produce(edm::Event&, edm::EventSetup const&) override;
-               
+  void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+  void produce(edm::Event&, edm::EventSetup const&) override;
+
 private:
-  
   // data
-  uint32_t neb_, nee_; // extremely important, in particular neb_
-  
-  
-  // gpu input  
-  edm::EDGetTokenT<cms::cuda::Product< ecal::UncalibratedRecHit<ecal::Tag::ptr> > > uncalibRecHitsInEBToken_;
-  edm::EDGetTokenT<cms::cuda::Product< ecal::UncalibratedRecHit<ecal::Tag::ptr> > > uncalibRecHitsInEEToken_;
-  
-  
-  
+  uint32_t neb_, nee_;  // extremely important, in particular neb_
+
+  // gpu input
+  edm::EDGetTokenT<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>> uncalibRecHitsInEBToken_;
+  edm::EDGetTokenT<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>> uncalibRecHitsInEEToken_;
+
   // event data
   ecal::rechit::EventOutputDataGPU eventOutputDataGPU_;
-  bool shouldTransferToHost_{true};
-  
+
   cms::cuda::ContextState cudaState_;
-  
+
   // gpu output
-  edm::EDPutTokenT<cms::cuda::Product<ecal::RecHit<ecal::Tag::ptr>>>  recHitsTokenEB_, recHitsTokenEE_;
-  
-  
+  edm::EDPutTokenT<cms::cuda::Product<ecal::RecHit<ecal::Tag::ptr>>> recHitsTokenEB_, recHitsTokenEE_;
+
   // configuration parameters
   ecal::rechit::ConfigurationParameters configParameters_;
   uint32_t maxNumberHits_;
-  
-  
+
   // conditions handles
   edm::ESHandle<EcalRechitADCToGeVConstantGPU> ADCToGeVConstantHandle_;
-  edm::ESHandle<EcalIntercalibConstantsGPU>    IntercalibConstantsHandle_;
-  edm::ESHandle<EcalRechitChannelStatusGPU>    ChannelStatusHandle_;
-  
-  edm::ESHandle<EcalLaserAPDPNRatiosGPU>    LaserAPDPNRatiosHandle_;
+  edm::ESHandle<EcalIntercalibConstantsGPU> IntercalibConstantsHandle_;
+  edm::ESHandle<EcalRechitChannelStatusGPU> ChannelStatusHandle_;
+
+  edm::ESHandle<EcalLaserAPDPNRatiosGPU> LaserAPDPNRatiosHandle_;
   edm::ESHandle<EcalLaserAPDPNRatiosRefGPU> LaserAPDPNRatiosRefHandle_;
-  edm::ESHandle<EcalLaserAlphasGPU>         LaserAlphasHandle_;
-  edm::ESHandle<EcalLinearCorrectionsGPU>   LinearCorrectionsHandle_;
-  
+  edm::ESHandle<EcalLaserAlphasGPU> LaserAlphasHandle_;
+  edm::ESHandle<EcalLinearCorrectionsGPU> LinearCorrectionsHandle_;
+
   // configuration
   std::vector<int> v_chstatus_;
-  
-  
+
   //
   // https://github.com/cms-sw/cmssw/blob/266e21cfc9eb409b093e4cf064f4c0a24c6ac293/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitWorkerSimple.h
   //
-  
+
   // Associate reco flagbit ( outer vector) to many db status flags (inner vector)
   //   std::vector<std::vector<uint32_t> > v_DB_reco_flags_;
-  std::vector<int> expanded_v_DB_reco_flags_;              // Transform a map in a vector      // FIXME AM: int or uint32 to be checked
-  std::vector<uint32_t> expanded_Sizes_v_DB_reco_flags_;   // Saving the size for each piece
-  std::vector<uint32_t> expanded_flagbit_v_DB_reco_flags_; // And the "key" for each key
-  
-  
+  std::vector<int>
+      expanded_v_DB_reco_flags_;  // Transform a map in a vector      // FIXME AM: int or uint32 to be checked
+  std::vector<uint32_t> expanded_Sizes_v_DB_reco_flags_;    // Saving the size for each piece
+  std::vector<uint32_t> expanded_flagbit_v_DB_reco_flags_;  // And the "key" for each key
+
   uint32_t flagmask_;  // do not propagate channels with these flags on
-  
-  
 };
 
+void EcalRecHitProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+  edm::ParameterSetDescription desc;
 
+  desc.add<edm::InputTag>("uncalibrecHitsInLabelEB",
+                          edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEB"));
+  desc.add<edm::InputTag>("uncalibrecHitsInLabelEE",
+                          edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEE"));
 
-void EcalRecHitProducerGPU::fillDescriptions(
-  edm::ConfigurationDescriptions& confDesc) 
-{
-  
-  edm::ParameterSetDescription desc;
-  
-  desc.add<edm::InputTag>("uncalibrecHitsInLabelEB", edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEB"));
-  desc.add<edm::InputTag>("uncalibrecHitsInLabelEE", edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEE"));
-  
   desc.add<std::string>("recHitsLabelEB", "EcalRecHitsGPUEB");
   desc.add<std::string>("recHitsLabelEE", "EcalRecHitsGPUEE");
-  
+
   desc.add<bool>("killDeadChannels", true);
-  
+
   desc.add<double>("EBLaserMIN", 0.01);
   desc.add<double>("EELaserMIN", 0.01);
   desc.add<double>("EBLaserMAX", 30.0);
   desc.add<double>("EELaserMAX", 30.0);
-  
+
   desc.add<uint32_t>("maxNumberHits", 20000);
-  
+
   // ## db statuses to be exluded from reconstruction (some will be recovered)
   edm::ParameterSetDescription desc_ChannelStatusToBeExcluded;
   desc_ChannelStatusToBeExcluded.add<std::string>("kDAC");
@@ -159,122 +137,106 @@ void EcalRecHitProducerGPU::fillDescriptions(
   desc_ChannelStatusToBeExcluded.add<std::string>("kDeadVFE");
   desc_ChannelStatusToBeExcluded.add<std::string>("kDeadFE");
   desc_ChannelStatusToBeExcluded.add<std::string>("kNoDataNoTP");
-  
+
   std::vector<edm::ParameterSet> default_ChannelStatusToBeExcluded(1);
-  
+
   desc.addVPSet("ChannelStatusToBeExcluded", desc_ChannelStatusToBeExcluded, default_ChannelStatusToBeExcluded);
-  
 }
 
-
-EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps)   {
-  
+EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) {
   //---- input
-  uncalibRecHitsInEBToken_ = consumes<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>>(ps.getParameter<edm::InputTag>("uncalibrecHitsInLabelEB"));
-  uncalibRecHitsInEEToken_ = consumes<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>>(ps.getParameter<edm::InputTag>("uncalibrecHitsInLabelEE"));
-  
+  uncalibRecHitsInEBToken_ = consumes<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>>(
+      ps.getParameter<edm::InputTag>("uncalibrecHitsInLabelEB"));
+  uncalibRecHitsInEEToken_ = consumes<cms::cuda::Product<ecal::UncalibratedRecHit<ecal::Tag::ptr>>>(
+      ps.getParameter<edm::InputTag>("uncalibrecHitsInLabelEE"));
+
   //---- output
-  recHitsTokenEB_ = produces<cms::cuda::Product<ecal::RecHit<ecal::Tag::ptr>>>( ps.getParameter<std::string>("recHitsLabelEB") );
-  recHitsTokenEE_ = produces<cms::cuda::Product<ecal::RecHit<ecal::Tag::ptr>>>( ps.getParameter<std::string>("recHitsLabelEE") );
-  
-  
+  recHitsTokenEB_ =
+      produces<cms::cuda::Product<ecal::RecHit<ecal::Tag::ptr>>>(ps.getParameter<std::string>("recHitsLabelEB"));
+  recHitsTokenEE_ =
+      produces<cms::cuda::Product<ecal::RecHit<ecal::Tag::ptr>>>(ps.getParameter<std::string>("recHitsLabelEE"));
+
   //---- db statuses to be exluded from reconstruction
-  v_chstatus_ = StringToEnumValue<EcalChannelStatusCode::Code>( ps.getParameter<std::vector<std::string> >("ChannelStatusToBeExcluded"));
-  
-  
+  v_chstatus_ = StringToEnumValue<EcalChannelStatusCode::Code>(
+      ps.getParameter<std::vector<std::string>>("ChannelStatusToBeExcluded"));
+
   bool killDeadChannels = ps.getParameter<bool>("killDeadChannels");
   configParameters_.killDeadChannels = killDeadChannels;
-  
-  
+
   configParameters_.EBLaserMIN = ps.getParameter<double>("EBLaserMIN");
   configParameters_.EELaserMIN = ps.getParameter<double>("EELaserMIN");
   configParameters_.EBLaserMAX = ps.getParameter<double>("EBLaserMAX");
   configParameters_.EELaserMAX = ps.getParameter<double>("EELaserMAX");
-  
-  
+
   // max number of digis to allocate for
   maxNumberHits_ = ps.getParameter<uint32_t>("maxNumberHits");
-  
+
   // allocate event output data
   eventOutputDataGPU_.allocate(configParameters_, maxNumberHits_);
-  
+
   configParameters_.ChannelStatusToBeExcludedSize = v_chstatus_.size();
-  
-  cudaCheck( cudaMalloc((void**)&configParameters_.ChannelStatusToBeExcluded, 
-                        sizeof(int) * v_chstatus_.size()) 
-  );
-  cudaCheck( cudaMemcpy(configParameters_.ChannelStatusToBeExcluded, 
-                        v_chstatus_.data(),
-                        v_chstatus_.size() * sizeof(int),
-                        cudaMemcpyHostToDevice) );
-  
-  
-  
+
+  cudaCheck(cudaMalloc((void**)&configParameters_.ChannelStatusToBeExcluded, sizeof(int) * v_chstatus_.size()));
+  cudaCheck(cudaMemcpy(configParameters_.ChannelStatusToBeExcluded,
+                       v_chstatus_.data(),
+                       v_chstatus_.size() * sizeof(int),
+                       cudaMemcpyHostToDevice));
+
   //
   //     https://github.com/cms-sw/cmssw/blob/266e21cfc9eb409b093e4cf064f4c0a24c6ac293/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitWorkerSimple.cc
-  //   
-  
+  //
+
   // Traslate string representation of flagsMapDBReco into enum values
   const edm::ParameterSet& p = ps.getParameter<edm::ParameterSet>("flagsMapDBReco");
   std::vector<std::string> recoflagbitsStrings = p.getParameterNames();
   //   v_DB_reco_flags_.resize(32);
-  
+
   for (unsigned int i = 0; i != recoflagbitsStrings.size(); ++i) {
     EcalRecHit::Flags recoflagbit = (EcalRecHit::Flags)StringToEnumValue<EcalRecHit::Flags>(recoflagbitsStrings[i]);
-    std::vector<std::string> dbstatus_s = p.getParameter<std::vector<std::string> >(recoflagbitsStrings[i]);
+    std::vector<std::string> dbstatus_s = p.getParameter<std::vector<std::string>>(recoflagbitsStrings[i]);
     //     std::vector<uint32_t> dbstatuses;
     for (unsigned int j = 0; j != dbstatus_s.size(); ++j) {
       EcalChannelStatusCode::Code dbstatus =
-      (EcalChannelStatusCode::Code)StringToEnumValue<EcalChannelStatusCode::Code>(dbstatus_s[j]);
+          (EcalChannelStatusCode::Code)StringToEnumValue<EcalChannelStatusCode::Code>(dbstatus_s[j]);
       //       dbstatuses.push_back(dbstatus);
       expanded_v_DB_reco_flags_.push_back(dbstatus);
     }
-    
-    expanded_Sizes_v_DB_reco_flags_.push_back( dbstatus_s.size() );
-    expanded_flagbit_v_DB_reco_flags_.push_back( recoflagbit );
-    
+
+    expanded_Sizes_v_DB_reco_flags_.push_back(dbstatus_s.size());
+    expanded_flagbit_v_DB_reco_flags_.push_back(recoflagbit);
+
     //     v_DB_reco_flags_[recoflagbit] = dbstatuses;
   }
-  
+
   // actual values
-  cudaCheck( cudaMalloc((void**)&configParameters_.expanded_v_DB_reco_flags, 
-                        sizeof(int) * expanded_v_DB_reco_flags_.size()) 
-  );
-  
-  cudaCheck( cudaMemcpy(configParameters_.expanded_v_DB_reco_flags, 
-                        expanded_v_DB_reco_flags_.data(),
-                        expanded_v_DB_reco_flags_.size() * sizeof(int),
-                        cudaMemcpyHostToDevice) 
-  );
-  
-  
+  cudaCheck(
+      cudaMalloc((void**)&configParameters_.expanded_v_DB_reco_flags, sizeof(int) * expanded_v_DB_reco_flags_.size()));
+
+  cudaCheck(cudaMemcpy(configParameters_.expanded_v_DB_reco_flags,
+                       expanded_v_DB_reco_flags_.data(),
+                       expanded_v_DB_reco_flags_.size() * sizeof(int),
+                       cudaMemcpyHostToDevice));
+
   // sizes
-  cudaCheck( cudaMalloc((void**)&configParameters_.expanded_Sizes_v_DB_reco_flags, 
-                        sizeof(uint32_t) * expanded_Sizes_v_DB_reco_flags_.size() ) 
-  );
-  
-  cudaCheck( cudaMemcpy(configParameters_.expanded_Sizes_v_DB_reco_flags, 
-                        expanded_Sizes_v_DB_reco_flags_.data(),
-                        expanded_Sizes_v_DB_reco_flags_.size() * sizeof(uint32_t),
-                        cudaMemcpyHostToDevice) 
-  );
-  
+  cudaCheck(cudaMalloc((void**)&configParameters_.expanded_Sizes_v_DB_reco_flags,
+                       sizeof(uint32_t) * expanded_Sizes_v_DB_reco_flags_.size()));
+
+  cudaCheck(cudaMemcpy(configParameters_.expanded_Sizes_v_DB_reco_flags,
+                       expanded_Sizes_v_DB_reco_flags_.data(),
+                       expanded_Sizes_v_DB_reco_flags_.size() * sizeof(uint32_t),
+                       cudaMemcpyHostToDevice));
+
   // keys
-  cudaCheck( cudaMalloc((void**)&configParameters_.expanded_flagbit_v_DB_reco_flags, 
-                        sizeof(uint32_t) * expanded_flagbit_v_DB_reco_flags_.size() ) 
-  );
-  
-  cudaCheck( cudaMemcpy(configParameters_.expanded_flagbit_v_DB_reco_flags, 
-                        expanded_flagbit_v_DB_reco_flags_.data(),
-                        expanded_flagbit_v_DB_reco_flags_.size() * sizeof(uint32_t),
-                        cudaMemcpyHostToDevice) 
-  );
-  
-  
+  cudaCheck(cudaMalloc((void**)&configParameters_.expanded_flagbit_v_DB_reco_flags,
+                       sizeof(uint32_t) * expanded_flagbit_v_DB_reco_flags_.size()));
+
+  cudaCheck(cudaMemcpy(configParameters_.expanded_flagbit_v_DB_reco_flags,
+                       expanded_flagbit_v_DB_reco_flags_.data(),
+                       expanded_flagbit_v_DB_reco_flags_.size() * sizeof(uint32_t),
+                       cudaMemcpyHostToDevice));
+
   configParameters_.expanded_v_DB_reco_flagsSize = expanded_flagbit_v_DB_reco_flags_.size();
-  
-  
-  
+
   flagmask_ = 0;
   flagmask_ |= 0x1 << EcalRecHit::kNeighboursRecovered;
   flagmask_ |= 0x1 << EcalRecHit::kTowerRecovered;
@@ -282,45 +244,35 @@ EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps)   {
   flagmask_ |= 0x1 << EcalRecHit::kKilled;
   flagmask_ |= 0x1 << EcalRecHit::kTPSaturated;
   flagmask_ |= 0x1 << EcalRecHit::kL1SpikeFlag;
-  
+
   configParameters_.flagmask = flagmask_;
-  
-  
+
   // for recovery and killing
-  
-  configParameters_.recoverEBIsolatedChannels   = ps.getParameter<bool>("recoverEBIsolatedChannels");
-  configParameters_.recoverEEIsolatedChannels   = ps.getParameter<bool>("recoverEEIsolatedChannels");
-  configParameters_.recoverEBVFE                = ps.getParameter<bool>("recoverEBVFE");
-  configParameters_.recoverEEVFE                = ps.getParameter<bool>("recoverEEVFE");
-  configParameters_.recoverEBFE                 = ps.getParameter<bool>("recoverEBFE");
-  configParameters_.recoverEEFE                 = ps.getParameter<bool>("recoverEEFE");
-  
-  
-  
-}
 
+  configParameters_.recoverEBIsolatedChannels = ps.getParameter<bool>("recoverEBIsolatedChannels");
+  configParameters_.recoverEEIsolatedChannels = ps.getParameter<bool>("recoverEEIsolatedChannels");
+  configParameters_.recoverEBVFE = ps.getParameter<bool>("recoverEBVFE");
+  configParameters_.recoverEEVFE = ps.getParameter<bool>("recoverEEVFE");
+  configParameters_.recoverEBFE = ps.getParameter<bool>("recoverEBFE");
+  configParameters_.recoverEEFE = ps.getParameter<bool>("recoverEEFE");
+}
 
 EcalRecHitProducerGPU::~EcalRecHitProducerGPU() {
-  
-  // free event ouput data 
+  // free event ouput data
   eventOutputDataGPU_.deallocate(configParameters_);
-  
+
   // FIXME AM: do I need to do this?
   //           Or can I do it as part of "deallocate" ?
-  cudaCheck( cudaFree(configParameters_.ChannelStatusToBeExcluded) );
-  
-  cudaCheck( cudaFree(configParameters_.expanded_v_DB_reco_flags) );
-  cudaCheck( cudaFree(configParameters_.expanded_Sizes_v_DB_reco_flags) );
-  cudaCheck( cudaFree(configParameters_.expanded_flagbit_v_DB_reco_flags) );
-  
-}
+  cudaCheck(cudaFree(configParameters_.ChannelStatusToBeExcluded));
 
+  cudaCheck(cudaFree(configParameters_.expanded_v_DB_reco_flags));
+  cudaCheck(cudaFree(configParameters_.expanded_Sizes_v_DB_reco_flags));
+  cudaCheck(cudaFree(configParameters_.expanded_flagbit_v_DB_reco_flags));
+}
 
-void EcalRecHitProducerGPU::acquire(
-  edm::Event const& event,
-  edm::EventSetup const& setup,
-  edm::WaitingTaskWithArenaHolder holder) 
-{
+void EcalRecHitProducerGPU::acquire(edm::Event const& event,
+                                    edm::EventSetup const& setup,
+                                    edm::WaitingTaskWithArenaHolder holder) {
   // cuda products
   auto const& ebUncalibRecHitsProduct = event.get(uncalibRecHitsInEBToken_);
   auto const& eeUncalibRecHitsProduct = event.get(uncalibRecHitsInEEToken_);
@@ -329,114 +281,95 @@ void EcalRecHitProducerGPU::acquire(
   // get actual object
   auto const& ebUncalibRecHits = ctx.get(ebUncalibRecHitsProduct);
   auto const& eeUncalibRecHits = ctx.get(eeUncalibRecHitsProduct);
-  
+
   ecal::rechit::EventInputDataGPU inputDataGPU{ebUncalibRecHits, eeUncalibRecHits};
-  
+
   neb_ = ebUncalibRecHits.size;
   nee_ = eeUncalibRecHits.size;
   //   std::cout << " [EcalRecHitProducerGPU::acquire]  neb_:nee_ = " << neb_ << " : " << nee_ << std::endl;
-  
-  int nchannelsEB = ebUncalibRecHits.size; // --> offsetForInput, first EB and then EE
-  
+
+  int nchannelsEB = ebUncalibRecHits.size;  // --> offsetForInput, first EB and then EE
+
   // conditions
-  // - laser correction 
+  // - laser correction
   // - IC
   // - adt2gev
-  
-  //   
-  setup.get<EcalADCToGeVConstantRcd>()   .get(ADCToGeVConstantHandle_);
+
+  //
+  setup.get<EcalADCToGeVConstantRcd>().get(ADCToGeVConstantHandle_);
   setup.get<EcalIntercalibConstantsRcd>().get(IntercalibConstantsHandle_);
-  setup.get<EcalChannelStatusRcd>()      .get(ChannelStatusHandle_);
-  
-  setup.get<EcalLaserAPDPNRatiosRcd>()     .get(LaserAPDPNRatiosHandle_);
-  setup.get<EcalLaserAPDPNRatiosRefRcd>()  .get(LaserAPDPNRatiosRefHandle_);
-  setup.get<EcalLaserAlphasRcd>()          .get(LaserAlphasHandle_);
-  setup.get<EcalLinearCorrectionsRcd>()    .get(LinearCorrectionsHandle_);
-  
-  //   
-  
-  auto const& ADCToGeVConstantProduct    = ADCToGeVConstantHandle_    -> getProduct(ctx.stream());
-  auto const& IntercalibConstantsProduct = IntercalibConstantsHandle_ -> getProduct(ctx.stream());
-  auto const& ChannelStatusProduct       = ChannelStatusHandle_       -> getProduct(ctx.stream());
-  
-  auto const& LaserAPDPNRatiosProduct     = LaserAPDPNRatiosHandle_     -> getProduct(ctx.stream());
-  auto const& LaserAPDPNRatiosRefProduct  = LaserAPDPNRatiosRefHandle_  -> getProduct(ctx.stream());
-  auto const& LaserAlphasProduct          = LaserAlphasHandle_          -> getProduct(ctx.stream());
-  auto const& LinearCorrectionsProduct    = LinearCorrectionsHandle_    -> getProduct(ctx.stream());
-  
-  
+  setup.get<EcalChannelStatusRcd>().get(ChannelStatusHandle_);
+
+  setup.get<EcalLaserAPDPNRatiosRcd>().get(LaserAPDPNRatiosHandle_);
+  setup.get<EcalLaserAPDPNRatiosRefRcd>().get(LaserAPDPNRatiosRefHandle_);
+  setup.get<EcalLaserAlphasRcd>().get(LaserAlphasHandle_);
+  setup.get<EcalLinearCorrectionsRcd>().get(LinearCorrectionsHandle_);
+
+  //
+
+  auto const& ADCToGeVConstantProduct = ADCToGeVConstantHandle_->getProduct(ctx.stream());
+  auto const& IntercalibConstantsProduct = IntercalibConstantsHandle_->getProduct(ctx.stream());
+  auto const& ChannelStatusProduct = ChannelStatusHandle_->getProduct(ctx.stream());
+
+  auto const& LaserAPDPNRatiosProduct = LaserAPDPNRatiosHandle_->getProduct(ctx.stream());
+  auto const& LaserAPDPNRatiosRefProduct = LaserAPDPNRatiosRefHandle_->getProduct(ctx.stream());
+  auto const& LaserAlphasProduct = LaserAlphasHandle_->getProduct(ctx.stream());
+  auto const& LinearCorrectionsProduct = LinearCorrectionsHandle_->getProduct(ctx.stream());
+
   // bundle up conditions
-  ecal::rechit::ConditionsProducts conditions {
-    ADCToGeVConstantProduct,
-    IntercalibConstantsProduct,
-    ChannelStatusProduct,
-    //       
-    LaserAPDPNRatiosProduct,   
-    LaserAPDPNRatiosRefProduct,
-    LaserAlphasProduct,        
-    LinearCorrectionsProduct,  
-    //       
-    IntercalibConstantsHandle_->getOffset()
-  };
-  
-  
+  ecal::rechit::ConditionsProducts conditions{ADCToGeVConstantProduct,
+                                              IntercalibConstantsProduct,
+                                              ChannelStatusProduct,
+                                              //
+                                              LaserAPDPNRatiosProduct,
+                                              LaserAPDPNRatiosRefProduct,
+                                              LaserAlphasProduct,
+                                              LinearCorrectionsProduct,
+                                              //
+                                              IntercalibConstantsHandle_->getOffset()};
+
   //
   // schedule algorithms
   //
-  
+
   edm::TimeValue_t event_time = event.time().value();
-  
-  
-  ecal::rechit::create_ecal_rehit(
-    inputDataGPU,
-    eventOutputDataGPU_,
-    //     eventDataForScratchGPU_,
-    conditions,  
-    configParameters_,
-    nchannelsEB,
-    event_time,
-    ctx.stream()
-  );
-  
-//   cudaCheck(cudaGetLastError());
-  
-  
+
+  ecal::rechit::create_ecal_rehit(inputDataGPU,
+                                  eventOutputDataGPU_,
+                                  //     eventDataForScratchGPU_,
+                                  conditions,
+                                  configParameters_,
+                                  nchannelsEB,
+                                  event_time,
+                                  ctx.stream());
+
+  //   cudaCheck(cudaGetLastError());
 }
 
-void EcalRecHitProducerGPU::produce(
-  edm::Event& event, 
-  edm::EventSetup const& setup) 
-{
+void EcalRecHitProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) {
   //DurationMeasurer<std::chrono::milliseconds> timer{std::string{"produce duration"}};
   cms::cuda::ScopedContextProduce ctx{cudaState_};
-  
+
   // copy construct output collections
   // note, output collections do not own device memory!
   ecal::RecHit<ecal::Tag::ptr> ebRecHits{eventOutputDataGPU_};
   ecal::RecHit<ecal::Tag::ptr> eeRecHits{eventOutputDataGPU_};
-  
-  
-  
+
   // set the size of eb and ee
   ebRecHits.size = neb_;
   eeRecHits.size = nee_;
-  
+
   // shift ptrs for ee
-  eeRecHits.energy   += neb_;
-  eeRecHits.chi2     += neb_;
-  eeRecHits.did      += neb_;
-  eeRecHits.time     += neb_;
-  eeRecHits.extra    += neb_;
+  eeRecHits.energy += neb_;
+  eeRecHits.chi2 += neb_;
+  eeRecHits.did += neb_;
+  eeRecHits.time += neb_;
+  eeRecHits.extra += neb_;
   eeRecHits.flagBits += neb_;
-  
+
   // put into the event
   ctx.emplace(event, recHitsTokenEB_, std::move(ebRecHits));
   ctx.emplace(event, recHitsTokenEE_, std::move(eeRecHits));
-  
 }
 
-
-
-
 DEFINE_FWK_MODULE(EcalRecHitProducerGPU);
-

From 55c10ec10564c2cd9a6649e7d8697e2de4c09d9e Mon Sep 17 00:00:00 2001
From: amassiro <massironi.andrea@gmail.com>
Date: Wed, 27 May 2020 12:23:10 +0200
Subject: [PATCH 18/30] change from CUDAHostAllocator to HostAllocator

---
 .../EcalRecHitSoA/interface/EcalRecHit_soa.h       |  2 +-
 .../interface/EcalIntercalibConstantsGPU.h         |  2 +-
 .../interface/EcalLaserAPDPNRatiosGPU.h            | 14 +++++++-------
 .../interface/EcalLaserAPDPNRatiosRefGPU.h         |  2 +-
 .../EcalRecAlgos/interface/EcalLaserAlphasGPU.h    |  2 +-
 .../interface/EcalLinearCorrectionsGPU.h           | 14 +++++++-------
 .../interface/EcalRechitADCToGeVConstantGPU.h      |  4 ++--
 .../interface/EcalRechitChannelStatusGPU.h         |  4 ++--
 8 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h b/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h
index 8379dec5c81ad..a5f73d2166f7a 100644
--- a/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h
+++ b/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h
@@ -5,7 +5,7 @@
 #include <array>
 
 #include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
 
 // needed for "soa" definition
 #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h
index c59527a6d9f5a..3bbcdbd04e385 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h
@@ -4,7 +4,7 @@
 #include "CondFormats/EcalObjects/interface/EcalIntercalibConstants.h"
 
 #ifndef __CUDACC__
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
 #include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #endif
 
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h
index 9b87c3228e5c7..633238234e086 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h
@@ -4,7 +4,7 @@
 #include "CondFormats/EcalObjects/interface/EcalLaserAPDPNRatios.h"
 
 #ifndef __CUDACC__
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
 #include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #endif
 
@@ -37,13 +37,13 @@ class EcalLaserAPDPNRatiosGPU {
 private:
   // in the future, we need to arrange so to avoid this copy on the host
   // store eb first then ee
-  std::vector<float, CUDAHostAllocator<float> > p1_;
-  std::vector<float, CUDAHostAllocator<float> > p2_;
-  std::vector<float, CUDAHostAllocator<float> > p3_;
+  std::vector<float, cms::cuda::HostAllocator<float> > p1_;
+  std::vector<float, cms::cuda::HostAllocator<float> > p2_;
+  std::vector<float, cms::cuda::HostAllocator<float> > p3_;
 
-  std::vector<edm::TimeValue_t, CUDAHostAllocator<edm::TimeValue_t> > t1_;
-  std::vector<edm::TimeValue_t, CUDAHostAllocator<edm::TimeValue_t> > t2_;
-  std::vector<edm::TimeValue_t, CUDAHostAllocator<edm::TimeValue_t> > t3_;
+  std::vector<edm::TimeValue_t, cms::cuda::HostAllocator<edm::TimeValue_t> > t1_;
+  std::vector<edm::TimeValue_t, cms::cuda::HostAllocator<edm::TimeValue_t> > t2_;
+  std::vector<edm::TimeValue_t, cms::cuda::HostAllocator<edm::TimeValue_t> > t3_;
 
   cms::cuda::ESProduct<Product> product_;
 
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h
index 6e48d50f217f3..08b2a2b5047dc 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h
@@ -4,7 +4,7 @@
 #include "CondFormats/EcalObjects/interface/EcalLaserAPDPNRatiosRef.h"
 
 #ifndef __CUDACC__
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
 #include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #endif
 
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h
index d787c5700cd7e..71af7753933f6 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h
@@ -4,7 +4,7 @@
 #include "CondFormats/EcalObjects/interface/EcalLaserAlphas.h"
 
 #ifndef __CUDACC__
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
 #include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #endif
 
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h
index f2b395f5660fa..62691e9c4ef8c 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h
@@ -4,7 +4,7 @@
 #include "CondFormats/EcalObjects/interface/EcalLinearCorrections.h"
 
 #ifndef __CUDACC__
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
 #include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #endif
 
@@ -37,13 +37,13 @@ class EcalLinearCorrectionsGPU {
 private:
   // in the future, we need to arrange so to avoid this copy on the host
   // store eb first then ee
-  std::vector<float, CUDAHostAllocator<float>> p1_;
-  std::vector<float, CUDAHostAllocator<float>> p2_;
-  std::vector<float, CUDAHostAllocator<float>> p3_;
+  std::vector<float, cms::cuda::HostAllocator<float>> p1_;
+  std::vector<float, cms::cuda::HostAllocator<float>> p2_;
+  std::vector<float, cms::cuda::HostAllocator<float>> p3_;
 
-  std::vector<edm::TimeValue_t, CUDAHostAllocator<edm::TimeValue_t>> t1_;
-  std::vector<edm::TimeValue_t, CUDAHostAllocator<edm::TimeValue_t>> t2_;
-  std::vector<edm::TimeValue_t, CUDAHostAllocator<edm::TimeValue_t>> t3_;
+  std::vector<edm::TimeValue_t, cms::cuda::HostAllocator<edm::TimeValue_t>> t1_;
+  std::vector<edm::TimeValue_t, cms::cuda::HostAllocator<edm::TimeValue_t>> t2_;
+  std::vector<edm::TimeValue_t, cms::cuda::HostAllocator<edm::TimeValue_t>> t3_;
 
   cms::cuda::ESProduct<Product> product_;
 
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h
index 3838a757cc2e1..92441ae4ae703 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h
@@ -4,7 +4,7 @@
 #include "CondFormats/EcalObjects/interface/EcalADCToGeVConstant.h"
 
 #ifndef __CUDACC__
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
 #include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #endif
 
@@ -32,7 +32,7 @@ class EcalRechitADCToGeVConstantGPU {
 private:
   // in the future, we need to arrange so to avoid this copy on the host
   // store eb first then ee
-  std::vector<float, CUDAHostAllocator<float>> adc2gev_;
+  std::vector<float, cms::cuda::HostAllocator<float>> adc2gev_;
 
   cms::cuda::ESProduct<Product> product_;
 
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h
index bf3f0f600224e..f425293a5488d 100644
--- a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h
+++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h
@@ -4,7 +4,7 @@
 #include "CondFormats/EcalObjects/interface/EcalChannelStatus.h"
 
 #ifndef __CUDACC__
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
 #include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #endif
 
@@ -32,7 +32,7 @@ class EcalRechitChannelStatusGPU {
 private:
   // in the future, we need to arrange so to avoid this copy on the host
   // store eb first then ee
-  std::vector<uint16_t, CUDAHostAllocator<uint16_t>> status_;
+  std::vector<uint16_t, cms::cuda::HostAllocator<uint16_t>> status_;
 
   cms::cuda::ESProduct<Product> product_;
 

From 5dd85ff28c3aeec12aff4fe69cd873125f0ce745 Mon Sep 17 00:00:00 2001
From: amassiro <massironi.andrea@gmail.com>
Date: Fri, 29 May 2020 18:05:23 +0200
Subject: [PATCH 19/30] remove message logger not needed

---
 RecoLocalCalo/EcalRecProducers/BuildFile.xml         | 1 -
 RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml | 1 -
 2 files changed, 2 deletions(-)

diff --git a/RecoLocalCalo/EcalRecProducers/BuildFile.xml b/RecoLocalCalo/EcalRecProducers/BuildFile.xml
index abbae509cdab0..aa19516964fd9 100644
--- a/RecoLocalCalo/EcalRecProducers/BuildFile.xml
+++ b/RecoLocalCalo/EcalRecProducers/BuildFile.xml
@@ -3,7 +3,6 @@
 <use name="CUDADataFormats/EcalRecHitSoA"/>
 <use name="CondFormats/EcalObjects"/>
 <use name="FWCore/Framework"/>
-<use name="FWCore/MessageLogger"/>
 <use name="HeterogeneousCore/CUDACore"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
 
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml
index ca6b19c2ddd23..3b1d2c0cf159d 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml
+++ b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml
@@ -15,7 +15,6 @@
 <use name="CondFormats/ESObjects"/>
 <use name="CondFormats/DataRecord"/>
 <use name="RecoLocalCalo/EcalRecAlgos"/>
-<use name="FWCore/MessageLogger"/>
 <use name="SimCalorimetry/EcalSimAlgos"/>
 <library file="*.cc" name="RecoLocalCaloEcalRecProducersPlugins">
   <flags EDM_PLUGIN="1"/>

From 34f8084f4a84a118195ed88088db930876fa8e83 Mon Sep 17 00:00:00 2001
From: amassiro <massironi.andrea@gmail.com>
Date: Thu, 4 Jun 2020 17:43:20 +0200
Subject: [PATCH 20/30] suggestions from PR implemented

---
 ...eEcalMultifitResultsGpuValidationPlots.cpp | 52 ++++++++----------
 .../plugins/EcalCPURecHitProducer.cc          | 11 ++--
 .../plugins/EcalRecHitConvertGPU2CPUFormat.cc | 54 +++++++++----------
 3 files changed, 51 insertions(+), 66 deletions(-)

diff --git a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp
index 1cf7c9d706317..8ddc5f9c9c028 100644
--- a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp
+++ b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp
@@ -232,7 +232,7 @@ int main(int argc, char *argv[]) {
       if (chi2_cpu > 0)
         hChi2EBGPUCPUratio->Fill((float)chi2_gpu / chi2_cpu);
 
-      if (fabs(chi2_gpu / chi2_cpu - 1) > 0.05 || fabs(soi_amp_gpu / soi_amp_cpu - 1) > 0.05) {
+      if (std::abs(chi2_gpu / chi2_cpu - 1) > 0.05 || std::abs(soi_amp_gpu / soi_amp_cpu - 1) > 0.05) {
         std::cout << " ---- EB  " << std::endl;
         std::cout << " eventid = " << ie << " xtal = " << i << std::endl;
         std::cout << " chi2_gpu    = " << chi2_gpu << " chi2_cpu =    " << chi2_cpu << std::endl;
@@ -296,7 +296,7 @@ int main(int argc, char *argv[]) {
       if (chi2_cpu > 0)
         hChi2EEGPUCPUratio->Fill((float)chi2_gpu / chi2_cpu);
 
-      if (fabs(chi2_gpu / chi2_cpu - 1) > 0.05 || fabs(soi_amp_gpu / soi_amp_cpu - 1) > 0.05) {
+      if (std::abs(chi2_gpu / chi2_cpu - 1) > 0.05 || std::abs(soi_amp_gpu / soi_amp_cpu - 1) > 0.05) {
         std::cout << " ---- EE  " << std::endl;
         std::cout << " eventid = " << ie << " xtal = " << i << std::endl;
         std::cout << " chi2_gpu    = " << chi2_gpu << " chi2_cpu =    " << chi2_cpu << std::endl;
@@ -332,12 +332,9 @@ int main(int argc, char *argv[]) {
   }
 
   {
-    //       TCanvas c("plots", "plots", 4200, 6200);
     TCanvas c("plots", "plots", 1750, 860);
-    //       c.Divide(2, 3);
     c.Divide(3, 2);
 
-    //       c.cd(1);
     c.cd(1);
     {
       gPad->SetLogy();
@@ -354,7 +351,7 @@ int main(int argc, char *argv[]) {
       stats->SetY2NDC(y1);
       stats->SetY1NDC(y1 - (y2 - y1));
     }
-    //       c.cd(2);
+
     c.cd(4);
     {
       gPad->SetLogy();
@@ -371,21 +368,21 @@ int main(int argc, char *argv[]) {
       stats->SetY2NDC(y1);
       stats->SetY1NDC(y1 - (y2 - y1));
     }
-    //       c.cd(3);
+    
     c.cd(2);
     gPad->SetGrid();
     hSOIAmplitudesEBGPUvsCPU->Draw("COLZ");
-    //       c.cd(4);
+    
     c.cd(5);
     gPad->SetGrid();
     hSOIAmplitudesEEGPUvsCPU->Draw("COLZ");
-    //       c.cd(5);
+    
     c.cd(3);
-    //       hSOIAmplitudesEBdeltavsCPU->Draw("COLZ");
+    
     hSOIAmplitudesEBGPUCPUratio->Draw("");
-    //       c.cd(6);
+    
     c.cd(6);
-    //       hSOIAmplitudesEEdeltavsCPU->Draw("COLZ");
+    
     hSOIAmplitudesEEGPUCPUratio->Draw("");
 
     c.SaveAs("ecal-amplitudes.root");
@@ -393,7 +390,6 @@ int main(int argc, char *argv[]) {
 
     // chi2
 
-    //       c.cd(1);
     c.cd(1);
     {
       gPad->SetLogy();
@@ -410,7 +406,7 @@ int main(int argc, char *argv[]) {
       stats->SetY2NDC(y1);
       stats->SetY1NDC(y1 - (y2 - y1));
     }
-    //       c.cd(2);
+    
     c.cd(4);
     {
       gPad->SetLogy();
@@ -427,21 +423,21 @@ int main(int argc, char *argv[]) {
       stats->SetY2NDC(y1);
       stats->SetY1NDC(y1 - (y2 - y1));
     }
-    //       c.cd(3);
+    
     c.cd(2);
     gPad->SetGrid();
     hChi2EBGPUvsCPU->Draw("COLZ");
-    //       c.cd(4);
+    
     c.cd(5);
     gPad->SetGrid();
     hChi2EEGPUvsCPU->Draw("COLZ");
-    //       c.cd(5);
+    
     c.cd(3);
-    //       hChi2EBdeltavsCPU->Draw("COLZ");
+    
     hChi2EBGPUCPUratio->Draw("");
-    //       c.cd(6);
+    
     c.cd(6);
-    //       hChi2EEdeltavsCPU->Draw("COLZ");
+    
     hChi2EEGPUCPUratio->Draw("");
 
     c.SaveAs("ecal-chi2.root");
@@ -449,7 +445,7 @@ int main(int argc, char *argv[]) {
 
     // flags
 
-    //       c.cd(1);
+    
     c.cd(1);
     {
       gPad->SetLogy();
@@ -466,7 +462,7 @@ int main(int argc, char *argv[]) {
       stats->SetY2NDC(y1);
       stats->SetY1NDC(y1 - (y2 - y1));
     }
-    //       c.cd(2);
+    
     c.cd(4);
     {
       gPad->SetLogy();
@@ -483,22 +479,20 @@ int main(int argc, char *argv[]) {
       stats->SetY2NDC(y1);
       stats->SetY1NDC(y1 - (y2 - y1));
     }
-    //       c.cd(3);
+    
     c.cd(2);
     gPad->SetGrid();
     hFlagsEBGPUvsCPU->Draw("COLZ");
-    //       c.cd(4);
+    
     c.cd(5);
     gPad->SetGrid();
     hFlagsEEGPUvsCPU->Draw("COLZ");
-    //       c.cd(5);
+    
     c.cd(3);
-    //       hFlagsEBdeltavsCPU->Draw("COLZ");
     hFlagsEBGPUCPUratio->Draw("");
 
-    //       c.cd(6);
+    
     c.cd(6);
-    //       hFlagsEEdeltavsCPU->Draw("COLZ");
     hFlagsEEGPUCPUratio->Draw("");
 
     c.SaveAs("ecal-flags.root");
@@ -547,13 +541,11 @@ int main(int argc, char *argv[]) {
     cRechits.cd(3);
     {
       gPad->SetLogy();
-      //hRechitsEBdeltavsCPU->Draw("COLZ");
       hRechitsEBGPUCPUratio->Draw("");
     }
     cRechits.cd(6);
     {
       gPad->SetLogy();
-      //hRechitsEEdeltavsCPU->Draw("COLZ");
       hRechitsEEGPUCPUratio->Draw("");
     }
     cRechits.SaveAs("ecal-rechits.root");
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc
index 8c5e5c0c9783d..8e1b4d399e0c7 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc
@@ -46,8 +46,7 @@ void EcalCPURecHitProducer::fillDescriptions(edm::ConfigurationDescriptions& con
   desc.add<std::string>("recHitsOutLabelEE", "EcalRecHitsEE");
   desc.add<bool>("containsTimingInformation", false);
 
-  std::string label = "ecalCPURecHitProducer";
-  confDesc.add(label, desc);
+  confDesc.addWithDefaultLabel(desc);
 }
 
 EcalCPURecHitProducer::EcalCPURecHitProducer(const edm::ParameterSet& ps)
@@ -151,13 +150,9 @@ void EcalCPURecHitProducer::acquire(edm::Event const& event,
 }
 
 void EcalCPURecHitProducer::produce(edm::Event& event, edm::EventSetup const& setup) {
-  // tmp vectors
-  auto recHitsOutEB = std::make_unique<ecal::RecHit<ecal::Tag::soa>>(std::move(recHitsEB_));
-  auto recHitsOutEE = std::make_unique<ecal::RecHit<ecal::Tag::soa>>(std::move(recHitsEE_));
-
   // put into event
-  event.put(recHitsOutEBToken_, std::move(recHitsOutEB));
-  event.put(recHitsOutEEToken_, std::move(recHitsOutEE));
+  event.emplace(recHitsOutEBToken_, std::move(recHitsEB_));
+  event.emplace(recHitsOutEEToken_, std::move(recHitsEE_));
 }
 
 DEFINE_FWK_MODULE(EcalCPURecHitProducer);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc
index 548bc812ffa2e..151762c6b63d3 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc
@@ -40,8 +40,7 @@ void EcalRecHitConvertGPU2CPUFormat::fillDescriptions(edm::ConfigurationDescript
   desc.add<std::string>("recHitsLabelCPUEB", "EcalRecHitsEB");
   desc.add<std::string>("recHitsLabelCPUEE", "EcalRecHitsEE");
 
-  std::string label = "ecalRecHitConvertGPU2CPUFormat";
-  confDesc.add(label, desc);
+  confDesc.addWithDefaultLabel(desc);
 }
 
 EcalRecHitConvertGPU2CPUFormat::EcalRecHitConvertGPU2CPUFormat(const edm::ParameterSet& ps)
@@ -56,65 +55,64 @@ EcalRecHitConvertGPU2CPUFormat::EcalRecHitConvertGPU2CPUFormat(const edm::Parame
 EcalRecHitConvertGPU2CPUFormat::~EcalRecHitConvertGPU2CPUFormat() {}
 
 void EcalRecHitConvertGPU2CPUFormat::produce(edm::Event& event, edm::EventSetup const& setup) {
-  edm::Handle<ecal::SoARecHitCollection> hRecHitsGPUEB, hRecHitsGPUEE;
-  event.getByToken(recHitsGPUEB_, hRecHitsGPUEB);
-  event.getByToken(recHitsGPUEE_, hRecHitsGPUEE);
-
+  auto const& hRecHitsGPUEB = event.get(recHitsGPUEB_);
+  auto const& hRecHitsGPUEE = event.get(recHitsGPUEE_);
+  
   auto recHitsCPUEB = std::make_unique<EBRecHitCollection>();
   auto recHitsCPUEE = std::make_unique<EERecHitCollection>();
-  recHitsCPUEB->reserve(hRecHitsGPUEB->energy.size());
-  recHitsCPUEE->reserve(hRecHitsGPUEE->energy.size());
+  recHitsCPUEB->reserve(hRecHitsGPUEB.energy.size());
+  recHitsCPUEE->reserve(hRecHitsGPUEE.energy.size());
 
   //
   //     explicit EcalRecHit(const DetId& id, float energy, float time, uint32_t extra = 0, uint32_t flagBits = 0):
   //
 
-  for (uint32_t i = 0; i < hRecHitsGPUEB->energy.size(); ++i) {
+  for (uint32_t i = 0; i < hRecHitsGPUEB.energy.size(); ++i) {
     //
     // Save only if energy is >= 0 !
     // This is extremely important because the channels that were supposed
     // to be excluded get "-1" as energy
     //
 
-    if (hRecHitsGPUEB->energy[i] >= 0) {
-      recHitsCPUEB->emplace_back(DetId{hRecHitsGPUEB->did[i]},
-                                 hRecHitsGPUEB->energy[i],
-                                 hRecHitsGPUEB->time[i],
-                                 hRecHitsGPUEB->extra[i],
-                                 hRecHitsGPUEB->flagBits[i]);
+    if (hRecHitsGPUEB.energy[i] >= 0) {
+      recHitsCPUEB->emplace_back(DetId{hRecHitsGPUEB.did[i]},
+                                 hRecHitsGPUEB.energy[i],
+                                 hRecHitsGPUEB.time[i],
+                                 hRecHitsGPUEB.extra[i],
+                                 hRecHitsGPUEB.flagBits[i]);
     }
 
-    //       std::cout << " EB :: extra [" << i << "::" << hRecHitsGPUEB->energy.size() << "] = " << hRecHitsGPUEB->extra[i] << std::endl;
+    //       std::cout << " EB :: extra [" << i << "::" << hRecHitsGPUEB.energy.size() << "] = " << hRecHitsGPUEB.extra[i] << std::endl;
 
-    //         (*recHitsCPUEB)[i].setJitterError(hRecHitsGPUEB->timeError[i]);
+    //         (*recHitsCPUEB)[i].setJitterError(hRecHitsGPUEB.timeError[i]);
     //         auto const offset = i * EcalDataFrame::MAXSAMPLES;
     //         for (uint32_t sample=0; sample<EcalDataFrame::MAXSAMPLES; ++sample)
     //             (*recHitsCPUEB)[i].setOutOfTimeAmplitude(
-    //                 sample, hRecHitsGPUEB->energysAll[offset + sample]);
+    //                 sample, hRecHitsGPUEB.energysAll[offset + sample]);
   }
 
-  for (uint32_t i = 0; i < hRecHitsGPUEE->energy.size(); ++i) {
+  for (uint32_t i = 0; i < hRecHitsGPUEE.energy.size(); ++i) {
     //
     // Save only if energy is >= 0 !
     // This is extremely important because the channels that were supposed
     // to be excluded get "-1" as energy
     //
 
-    if (hRecHitsGPUEE->energy[i] >= 0) {
-      recHitsCPUEE->emplace_back(DetId{hRecHitsGPUEE->did[i]},
-                                 hRecHitsGPUEE->energy[i],
-                                 hRecHitsGPUEE->time[i],
-                                 hRecHitsGPUEE->extra[i],
-                                 hRecHitsGPUEE->flagBits[i]);
+    if (hRecHitsGPUEE.energy[i] >= 0) {
+      recHitsCPUEE->emplace_back(DetId{hRecHitsGPUEE.did[i]},
+                                 hRecHitsGPUEE.energy[i],
+                                 hRecHitsGPUEE.time[i],
+                                 hRecHitsGPUEE.extra[i],
+                                 hRecHitsGPUEE.flagBits[i]);
     }
 
-    //       std::cout << " EE :: extra [" << i << "::" << hRecHitsGPUEE->energy.size() << "] = " << hRecHitsGPUEE->extra[i] << std::endl;
+    //       std::cout << " EE :: extra [" << i << "::" << hRecHitsGPUEE.energy.size() << "] = " << hRecHitsGPUEE.extra[i] << std::endl;
 
-    //         (*recHitsCPUEE)[i].setJitterError(hRecHitsGPUEE->timeError[i]);
+    //         (*recHitsCPUEE)[i].setJitterError(hRecHitsGPUEE.timeError[i]);
     //         auto const offset = i * EcalDataFrame::MAXSAMPLES;
     //         for (uint32_t sample=0; sample<EcalDataFrame::MAXSAMPLES; ++sample)
     //             (*recHitsCPUEE)[i].setOutOfTimeAmplitude(
-    //                 sample, hRecHitsGPUEE->energysAll[offset + sample]);
+    //                 sample, hRecHitsGPUEE.energysAll[offset + sample]);
   }
 
   event.put(std::move(recHitsCPUEB), recHitsLabelCPUEB_);

From 02c4d1059dc9bc41611048a12591e0794bacf0aa Mon Sep 17 00:00:00 2001
From: amassiro <massironi.andrea@gmail.com>
Date: Thu, 4 Jun 2020 19:11:01 +0200
Subject: [PATCH 21/30] add cuda protection

---
 .../plugins/EcalRecHitProducerGPU.cc          | 104 ++++++++++--------
 1 file changed, 60 insertions(+), 44 deletions(-)

diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
index 795a499987a06..1c80d648f1eed 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
@@ -44,6 +44,9 @@
 
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
 // configuration
 #include "CommonTools/Utils/interface/StringToEnumValue.h"
 
@@ -176,12 +179,18 @@ EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) {
 
   configParameters_.ChannelStatusToBeExcludedSize = v_chstatus_.size();
 
-  cudaCheck(cudaMalloc((void**)&configParameters_.ChannelStatusToBeExcluded, sizeof(int) * v_chstatus_.size()));
-  cudaCheck(cudaMemcpy(configParameters_.ChannelStatusToBeExcluded,
-                       v_chstatus_.data(),
-                       v_chstatus_.size() * sizeof(int),
-                       cudaMemcpyHostToDevice));
-
+  
+  // call CUDA API functions only if CUDA is available
+  edm::Service<CUDAService> cs;
+  if (cs and cs->enabled()) {
+    
+    cudaCheck(cudaMalloc((void**)&configParameters_.ChannelStatusToBeExcluded, sizeof(int) * v_chstatus_.size()));
+    cudaCheck(cudaMemcpy(configParameters_.ChannelStatusToBeExcluded,
+                         v_chstatus_.data(),
+                         v_chstatus_.size() * sizeof(int),
+                         cudaMemcpyHostToDevice));
+  }
+  
   //
   //     https://github.com/cms-sw/cmssw/blob/266e21cfc9eb409b093e4cf064f4c0a24c6ac293/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitWorkerSimple.cc
   //
@@ -208,33 +217,36 @@ EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) {
     //     v_DB_reco_flags_[recoflagbit] = dbstatuses;
   }
 
-  // actual values
-  cudaCheck(
-      cudaMalloc((void**)&configParameters_.expanded_v_DB_reco_flags, sizeof(int) * expanded_v_DB_reco_flags_.size()));
-
-  cudaCheck(cudaMemcpy(configParameters_.expanded_v_DB_reco_flags,
-                       expanded_v_DB_reco_flags_.data(),
-                       expanded_v_DB_reco_flags_.size() * sizeof(int),
-                       cudaMemcpyHostToDevice));
-
-  // sizes
-  cudaCheck(cudaMalloc((void**)&configParameters_.expanded_Sizes_v_DB_reco_flags,
-                       sizeof(uint32_t) * expanded_Sizes_v_DB_reco_flags_.size()));
-
-  cudaCheck(cudaMemcpy(configParameters_.expanded_Sizes_v_DB_reco_flags,
-                       expanded_Sizes_v_DB_reco_flags_.data(),
-                       expanded_Sizes_v_DB_reco_flags_.size() * sizeof(uint32_t),
-                       cudaMemcpyHostToDevice));
-
-  // keys
-  cudaCheck(cudaMalloc((void**)&configParameters_.expanded_flagbit_v_DB_reco_flags,
-                       sizeof(uint32_t) * expanded_flagbit_v_DB_reco_flags_.size()));
-
-  cudaCheck(cudaMemcpy(configParameters_.expanded_flagbit_v_DB_reco_flags,
-                       expanded_flagbit_v_DB_reco_flags_.data(),
-                       expanded_flagbit_v_DB_reco_flags_.size() * sizeof(uint32_t),
-                       cudaMemcpyHostToDevice));
-
+  // call CUDA API functions only if CUDA is available
+  if (cs and cs->enabled()) {
+    // actual values
+    cudaCheck(
+        cudaMalloc((void**)&configParameters_.expanded_v_DB_reco_flags, sizeof(int) * expanded_v_DB_reco_flags_.size()));
+    
+    cudaCheck(cudaMemcpy(configParameters_.expanded_v_DB_reco_flags,
+                         expanded_v_DB_reco_flags_.data(),
+                         expanded_v_DB_reco_flags_.size() * sizeof(int),
+                         cudaMemcpyHostToDevice));
+    
+    // sizes
+    cudaCheck(cudaMalloc((void**)&configParameters_.expanded_Sizes_v_DB_reco_flags,
+                         sizeof(uint32_t) * expanded_Sizes_v_DB_reco_flags_.size()));
+    
+    cudaCheck(cudaMemcpy(configParameters_.expanded_Sizes_v_DB_reco_flags,
+                         expanded_Sizes_v_DB_reco_flags_.data(),
+                         expanded_Sizes_v_DB_reco_flags_.size() * sizeof(uint32_t),
+                         cudaMemcpyHostToDevice));
+    
+    // keys
+    cudaCheck(cudaMalloc((void**)&configParameters_.expanded_flagbit_v_DB_reco_flags,
+                         sizeof(uint32_t) * expanded_flagbit_v_DB_reco_flags_.size()));
+    
+    cudaCheck(cudaMemcpy(configParameters_.expanded_flagbit_v_DB_reco_flags,
+                         expanded_flagbit_v_DB_reco_flags_.data(),
+                         expanded_flagbit_v_DB_reco_flags_.size() * sizeof(uint32_t),
+                         cudaMemcpyHostToDevice));
+  }
+  
   configParameters_.expanded_v_DB_reco_flagsSize = expanded_flagbit_v_DB_reco_flags_.size();
 
   flagmask_ = 0;
@@ -258,16 +270,20 @@ EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) {
 }
 
 EcalRecHitProducerGPU::~EcalRecHitProducerGPU() {
-  // free event ouput data
-  eventOutputDataGPU_.deallocate(configParameters_);
-
-  // FIXME AM: do I need to do this?
-  //           Or can I do it as part of "deallocate" ?
-  cudaCheck(cudaFree(configParameters_.ChannelStatusToBeExcluded));
-
-  cudaCheck(cudaFree(configParameters_.expanded_v_DB_reco_flags));
-  cudaCheck(cudaFree(configParameters_.expanded_Sizes_v_DB_reco_flags));
-  cudaCheck(cudaFree(configParameters_.expanded_flagbit_v_DB_reco_flags));
+  
+  edm::Service<CUDAService> cs;
+  if (cs and cs->enabled()) {
+    // free event ouput data
+    eventOutputDataGPU_.deallocate(configParameters_);
+
+    // FIXME AM: do I need to do this?
+    //           Or can I do it as part of "deallocate" ?
+    cudaCheck(cudaFree(configParameters_.ChannelStatusToBeExcluded));
+
+    cudaCheck(cudaFree(configParameters_.expanded_v_DB_reco_flags));
+    cudaCheck(cudaFree(configParameters_.expanded_Sizes_v_DB_reco_flags));
+    cudaCheck(cudaFree(configParameters_.expanded_flagbit_v_DB_reco_flags));
+  }
 }
 
 void EcalRecHitProducerGPU::acquire(edm::Event const& event,
@@ -343,7 +359,7 @@ void EcalRecHitProducerGPU::acquire(edm::Event const& event,
                                   event_time,
                                   ctx.stream());
 
-  //   cudaCheck(cudaGetLastError());
+  cudaCheck(cudaGetLastError());
 }
 
 void EcalRecHitProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) {

From aed840061df8a4da6b037ff93fbbc7fff4bd608b Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 5 Jun 2020 10:58:46 +0200
Subject: [PATCH 22/30] Update sequences for ECAL local reconstruction running
 on GPU

---
 .../EcalRawToDigi/python/ecalDigis_cff.py     | 10 ++++----
 .../python/ecalMultiFitUncalibRecHit_cff.py   | 23 +++++++++++++++----
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/EventFilter/EcalRawToDigi/python/ecalDigis_cff.py b/EventFilter/EcalRawToDigi/python/ecalDigis_cff.py
index 9f79d3e0dbcb4..f6b873704dcd8 100644
--- a/EventFilter/EcalRawToDigi/python/ecalDigis_cff.py
+++ b/EventFilter/EcalRawToDigi/python/ecalDigis_cff.py
@@ -18,11 +18,11 @@
 
 # copy the digi from the GPU to the CPU and convert to legacy format
 from EventFilter.EcalRawToDigi.ecalCPUDigisProducer_cfi import ecalCPUDigisProducer as _ecalCPUDigisProducer
-_gpu_ecalDigis = _ecalCPUDigisProducer.clone(
-  digisInLabelEB = 'ecalDigisGPU:ebDigisGPU',
-  digisInLabelEE = 'ecalDigisGPU:eeDigisGPU',
-  produceDummyIntegrityCollections = True,
+_ecalDigis_gpu = _ecalCPUDigisProducer.clone(
+  digisInLabelEB = cms.InputTag('ecalDigisGPU', 'ebDigisGPU'),
+  digisInLabelEE = cms.InputTag('ecalDigisGPU', 'eeDigisGPU'),
+  produceDummyIntegrityCollections = True
 )
-gpu.toReplaceWith(ecalDigis, _gpu_ecalDigis)
+gpu.toReplaceWith(ecalDigis, _ecalDigis_gpu)
 
 gpu.toReplaceWith(ecalDigisTask, cms.Task(ecalElectronicsMappingGPUESProducer, ecalDigisGPU, ecalDigis))
diff --git a/RecoLocalCalo/EcalRecProducers/python/ecalMultiFitUncalibRecHit_cff.py b/RecoLocalCalo/EcalRecProducers/python/ecalMultiFitUncalibRecHit_cff.py
index 829c1b1c9468e..cbf220323df78 100644
--- a/RecoLocalCalo/EcalRecProducers/python/ecalMultiFitUncalibRecHit_cff.py
+++ b/RecoLocalCalo/EcalRecProducers/python/ecalMultiFitUncalibRecHit_cff.py
@@ -29,12 +29,27 @@
   recHitsInLabelEE = cms.InputTag('ecalMultiFitUncalibRecHitGPU', 'EcalUncalibRecHitsEE'),
 )
 
-# convert the uncalibrated rechits legacy format
+# convert the uncalibrated rechits from SoA to legacy format
 from RecoLocalCalo.EcalRecProducers.ecalUncalibRecHitConvertGPU2CPUFormat_cfi import ecalUncalibRecHitConvertGPU2CPUFormat as _ecalUncalibRecHitConvertGPU2CPUFormat
-_gpu_ecalMultiFitUncalibRecHit = _ecalUncalibRecHitConvertGPU2CPUFormat.clone(
+_ecalMultiFitUncalibRecHit_gpu = _ecalUncalibRecHitConvertGPU2CPUFormat.clone(
   recHitsLabelGPUEB = cms.InputTag('ecalMultiFitUncalibRecHitSoA', 'EcalUncalibRecHitsEB'),
   recHitsLabelGPUEE = cms.InputTag('ecalMultiFitUncalibRecHitSoA', 'EcalUncalibRecHitsEE'),
 )
-gpu.toReplaceWith(ecalMultiFitUncalibRecHit, _gpu_ecalMultiFitUncalibRecHit)
+gpu.toReplaceWith(ecalMultiFitUncalibRecHit, _ecalMultiFitUncalibRecHit_gpu)
 
-gpu.toReplaceWith(ecalMultiFitUncalibRecHitTask, cms.Task(ecalMultiFitUncalibRecHitGPU, ecalMultiFitUncalibRecHitSoA, ecalMultiFitUncalibRecHit))
+gpu.toReplaceWith(ecalMultiFitUncalibRecHitTask, cms.Task(
+  # ECAL conditions used by the multifit running on GPU
+  ecalPedestalsGPUESProducer,
+  ecalGainRatiosGPUESProducer,
+  ecalPulseShapesGPUESProducer,
+  ecalPulseCovariancesGPUESProducer,
+  ecalSamplesCorrelationGPUESProducer,
+  ecalTimeBiasCorrectionsGPUESProducer,
+  ecalTimeCalibConstantsGPUESProducer,
+  # ECAL multifit running on GP
+  ecalMultiFitUncalibRecHitGPU,
+  # copy the uncalibrated rechits from GPU to CPU
+  ecalMultiFitUncalibRecHitSoA,
+  # convert the uncalibrated rechits legacy format
+  ecalMultiFitUncalibRecHit,
+))

From 92afb3e9dfc0a9977785c736604cb33b6628dd14 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 5 Jun 2020 10:59:26 +0200
Subject: [PATCH 23/30] Reconstruct ECAL rechits on GPUs

---
 .../python/ecalLocalRecoSequence_cff.py       | 52 +++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/RecoLocalCalo/Configuration/python/ecalLocalRecoSequence_cff.py b/RecoLocalCalo/Configuration/python/ecalLocalRecoSequence_cff.py
index 06fecf4787baf..5895f78eccd55 100644
--- a/RecoLocalCalo/Configuration/python/ecalLocalRecoSequence_cff.py
+++ b/RecoLocalCalo/Configuration/python/ecalLocalRecoSequence_cff.py
@@ -1,4 +1,5 @@
 import FWCore.ParameterSet.Config as cms
+from Configuration.ProcessModifiers.gpu_cff import gpu
 
 # TPG condition needed by ecalRecHit producer if TT recovery is ON
 from RecoLocalCalo.EcalRecProducers.ecalRecHitTPGConditions_cff import *
@@ -43,6 +44,57 @@
 
 ecalOnlyLocalRecoSequence = cms.Sequence(ecalOnlyLocalRecoTask)
 
+# ECAL rechit calibrations on GPU
+from RecoLocalCalo.EcalRecProducers.ecalRechitADCToGeVConstantGPUESProducer_cfi import ecalRechitADCToGeVConstantGPUESProducer
+from RecoLocalCalo.EcalRecProducers.ecalRechitChannelStatusGPUESProducer_cfi import ecalRechitChannelStatusGPUESProducer
+from RecoLocalCalo.EcalRecProducers.ecalIntercalibConstantsGPUESProducer_cfi import ecalIntercalibConstantsGPUESProducer
+from RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosGPUESProducer_cfi import ecalLaserAPDPNRatiosGPUESProducer
+from RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosRefGPUESProducer_cfi import ecalLaserAPDPNRatiosRefGPUESProducer
+from RecoLocalCalo.EcalRecProducers.ecalLaserAlphasGPUESProducer_cfi import ecalLaserAlphasGPUESProducer
+from RecoLocalCalo.EcalRecProducers.ecalLinearCorrectionsGPUESProducer_cfi import ecalLinearCorrectionsGPUESProducer
+
+# ECAL rechits running on GPU
+from RecoLocalCalo.EcalRecProducers.ecalRecHitGPU_cfi import ecalRecHitGPU as _ecalRecHitGPU
+ecalRecHitGPU = _ecalRecHitGPU.clone(
+    uncalibrecHitsInLabelEB = cms.InputTag('ecalMultiFitUncalibRecHitGPU', 'EcalUncalibRecHitsEB'),
+    uncalibrecHitsInLabelEE = cms.InputTag('ecalMultiFitUncalibRecHitGPU', 'EcalUncalibRecHitsEE')
+)
+
+# copy the rechits from GPU to CPU
+from RecoLocalCalo.EcalRecProducers.ecalCPURecHitProducer_cfi import ecalCPURecHitProducer as _ecalCPURecHitProducer
+ecalRecHitSoA = _ecalCPURecHitProducer.clone(
+    recHitsInLabelEB = cms.InputTag('ecalRecHitGPU', 'EcalRecHitsEB'),
+    recHitsInLabelEE = cms.InputTag('ecalRecHitGPU', 'EcalRecHitsEE')
+)
+
+# convert the rechits from SoA to legacy format
+from RecoLocalCalo.EcalRecProducers.ecalRecHitConvertGPU2CPUFormat_cfi import ecalRecHitConvertGPU2CPUFormat as _ecalRecHitConvertGPU2CPUFormat
+_ecalRecHit_gpu = _ecalRecHitConvertGPU2CPUFormat.clone(
+    recHitsLabelGPUEB = cms.InputTag('ecalRecHitSoA', 'EcalRecHitsEB'),
+    recHitsLabelGPUEE = cms.InputTag('ecalRecHitSoA', 'EcalRecHitsEE')
+)
+gpu.toReplaceWith(ecalRecHit, _ecalRecHit_gpu)
+
+# ECAL reconstruction on GPU
+gpu.toReplaceWith(ecalRecHitNoTPTask, cms.Task(
+  # ECAL rechit calibrations on GPU
+  ecalRechitADCToGeVConstantGPUESProducer,
+  ecalRechitChannelStatusGPUESProducer,
+  ecalIntercalibConstantsGPUESProducer,
+  ecalLaserAPDPNRatiosGPUESProducer,
+  ecalLaserAPDPNRatiosRefGPUESProducer,
+  ecalLaserAlphasGPUESProducer,
+  ecalLinearCorrectionsGPUESProducer,
+  # ECAL rechits running on GPU
+  ecalRecHitGPU,
+  # copy the rechits from GPU to CPU
+  ecalRecHitSoA,
+  # convert the rechits from SoA to legacy format
+  ecalRecHit,
+  # ECAL preshower rechit legacy module
+  ecalPreshowerRecHit
+))
+
 # Phase 2 modifications
 from RecoLocalCalo.EcalRecProducers.ecalDetailedTimeRecHit_cfi import *
 _phase2_timing_ecalRecHitTask = cms.Task( ecalRecHitTask.copy() , ecalDetailedTimeRecHit )

From e94bc013f7ee861c6d27e9e639363eda65501f97 Mon Sep 17 00:00:00 2001
From: amassiro <massironi.andrea@gmail.com>
Date: Mon, 8 Jun 2020 14:59:46 +0200
Subject: [PATCH 24/30] fixes of the previous PR after central validation

---
 .../EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc     |  4 ++++
 .../EcalRecAlgos/src/EcalRecHitBuilderKernels.cu     | 12 +++++++++---
 .../plugins/EcalRecHitProducerGPU.cc                 |  4 ++--
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc
index 20946028aba90..0af2a9044ab65 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc
@@ -38,6 +38,10 @@ EcalLinearCorrectionsGPU::Product::~Product() {
   // deallocation
   cudaCheck(cudaFree(p1));
   cudaCheck(cudaFree(p2));
+  cudaCheck(cudaFree(p3));
+  cudaCheck(cudaFree(t1));
+  cudaCheck(cudaFree(t2));
+  cudaCheck(cudaFree(t3));
 }
 
 EcalLinearCorrectionsGPU::Product const& EcalLinearCorrectionsGPU::getProduct(cudaStream_t cudaStream) const {
diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
index 54c376214c4c6..904c751de460a 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
@@ -286,7 +286,10 @@ namespace ecal {
         // Exploited later by the module "EcalRecHitConvertGPU2CPUFormat"
         //
         energy[ch] = -1;  //---- AM: default, un-physical, ok
-
+        chi2[ch] = chi2_in[inputCh];
+        extra[ch] = 0;
+        
+        bool skip_this_channel = false;
         //
         static const int chStatusMask = 0x1F;
         // ChannelStatusToBeExcluded is a "int" then I put "dbstatus" to be the same
@@ -294,10 +297,12 @@ namespace ecal {
         if (ChannelStatusToBeExcludedSize != 0) {
           for (int ich_to_check = 0; ich_to_check < ChannelStatusToBeExcludedSize; ich_to_check++) {
             if (ChannelStatusToBeExcluded[ich_to_check] == dbstatus) {
-              return;
+              skip_this_channel = true;
             }
           }
         }
+        
+        if (skip_this_channel) continue;
 
         // Take our association map of dbstatuses-> recHit flagbits and return the apporpriate flagbit word
 
@@ -336,7 +341,8 @@ namespace ecal {
         }
 
         if ((flagmask & temporary_flagBits) && killDeadChannels) {
-          return;
+          continue;
+          // skip this channel
         }
 
         //
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
index 1c80d648f1eed..70c0af0e01821 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
@@ -283,7 +283,7 @@ EcalRecHitProducerGPU::~EcalRecHitProducerGPU() {
     cudaCheck(cudaFree(configParameters_.expanded_v_DB_reco_flags));
     cudaCheck(cudaFree(configParameters_.expanded_Sizes_v_DB_reco_flags));
     cudaCheck(cudaFree(configParameters_.expanded_flagbit_v_DB_reco_flags));
-  }
+  } 
 }
 
 void EcalRecHitProducerGPU::acquire(edm::Event const& event,
@@ -302,7 +302,7 @@ void EcalRecHitProducerGPU::acquire(edm::Event const& event,
 
   neb_ = ebUncalibRecHits.size;
   nee_ = eeUncalibRecHits.size;
-  //   std::cout << " [EcalRecHitProducerGPU::acquire]  neb_:nee_ = " << neb_ << " : " << nee_ << std::endl;
+  std::cout << " [EcalRecHitProducerGPU::acquire]  neb_:nee_ = " << neb_ << " : " << nee_ << std::endl;
 
   int nchannelsEB = ebUncalibRecHits.size;  // --> offsetForInput, first EB and then EE
 

From 907a3bb265c30d332432084a72b1748f2b51971c Mon Sep 17 00:00:00 2001
From: amassiro <massironi.andrea@gmail.com>
Date: Mon, 8 Jun 2020 15:04:15 +0200
Subject: [PATCH 25/30] ops, a cout slipped through

---
 RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
index 70c0af0e01821..ac09cb484288b 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
@@ -302,7 +302,7 @@ void EcalRecHitProducerGPU::acquire(edm::Event const& event,
 
   neb_ = ebUncalibRecHits.size;
   nee_ = eeUncalibRecHits.size;
-  std::cout << " [EcalRecHitProducerGPU::acquire]  neb_:nee_ = " << neb_ << " : " << nee_ << std::endl;
+  // std::cout << " [EcalRecHitProducerGPU::acquire]  neb_:nee_ = " << neb_ << " : " << nee_ << std::endl;
 
   int nchannelsEB = ebUncalibRecHits.size;  // --> offsetForInput, first EB and then EE
 

From ff30a39c4e0a33a06c838d771983fe10645e4f85 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 8 Jun 2020 16:21:04 +0200
Subject: [PATCH 26/30] Remove extra whitespace

---
 .../EcalRecProducers/plugins/EcalRecHitProducerGPU.cc     | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
index ac09cb484288b..0a1260dffefd2 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
@@ -7,10 +7,6 @@
 #include "FWCore/Framework/interface/EventSetup.h"
 #include "FWCore/Framework/interface/MakerMacros.h"
 
-//
-//
-//
-
 // format
 #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
 #include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
@@ -283,7 +279,7 @@ EcalRecHitProducerGPU::~EcalRecHitProducerGPU() {
     cudaCheck(cudaFree(configParameters_.expanded_v_DB_reco_flags));
     cudaCheck(cudaFree(configParameters_.expanded_Sizes_v_DB_reco_flags));
     cudaCheck(cudaFree(configParameters_.expanded_flagbit_v_DB_reco_flags));
-  } 
+  }
 }
 
 void EcalRecHitProducerGPU::acquire(edm::Event const& event,
@@ -321,8 +317,6 @@ void EcalRecHitProducerGPU::acquire(edm::Event const& event,
   setup.get<EcalLaserAlphasRcd>().get(LaserAlphasHandle_);
   setup.get<EcalLinearCorrectionsRcd>().get(LinearCorrectionsHandle_);
 
-  //
-
   auto const& ADCToGeVConstantProduct = ADCToGeVConstantHandle_->getProduct(ctx.stream());
   auto const& IntercalibConstantsProduct = IntercalibConstantsHandle_->getProduct(ctx.stream());
   auto const& ChannelStatusProduct = ChannelStatusHandle_->getProduct(ctx.stream());

From 134ab3ae2bd8d4bcd9433cdd25f7a211ea4a275e Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 8 Jun 2020 16:24:52 +0200
Subject: [PATCH 27/30] Move skip_this_channel inside the if block

---
 RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
index 904c751de460a..31ba3c0d3b5e4 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
@@ -289,21 +289,20 @@ namespace ecal {
         chi2[ch] = chi2_in[inputCh];
         extra[ch] = 0;
         
-        bool skip_this_channel = false;
-        //
         static const int chStatusMask = 0x1F;
         // ChannelStatusToBeExcluded is a "int" then I put "dbstatus" to be the same
         int dbstatus = EcalChannelStatusCode_Code((status[hashedId]) & chStatusMask);
         if (ChannelStatusToBeExcludedSize != 0) {
+          bool skip_this_channel = false;
           for (int ich_to_check = 0; ich_to_check < ChannelStatusToBeExcludedSize; ich_to_check++) {
             if (ChannelStatusToBeExcluded[ich_to_check] == dbstatus) {
               skip_this_channel = true;
+              break;
             }
           }
+          if (skip_this_channel) continue;
         }
         
-        if (skip_this_channel) continue;
-
         // Take our association map of dbstatuses-> recHit flagbits and return the apporpriate flagbit word
 
         //

From a6a074f3ece52dfdec3cd5fb4b97c92256188910 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 8 Jun 2020 16:38:08 +0200
Subject: [PATCH 28/30] Avoid some repeated assignments

---
 .../src/EcalRecHitBuilderKernels.cu           | 28 +++++++++++--------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
index 31ba3c0d3b5e4..5ab7f6226d1b2 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
@@ -286,10 +286,17 @@ namespace ecal {
         // Exploited later by the module "EcalRecHitConvertGPU2CPUFormat"
         //
         energy[ch] = -1;  //---- AM: default, un-physical, ok
-        chi2[ch] = chi2_in[inputCh];
+
+        // truncate the chi2
+        if (chi2_in[inputCh] > 64)
+          chi2[ch] = 64;
+        else
+          chi2[ch] = chi2_in[inputCh];
+
+        // default value for the "extra flags"
         extra[ch] = 0;
-        
-        static const int chStatusMask = 0x1F;
+
+        static const int chStatusMask = 0x1f;
         // ChannelStatusToBeExcluded is a "int" then I put "dbstatus" to be the same
         int dbstatus = EcalChannelStatusCode_Code((status[hashedId]) & chStatusMask);
         if (ChannelStatusToBeExcludedSize != 0) {
@@ -300,9 +307,12 @@ namespace ecal {
               break;
             }
           }
-          if (skip_this_channel) continue;
+          if (skip_this_channel) {
+            // skip this channel
+            continue;
+          }
         }
-        
+
         // Take our association map of dbstatuses-> recHit flagbits and return the apporpriate flagbit word
 
         //
@@ -340,8 +350,8 @@ namespace ecal {
         }
 
         if ((flagmask & temporary_flagBits) && killDeadChannels) {
-          continue;
           // skip this channel
+          continue;
         }
 
         //
@@ -357,13 +367,7 @@ namespace ecal {
         // Time is not saved so far, FIXME
         //         time[ch] = time_in[inputCh];
 
-        if (chi2_in[inputCh] > 64)
-          chi2[ch] = 64;
-        else
-          chi2[ch] = chi2_in[inputCh];
-
         // NB: calculate the "flagBits extra"  --> not really "flags", but actually an encoded version of energy uncertainty, time unc., ...
-        extra[ch] = 0;
 
         //
         // extra packing ...

From 9da15f2d5a0720d6a0e65cf8780ed51d303d8414 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 10 Jun 2020 10:09:17 +0200
Subject: [PATCH 29/30] Silence warning about unused variable

---
 RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
index 5ab7f6226d1b2..cc44b27ab224a 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
@@ -157,7 +157,7 @@ namespace ecal {
 
         ::ecal::reco::StorageScalarType const* amplitude = isEndcap ? amplitude_ee : amplitude_eb;
 
-        ::ecal::reco::StorageScalarType const* time_in = isEndcap ? time_ee : time_eb;
+        //::ecal::reco::StorageScalarType const* time_in = isEndcap ? time_ee : time_eb;
 
         ::ecal::reco::StorageScalarType const* chi2_in = isEndcap ? chi2_ee : chi2_eb;
 

From dacaf2ead322bfe3b7c38aaea18a16b0ed79a07f Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 10 Jun 2020 10:14:52 +0200
Subject: [PATCH 30/30] Set flagBits for all channels

---
 .../EcalRecAlgos/src/EcalRecHitBuilderKernels.cu          | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
index cc44b27ab224a..114c56e8907f2 100644
--- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
+++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu
@@ -293,7 +293,8 @@ namespace ecal {
         else
           chi2[ch] = chi2_in[inputCh];
 
-        // default value for the "extra flags"
+        // default values for the flags
+        flagBits[ch] = 0;
         extra[ch] = 0;
 
         static const int chStatusMask = 0x1f;
@@ -349,14 +350,13 @@ namespace ecal {
           flagbit_counter += 1;
         }
 
+        flagBits[ch] = temporary_flagBits;
+
         if ((flagmask & temporary_flagBits) && killDeadChannels) {
           // skip this channel
           continue;
         }
 
-        //
-        flagBits[ch] = temporary_flagBits;
-
         //
         // multiply the adc counts with factors to get GeV
         //