diff --git a/EventFilter/EcalRawToDigi/BuildFile.xml b/EventFilter/EcalRawToDigi/BuildFile.xml
index da28405324833..31e115bdc7880 100644
--- a/EventFilter/EcalRawToDigi/BuildFile.xml
+++ b/EventFilter/EcalRawToDigi/BuildFile.xml
@@ -8,7 +8,6 @@
-
@@ -18,8 +17,6 @@
-
-
diff --git a/EventFilter/EcalRawToDigi/interface/EcalRegionCabling.h b/EventFilter/EcalRawToDigi/interface/EcalRegionCabling.h
index fa6e9f5d5a161..38a9ebdf18cb8 100644
--- a/EventFilter/EcalRawToDigi/interface/EcalRegionCabling.h
+++ b/EventFilter/EcalRawToDigi/interface/EcalRegionCabling.h
@@ -1,14 +1,11 @@
-#ifndef EcalRegionCabling_H
-#define EcalRegionCabling_H
+#ifndef EventFilter_EcalRawToDigi_interface_EcalRegionCabling_h
+#define EventFilter_EcalRawToDigi_interface_EcalRegionCabling_h
-#include "Geometry/EcalMapping/interface/EcalElectronicsMapping.h"
-#include "Geometry/EcalMapping/interface/ESElectronicsMapper.h"
-
-#include "DataFormats/EcalRecHit/interface/EcalRecHit.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
#include "DataFormats/FEDRawData/interface/FEDNumbering.h"
-
#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "Geometry/EcalMapping/interface/ESElectronicsMapper.h"
+#include "Geometry/EcalMapping/interface/EcalElectronicsMapping.h"
class EcalRegionCabling {
public:
@@ -73,4 +70,4 @@ class EcalRegionCabling {
const ESElectronicsMapper* es_mapping_;
};
-#endif
+#endif // EventFilter_EcalRawToDigi_interface_EcalRegionCabling_h
diff --git a/EventFilter/EcalRawToDigi/plugins/BuildFile.xml b/EventFilter/EcalRawToDigi/plugins/BuildFile.xml
index 3bd25122ff029..df29d960b57e9 100644
--- a/EventFilter/EcalRawToDigi/plugins/BuildFile.xml
+++ b/EventFilter/EcalRawToDigi/plugins/BuildFile.xml
@@ -3,7 +3,6 @@
-
@@ -14,10 +13,6 @@
-
-
-
-
-
+
diff --git a/EventFilter/EcalRawToDigi/plugins/DeclsForKernels.h b/EventFilter/EcalRawToDigi/plugins/DeclsForKernels.h
new file mode 100644
index 0000000000000..9eab30a33fb3c
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/plugins/DeclsForKernels.h
@@ -0,0 +1,108 @@
+#ifndef EventFilter_EcalRawToDigi_plugins_DeclsForKernels_h
+#define EventFilter_EcalRawToDigi_plugins_DeclsForKernels_h
+
+#include
+
+#include "EventFilter/EcalRawToDigi/interface/DCCRawDataDefinitions.h"
+#include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+namespace ecal {
+ namespace raw {
+
+ constexpr auto empty_event_size = EMPTYEVENTSIZE;
+ constexpr uint32_t nfeds_max = 54;
+ constexpr uint32_t nbytes_per_fed_max = 10 * 1024;
+
+ struct InputDataCPU {
+ std::vector> data;
+ std::vector> offsets;
+ std::vector> feds;
+
+ void allocate() {
+ // 2KB per FED resize
+ data.resize(nfeds_max * sizeof(unsigned char) * nbytes_per_fed_max);
+ offsets.resize(nfeds_max, 0);
+ feds.resize(nfeds_max, 0);
+ }
+ };
+
+ struct ConfigurationParameters {
+ uint32_t maxChannels;
+ };
+
+ struct OutputDataCPU {
+ // [0] - eb, [1] - ee
+ std::vector> nchannels;
+
+ void allocate() { nchannels.resize(2); }
+ };
+
+ struct OutputDataGPU {
+ uint16_t *samplesEB = nullptr, *samplesEE = nullptr;
+ uint32_t *idsEB = nullptr, *idsEE = nullptr;
+
+ // FIXME: we should separate max channels parameter for eb and ee
+ // FIXME: replace hardcoded values
+ void allocate(ConfigurationParameters const &config) {
+ cudaCheck(cudaMalloc((void **)&samplesEB, config.maxChannels * sizeof(uint16_t) * 10));
+ cudaCheck(cudaMalloc((void **)&samplesEE, config.maxChannels * sizeof(uint16_t) * 10));
+ cudaCheck(cudaMalloc((void **)&idsEB, config.maxChannels * sizeof(uint32_t)));
+ cudaCheck(cudaMalloc((void **)&idsEE, config.maxChannels * sizeof(uint32_t)));
+ }
+
+ void deallocate(ConfigurationParameters const &config) {
+ if (samplesEB) {
+ cudaCheck(cudaFree(samplesEB));
+ cudaCheck(cudaFree(samplesEE));
+ cudaCheck(cudaFree(idsEB));
+ cudaCheck(cudaFree(idsEE));
+ }
+ }
+ };
+
+ struct ScratchDataGPU {
+ // [0] = EB
+ // [1] = EE
+ uint32_t *pChannelsCounter = nullptr;
+
+ void allocate(ConfigurationParameters const &config) {
+ cudaCheck(cudaMalloc((void **)&pChannelsCounter, sizeof(uint32_t) * 2));
+ }
+
+ void deallocate(ConfigurationParameters const &config) {
+ if (pChannelsCounter) {
+ cudaCheck(cudaFree(pChannelsCounter));
+ }
+ }
+ };
+
+ struct InputDataGPU {
+ unsigned char *data = nullptr;
+ uint32_t *offsets = nullptr;
+ int *feds = nullptr;
+
+ void allocate() {
+ cudaCheck(cudaMalloc((void **)&data, sizeof(unsigned char) * nbytes_per_fed_max * nfeds_max));
+ cudaCheck(cudaMalloc((void **)&offsets, sizeof(uint32_t) * nfeds_max));
+ cudaCheck(cudaMalloc((void **)&feds, sizeof(int) * nfeds_max));
+ }
+
+ void deallocate() {
+ if (data) {
+ cudaCheck(cudaFree(data));
+ cudaCheck(cudaFree(offsets));
+ cudaCheck(cudaFree(feds));
+ }
+ }
+ };
+
+ struct ConditionsProducts {
+ ElectronicsMappingGPU::Product const &eMappingProduct;
+ };
+
+ } // namespace raw
+} // namespace ecal
+
+#endif // EventFilter_EcalRawToDigi_plugins_DeclsForKernels_h
diff --git a/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc b/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc
index a046ce92fe336..fa53efe6286f1 100644
--- a/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc
+++ b/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc
@@ -6,9 +6,7 @@
#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
-#include "EventFilter/EcalRawToDigi/interface/DeclsForKernels.h"
#include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h"
-#include "EventFilter/EcalRawToDigi/interface/UnpackGPU.h"
#include "FWCore/Framework/interface/Event.h"
#include "FWCore/Framework/interface/EventSetup.h"
#include "FWCore/Framework/interface/MakerMacros.h"
@@ -18,6 +16,9 @@
#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "DeclsForKernels.h"
+#include "UnpackGPU.h"
+
class EcalCPUDigisProducer : public edm::stream::EDProducer {
public:
explicit EcalCPUDigisProducer(edm::ParameterSet const& ps);
diff --git a/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc b/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc
index 18dc2307e9bfc..e2a127e950252 100644
--- a/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc
+++ b/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc
@@ -1,29 +1,20 @@
#include
-// framework
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
-//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h"
-
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-
-// algorithm specific
-
-#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
-#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
#include "CUDADataFormats/EcalDigi/interface/DigisCollection.h"
-
#include "CondFormats/DataRecord/interface/EcalMappingElectronicsRcd.h"
-
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
#include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "EventFilter/EcalRawToDigi/interface/DeclsForKernels.h"
-#include "EventFilter/EcalRawToDigi/interface/UnpackGPU.h"
+#include "DeclsForKernels.h"
+#include "UnpackGPU.h"
class EcalRawToDigiGPU : public edm::stream::EDProducer {
public:
diff --git a/EventFilter/EcalRawToDigi/plugins/UnpackGPU.cu b/EventFilter/EcalRawToDigi/plugins/UnpackGPU.cu
new file mode 100644
index 0000000000000..79f916d510e72
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/plugins/UnpackGPU.cu
@@ -0,0 +1,337 @@
+#include "EventFilter/EcalRawToDigi/interface/ElectronicsIdGPU.h"
+
+#include "UnpackGPU.h"
+
+namespace ecal {
+ namespace raw {
+
+ __forceinline__ __device__ void print_raw_buffer(uint8_t const* const buffer,
+ uint32_t const nbytes,
+ uint32_t const nbytes_per_row = 20) {
+ for (uint32_t i = 0; i < nbytes; i++) {
+ if (i % nbytes_per_row == 0 && i > 0)
+ printf("\n");
+ printf("%02X ", buffer[i]);
+ }
+ }
+
+ __forceinline__ __device__ void print_first3bits(uint64_t const* buffer, uint32_t size) {
+ for (uint32_t i = 0; i < size; ++i) {
+ uint8_t const b61 = (buffer[i] >> 61) & 0x1;
+ uint8_t const b62 = (buffer[i] >> 62) & 0x1;
+ uint8_t const b63 = (buffer[i] >> 63) & 0x1;
+ printf("[word: %u] %u%u%u\n", i, b63, b62, b61);
+ }
+ }
+
+ __forceinline__ __device__ bool is_barrel(uint8_t dccid) {
+ return dccid >= ElectronicsIdGPU::MIN_DCCID_EBM && dccid <= ElectronicsIdGPU::MAX_DCCID_EBP;
+ }
+
+ __forceinline__ __device__ uint8_t fed2dcc(int fed) { return static_cast(fed - 600); }
+
+ __forceinline__ __device__ int zside_for_eb(ElectronicsIdGPU const& eid) {
+ int dcc = eid.dccId();
+ return ((dcc >= ElectronicsIdGPU::MIN_DCCID_EBM && dcc <= ElectronicsIdGPU::MAX_DCCID_EBM)) ? -1 : 1;
+ }
+
+ __forceinline__ __device__ bool is_synced_towerblock(uint16_t const dccbx,
+ uint16_t const bx,
+ uint16_t const dccl1,
+ uint16_t const l1) {
+ bool const bxsync = (bx == 0 && dccbx == 3564) || (bx == dccbx && dccbx != 3564);
+ bool const l1sync = (l1 == ((dccl1 - 1) & 0xfff));
+ return bxsync && l1sync;
+ }
+
+ __forceinline__ __device__ bool right_tower_for_eb(int tower) {
+ // for EB, two types of tower (LVRB top/bottom)
+ if ((tower > 12 && tower < 21) || (tower > 28 && tower < 37) || (tower > 44 && tower < 53) ||
+ (tower > 60 && tower < 69))
+ return true;
+ else
+ return false;
+ }
+
+ __forceinline__ __device__ uint32_t compute_ebdetid(ElectronicsIdGPU const& eid) {
+ // as in Geometry/EcalMaping/.../EcalElectronicsMapping
+ auto const dcc = eid.dccId();
+ auto const tower = eid.towerId();
+ auto const strip = eid.stripId();
+ auto const xtal = eid.xtalId();
+
+ int smid = 0;
+ int iphi = 0;
+ bool EBPlus = (zside_for_eb(eid) > 0);
+ bool EBMinus = !EBPlus;
+
+ if (zside_for_eb(eid) < 0) {
+ smid = dcc + 19 - ElectronicsIdGPU::DCCID_PHI0_EBM;
+ iphi = (smid - 19) * ElectronicsIdGPU::kCrystalsInPhi;
+ iphi += 5 * ((tower - 1) % ElectronicsIdGPU::kTowersInPhi);
+ } else {
+ smid = dcc + 1 - ElectronicsIdGPU::DCCID_PHI0_EBP;
+ iphi = (smid - 1) * ElectronicsIdGPU::kCrystalsInPhi;
+ iphi += 5 * (ElectronicsIdGPU::kTowersInPhi - ((tower - 1) % ElectronicsIdGPU::kTowersInPhi) - 1);
+ }
+
+ bool RightTower = right_tower_for_eb(tower);
+ int ieta = 5 * ((tower - 1) / ElectronicsIdGPU::kTowersInPhi) + 1;
+ if (RightTower) {
+ ieta += (strip - 1);
+ if (strip % 2 == 1) {
+ if (EBMinus)
+ iphi += (xtal - 1) + 1;
+ else
+ iphi += (4 - (xtal - 1)) + 1;
+ } else {
+ if (EBMinus)
+ iphi += (4 - (xtal - 1)) + 1;
+ else
+ iphi += (xtal - 1) + 1;
+ }
+ } else {
+ ieta += 4 - (strip - 1);
+ if (strip % 2 == 1) {
+ if (EBMinus)
+ iphi += (4 - (xtal - 1)) + 1;
+ else
+ iphi += (xtal - 1) + 1;
+ } else {
+ if (EBMinus)
+ iphi += (xtal - 1) + 1;
+ else
+ iphi += (4 - (xtal - 1)) + 1;
+ }
+ }
+
+ if (zside_for_eb(eid) < 0)
+ ieta = -ieta;
+
+ DetId did{DetId::Ecal, EcalBarrel};
+ return did.rawId() | ((ieta > 0) ? (0x10000 | (ieta << 9)) : ((-ieta) << 9)) | (iphi & 0x1FF);
+ }
+
+ __forceinline__ __device__ int adc(uint16_t sample) { return sample & 0xfff; }
+
+ __forceinline__ __device__ int gainId(uint16_t sample) { return (sample >> 12) & 0x3; }
+
+ template
+ __global__ void kernel_unpack_test(unsigned char const* __restrict__ data,
+ uint32_t const* __restrict__ offsets,
+ int const* __restrict__ feds,
+ uint16_t* samplesEB,
+ uint16_t* samplesEE,
+ uint32_t* idsEB,
+ uint32_t* idsEE,
+ uint32_t* pChannelsCounterEBEE,
+ uint32_t const* eid2did,
+ uint32_t const nbytesTotal) {
+ // indices
+ auto const ifed = blockIdx.x;
+
+ // FIXME: use only the very first fed
+ //if (ifed!=10) return;
+
+ // offset in bytes
+ auto const offset = offsets[ifed];
+ // fed id
+ auto const fed = feds[ifed];
+ auto const isBarrel = is_barrel(static_cast(fed - 600));
+ // size
+ auto const size = ifed == gridDim.x - 1 ? nbytesTotal - offset : offsets[ifed + 1] - offset;
+ auto* samples = isBarrel ? samplesEB : samplesEE;
+ auto* ids = isBarrel ? idsEB : idsEE;
+ auto* pChannelsCounter = isBarrel ? &pChannelsCounterEBEE[0] : &pChannelsCounterEBEE[1];
+
+ // FIXME: debugging
+ //printf("ifed = %u fed = %d offset = %u size = %u\n", ifed, fed, offset, size);
+
+ // offset to the right raw buffer
+ uint64_t const* buffer = reinterpret_cast(data + offset);
+
+ // dump first 3 bits for each 64-bit word
+ //print_first3bits(buffer, size / 8);
+
+ //
+ // fed header
+ //
+ auto const fed_header = buffer[0];
+ uint32_t bx = (fed_header >> 20) & 0xfff;
+ uint32_t lv1 = (fed_header >> 32) & 0xffffff;
+
+ // 9 for fed + dcc header
+ // 36 for 4 EE TCC blocks or 18 for 1 EB TCC block
+ // 6 for SR block size
+
+ // dcc header w2
+ auto const w2 = buffer[2];
+ uint8_t const fov = (w2 >> 48) & 0xf;
+
+ //
+ // print Tower block headers
+ //
+ uint8_t ntccblockwords = isBarrel ? 18 : 36;
+ auto const* tower_blocks_start = buffer + 9 + ntccblockwords + 6;
+ auto const* trailer = buffer + (size / 8 - 1);
+ auto const* current_tower_block = tower_blocks_start;
+ while (current_tower_block != trailer) {
+ auto const w = *current_tower_block;
+ uint8_t ttid = w & 0xff;
+ uint16_t bxlocal = (w >> 16) & 0xfff;
+ uint16_t lv1local = (w >> 32) & 0xfff;
+ uint16_t block_length = (w >> 48) & 0x1ff;
+
+ uint16_t const dccbx = bx & 0xfff;
+ uint16_t const dccl1 = lv1 & 0xfff;
+ // fov>=1 is required to support simulated data for which bx==bxlocal==0
+ if (fov >= 1 && !is_synced_towerblock(dccbx, bxlocal, dccl1, lv1local)) {
+ current_tower_block += block_length;
+ continue;
+ }
+
+ // go through all the channels
+ // get the next channel coordinates
+ uint32_t nchannels = (block_length - 1) / 3;
+
+ // 1 threads per channel in this block
+ for (uint32_t ich = 0; ich < nchannels; ich += NTHREADS) {
+ auto const i_to_access = ich + threadIdx.x;
+ // threads outside of the range -> leave the loop
+ if (i_to_access >= nchannels)
+ break;
+
+ // inc the channel's counter and get the pos where to store
+ auto const wdata = current_tower_block[1 + i_to_access * 3];
+ uint8_t const stripid = wdata & 0x7;
+ uint8_t const xtalid = (wdata >> 4) & 0x7;
+ ElectronicsIdGPU eid{fed2dcc(fed), ttid, stripid, xtalid};
+ auto const didraw = isBarrel ? compute_ebdetid(eid) : eid2did[eid.linearIndex()];
+ // FIXME: what kind of channels are these guys
+ if (didraw == 0)
+ continue;
+
+ // get samples
+ uint16_t sampleValues[10];
+ sampleValues[0] = (wdata >> 16) & 0x3fff;
+ sampleValues[1] = (wdata >> 32) & 0x3fff;
+ sampleValues[2] = (wdata >> 48) & 0x3fff;
+ auto const wdata1 = current_tower_block[2 + i_to_access * 3];
+ sampleValues[3] = wdata1 & 0x3fff;
+ sampleValues[4] = (wdata1 >> 16) & 0x3fff;
+ sampleValues[5] = (wdata1 >> 32) & 0x3fff;
+ sampleValues[6] = (wdata1 >> 48) & 0x3fff;
+ auto const wdata2 = current_tower_block[3 + i_to_access * 3];
+ sampleValues[7] = wdata2 & 0x3fff;
+ sampleValues[8] = (wdata2 >> 16) & 0x3fff;
+ sampleValues[9] = (wdata2 >> 32) & 0x3fff;
+ //printf("stripid = %u xtalid = %u\n", stripid, xtalid);
+
+ // check gain
+ bool isSaturation = true;
+ short firstGainZeroSampID{-1}, firstGainZeroSampADC{-1};
+ for (uint32_t si = 0; si < 10; si++) {
+ if (gainId(sampleValues[si]) == 0) {
+ firstGainZeroSampID = si;
+ firstGainZeroSampADC = adc(sampleValues[si]);
+ break;
+ }
+ }
+ if (firstGainZeroSampID != -1) {
+ unsigned int plateauEnd = std::min(10u, (unsigned int)(firstGainZeroSampID + 5));
+ for (unsigned int s = firstGainZeroSampID; s < plateauEnd; s++) {
+ if (gainId(sampleValues[s]) == 0 && adc(sampleValues[s]) == firstGainZeroSampADC) {
+ ;
+ } else {
+ isSaturation = false;
+ break;
+ } //it's not saturation
+ }
+ // get rid of channels which are stuck in gain0
+ if (firstGainZeroSampID < 3) {
+ isSaturation = false;
+ }
+ if (!isSaturation)
+ continue;
+ } else { // there is no zero gainId sample
+ // gain switch check
+ short numGain = 1;
+ bool gainSwitchError = false;
+ for (unsigned int si = 1; si < 10; si++) {
+ if ((gainId(sampleValues[si - 1]) > gainId(sampleValues[si])) && numGain < 5)
+ gainSwitchError = true;
+ if (gainId(sampleValues[si - 1]) == gainId(sampleValues[si]))
+ numGain++;
+ else
+ numGain = 1;
+ }
+ if (gainSwitchError)
+ continue;
+ }
+
+ auto const pos = atomicAdd(pChannelsCounter, 1);
+
+ // store to global
+ ids[pos] = didraw;
+ samples[pos * 10] = sampleValues[0];
+ samples[pos * 10 + 1] = sampleValues[1];
+ samples[pos * 10 + 2] = sampleValues[2];
+ samples[pos * 10 + 3] = sampleValues[3];
+ samples[pos * 10 + 4] = sampleValues[4];
+ samples[pos * 10 + 5] = sampleValues[5];
+ samples[pos * 10 + 6] = sampleValues[6];
+ samples[pos * 10 + 7] = sampleValues[7];
+ samples[pos * 10 + 8] = sampleValues[8];
+ samples[pos * 10 + 9] = sampleValues[9];
+ }
+
+ current_tower_block += block_length;
+ }
+ }
+
+ void entryPoint(InputDataCPU const& inputCPU,
+ InputDataGPU& inputGPU,
+ OutputDataGPU& outputGPU,
+ ScratchDataGPU& scratchGPU,
+ OutputDataCPU& outputCPU,
+ ConditionsProducts const& conditions,
+ cudaStream_t cudaStream,
+ uint32_t const nfedsWithData,
+ uint32_t const nbytesTotal) {
+ // transfer
+ cudaCheck(cudaMemcpyAsync(
+ inputGPU.data, inputCPU.data.data(), nbytesTotal * sizeof(unsigned char), cudaMemcpyHostToDevice, cudaStream));
+ cudaCheck(cudaMemcpyAsync(inputGPU.offsets,
+ inputCPU.offsets.data(),
+ nfedsWithData * sizeof(uint32_t),
+ cudaMemcpyHostToDevice,
+ cudaStream));
+ cudaCheck(cudaMemsetAsync(scratchGPU.pChannelsCounter,
+ 0,
+ sizeof(uint32_t) * 2, // EB + EE
+ cudaStream));
+ cudaCheck(cudaMemcpyAsync(
+ inputGPU.feds, inputCPU.feds.data(), nfedsWithData * sizeof(int), cudaMemcpyHostToDevice, cudaStream));
+
+ kernel_unpack_test<32><<>>(inputGPU.data,
+ inputGPU.offsets,
+ inputGPU.feds,
+ outputGPU.samplesEB,
+ outputGPU.samplesEE,
+ outputGPU.idsEB,
+ outputGPU.idsEE,
+ scratchGPU.pChannelsCounter,
+ conditions.eMappingProduct.eid2did,
+ nbytesTotal);
+ cudaCheck(cudaGetLastError());
+
+ // transfer the counters for how many eb and ee channels we got
+ cudaCheck(cudaMemcpyAsync(outputCPU.nchannels.data(),
+ scratchGPU.pChannelsCounter,
+ sizeof(uint32_t) * 2,
+ cudaMemcpyDeviceToHost,
+ cudaStream));
+ }
+
+ } // namespace raw
+} // namespace ecal
diff --git a/EventFilter/EcalRawToDigi/plugins/UnpackGPU.h b/EventFilter/EcalRawToDigi/plugins/UnpackGPU.h
new file mode 100644
index 0000000000000..d98906e7e24a7
--- /dev/null
+++ b/EventFilter/EcalRawToDigi/plugins/UnpackGPU.h
@@ -0,0 +1,23 @@
+#ifndef EventFilter_EcalRawToDigi_plugins_UnpackGPU_h
+#define EventFilter_EcalRawToDigi_plugins_UnpackGPU_h
+
+#include "DeclsForKernels.h"
+
+namespace ecal {
+ namespace raw {
+
+ // FIXME: bundle up uint32_t values
+ void entryPoint(InputDataCPU const&,
+ InputDataGPU&,
+ OutputDataGPU&,
+ ScratchDataGPU&,
+ OutputDataCPU&,
+ ConditionsProducts const&,
+ cudaStream_t,
+ uint32_t const,
+ uint32_t const);
+
+ } // namespace raw
+} // namespace ecal
+
+#endif // EventFilter_EcalRawToDigi_plugins_UnpackGPU_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.cu
new file mode 100644
index 0000000000000..36601a0fb9230
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.cu
@@ -0,0 +1,505 @@
+#include
+#include
+
+#include
+
+#include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h"
+#include "CondFormats/EcalObjects/interface/EcalPulseShapes.h"
+#include "CondFormats/EcalObjects/interface/EcalSamplesCorrelation.h"
+#include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
+#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
+#include "DataFormats/Math/interface/approx_exp.h"
+#include "DataFormats/Math/interface/approx_log.h"
+
+#include "AmplitudeComputationCommonKernels.h"
+#include "KernelHelpers.h"
+
+namespace ecal {
+ namespace multifit {
+
+ ///
+ /// assume kernel launch configuration is
+ /// (MAXSAMPLES * nchannels, blocks)
+ ///
+ __global__ void kernel_prep_1d_and_initialize(EcalPulseShape const* shapes_in,
+ uint16_t const* digis_in_eb,
+ uint32_t const* dids_eb,
+ uint16_t const* digis_in_ee,
+ uint32_t const* dids_ee,
+ SampleVector* amplitudes,
+ SampleVector* amplitudesForMinimization,
+ SampleGainVector* gainsNoise,
+ float const* mean_x1,
+ float const* mean_x12,
+ float const* rms_x12,
+ float const* mean_x6,
+ float const* gain6Over1,
+ float const* gain12Over6,
+ bool* hasSwitchToGain6,
+ bool* hasSwitchToGain1,
+ bool* isSaturated,
+ ::ecal::reco::StorageScalarType* energies,
+ ::ecal::reco::StorageScalarType* chi2,
+ ::ecal::reco::StorageScalarType* g_pedestal,
+ uint32_t* dids_out,
+ uint32_t* flags,
+ char* acState,
+ BXVectorType* bxs,
+ uint32_t const offsetForHashes,
+ uint32_t const offsetForInputs,
+ bool const gainSwitchUseMaxSampleEB,
+ bool const gainSwitchUseMaxSampleEE,
+ int const nchannels) {
+ constexpr bool dynamicPedestal = false; //---- default to false, ok
+ constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+ constexpr int sample_max = 5;
+ constexpr int full_pulse_max = 9;
+ int const tx = threadIdx.x + blockIdx.x * blockDim.x;
+ int const nchannels_per_block = blockDim.x / nsamples;
+ int const ch = tx / nsamples;
+ // for accessing input arrays
+ int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+ int const inputTx = ch >= offsetForInputs ? tx - offsetForInputs * 10 : tx;
+ // eb is first and then ee
+ auto const* digis_in = ch >= offsetForInputs ? digis_in_ee : digis_in_eb;
+ auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+ int const sample = threadIdx.x % nsamples;
+
+ if (ch < nchannels) {
+ // array of 10 x channels per block
+ // TODO: any other way of doing simple reduction
+ // assume bool is 1 byte, should be quite safe
+ extern __shared__ char shared_mem[];
+ bool* shr_hasSwitchToGain6 = reinterpret_cast(shared_mem);
+ bool* shr_hasSwitchToGain1 = shr_hasSwitchToGain6 + nchannels_per_block * nsamples;
+ bool* shr_hasSwitchToGain0 = shr_hasSwitchToGain1 + nchannels_per_block * nsamples;
+ bool* shr_isSaturated = shr_hasSwitchToGain0 + nchannels_per_block * nsamples;
+ bool* shr_hasSwitchToGain0_tmp = shr_isSaturated + nchannels_per_block * nsamples;
+ char* shr_counts = reinterpret_cast(shr_hasSwitchToGain0_tmp) + nchannels_per_block * nsamples;
+
+ //
+ // indices
+ //
+ auto const did = DetId{dids[inputCh]};
+ auto const isBarrel = did.subdetId() == EcalBarrel;
+ // TODO offset for ee, 0 for eb
+ auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
+ : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
+
+ //
+ // pulse shape template
+ /*
+ for (int isample=sample; isample first 2 threads of each channel will be used here
+ // => 0 -> will compare 3 and 4 and put into 0
+ // => 1 -> will compare 4 and 5 and put into 1
+ shr_isSaturated[threadIdx.x] = shr_isSaturated[threadIdx.x + 3] || shr_isSaturated[threadIdx.x + 4];
+ }
+ __syncthreads();
+
+ bool check_hasSwitchToGain0 = false;
+
+ if (sample == 0) {
+ shr_hasSwitchToGain6[threadIdx.x] =
+ shr_hasSwitchToGain6[threadIdx.x] || shr_hasSwitchToGain6[threadIdx.x + 1];
+ shr_hasSwitchToGain1[threadIdx.x] =
+ shr_hasSwitchToGain1[threadIdx.x] || shr_hasSwitchToGain1[threadIdx.x + 1];
+ shr_hasSwitchToGain0_tmp[threadIdx.x] =
+ shr_hasSwitchToGain0_tmp[threadIdx.x] || shr_hasSwitchToGain0_tmp[threadIdx.x + 1];
+
+ hasSwitchToGain6[ch] = shr_hasSwitchToGain6[threadIdx.x];
+ hasSwitchToGain1[ch] = shr_hasSwitchToGain1[threadIdx.x];
+
+ // set only for the threadIdx.x corresponding to sample==0
+ check_hasSwitchToGain0 = shr_hasSwitchToGain0_tmp[threadIdx.x];
+
+ shr_isSaturated[threadIdx.x + 3] = shr_isSaturated[threadIdx.x] || shr_isSaturated[threadIdx.x + 1];
+ isSaturated[ch] = shr_isSaturated[threadIdx.x + 3];
+ }
+
+ // TODO: w/o this sync, there is a race
+ // if (threadIdx == sample_max) below uses max sample thread, not for 0 sample
+ // check if we can remove it
+ __syncthreads();
+
+ // TODO: divergent branch
+ if (gainId == 0 || gainId == 3) {
+ pedestal = mean_x1[hashedId];
+ gainratio = gain6Over1[hashedId] * gain12Over6[hashedId];
+ gainsNoise[ch](sample) = 2;
+ } else if (gainId == 1) {
+ pedestal = mean_x12[hashedId];
+ gainratio = 1.;
+ gainsNoise[ch](sample) = 0;
+ } else if (gainId == 2) {
+ pedestal = mean_x6[hashedId];
+ gainratio = gain12Over6[hashedId];
+ gainsNoise[ch](sample) = 1;
+ }
+
+ // TODO: compile time constant -> branch should be non-divergent
+ if (dynamicPedestal)
+ amplitude = static_cast(adc) * gainratio;
+ else
+ amplitude = (static_cast(adc) - pedestal) * gainratio;
+ amplitudes[ch][sample] = amplitude;
+
+#ifdef ECAL_RECO_CUDA_DEBUG
+ printf("%d %d %d %d %f %f %f\n", tx, ch, sample, adc, amplitude, pedestal, gainratio);
+ if (adc == 0)
+ printf("adc is zero\n");
+#endif
+
+ //
+ // initialization
+ //
+ amplitudesForMinimization[ch](sample) = 0;
+ bxs[ch](sample) = sample - 5;
+
+ // select the thread for the max sample
+ //---> hardcoded above to be 5th sample, ok
+ if (sample == sample_max) {
+ //
+ // initialization
+ //
+ acState[ch] = static_cast(MinimizationState::NotFinished);
+ energies[ch] = 0;
+ chi2[ch] = 0;
+ g_pedestal[ch] = 0;
+ uint32_t flag = 0;
+ dids_out[ch] = did.rawId();
+
+ // start of this channel in shared mem
+ int const chStart = threadIdx.x - sample_max;
+ // thread for the max sample in shared mem
+ int const threadMax = threadIdx.x;
+ auto const gainSwitchUseMaxSample = isBarrel ? gainSwitchUseMaxSampleEB : gainSwitchUseMaxSampleEE;
+
+ // this flag setting is applied to all of the cases
+ if (shr_hasSwitchToGain6[chStart])
+ flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain6;
+ if (shr_hasSwitchToGain1[chStart])
+ flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain1;
+
+ // this corresponds to cpu branching on lastSampleBeforeSaturation
+ // likely false
+ if (check_hasSwitchToGain0) {
+ // assign for the case some sample having gainId == 0
+ //energies[ch] = amplitudes[ch][sample_max];
+ energies[ch] = amplitude;
+
+ // check if samples before sample_max have true
+ bool saturated_before_max = false;
+#pragma unroll
+ for (char ii = 0; ii < 5; ii++)
+ saturated_before_max = saturated_before_max || shr_hasSwitchToGain0[chStart + ii];
+
+ // if saturation is in the max sample and not in the first 5
+ if (!saturated_before_max && shr_hasSwitchToGain0[threadMax])
+ energies[ch] = 49140; // 4095 * 12
+ //---- AM FIXME : no pedestal subtraction???
+ //It should be "(4095. - pedestal) * gainratio"
+
+ // set state flag to terminate further processing of this channel
+ acState[ch] = static_cast(MinimizationState::Precomputed);
+ flag |= 0x1 << EcalUncalibratedRecHit::kSaturated;
+ flags[ch] = flag;
+ return;
+ }
+
+ // according to cpu version
+ // auto max_amplitude = amplitudes[ch][sample_max];
+ auto const max_amplitude = amplitude;
+ // according to cpu version
+ auto shape_value = shapes_in[hashedId].pdfval[full_pulse_max - 7];
+ // note, no syncing as the same thread will be accessing here
+ bool hasGainSwitch =
+ shr_hasSwitchToGain6[chStart] || shr_hasSwitchToGain1[chStart] || shr_isSaturated[chStart + 3];
+
+ // pedestal is final unconditionally
+ g_pedestal[ch] = pedestal;
+ if (hasGainSwitch && gainSwitchUseMaxSample) {
+ // thread for sample=0 will access the right guys
+ energies[ch] = max_amplitude / shape_value;
+ acState[ch] = static_cast(MinimizationState::Precomputed);
+ flags[ch] = flag;
+ return;
+ }
+
+ // this happens cause sometimes rms_x12 is 0...
+ // needs to be checkec why this is the case
+ // general case here is that noisecov is a Zero matrix
+ if (rmsForChecking == 0) {
+ acState[ch] = static_cast(MinimizationState::Precomputed);
+ flags[ch] = flag;
+ return;
+ }
+
+ // for the case when no shortcuts were taken
+ flags[ch] = flag;
+ }
+ }
+ }
+
+ ///
+ /// assume kernel launch configuration is
+ /// ([MAXSAMPLES, MAXSAMPLES], nchannels)
+ ///
+ __global__ void kernel_prep_2d(SampleGainVector const* gainNoise,
+ uint32_t const* dids_eb,
+ uint32_t const* dids_ee,
+ float const* rms_x12,
+ float const* rms_x6,
+ float const* rms_x1,
+ float const* gain12Over6,
+ float const* gain6Over1,
+ double const* G12SamplesCorrelationEB,
+ double const* G6SamplesCorrelationEB,
+ double const* G1SamplesCorrelationEB,
+ double const* G12SamplesCorrelationEE,
+ double const* G6SamplesCorrelationEE,
+ double const* G1SamplesCorrelationEE,
+ SampleMatrix* noisecov,
+ PulseMatrixType* pulse_matrix,
+ EcalPulseShape const* pulse_shape,
+ bool const* hasSwitchToGain6,
+ bool const* hasSwitchToGain1,
+ bool const* isSaturated,
+ uint32_t const offsetForHashes,
+ uint32_t const offsetForInputs) {
+ int const ch = blockIdx.x;
+ int const tx = threadIdx.x;
+ int const ty = threadIdx.y;
+ constexpr float addPedestalUncertainty = 0.f;
+ constexpr bool dynamicPedestal = false;
+ constexpr bool simplifiedNoiseModelForGainSwitch = true; //---- default is true
+
+ // to access input arrays (ids and digis only)
+ int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+ auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+
+ bool tmp0 = hasSwitchToGain6[ch];
+ bool tmp1 = hasSwitchToGain1[ch];
+ auto const did = DetId{dids[inputCh]};
+ auto const isBarrel = did.subdetId() == EcalBarrel;
+ auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
+ : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
+ auto const G12SamplesCorrelation = isBarrel ? G12SamplesCorrelationEB : G12SamplesCorrelationEE;
+ auto const* G6SamplesCorrelation = isBarrel ? G6SamplesCorrelationEB : G6SamplesCorrelationEE;
+ auto const* G1SamplesCorrelation = isBarrel ? G1SamplesCorrelationEB : G1SamplesCorrelationEE;
+ bool tmp2 = isSaturated[ch];
+ bool hasGainSwitch = tmp0 || tmp1 || tmp2;
+ auto const vidx = ecal::abs(ty - tx);
+
+ // non-divergent branch for all threads per block
+ if (hasGainSwitch) {
+ // TODO: did not include simplified noise model
+ float noise_value = 0;
+
+ // non-divergent branch - all threads per block
+ // TODO: all of these constants indicate that
+ // that these parts could be splitted into completely different
+ // kernels and run one of them only depending on the config
+ if (simplifiedNoiseModelForGainSwitch) {
+ int isample_max = 5; // according to cpu defs
+ int gainidx = gainNoise[ch][isample_max];
+
+ // non-divergent branches
+ if (gainidx == 0)
+ //noise_value = rms_x12[ch]*rms_x12[ch]*noisecorrs[0](ty, tx);
+ noise_value = rms_x12[hashedId] * rms_x12[hashedId] * G12SamplesCorrelation[vidx];
+ if (gainidx == 1)
+ // noise_value = gain12Over6[ch]*gain12Over6[ch] * rms_x6[ch]*rms_x6[ch]
+ // *noisecorrs[1](ty, tx);
+ noise_value = gain12Over6[hashedId] * gain12Over6[hashedId] * rms_x6[hashedId] * rms_x6[hashedId] *
+ G6SamplesCorrelation[vidx];
+ if (gainidx == 2)
+ // noise_value = gain12Over6[ch]*gain12Over6[ch]
+ // * gain6Over1[ch]*gain6Over1[ch] * rms_x1[ch]*rms_x1[ch]
+ // * noisecorrs[2](ty, tx);
+ noise_value = gain12Over6[hashedId] * gain12Over6[hashedId] * gain6Over1[hashedId] * gain6Over1[hashedId] *
+ rms_x1[hashedId] * rms_x1[hashedId] * G1SamplesCorrelation[vidx];
+ if (!dynamicPedestal && addPedestalUncertainty > 0.f)
+ noise_value += addPedestalUncertainty * addPedestalUncertainty;
+ } else {
+ int gainidx = 0;
+ char mask = gainidx;
+ int pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
+ // noise_value += /* gainratio is 1*/ rms_x12[ch]*rms_x12[ch]
+ // *pedestal*noisecorrs[0](ty, tx);
+ noise_value +=
+ /* gainratio is 1*/ rms_x12[hashedId] * rms_x12[hashedId] * pedestal * G12SamplesCorrelation[vidx];
+ // non-divergent branch
+ if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
+ noise_value += /* gainratio is 1 */
+ addPedestalUncertainty * addPedestalUncertainty * pedestal;
+ }
+
+ //
+ gainidx = 1;
+ mask = gainidx;
+ pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
+ // noise_value += gain12Over6[ch]*gain12Over6[ch]
+ // *rms_x6[ch]*rms_x6[ch]*pedestal*noisecorrs[1](ty, tx);
+ noise_value += gain12Over6[hashedId] * gain12Over6[hashedId] * rms_x6[hashedId] * rms_x6[hashedId] *
+ pedestal * G6SamplesCorrelation[vidx];
+ // non-divergent branch
+ if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
+ noise_value += gain12Over6[hashedId] * gain12Over6[hashedId] * addPedestalUncertainty *
+ addPedestalUncertainty * pedestal;
+ }
+
+ //
+ gainidx = 2;
+ mask = gainidx;
+ pedestal = gainNoise[ch][ty] == mask ? 1 : 0;
+ float tmp = gain6Over1[hashedId] * gain12Over6[hashedId];
+ // noise_value += tmp*tmp * rms_x1[ch]*rms_x1[ch]
+ // *pedestal*noisecorrs[2](ty, tx);
+ noise_value += tmp * tmp * rms_x1[hashedId] * rms_x1[hashedId] * pedestal * G1SamplesCorrelation[vidx];
+ // non-divergent branch
+ if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
+ noise_value += tmp * tmp * addPedestalUncertainty * addPedestalUncertainty * pedestal;
+ }
+ }
+
+ noisecov[ch](ty, tx) = noise_value;
+ } else {
+ auto rms = rms_x12[hashedId];
+ float noise_value = rms * rms * G12SamplesCorrelation[vidx];
+ if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
+ //---- add fully correlated component to noise covariance to inflate pedestal uncertainty
+ noise_value += addPedestalUncertainty * addPedestalUncertainty;
+ }
+ noisecov[ch](ty, tx) = noise_value;
+ }
+
+ // pulse matrix
+ // int const bx = tx - 5; // -5 -4 -3 ... 3 4
+ // int bx = (*bxs)(tx);
+ // int const offset = 7 - 3 - bx;
+ int const posToAccess = 9 - tx + ty; // see cpu for reference
+ float const value = posToAccess >= 7 ? pulse_shape[hashedId].pdfval[posToAccess - 7] : 0;
+ pulse_matrix[ch](ty, tx) = value;
+ }
+
+ __global__ void kernel_permute_results(SampleVector* amplitudes,
+ BXVectorType const* activeBXs,
+ ::ecal::reco::StorageScalarType* energies,
+ char const* acState,
+ int const nchannels) {
+ // constants
+ constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+ // indices
+ int const tx = threadIdx.x + blockIdx.x * blockDim.x;
+ int const ch = tx / nsamples;
+ int const iii = tx % nsamples; // this is to address activeBXs
+
+ if (ch >= nchannels)
+ return;
+
+ // channels that have amplitude precomputed do not need results to be permuted
+ auto const state = static_cast(acState[ch]);
+ if (state == MinimizationState::Precomputed)
+ return;
+
+ // configure shared memory and cp into it
+ extern __shared__ char smem[];
+ SampleVector::Scalar* values = reinterpret_cast(smem);
+ values[threadIdx.x] = amplitudes[ch](iii);
+ __syncthreads();
+
+ // get the sample for this bx
+ auto const sample = static_cast(activeBXs[ch](iii)) + 5;
+
+ // store back to global
+ amplitudes[ch](sample) = values[threadIdx.x];
+
+ // store sample 5 separately
+ // only for the case when minimization was performed
+ // not for cases with precomputed amplitudes
+ if (sample == 5)
+ energies[ch] = values[threadIdx.x];
+ }
+
+///
+/// Build an Ecal RecHit.
+/// TODO: Use SoA data structures on the host directly
+/// the reason for removing this from minimize kernel is to isolate the minimize +
+/// again, building an aos rec hit involves strides... -> bad memory access pattern
+///
+#ifdef RUN_BUILD_AOS_RECHIT
+ __global__ void kernel_build_rechit(
+ float const* energies, float const* chi2s, uint32_t* dids, EcalUncalibratedRecHit* rechits, int nchannels) {
+ int idx = threadIdx.x + blockDim.x * blockIdx.x;
+ if (idx < nchannels) {
+ rechits[idx] = EcalUncalibratedRecHit{dids[idx], energies[idx], 0, 0, chi2s[idx], 0};
+ }
+ }
+#endif
+
+ } // namespace multifit
+} // namespace ecal
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.h
new file mode 100644
index 0000000000000..3e813b529a2ad
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.h
@@ -0,0 +1,98 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationCommonKernels_h
+#define RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationCommonKernels_h
+
+#include "Common.h"
+#include "DeclsForKernels.h"
+#include "EigenMatrixTypes_gpu.h"
+
+class EcalPulseShape;
+// this flag setting is applied to all of the cases
+class EcalPulseCovariance;
+class EcalUncalibratedRecHit;
+
+namespace ecal {
+ namespace multifit {
+
+ ///
+ /// assume kernel launch configuration is
+ /// (MAXSAMPLES * nchannels, blocks)
+ /// TODO: is there a point to split this kernel further to separate reductions
+ ///
+ __global__ void kernel_prep_1d_and_initialize(EcalPulseShape const* shapes_in,
+ uint16_t const* digis_in_eb,
+ uint32_t const* dids_eb,
+ uint16_t const* digis_in_ee,
+ uint32_t const* dids_ee,
+ SampleVector* amplitudes,
+ SampleVector* amplitudesForMinimization,
+ SampleGainVector* gainsNoise,
+ float const* mean_x1,
+ float const* mean_x12,
+ float const* rms_x12,
+ float const* mean_x6,
+ float const* gain6Over1,
+ float const* gain12Over6,
+ bool* hasSwitchToGain6,
+ bool* hasSwitchToGain1,
+ bool* isSaturated,
+ ::ecal::reco::StorageScalarType* energies,
+ ::ecal::reco::StorageScalarType* chi2,
+ ::ecal::reco::StorageScalarType* pedestal,
+ uint32_t* dids_out,
+ uint32_t* flags,
+ char* acState,
+ BXVectorType* bxs,
+ uint32_t const offsetForHashes,
+ uint32_t const offsetForInputs,
+ bool const gainSwitchUseMaxSampleEB,
+ bool const gainSwitchUseMaxSampleEE,
+ int const nchannels);
+
+ ///
+ /// assume kernel launch configuration is
+ /// ([MAXSAMPLES, MAXSAMPLES], nchannels)
+ ///
+ __global__ void kernel_prep_2d(SampleGainVector const* gainNoise,
+ uint32_t const* dids_eb,
+ uint32_t const* dids_ee,
+ float const* rms_x12,
+ float const* rms_x6,
+ float const* rms_x1,
+ float const* gain12Over6,
+ float const* gain6Over1,
+ double const* G12SamplesCorrelationEB,
+ double const* G6SamplesCorrelationEB,
+ double const* G1SamplesCorrelationEB,
+ double const* G12SamplesCorrelationEE,
+ double const* G6SamplesCorrelationEE,
+ double const* G1SamplesCorrelationEE,
+ SampleMatrix* noisecov,
+ PulseMatrixType* pulse_matrix,
+ EcalPulseShape const* pulse_shape,
+ bool const* hasSwitchToGain6,
+ bool const* hasSwitchToGain1,
+ bool const* isSaturated,
+ uint32_t const offsetForHashes,
+ uint32_t const offsetForInputs);
+
+ __global__ void kernel_permute_results(SampleVector* amplitudes,
+ BXVectorType const* activeBXs,
+ ::ecal::reco::StorageScalarType* energies,
+ char const* acState,
+ int const nchannels);
+
+///
+/// Build an Ecal RecHit.
+/// TODO: Use SoA data structures on the host directly
+/// the reason for removing this from minimize kernel is to isolate the minimize +
+/// again, building an aos rec hit involves strides... -> bad memory access pattern
+///
+#ifdef RUN_BUILD_AOS_RECHIT
+ __global__ void kernel_build_rechit(
+ float const* energies, float const* chi2s, uint32_t* dids, EcalUncalibratedRecHit* rechits, int nchannels);
+#endif
+
+ } // namespace multifit
+} // namespace ecal
+
+#endif // RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationCommonKernels_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu
new file mode 100644
index 0000000000000..42c0b8aab85db
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu
@@ -0,0 +1,400 @@
+#include
+#include
+
+#include
+
+#include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h"
+#include "CondFormats/EcalObjects/interface/EcalPulseShapes.h"
+#include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+#include "DataFormats/Math/interface/approx_exp.h"
+#include "DataFormats/Math/interface/approx_log.h"
+
+#include "KernelHelpers.h"
+#include "AmplitudeComputationKernels.h"
+#include "AmplitudeComputationCommonKernels.h"
+
+namespace ecal {
+ namespace multifit {
+
+ void eigen_solve_submatrix(SampleMatrix& mat, SampleVector& invec, SampleVector& outvec, unsigned NP) {
+ using namespace Eigen;
+ switch (NP) { // pulse matrix is always square.
+ case 10: {
+ Matrix temp = mat.topLeftCorner<10, 10>();
+ outvec.head<10>() = temp.ldlt().solve(invec.head<10>());
+ break;
+ }
+ case 9: {
+ Matrix temp = mat.topLeftCorner<9, 9>();
+ outvec.head<9>() = temp.ldlt().solve(invec.head<9>());
+ break;
+ }
+ case 8: {
+ Matrix temp = mat.topLeftCorner<8, 8>();
+ outvec.head<8>() = temp.ldlt().solve(invec.head<8>());
+ break;
+ }
+ case 7: {
+ Matrix temp = mat.topLeftCorner<7, 7>();
+ outvec.head<7>() = temp.ldlt().solve(invec.head<7>());
+ break;
+ }
+ case 6: {
+ Matrix temp = mat.topLeftCorner<6, 6>();
+ outvec.head<6>() = temp.ldlt().solve(invec.head<6>());
+ break;
+ }
+ case 5: {
+ Matrix temp = mat.topLeftCorner<5, 5>();
+ outvec.head<5>() = temp.ldlt().solve(invec.head<5>());
+ break;
+ }
+ case 4: {
+ Matrix temp = mat.topLeftCorner<4, 4>();
+ outvec.head<4>() = temp.ldlt().solve(invec.head<4>());
+ break;
+ }
+ case 3: {
+ Matrix temp = mat.topLeftCorner<3, 3>();
+ outvec.head<3>() = temp.ldlt().solve(invec.head<3>());
+ break;
+ }
+ case 2: {
+ Matrix temp = mat.topLeftCorner<2, 2>();
+ outvec.head<2>() = temp.ldlt().solve(invec.head<2>());
+ break;
+ }
+ case 1: {
+ Matrix temp = mat.topLeftCorner<1, 1>();
+ outvec.head<1>() = temp.ldlt().solve(invec.head<1>());
+ break;
+ }
+ default:
+ return;
+ }
+ }
+
+ template
+ __device__ __forceinline__ bool update_covariance(EcalPulseCovariance const& pulse_covariance,
+ MatrixType& inverse_cov,
+ SampleVector const& amplitudes) {
+ constexpr int nsamples = SampleVector::RowsAtCompileTime;
+ constexpr int npulses = BXVectorType::RowsAtCompileTime;
+
+#pragma unroll
+ for (unsigned int ipulse = 0; ipulse < npulses; ipulse++) {
+ auto const amplitude = amplitudes.coeff(ipulse);
+ if (amplitude == 0)
+ continue;
+
+ // FIXME: ipulse - 5 -> ipulse - firstOffset
+ int bx = ipulse - 5;
+ int first_sample_t = std::max(0, bx + 3);
+ int offset = -3 - bx;
+
+ auto const value_sq = amplitude * amplitude;
+
+ for (int col = first_sample_t; col < nsamples; col++) {
+ for (int row = col; row < nsamples; row++) {
+ inverse_cov(row, col) += value_sq * __ldg(&pulse_covariance.covval[row + offset][col + offset]);
+ }
+ }
+ }
+
+ return true;
+ }
+
+ ///
+ /// launch ctx parameters are (nchannels / block, blocks)
+ /// TODO: trivial impl for now, there must be a way to improve
+ ///
+ /// Conventions:
+ /// - amplitudes -> solution vector, what we are fitting for
+ /// - samples -> raw detector responses
+ /// - passive constraint - satisfied constraint
+ /// - active constraint - unsatisfied (yet) constraint
+ ///
+ __global__ void kernel_minimize(uint32_t const* dids_eb,
+ uint32_t const* dids_ee,
+ SampleMatrix const* __restrict__ noisecov,
+ EcalPulseCovariance const* __restrict__ pulse_covariance,
+ BXVectorType* bxs,
+ SampleVector const* __restrict__ samples,
+ SampleVector* amplitudes,
+ PulseMatrixType const* __restrict__ pulse_matrix,
+ ::ecal::reco::StorageScalarType* chi2s,
+ ::ecal::reco::StorageScalarType* energies,
+ char* acState,
+ int nchannels,
+ int max_iterations,
+ uint32_t const offsetForHashes,
+ uint32_t const offsetForInputs) {
+ // FIXME: ecal has 10 samples and 10 pulses....
+ // but this needs to be properly treated and renamed everywhere
+ constexpr auto NSAMPLES = SampleMatrix::RowsAtCompileTime;
+ constexpr auto NPULSES = SampleMatrix::RowsAtCompileTime;
+ static_assert(NSAMPLES == NPULSES);
+
+ using DataType = SampleVector::Scalar;
+
+ extern __shared__ char shrmem[];
+ DataType* shrMatrixLForFnnlsStorage =
+ reinterpret_cast(shrmem) + MapSymM::total * threadIdx.x;
+ DataType* shrAtAStorage =
+ reinterpret_cast(shrmem) + MapSymM::total * (threadIdx.x + blockDim.x);
+
+ // FIXME: remove eitehr idx or ch -> they are teh same thing
+ int idx = threadIdx.x + blockDim.x * blockIdx.x;
+ auto const ch = idx;
+ if (idx < nchannels) {
+ if (static_cast(acState[idx]) == MinimizationState::Precomputed)
+ return;
+
+ // get the hash
+ int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+ auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+ auto const did = DetId{dids[inputCh]};
+ auto const isBarrel = did.subdetId() == EcalBarrel;
+ auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
+ : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
+
+ // inits
+ int iter = 0;
+ int npassive = 0;
+
+ ColumnVector pulseOffsets;
+#pragma unroll
+ for (int i = 0; i < NPULSES; ++i)
+ pulseOffsets(i) = i;
+
+ ColumnVector resultAmplitudes;
+#pragma unroll
+ for (int counter = 0; counter < NPULSES; counter++)
+ resultAmplitudes(counter) = 0;
+
+ // inits
+ //SampleDecompLLT covariance_decomposition;
+ //SampleMatrix inverse_cov;
+ SampleVector::Scalar chi2 = 0, chi2_now = 0;
+
+ // loop until ocnverge
+ while (true) {
+ if (iter >= max_iterations)
+ break;
+
+ //inverse_cov = noisecov[idx];
+ //DataType covMatrixStorage[MapSymM::total];
+ DataType* covMatrixStorage = shrMatrixLForFnnlsStorage;
+ MapSymM covMatrix{covMatrixStorage};
+ int counter = 0;
+#pragma unroll
+ for (int col = 0; col < NSAMPLES; col++)
+#pragma unroll
+ for (int row = col; row < NSAMPLES; row++)
+ covMatrixStorage[counter++] = __ldg(&noisecov[idx].coeffRef(row, col));
+
+ update_covariance(pulse_covariance[hashedId], covMatrix, resultAmplitudes);
+
+ // compute actual covariance decomposition
+ //covariance_decomposition.compute(inverse_cov);
+ //auto const& matrixL = covariance_decomposition.matrixL();
+ DataType matrixLStorage[MapSymM::total];
+ MapSymM matrixL{matrixLStorage};
+ compute_decomposition_unrolled(matrixL, covMatrix);
+
+ // L * A = P
+ ColMajorMatrix A;
+ solve_forward_subst_matrix(A, pulse_matrix[idx], matrixL);
+
+ // L b = s
+ float reg_b[NSAMPLES];
+ solve_forward_subst_vector(reg_b, samples[idx], matrixL);
+
+ // FIXME: shared mem
+ //DataType AtAStorage[MapSymM::total];
+ MapSymM AtA{shrAtAStorage};
+ //SampleMatrix AtA;
+ SampleVector Atb;
+#pragma unroll
+ for (int icol = 0; icol < NPULSES; icol++) {
+ float reg_ai[NSAMPLES];
+
+// load column icol
+#pragma unroll
+ for (int counter = 0; counter < NSAMPLES; counter++)
+ reg_ai[counter] = A(counter, icol);
+
+ // compute diagoanl
+ float sum = 0.f;
+#pragma unroll
+ for (int counter = 0; counter < NSAMPLES; counter++)
+ sum += reg_ai[counter] * reg_ai[counter];
+
+ // store
+ AtA(icol, icol) = sum;
+
+// go thru the other columns
+#pragma unroll
+ for (int j = icol + 1; j < NPULSES; j++) {
+ // load column j
+ float reg_aj[NSAMPLES];
+#pragma unroll
+ for (int counter = 0; counter < NSAMPLES; counter++)
+ reg_aj[counter] = A(counter, j);
+
+ // accum
+ float sum = 0.f;
+#pragma unroll
+ for (int counter = 0; counter < NSAMPLES; counter++)
+ sum += reg_aj[counter] * reg_ai[counter];
+
+ // store
+ //AtA(icol, j) = sum;
+ AtA(j, icol) = sum;
+ }
+
+ // Atb accum
+ float sum_atb = 0.f;
+#pragma unroll
+ for (int counter = 0; counter < NSAMPLES; counter++)
+ sum_atb += reg_ai[counter] * reg_b[counter];
+
+ // store atb
+ Atb(icol) = sum_atb;
+ }
+
+ // FIXME: shared mem
+ //DataType matrixLForFnnlsStorage[MapSymM::total];
+ MapSymM matrixLForFnnls{shrMatrixLForFnnlsStorage};
+
+ fnnls(AtA,
+ Atb,
+ //amplitudes[idx],
+ resultAmplitudes,
+ npassive,
+ pulseOffsets,
+ matrixLForFnnls,
+ 1e-11,
+ 500);
+
+ {
+ DataType accum[NSAMPLES];
+// load accum
+#pragma unroll
+ for (int counter = 0; counter < NSAMPLES; counter++)
+ accum[counter] = -samples[idx](counter);
+
+ // iterate
+ for (int icol = 0; icol < NPULSES; icol++) {
+ DataType pm_col[NSAMPLES];
+
+// preload a column of pulse matrix
+#pragma unroll
+ for (int counter = 0; counter < NSAMPLES; counter++)
+ pm_col[counter] = __ldg(&pulse_matrix[idx].coeffRef(counter, icol));
+
+// accum
+#pragma unroll
+ for (int counter = 0; counter < NSAMPLES; counter++)
+ accum[counter] += resultAmplitudes[icol] * pm_col[counter];
+ }
+
+ DataType reg_L[NSAMPLES];
+ DataType accumSum = 0;
+
+// preload a column and load column 0 of cholesky
+#pragma unroll
+ for (int i = 0; i < NSAMPLES; i++)
+ reg_L[i] = matrixL(i, 0);
+
+ // compute x0 and store it
+ auto x_prev = accum[0] / reg_L[0];
+ accumSum += x_prev * x_prev;
+
+// iterate
+#pragma unroll
+ for (int iL = 1; iL < NSAMPLES; iL++) {
+// update accum
+#pragma unroll
+ for (int counter = iL; counter < NSAMPLES; counter++)
+ accum[counter] -= x_prev * reg_L[counter];
+
+// load the next column of cholesky
+#pragma unroll
+ for (int counter = iL; counter < NSAMPLES; counter++)
+ reg_L[counter] = matrixL(counter, iL);
+
+ // compute the next x for M(iL, icol)
+ x_prev = accum[iL] / reg_L[iL];
+
+ // store teh result value
+ accumSum += x_prev * x_prev;
+ }
+
+ chi2_now = accumSum;
+ }
+
+ auto deltachi2 = chi2_now - chi2;
+ chi2 = chi2_now;
+
+ if (ecal::abs(deltachi2) < 1e-3)
+ break;
+
+ //---- AM: TEST
+ //---- it was 3 lines above, now here as in the CPU version
+ ++iter;
+ }
+
+ // store to global output values
+ // FIXME: amplitudes are used in global directly
+ chi2s[idx] = chi2;
+ energies[idx] = resultAmplitudes(5);
+
+#pragma unroll
+ for (int counter = 0; counter < NPULSES; counter++)
+ amplitudes[idx](counter) = resultAmplitudes(counter);
+ }
+ }
+
+ namespace v1 {
+
+ void minimization_procedure(EventInputDataGPU const& eventInputGPU,
+ EventOutputDataGPU& eventOutputGPU,
+ EventDataForScratchGPU& scratch,
+ ConditionsProducts const& conditions,
+ ConfigurationParameters const& configParameters,
+ cudaStream_t cudaStream) {
+ using DataType = SampleVector::Scalar;
+ unsigned int totalChannels = eventInputGPU.ebDigis.ndigis + eventInputGPU.eeDigis.ndigis;
+ // unsigned int threads_min = conf.threads.x;
+ // TODO: configure from python
+ unsigned int threads_min = configParameters.kernelMinimizeThreads[0];
+ unsigned int blocks_min = threads_min > totalChannels ? 1 : (totalChannels + threads_min - 1) / threads_min;
+ uint32_t const offsetForHashes = conditions.offsetForHashes;
+ uint32_t const offsetForInputs = eventInputGPU.ebDigis.ndigis;
+ auto const nbytesShared =
+ 2 * threads_min * MapSymM::total * sizeof(DataType);
+ kernel_minimize<<>>(
+ eventInputGPU.ebDigis.ids,
+ eventInputGPU.eeDigis.ids,
+ scratch.noisecov,
+ conditions.pulseCovariances.values,
+ scratch.activeBXs,
+ scratch.samples,
+ (SampleVector*)eventOutputGPU.amplitudesAll,
+ scratch.pulse_matrix,
+ eventOutputGPU.chi2,
+ eventOutputGPU.amplitude,
+ scratch.acState,
+ totalChannels,
+ 50,
+ offsetForHashes,
+ offsetForInputs);
+ cudaCheck(cudaGetLastError());
+ }
+
+ } // namespace v1
+
+ } // namespace multifit
+} // namespace ecal
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.h
new file mode 100644
index 0000000000000..b8202f75b653b
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.h
@@ -0,0 +1,29 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationKernels_h
+#define RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationKernels_h
+
+#include "Common.h"
+#include "DeclsForKernels.h"
+#include "EigenMatrixTypes_gpu.h"
+
+class EcalPulseShape;
+class EcalPulseCovariance;
+class EcalUncalibratedRecHit;
+
+namespace ecal {
+ namespace multifit {
+
+ namespace v1 {
+
+ void minimization_procedure(EventInputDataGPU const& eventInputGPU,
+ EventOutputDataGPU& eventOutputGPU,
+ EventDataForScratchGPU& scratch,
+ ConditionsProducts const& conditions,
+ ConfigurationParameters const& configParameters,
+ cudaStream_t cudaStream);
+
+ }
+
+ } // namespace multifit
+} // namespace ecal
+
+#endif // RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationKernels_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml
index 3b1d2c0cf159d..61eed4689fd20 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml
+++ b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml
@@ -1,21 +1,22 @@
-
-
-
-
-
-
-
-
+
+
+
+
-
-
-
+
+
+
+
+
+
+
-
+
+
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/Common.h b/RecoLocalCalo/EcalRecProducers/plugins/Common.h
new file mode 100644
index 0000000000000..4e7afbd0819db
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/Common.h
@@ -0,0 +1,46 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_Common_h
+#define RecoLocalCalo_EcalRecProducers_plugins_Common_h
+
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+// a workaround for std::abs not being a constexpr function
+namespace ecal {
+
+ template
+ constexpr T abs(T const& value) {
+ return ::std::max(value, -value);
+ }
+
+ // temporary
+ namespace mgpa {
+
+ constexpr int adc(uint16_t sample) { return sample & 0xfff; }
+ constexpr int gainId(uint16_t sample) { return (sample >> 12) & 0x3; }
+
+ } // namespace mgpa
+
+} // namespace ecal
+
+template
+struct DurationMeasurer {
+ DurationMeasurer(std::string const& msg) : msg_{msg}, start_{std::chrono::high_resolution_clock::now()} {}
+
+ ~DurationMeasurer() {
+ auto end = std::chrono::high_resolution_clock::now();
+ auto duration = std::chrono::duration_cast(end - start_).count();
+ std::cout << msg_ << "\nduration = " << duration << std::endl;
+ }
+
+private:
+ std::string msg_;
+ std::chrono::high_resolution_clock::time_point start_;
+};
+
+#endif // RecoLocalCalo_EcalRecProducers_plugins_Common_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/DeclsForKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/DeclsForKernels.h
new file mode 100644
index 0000000000000..1ea0fcc430882
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/DeclsForKernels.h
@@ -0,0 +1,333 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_DeclsForKernels_h
+#define RecoLocalCalo_EcalRecProducers_plugins_DeclsForKernels_h
+
+#include
+
+#include
+#include
+
+#include "CUDADataFormats/EcalDigi/interface/DigisCollection.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h"
+#include "CondFormats/EcalObjects/interface/EcalChannelStatus.h"
+#include "CondFormats/EcalObjects/interface/EcalChannelStatusCode.h"
+#include "CondFormats/EcalObjects/interface/EcalGainRatios.h"
+#include "CondFormats/EcalObjects/interface/EcalPedestals.h"
+#include "CondFormats/EcalObjects/interface/EcalTimeBiasCorrections.h"
+#include "CondFormats/EcalObjects/interface/EcalTimeOffsetConstant.h"
+#include "CondFormats/EcalObjects/interface/EcalWeightSet.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h"
+
+#include "EigenMatrixTypes_gpu.h"
+
+struct EcalPulseShape;
+class EcalSampleMask;
+class EcalTimeBiasCorrections;
+struct EcalPulseCovariance;
+class EcalDigiCollection;
+class EcalXtalGroupId;
+class EcalSamplesCorrelation;
+class EBDigiCollection;
+class EEDigiCollection;
+
+namespace ecal {
+ namespace multifit {
+
+ enum class TimeComputationState : char { NotFinished = 0, Finished = 1 };
+ enum class MinimizationState : char {
+ NotFinished = 0,
+ Finished = 1,
+ Precomputed = 2,
+ };
+
+ //
+ struct EventInputDataGPU {
+ ecal::DigisCollection const& ebDigis;
+ ecal::DigisCollection const& eeDigis;
+ };
+
+ // parameters have a fixed type
+ // Can we go by with single precision
+ struct ConfigurationParameters {
+ using type = double;
+ // device ptrs
+ type *amplitudeFitParametersEB = nullptr, *amplitudeFitParametersEE = nullptr;
+
+ uint32_t timeFitParametersSizeEB, timeFitParametersSizeEE;
+ // device ptrs
+ type *timeFitParametersEB = nullptr, *timeFitParametersEE = nullptr;
+
+ type timeFitLimitsFirstEB, timeFitLimitsFirstEE;
+ type timeFitLimitsSecondEB, timeFitLimitsSecondEE;
+
+ type timeConstantTermEB, timeConstantTermEE;
+
+ type timeNconstEB, timeNconstEE;
+
+ type amplitudeThreshEE, amplitudeThreshEB;
+
+ type outOfTimeThreshG12pEB, outOfTimeThreshG12mEB;
+ type outOfTimeThreshG12pEE, outOfTimeThreshG12mEE;
+ type outOfTimeThreshG61pEE, outOfTimeThreshG61mEE;
+ type outOfTimeThreshG61pEB, outOfTimeThreshG61mEB;
+
+ std::array kernelMinimizeThreads;
+
+ bool shouldRunTimingComputation;
+ };
+
+ struct EventOutputDataGPU final : public ::ecal::UncalibratedRecHit<::ecal::Tag::ptr> {
+ void allocate(ConfigurationParameters const& configParameters, uint32_t size) {
+ cudaCheck(cudaMalloc((void**)&litudesAll, size * sizeof(SampleVector)));
+ cudaCheck(cudaMalloc((void**)&litude, size * sizeof(::ecal::reco::StorageScalarType)));
+ cudaCheck(cudaMalloc((void**)&chi2, size * sizeof(::ecal::reco::StorageScalarType)));
+ cudaCheck(cudaMalloc((void**)&pedestal, size * sizeof(::ecal::reco::StorageScalarType)));
+
+ if (configParameters.shouldRunTimingComputation) {
+ cudaCheck(cudaMalloc((void**)&jitter, size * sizeof(::ecal::reco::StorageScalarType)));
+ cudaCheck(cudaMalloc((void**)&jitterError, size * sizeof(::ecal::reco::StorageScalarType)));
+ }
+
+ cudaCheck(cudaMalloc((void**)&did, size * sizeof(uint32_t)));
+ cudaCheck(cudaMalloc((void**)&flags, size * sizeof(uint32_t)));
+ }
+
+ void deallocate(ConfigurationParameters const& configParameters) {
+ cudaCheck(cudaFree(amplitudesAll));
+ cudaCheck(cudaFree(amplitude));
+ cudaCheck(cudaFree(chi2));
+ cudaCheck(cudaFree(pedestal));
+ if (configParameters.shouldRunTimingComputation) {
+ cudaCheck(cudaFree(jitter));
+ cudaCheck(cudaFree(jitterError));
+ }
+ cudaCheck(cudaFree(did));
+ cudaCheck(cudaFree(flags));
+ }
+ };
+
+ struct EventDataForScratchGPU {
+ SampleVector* samples = nullptr;
+ SampleGainVector* gainsNoise = nullptr;
+
+ SampleMatrix* noisecov = nullptr;
+ PulseMatrixType* pulse_matrix = nullptr;
+ BXVectorType* activeBXs = nullptr;
+ char* acState = nullptr;
+
+ bool *hasSwitchToGain6 = nullptr, *hasSwitchToGain1 = nullptr, *isSaturated = nullptr;
+
+ SampleVector::Scalar *sample_values, *sample_value_errors;
+ bool* useless_sample_values;
+ SampleVector::Scalar* chi2sNullHypot;
+ SampleVector::Scalar* sum0sNullHypot;
+ SampleVector::Scalar* sumAAsNullHypot;
+ char* pedestal_nums;
+ SampleVector::Scalar *tMaxAlphaBetas, *tMaxErrorAlphaBetas;
+ SampleVector::Scalar *accTimeMax, *accTimeWgt;
+ SampleVector::Scalar *ampMaxAlphaBeta, *ampMaxError;
+ SampleVector::Scalar *timeMax, *timeError;
+ TimeComputationState* tcState;
+
+ void allocate(ConfigurationParameters const& configParameters, uint32_t size) {
+ cudaCheck(cudaMalloc((void**)&samples, size * sizeof(SampleVector)));
+ cudaCheck(cudaMalloc((void**)&gainsNoise, size * sizeof(SampleGainVector)));
+
+ cudaCheck(cudaMalloc((void**)&noisecov, size * sizeof(SampleMatrix)));
+ cudaCheck(cudaMalloc((void**)&pulse_matrix, size * sizeof(PulseMatrixType)));
+ cudaCheck(cudaMalloc((void**)&activeBXs, size * sizeof(BXVectorType)));
+ cudaCheck(cudaMalloc((void**)&acState, size * sizeof(char)));
+
+ cudaCheck(cudaMalloc((void**)&hasSwitchToGain6, size * sizeof(bool)));
+ cudaCheck(cudaMalloc((void**)&hasSwitchToGain1, size * sizeof(bool)));
+ cudaCheck(cudaMalloc((void**)&isSaturated, size * sizeof(bool)));
+
+ if (configParameters.shouldRunTimingComputation) {
+ cudaCheck(cudaMalloc((void**)&sample_values, size * sizeof(SampleVector)));
+ cudaCheck(cudaMalloc((void**)&sample_value_errors, size * sizeof(SampleVector)));
+ cudaCheck(cudaMalloc((void**)&useless_sample_values, size * sizeof(bool) * EcalDataFrame::MAXSAMPLES));
+ cudaCheck(cudaMalloc((void**)&chi2sNullHypot, size * sizeof(SampleVector::Scalar)));
+ cudaCheck(cudaMalloc((void**)&sum0sNullHypot, size * sizeof(SampleVector::Scalar)));
+ cudaCheck(cudaMalloc((void**)&sumAAsNullHypot, size * sizeof(SampleVector::Scalar)));
+ cudaCheck(cudaMalloc((void**)&pedestal_nums, size * sizeof(char)));
+
+ cudaCheck(cudaMalloc((void**)&tMaxAlphaBetas, size * sizeof(SampleVector::Scalar)));
+ cudaCheck(cudaMalloc((void**)&tMaxErrorAlphaBetas, size * sizeof(SampleVector::Scalar)));
+ cudaCheck(cudaMalloc((void**)&accTimeMax, size * sizeof(SampleVector::Scalar)));
+ cudaCheck(cudaMalloc((void**)&accTimeWgt, size * sizeof(SampleVector::Scalar)));
+ cudaCheck(cudaMalloc((void**)&MaxAlphaBeta, size * sizeof(SampleVector::Scalar)));
+ cudaCheck(cudaMalloc((void**)&MaxError, size * sizeof(SampleVector::Scalar)));
+ cudaCheck(cudaMalloc((void**)&timeMax, size * sizeof(SampleVector::Scalar)));
+ cudaCheck(cudaMalloc((void**)&timeError, size * sizeof(SampleVector::Scalar)));
+ cudaCheck(cudaMalloc((void**)&tcState, size * sizeof(TimeComputationState)));
+ }
+ }
+
+ void deallocate(ConfigurationParameters const& configParameters) {
+ cudaCheck(cudaFree(samples));
+ cudaCheck(cudaFree(gainsNoise));
+
+ cudaCheck(cudaFree(noisecov));
+ cudaCheck(cudaFree(pulse_matrix));
+ cudaCheck(cudaFree(activeBXs));
+ cudaCheck(cudaFree(acState));
+
+ cudaCheck(cudaFree(hasSwitchToGain6));
+ cudaCheck(cudaFree(hasSwitchToGain1));
+ cudaCheck(cudaFree(isSaturated));
+
+ if (configParameters.shouldRunTimingComputation) {
+ cudaCheck(cudaFree(sample_values));
+ cudaCheck(cudaFree(sample_value_errors));
+ cudaCheck(cudaFree(useless_sample_values));
+ cudaCheck(cudaFree(chi2sNullHypot));
+ cudaCheck(cudaFree(sum0sNullHypot));
+ cudaCheck(cudaFree(sumAAsNullHypot));
+ cudaCheck(cudaFree(pedestal_nums));
+
+ cudaCheck(cudaFree(tMaxAlphaBetas));
+ cudaCheck(cudaFree(tMaxErrorAlphaBetas));
+ cudaCheck(cudaFree(accTimeMax));
+ cudaCheck(cudaFree(accTimeWgt));
+ cudaCheck(cudaFree(ampMaxAlphaBeta));
+ cudaCheck(cudaFree(ampMaxError));
+ cudaCheck(cudaFree(timeMax));
+ cudaCheck(cudaFree(timeError));
+ cudaCheck(cudaFree(tcState));
+ }
+ }
+ };
+
+ // const refs products to conditions
+ struct ConditionsProducts {
+ EcalPedestalsGPU::Product const& pedestals;
+ EcalGainRatiosGPU::Product const& gainRatios;
+ EcalPulseShapesGPU::Product const& pulseShapes;
+ EcalPulseCovariancesGPU::Product const& pulseCovariances;
+ EcalSamplesCorrelationGPU::Product const& samplesCorrelation;
+ EcalTimeBiasCorrectionsGPU::Product const& timeBiasCorrections;
+ EcalTimeCalibConstantsGPU::Product const& timeCalibConstants;
+ EcalSampleMask const& sampleMask;
+ EcalTimeOffsetConstant const& timeOffsetConstant;
+ uint32_t offsetForHashes;
+ };
+
+ //*/
+
+ struct xyz {
+ int x, y, z;
+ };
+
+ struct conf_data {
+ xyz threads;
+ bool runV1;
+ cudaStream_t cuStream;
+ };
+
+ } // namespace multifit
+} // namespace ecal
+
+//
+// ECAL Rechit producer
+//
+
+namespace ecal {
+ namespace rechit {
+
+ // parameters that are read in the configuration file for rechit producer
+ struct ConfigurationParameters {
+ // device ptrs
+ int* ChannelStatusToBeExcluded = nullptr;
+ uint32_t ChannelStatusToBeExcludedSize;
+
+ bool killDeadChannels;
+
+ bool recoverEBIsolatedChannels;
+ bool recoverEEIsolatedChannels;
+ bool recoverEBVFE;
+ bool recoverEEVFE;
+ bool recoverEBFE;
+ bool recoverEEFE;
+
+ float EBLaserMIN;
+ float EELaserMIN;
+ float EBLaserMAX;
+ float EELaserMAX;
+
+ // std::vector > v_DB_reco_flags;
+ int* expanded_v_DB_reco_flags;
+ uint32_t* expanded_Sizes_v_DB_reco_flags;
+ uint32_t* expanded_flagbit_v_DB_reco_flags;
+ uint32_t expanded_v_DB_reco_flagsSize;
+
+ uint32_t flagmask;
+
+ //
+ // bool shouldRunTimingComputation;
+ };
+
+ struct EventOutputDataGPU final : public ::ecal::RecHit<::ecal::Tag::ptr> {
+ void allocate(ConfigurationParameters const& configParameters, uint32_t size) {
+ // void allocate(uint32_t size) {
+ //---- configParameters -> needed only to decide if to save the timing information or not
+
+ cudaCheck(cudaMalloc((void**)&energy, size * sizeof(::ecal::reco::StorageScalarType)));
+ cudaCheck(cudaMalloc((void**)&time, size * sizeof(::ecal::reco::StorageScalarType)));
+ cudaCheck(cudaMalloc((void**)&chi2, size * sizeof(::ecal::reco::StorageScalarType)));
+ cudaCheck(cudaMalloc((void**)&flagBits, size * sizeof(uint32_t)));
+ cudaCheck(cudaMalloc((void**)&extra, size * sizeof(uint32_t)));
+ cudaCheck(cudaMalloc((void**)&did, size * sizeof(uint32_t)));
+ }
+
+ void deallocate(ConfigurationParameters const& configParameters) {
+ // void deallocate() {
+ //---- configParameters -> needed only to decide if to save the timing information or not
+
+ cudaCheck(cudaFree(energy));
+ cudaCheck(cudaFree(time));
+ cudaCheck(cudaFree(chi2));
+ cudaCheck(cudaFree(flagBits));
+ cudaCheck(cudaFree(extra));
+ cudaCheck(cudaFree(did));
+ }
+ };
+
+ struct EventInputDataGPU {
+ ecal::UncalibratedRecHit const& ebUncalibRecHits;
+ ecal::UncalibratedRecHit const& eeUncalibRecHits;
+ };
+
+ // const refs products to conditions
+ struct ConditionsProducts {
+ EcalRechitADCToGeVConstantGPU::Product const& ADCToGeV;
+ EcalIntercalibConstantsGPU::Product const& Intercalib;
+ EcalRechitChannelStatusGPU::Product const& ChannelStatus;
+ //
+ EcalLaserAPDPNRatiosGPU::Product const& LaserAPDPNRatios;
+ EcalLaserAPDPNRatiosRefGPU::Product const& LaserAPDPNRatiosRef;
+ EcalLaserAlphasGPU::Product const& LaserAlphas;
+ EcalLinearCorrectionsGPU::Product const& LinearCorrections;
+ //
+ //
+ uint32_t offsetForHashes;
+ };
+
+ } // namespace rechit
+} // namespace ecal
+
+#endif // RecoLocalCalo_EcalRecProducers_plugins_DeclsForKernels_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.cu
new file mode 100644
index 0000000000000..d816414aa71a0
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.cu
@@ -0,0 +1,673 @@
+#include
+
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
+
+#include "EcalRecHitBuilderKernels.h"
+#include "KernelHelpers.h"
+
+namespace ecal {
+ namespace rechit {
+
+ // uncalibrecHit flags
+ enum UncalibRecHitFlags {
+ kGood = -1, // channel is good (mutually exclusive with other states) setFlagBit(kGood) reset flags_ to zero
+ kPoorReco, // channel has been badly reconstructed (e.g. bad shape, bad chi2 etc.)
+ kSaturated, // saturated channel
+ kOutOfTime, // channel out of time
+ kLeadingEdgeRecovered, // saturated channel: energy estimated from the leading edge before saturation
+ kHasSwitchToGain6, // at least one data frame is in G6
+ kHasSwitchToGain1 // at least one data frame is in G1
+ };
+
+ // recHit flags
+ enum RecHitFlags {
+ RecHitFlags_kGood = 0, // channel ok, the energy and time measurement are reliable
+ RecHitFlags_kPoorReco, // the energy is available from the UncalibRecHit, but approximate (bad shape, large chi2)
+ RecHitFlags_kOutOfTime, // the energy is available from the UncalibRecHit (sync reco), but the event is out of time
+ RecHitFlags_kFaultyHardware, // The energy is available from the UncalibRecHit, channel is faulty at some hardware level (e.g. noisy)
+ RecHitFlags_kNoisy, // the channel is very noisy
+ RecHitFlags_kPoorCalib, // the energy is available from the UncalibRecHit, but the calibration of the channel is poor
+ RecHitFlags_kSaturated, // saturated channel (recovery not tried)
+ RecHitFlags_kLeadingEdgeRecovered, // saturated channel: energy estimated from the leading edge before saturation
+ RecHitFlags_kNeighboursRecovered, // saturated/isolated dead: energy estimated from neighbours
+ RecHitFlags_kTowerRecovered, // channel in TT with no data link, info retrieved from Trigger Primitive
+ RecHitFlags_kDead, // channel is dead and any recovery fails
+ RecHitFlags_kKilled, // MC only flag: the channel is killed in the real detector
+ RecHitFlags_kTPSaturated, // the channel is in a region with saturated TP
+ RecHitFlags_kL1SpikeFlag, // the channel is in a region with TP with sFGVB = 0
+ RecHitFlags_kWeird, // the signal is believed to originate from an anomalous deposit (spike)
+ RecHitFlags_kDiWeird, // the signal is anomalous, and neighbors another anomalous signal
+ RecHitFlags_kHasSwitchToGain6, // at least one data frame is in G6
+ RecHitFlags_kHasSwitchToGain1, // at least one data frame is in G1
+ //
+ RecHitFlags_kUnknown // to ease the interface with functions returning flags.
+ };
+
+ // status code
+ enum EcalChannelStatusCode_Code {
+ kOk = 0,
+ kDAC,
+ kNoLaser,
+ kNoisy,
+ kNNoisy,
+ kNNNoisy,
+ kNNNNoisy,
+ kNNNNNoisy,
+ kFixedG6,
+ kFixedG1,
+ kFixedG0,
+ kNonRespondingIsolated,
+ kDeadVFE,
+ kDeadFE,
+ kNoDataNoTP
+ };
+
+ __global__ void kernel_create_ecal_rehit(
+ // configuration
+ int const* ChannelStatusToBeExcluded,
+ uint32_t ChannelStatusToBeExcludedSize,
+ bool const killDeadChannels,
+ bool const recoverEBIsolatedChannels,
+ bool const recoverEEIsolatedChannels,
+ bool const recoverEBVFE,
+ bool const recoverEEVFE,
+ bool const recoverEBFE,
+ bool const recoverEEFE,
+ float const EBLaserMIN,
+ float const EELaserMIN,
+ float const EBLaserMAX,
+ float const EELaserMAX,
+ // for flags setting
+ int const* expanded_v_DB_reco_flags, // FIXME AM: to be checked
+ uint32_t const* expanded_Sizes_v_DB_reco_flags,
+ uint32_t const* expanded_flagbit_v_DB_reco_flags,
+ uint32_t expanded_v_DB_reco_flagsSize,
+ uint32_t flagmask,
+ // conditions
+ float const* adc2gev,
+ float const* intercalib,
+ uint16_t const* status,
+ float const* apdpnrefs,
+ float const* alphas,
+ // input for transparency corrections
+ float const* p1,
+ float const* p2,
+ float const* p3,
+ edm::TimeValue_t const* t1,
+ edm::TimeValue_t const* t2,
+ edm::TimeValue_t const* t3,
+ // input for linear corrections
+ float const* lp1,
+ float const* lp2,
+ float const* lp3,
+ edm::TimeValue_t const* lt1,
+ edm::TimeValue_t const* lt2,
+ edm::TimeValue_t const* lt3,
+ // time, used for time dependent corrections
+ edm::TimeValue_t const event_time,
+ // input
+ uint32_t const* did_eb,
+ uint32_t const* did_ee,
+ ::ecal::reco::StorageScalarType const* amplitude_eb, // in adc counts
+ ::ecal::reco::StorageScalarType const* amplitude_ee, // in adc counts
+ ::ecal::reco::StorageScalarType const* time_eb,
+ ::ecal::reco::StorageScalarType const* time_ee,
+ ::ecal::reco::StorageScalarType const* chi2_eb,
+ ::ecal::reco::StorageScalarType const* chi2_ee,
+ uint32_t const* flags_eb,
+ uint32_t const* flags_ee,
+ // output
+ uint32_t* did,
+ ::ecal::reco::StorageScalarType* energy, // in energy [GeV]
+ ::ecal::reco::StorageScalarType* time,
+ ::ecal::reco::StorageScalarType* chi2,
+ uint32_t* flagBits,
+ uint32_t* extra,
+ // other
+ int const nchannels,
+ uint32_t const nChannelsBarrel,
+ uint32_t const offsetForHashes) {
+ //
+ // NB: energy "type_wrapper::type" most likely std::vector
+ //
+
+ for (int ch = threadIdx.x + blockDim.x * blockIdx.x; ch < nchannels; ch += blockDim.x * gridDim.x) {
+ // int ch = threadIdx.x + blockDim.x*blockIdx.x;
+
+ // if (ch < nchannels) {
+
+ bool isEndcap = (ch >= nChannelsBarrel);
+
+ int const inputCh = isEndcap ? ch - nChannelsBarrel : ch;
+
+ uint32_t const* didCh = isEndcap ? did_ee : did_eb;
+
+ // only two values, EB or EE
+ // AM : FIXME : why not using "isBarrel" ? isBarrel ? adc2gev[0] : adc2gev[1]
+ float adc2gev_to_use = isEndcap ? adc2gev[1] // ee
+ : adc2gev[0]; // eb
+
+ // first EB and then EE
+
+ ::ecal::reco::StorageScalarType const* amplitude = isEndcap ? amplitude_ee : amplitude_eb;
+
+ //::ecal::reco::StorageScalarType const* time_in = isEndcap ? time_ee : time_eb;
+
+ ::ecal::reco::StorageScalarType const* chi2_in = isEndcap ? chi2_ee : chi2_eb;
+
+ uint32_t const* flags_in = isEndcap ? flags_ee : flags_eb;
+
+ // simple copy
+ did[ch] = didCh[inputCh];
+
+ auto const did_to_use = DetId{didCh[inputCh]};
+
+ auto const isBarrel = did_to_use.subdetId() == EcalBarrel;
+ auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did_to_use.rawId())
+ : offsetForHashes + ecal::reconstruction::hashedIndexEE(did_to_use.rawId());
+
+ float const intercalib_to_use = intercalib[hashedId];
+
+ // get laser coefficient
+ float lasercalib = 1.;
+
+ //
+ // AM: ideas
+ //
+ // One possibility is to create the map of laser corrections once on CPU
+ // for all crystals and push them on GPU.
+ // Then only if the LS is different, update the laser correction
+ // The variation within a LS is not worth pursuing (<< 0.1% !!)
+ // and below the precision we can claim on the laser corrections (right?).
+ // This will save quite some time (also for the CPU version?)
+ //
+
+ int iLM = 1;
+
+ if (isBarrel) {
+ iLM = ecal::reconstruction::laser_monitoring_region_EB(did_to_use.rawId());
+ } else {
+ iLM = ecal::reconstruction::laser_monitoring_region_EE(did_to_use.rawId());
+ }
+
+ long long t_i = 0, t_f = 0;
+ float p_i = 0, p_f = 0;
+ long long lt_i = 0, lt_f = 0;
+ float lp_i = 0, lp_f = 0;
+
+ // laser
+ if (event_time >= t1[iLM - 1] && event_time < t2[iLM - 1]) {
+ t_i = t1[iLM - 1];
+ t_f = t2[iLM - 1];
+ p_i = p1[hashedId];
+ p_f = p2[hashedId];
+ } else if (event_time >= t2[iLM - 1] && event_time <= t3[iLM - 1]) {
+ t_i = t2[iLM - 1];
+ t_f = t3[iLM - 1];
+ p_i = p2[hashedId];
+ p_f = p3[hashedId];
+ } else if (event_time < t1[iLM - 1]) {
+ t_i = t1[iLM - 1];
+ t_f = t2[iLM - 1];
+ p_i = p1[hashedId];
+ p_f = p2[hashedId];
+
+ } else if (event_time > t3[iLM - 1]) {
+ t_i = t2[iLM - 1];
+ t_f = t3[iLM - 1];
+ p_i = p2[hashedId];
+ p_f = p3[hashedId];
+ }
+
+ // linear corrections
+ if (event_time >= lt1[iLM - 1] && event_time < lt2[iLM - 1]) {
+ lt_i = lt1[iLM - 1];
+ lt_f = lt2[iLM - 1];
+ lp_i = lp1[hashedId];
+ lp_f = lp2[hashedId];
+ } else if (event_time >= lt2[iLM - 1] && event_time <= lt3[iLM - 1]) {
+ lt_i = lt2[iLM - 1];
+ lt_f = lt3[iLM - 1];
+ lp_i = lp2[hashedId];
+ lp_f = lp3[hashedId];
+ } else if (event_time < lt1[iLM - 1]) {
+ lt_i = lt1[iLM - 1];
+ lt_f = lt2[iLM - 1];
+ lp_i = lp1[hashedId];
+ lp_f = lp2[hashedId];
+
+ } else if (event_time > lt3[iLM - 1]) {
+ lt_i = lt2[iLM - 1];
+ lt_f = lt3[iLM - 1];
+ lp_i = lp2[hashedId];
+ lp_f = lp3[hashedId];
+ }
+
+ // apdpnref and alpha
+ float apdpnref = apdpnrefs[hashedId];
+ float alpha = alphas[hashedId];
+
+ // now calculate transparency correction
+ if (apdpnref != 0 && (t_i - t_f) != 0 && (lt_i - lt_f) != 0) {
+ long long tt = event_time; // never subtract two unsigned!
+ float interpolatedLaserResponse =
+ p_i / apdpnref + float(tt - t_i) * (p_f - p_i) / (apdpnref * float(t_f - t_i));
+
+ float interpolatedLinearResponse =
+ lp_i / apdpnref + float(tt - lt_i) * (lp_f - lp_i) / (apdpnref * float(lt_f - lt_i)); // FIXED BY FC
+
+ if (interpolatedLinearResponse > 2.f || interpolatedLinearResponse < 0.1f) {
+ interpolatedLinearResponse = 1.f;
+ }
+ if (interpolatedLaserResponse <= 0.) {
+ // AM : how the heck is it possible?
+ // interpolatedLaserResponse = 0.0001;
+ lasercalib = 1.;
+
+ } else {
+ float interpolatedTransparencyResponse = interpolatedLaserResponse / interpolatedLinearResponse;
+
+ // ... and now this:
+ lasercalib = 1.f / (std::pow(interpolatedTransparencyResponse, alpha) * interpolatedLinearResponse);
+ }
+ }
+
+ //
+ // Check for channels to be excluded from reconstruction
+ //
+ //
+ // Default energy? Not to be updated if "ChannelStatusToBeExcluded"
+ // Exploited later by the module "EcalRecHitConvertGPU2CPUFormat"
+ //
+ energy[ch] = -1; //---- AM: default, un-physical, ok
+
+ // truncate the chi2
+ if (chi2_in[inputCh] > 64)
+ chi2[ch] = 64;
+ else
+ chi2[ch] = chi2_in[inputCh];
+
+ // default values for the flags
+ flagBits[ch] = 0;
+ extra[ch] = 0;
+
+ static const int chStatusMask = 0x1f;
+ // ChannelStatusToBeExcluded is a "int" then I put "dbstatus" to be the same
+ int dbstatus = EcalChannelStatusCode_Code((status[hashedId]) & chStatusMask);
+ if (ChannelStatusToBeExcludedSize != 0) {
+ bool skip_this_channel = false;
+ for (int ich_to_check = 0; ich_to_check < ChannelStatusToBeExcludedSize; ich_to_check++) {
+ if (ChannelStatusToBeExcluded[ich_to_check] == dbstatus) {
+ skip_this_channel = true;
+ break;
+ }
+ }
+ if (skip_this_channel) {
+ // skip this channel
+ continue;
+ }
+ }
+
+ // Take our association map of dbstatuses-> recHit flagbits and return the apporpriate flagbit word
+
+ //
+ // AM: get the smaller "flagbit_counter" with match
+ //
+
+ uint32_t temporary_flagBits = 0;
+
+ int iterator_flags = 0;
+ bool need_to_exit = false;
+ int flagbit_counter = 0;
+ while (!need_to_exit) {
+ iterator_flags = 0;
+ for (unsigned int i = 0; i != expanded_v_DB_reco_flagsSize; ++i) {
+ // check the correct "flagbit"
+ if (expanded_flagbit_v_DB_reco_flags[i] == flagbit_counter) {
+ for (unsigned int j = 0; j < expanded_Sizes_v_DB_reco_flags[i]; j++) {
+ if (expanded_v_DB_reco_flags[iterator_flags] == dbstatus) {
+ temporary_flagBits = 0x1 << expanded_flagbit_v_DB_reco_flags[i];
+ need_to_exit = true;
+ break; // also from the big loop!!!
+ }
+ iterator_flags++;
+ }
+ } else {
+ // if not, got to the next bunch directly
+ iterator_flags += expanded_Sizes_v_DB_reco_flags[i];
+ }
+
+ if (need_to_exit) {
+ break;
+ }
+ }
+ flagbit_counter += 1;
+ }
+
+ flagBits[ch] = temporary_flagBits;
+
+ if ((flagmask & temporary_flagBits) && killDeadChannels) {
+ // skip this channel
+ continue;
+ }
+
+ //
+ // multiply the adc counts with factors to get GeV
+ //
+
+ // energy[ch] = amplitude[inputCh] * adc2gev_to_use * intercalib_to_use ;
+ energy[ch] = amplitude[inputCh] * adc2gev_to_use * intercalib_to_use * lasercalib;
+
+ // Time is not saved so far, FIXME
+ // time[ch] = time_in[inputCh];
+
+ // NB: calculate the "flagBits extra" --> not really "flags", but actually an encoded version of energy uncertainty, time unc., ...
+
+ //
+ // extra packing ...
+ //
+
+ uint32_t offset;
+ uint32_t width;
+ uint32_t value;
+
+ float chi2_temp = chi2[ch];
+ if (chi2_temp > 64)
+ chi2_temp = 64;
+ // use 7 bits
+ uint32_t rawChi2 = lround(chi2_temp / 64. * ((1 << 7) - 1));
+
+ offset = 0;
+ width = 7;
+ value = 0;
+
+ uint32_t mask = ((1 << width) - 1) << offset;
+ value &= ~mask;
+ value |= (rawChi2 & ((1U << width) - 1)) << offset;
+
+ // extra[ch] = value;
+ //
+
+ // rawEnergy is actually "error" !!!
+ uint32_t rawEnergy = 0;
+
+ // AM: FIXME: this is not propagated currently to the uncalibrecHit collection SOA
+ // if you want to store this in "extra", we need first to add it to the uncalibrecHit results
+ // then it will be something like the following
+ // amplitudeError[inputCh] * adc2gev_to_use * intercalib_to_use * lasercalib
+ //
+ //
+
+ float amplitudeError_ch = 0.; // amplitudeError[ch];
+
+ if (amplitudeError_ch > 0.001) {
+ // uint16_t exponent = getPower10(amplitudeError_ch);
+
+ static constexpr float p10[] = {1.e-2f, 1.e-1f, 1.f, 1.e1f, 1.e2f, 1.e3f, 1.e4f, 1.e5f, 1.e6f};
+ int b = amplitudeError_ch < p10[4] ? 0 : 5;
+ for (; b < 9; ++b)
+ if (amplitudeError_ch < p10[b])
+ break;
+
+ uint16_t exponent = b;
+
+ static constexpr float ip10[] = {1.e5f, 1.e4f, 1.e3f, 1.e2f, 1.e1f, 1.e0f, 1.e-1f, 1.e-2f, 1.e-3f, 1.e-4};
+ uint16_t significand = lround(amplitudeError_ch * ip10[exponent]);
+ // use 13 bits (3 exponent, 10 significand)
+ rawEnergy = exponent << 10 | significand;
+ }
+
+ offset = 8;
+ width = 13;
+ // value from last change, ok
+
+ mask = ((1 << width) - 1) << offset;
+ value &= ~mask;
+ value |= (rawEnergy & ((1U << width) - 1)) << offset;
+
+ uint32_t jitterErrorBits = 0;
+ jitterErrorBits = jitterErrorBits & 0xFF;
+
+ offset = 24;
+ width = 8;
+ // value from last change, ok
+
+ mask = ((1 << width) - 1) << offset;
+ value &= ~mask;
+ value |= (jitterErrorBits & ((1U << width) - 1)) << offset;
+
+ //
+ // now finally set "extra[ch]"
+ //
+ extra[ch] = value;
+
+ //
+ // additional flags setting
+ //
+ // using correctly the flags as calculated at the UncalibRecHit stage
+ //
+ // Now fill flags
+
+ bool good = true;
+
+ if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kLeadingEdgeRecovered))) {
+ flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kLeadingEdgeRecovered));
+ good = false;
+ }
+
+ if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kSaturated))) {
+ // leading edge recovery failed - still keep the information
+ // about the saturation and do not flag as dead
+ flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kSaturated));
+ good = false;
+ }
+
+ //
+ // AM: why do we have two tests one after the other checking almost the same thing???
+ // Please clean up the code, ... also the original one!
+ //
+ // uncalibRH.isSaturated() --->
+ //
+ // bool EcalUncalibratedRecHit::isSaturated() const {
+ // return EcalUncalibratedRecHit::checkFlag(kSaturated);
+ // }
+ //
+ //
+
+ if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kSaturated))) {
+ flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kSaturated));
+ good = false;
+ }
+
+ if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kOutOfTime))) {
+ flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kOutOfTime));
+ good = false;
+ }
+ if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kPoorReco))) {
+ flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorReco));
+ good = false;
+ }
+ if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kHasSwitchToGain6))) {
+ flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kHasSwitchToGain6));
+ }
+ if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kHasSwitchToGain1))) {
+ flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kHasSwitchToGain1));
+ }
+
+ if (good) {
+ flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kGood));
+ }
+
+ if (isBarrel && (lasercalib < EBLaserMIN || lasercalib > EBLaserMAX)) {
+ flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorCalib));
+ }
+ if (!isBarrel && (lasercalib < EELaserMIN || lasercalib > EELaserMAX)) {
+ flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorCalib));
+ }
+
+ // recover, killing, and other stuff
+
+ //
+ // Structure:
+ // EB
+ // EE
+ //
+ //
+ // - single MVA
+ // - democratic sharing
+ // - kill all the other cases
+ //
+
+ bool is_Single = false;
+ bool is_FE = false;
+ bool is_VFE = false;
+
+ bool is_recoverable = false; // DetIdToBeRecovered
+
+ if (dbstatus == 10 || dbstatus == 11 || dbstatus == 12) {
+ is_recoverable = true;
+ }
+
+ if (is_recoverable) {
+ if (dbstatus == EcalChannelStatusCode_Code::kDeadVFE) {
+ is_VFE = true;
+ } else if (dbstatus == EcalChannelStatusCode_Code::kDeadVFE) {
+ is_FE = true;
+ } else {
+ is_Single = true;
+ }
+
+ // EB
+ if (isBarrel) {
+ if (is_Single || is_FE || is_VFE) {
+ // single MVA
+ if (is_Single && (recoverEBIsolatedChannels || !killDeadChannels)) {
+ }
+ // decmocratic sharing
+ else if (is_FE && (recoverEBFE || !killDeadChannels)) {
+ }
+ // kill all the other cases
+ else {
+ energy[ch] = 0.; // Need to set also the flags ...
+ }
+ }
+ }
+ // EE
+ else {
+ if (is_Single || is_FE || is_VFE) {
+ // single MVA
+ if (is_Single && (recoverEBIsolatedChannels || !killDeadChannels)) {
+ }
+ // decmocratic sharing
+ else if (is_FE && (recoverEBFE || !killDeadChannels)) {
+ //
+ // Code is definitely too long ...
+ //
+
+ }
+ // kill all the other cases
+ else {
+ energy[ch] = 0.; // Need to set also the flags ...
+ }
+ }
+ }
+ }
+
+ } // end channel
+ }
+
+ // host version, to be called by the plugin
+ void create_ecal_rehit(EventInputDataGPU const& eventInputGPU,
+ EventOutputDataGPU& eventOutputGPU,
+ // eventDataForScratchGPU_,
+ ConditionsProducts const& conditions,
+ ConfigurationParameters const& configParameters,
+ uint32_t const nChannelsBarrel,
+ edm::TimeValue_t const event_time,
+ cudaStream_t cudaStream) {
+ int nchannels = eventInputGPU.ebUncalibRecHits.size + eventInputGPU.eeUncalibRecHits.size;
+
+ // unsigned int nchannels_per_block = 32;
+ unsigned int nchannels_per_block = 16;
+ unsigned int threads_min = nchannels_per_block;
+ unsigned int blocks_min = (nchannels + threads_min - 1) / threads_min; // TEST : to be optimized (AM)
+
+ //
+ // kernel create rechit
+ //
+
+ // auto const nbytesShared = 2 * threads_min * MapSymM::total * sizeof(DataType);
+
+ kernel_create_ecal_rehit<<>>(
+ // kernel_create_ecal_rehit <<< blocks_min, threads_min, nbytesShared, cudaStream >>> (
+ // kernel_create_ecal_rehit <<< blocks_min, threads_min >>> (
+ // configuration
+ configParameters.ChannelStatusToBeExcluded,
+ configParameters.ChannelStatusToBeExcludedSize,
+ configParameters.killDeadChannels,
+ configParameters.recoverEBIsolatedChannels,
+ configParameters.recoverEEIsolatedChannels,
+ configParameters.recoverEBVFE,
+ configParameters.recoverEEVFE,
+ configParameters.recoverEBFE,
+ configParameters.recoverEEFE,
+ configParameters.EBLaserMIN,
+ configParameters.EELaserMIN,
+ configParameters.EBLaserMAX,
+ configParameters.EELaserMAX,
+ // for flags setting
+ configParameters.expanded_v_DB_reco_flags,
+ configParameters.expanded_Sizes_v_DB_reco_flags,
+ configParameters.expanded_flagbit_v_DB_reco_flags,
+ configParameters.expanded_v_DB_reco_flagsSize,
+ configParameters.flagmask,
+ // conditions
+ conditions.ADCToGeV.adc2gev,
+ conditions.Intercalib.values,
+ conditions.ChannelStatus.status,
+ conditions.LaserAPDPNRatiosRef.values,
+ conditions.LaserAlphas.values,
+ // input for transparency corrections
+ conditions.LaserAPDPNRatios.p1,
+ conditions.LaserAPDPNRatios.p2,
+ conditions.LaserAPDPNRatios.p3,
+ conditions.LaserAPDPNRatios.t1,
+ conditions.LaserAPDPNRatios.t2,
+ conditions.LaserAPDPNRatios.t3,
+ // input for linear corrections
+ conditions.LinearCorrections.p1,
+ conditions.LinearCorrections.p2,
+ conditions.LinearCorrections.p3,
+ conditions.LinearCorrections.t1,
+ conditions.LinearCorrections.t2,
+ conditions.LinearCorrections.t3,
+ // time, used for time dependent corrections
+ event_time,
+ // input
+ eventInputGPU.ebUncalibRecHits.did,
+ eventInputGPU.eeUncalibRecHits.did,
+ eventInputGPU.ebUncalibRecHits.amplitude,
+ eventInputGPU.eeUncalibRecHits.amplitude,
+ eventInputGPU.ebUncalibRecHits.jitter,
+ eventInputGPU.eeUncalibRecHits.jitter,
+ eventInputGPU.ebUncalibRecHits.chi2,
+ eventInputGPU.eeUncalibRecHits.chi2,
+ eventInputGPU.ebUncalibRecHits.flags,
+ eventInputGPU.eeUncalibRecHits.flags,
+ // output
+ eventOutputGPU.did,
+ eventOutputGPU.energy,
+ eventOutputGPU.time,
+ eventOutputGPU.chi2,
+ eventOutputGPU.flagBits,
+ eventOutputGPU.extra,
+ // other
+ nchannels,
+ nChannelsBarrel,
+ conditions.offsetForHashes);
+ }
+
+ } // namespace rechit
+
+} // namespace ecal
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.h
new file mode 100644
index 0000000000000..831c602a24e61
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.h
@@ -0,0 +1,94 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_EcalRecHitBuilderKernels_h
+#define RecoLocalCalo_EcalRecProducers_plugins_EcalRecHitBuilderKernels_h
+
+//
+// Builder of ECAL RecHits on GPU
+//
+
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
+#include "DataFormats/Provenance/interface/Timestamp.h"
+
+#include "Common.h"
+#include "DeclsForKernels.h"
+
+namespace ecal {
+ namespace rechit {
+
+ __global__ void kernel_create_ecal_rehit(
+ // configuration
+ int const* ChannelStatusToBeExcluded,
+ uint32_t ChannelStatusToBeExcludedSize,
+ bool killDeadChannels,
+ bool const recoverEBIsolatedChannels,
+ bool const recoverEEIsolatedChannels,
+ bool const recoverEBVFE,
+ bool const recoverEEVFE,
+ bool const recoverEBFE,
+ bool const recoverEEFE,
+ // for flags setting
+ int const* expanded_v_DB_reco_flags,
+ uint32_t const* expanded_Sizes_v_DB_reco_flags,
+ uint32_t const* expanded_flagbit_v_DB_reco_flags,
+ uint32_t expanded_v_DB_reco_flagsSize,
+ uint32_t flagmask,
+ // conditions
+ float const* adc2gev,
+ float const* intercalib,
+ uint16_t const* status,
+ float const* apdpnrefs,
+ float const* alphas,
+ // input for transparency corrections
+ float const* p1,
+ float const* p2,
+ float const* p3,
+ edm::TimeValue_t const* t1,
+ edm::TimeValue_t const* t2,
+ edm::TimeValue_t const* t3,
+ // input for linear corrections
+ float const* lp1,
+ float const* lp2,
+ float const* lp3,
+ edm::TimeValue_t const* lt1,
+ edm::TimeValue_t const* lt2,
+ edm::TimeValue_t const* lt3,
+ // time, used for time dependent corrections
+ edm::TimeValue_t const event_time,
+ // input
+ uint32_t const* did_eb,
+ uint32_t const* did_ee,
+ ::ecal::reco::StorageScalarType const* amplitude_eb, // in adc counts
+ ::ecal::reco::StorageScalarType const* amplitude_ee, // in adc counts
+ ::ecal::reco::StorageScalarType const* time_eb,
+ ::ecal::reco::StorageScalarType const* time_ee,
+ ::ecal::reco::StorageScalarType const* chi2_eb,
+ ::ecal::reco::StorageScalarType const* chi2_ee,
+ uint32_t const* flags_eb,
+ uint32_t const* flags_ee,
+ // output
+ uint32_t* did,
+ ::ecal::reco::StorageScalarType* energy, // in energy [GeV]
+ ::ecal::reco::StorageScalarType* time,
+ ::ecal::reco::StorageScalarType* chi2,
+ uint32_t* flagBits,
+ uint32_t* extra,
+ int const nchannels,
+ uint32_t const nChannelsBarrel,
+ uint32_t const offsetForHashes);
+
+ // host version, to be called by the plugin
+
+ void create_ecal_rehit(EventInputDataGPU const& eventInputGPU,
+ EventOutputDataGPU& eventOutputGPU,
+ // eventDataForScratchGPU_,
+ ConditionsProducts const& conditions,
+ ConfigurationParameters const& configParameters,
+ uint32_t const nChannelsBarrel,
+ edm::TimeValue_t const event_time,
+ cudaStream_t cudaStream);
+
+ } // namespace rechit
+
+} // namespace ecal
+
+#endif // RecoLocalCalo_EcalRecProducers_plugins_EcalRecHitBuilderKernels_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc
index 151762c6b63d3..5766fa0ffcb85 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc
@@ -1,18 +1,16 @@
-// framework
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
+#include
-// algorithm specific
-#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
#include "DataFormats/EcalRecHit/interface/EcalRecHit.h"
#include "DataFormats/EcalRecHit/interface/EcalRecHitCollections.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/Common.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include
+#include "Common.h"
class EcalRecHitConvertGPU2CPUFormat : public edm::stream::EDProducer<> {
public:
@@ -57,7 +55,7 @@ EcalRecHitConvertGPU2CPUFormat::~EcalRecHitConvertGPU2CPUFormat() {}
void EcalRecHitConvertGPU2CPUFormat::produce(edm::Event& event, edm::EventSetup const& setup) {
auto const& hRecHitsGPUEB = event.get(recHitsGPUEB_);
auto const& hRecHitsGPUEE = event.get(recHitsGPUEE_);
-
+
auto recHitsCPUEB = std::make_unique();
auto recHitsCPUEE = std::make_unique();
recHitsCPUEB->reserve(hRecHitsGPUEB.energy.size());
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
index 248248a4cca88..c13d00a530feb 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
@@ -1,50 +1,33 @@
-// framework
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-
-// format
-#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
#include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h"
-
-// needed for definition of flags
-#include "DataFormats/EcalRecHit/interface/EcalRecHit.h"
-
-// the kernels
-#include "RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h"
-
-// conditions cpu
+#include "CommonTools/Utils/interface/StringToEnumValue.h"
#include "CondFormats/DataRecord/interface/EcalADCToGeVConstantRcd.h"
-#include "CondFormats/DataRecord/interface/EcalIntercalibConstantsRcd.h"
#include "CondFormats/DataRecord/interface/EcalChannelStatusRcd.h"
-
+#include "CondFormats/DataRecord/interface/EcalIntercalibConstantsRcd.h"
#include "CondFormats/DataRecord/interface/EcalLaserAPDPNRatiosRcd.h"
#include "CondFormats/DataRecord/interface/EcalLaserAPDPNRatiosRefRcd.h"
#include "CondFormats/DataRecord/interface/EcalLaserAlphasRcd.h"
#include "CondFormats/DataRecord/interface/EcalLinearCorrectionsRcd.h"
-
-// conditions gpu
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h"
+#include "DataFormats/EcalRecHit/interface/EcalRecHit.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h"
-
#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h"
#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h"
#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h"
#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-
-// configuration
-#include "CommonTools/Utils/interface/StringToEnumValue.h"
+#include "EcalRecHitBuilderKernels.h"
class EcalRecHitProducerGPU : public edm::stream::EDProducer {
public:
@@ -175,18 +158,16 @@ EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) {
configParameters_.ChannelStatusToBeExcludedSize = v_chstatus_.size();
-
// call CUDA API functions only if CUDA is available
edm::Service cs;
if (cs and cs->enabled()) {
-
cudaCheck(cudaMalloc((void**)&configParameters_.ChannelStatusToBeExcluded, sizeof(int) * v_chstatus_.size()));
cudaCheck(cudaMemcpy(configParameters_.ChannelStatusToBeExcluded,
v_chstatus_.data(),
v_chstatus_.size() * sizeof(int),
cudaMemcpyHostToDevice));
}
-
+
//
// https://github.com/cms-sw/cmssw/blob/266e21cfc9eb409b093e4cf064f4c0a24c6ac293/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitWorkerSimple.cc
//
@@ -216,33 +197,33 @@ EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) {
// call CUDA API functions only if CUDA is available
if (cs and cs->enabled()) {
// actual values
- cudaCheck(
- cudaMalloc((void**)&configParameters_.expanded_v_DB_reco_flags, sizeof(int) * expanded_v_DB_reco_flags_.size()));
-
+ cudaCheck(cudaMalloc((void**)&configParameters_.expanded_v_DB_reco_flags,
+ sizeof(int) * expanded_v_DB_reco_flags_.size()));
+
cudaCheck(cudaMemcpy(configParameters_.expanded_v_DB_reco_flags,
expanded_v_DB_reco_flags_.data(),
expanded_v_DB_reco_flags_.size() * sizeof(int),
cudaMemcpyHostToDevice));
-
+
// sizes
cudaCheck(cudaMalloc((void**)&configParameters_.expanded_Sizes_v_DB_reco_flags,
sizeof(uint32_t) * expanded_Sizes_v_DB_reco_flags_.size()));
-
+
cudaCheck(cudaMemcpy(configParameters_.expanded_Sizes_v_DB_reco_flags,
expanded_Sizes_v_DB_reco_flags_.data(),
expanded_Sizes_v_DB_reco_flags_.size() * sizeof(uint32_t),
cudaMemcpyHostToDevice));
-
+
// keys
cudaCheck(cudaMalloc((void**)&configParameters_.expanded_flagbit_v_DB_reco_flags,
sizeof(uint32_t) * expanded_flagbit_v_DB_reco_flags_.size()));
-
+
cudaCheck(cudaMemcpy(configParameters_.expanded_flagbit_v_DB_reco_flags,
expanded_flagbit_v_DB_reco_flags_.data(),
expanded_flagbit_v_DB_reco_flags_.size() * sizeof(uint32_t),
cudaMemcpyHostToDevice));
}
-
+
configParameters_.expanded_v_DB_reco_flagsSize = expanded_flagbit_v_DB_reco_flags_.size();
flagmask_ = 0;
@@ -266,7 +247,6 @@ EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) {
}
EcalRecHitProducerGPU::~EcalRecHitProducerGPU() {
-
edm::Service cs;
if (cs and cs->enabled()) {
// free event ouput data
@@ -303,7 +283,7 @@ void EcalRecHitProducerGPU::acquire(edm::Event const& event,
if ((neb_ + nee_) > maxNumberHits_) {
edm::LogError("EcalRecHitProducerGPU") << "max number of channels exceeded. See options 'maxNumberHits' ";
}
-
+
int nchannelsEB = ebUncalibRecHits.size; // --> offsetForInput, first EB and then EE
// conditions
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc
index 20f51ea5245df..69c6b7a67831c 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc
@@ -1,18 +1,16 @@
-// framework
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
+#include
-// algorithm specific
-#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
-#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
#include "DataFormats/EcalRecHit/interface/EcalRecHitCollections.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/Common.h"
+#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include
+#include "Common.h"
class EcalUncalibRecHitConvertGPU2CPUFormat : public edm::stream::EDProducer<> {
public:
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu
new file mode 100644
index 0000000000000..e79a5dff85d9f
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu
@@ -0,0 +1,312 @@
+#include
+#include
+
+#include
+
+#include "CondFormats/EcalObjects/interface/EcalMGPAGainRatio.h"
+#include "CondFormats/EcalObjects/interface/EcalPedestals.h"
+#include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h"
+#include "CondFormats/EcalObjects/interface/EcalPulseShapes.h"
+#include "CondFormats/EcalObjects/interface/EcalSampleMask.h"
+#include "CondFormats/EcalObjects/interface/EcalSamplesCorrelation.h"
+#include "CondFormats/EcalObjects/interface/EcalXtalGroupId.h"
+#include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+
+#include "AmplitudeComputationCommonKernels.h"
+#include "AmplitudeComputationKernels.h"
+#include "Common.h"
+#include "EcalUncalibRecHitMultiFitAlgo_gpu_new.h"
+#include "TimeComputationKernels.h"
+
+//#define DEBUG
+
+//#define ECAL_RECO_CUDA_DEBUG
+
+namespace ecal {
+ namespace multifit {
+
+ void entryPoint(EventInputDataGPU const& eventInputGPU,
+ EventOutputDataGPU& eventOutputGPU,
+ EventDataForScratchGPU& scratch,
+ ConditionsProducts const& conditions,
+ ConfigurationParameters const& configParameters,
+ cudaStream_t cudaStream) {
+ using digis_type = std::vector;
+ using dids_type = std::vector;
+ // accodring to the cpu setup //----> hardcoded
+ bool const gainSwitchUseMaxSampleEB = true;
+ // accodring to the cpu setup //----> hardcoded
+ bool const gainSwitchUseMaxSampleEE = false;
+
+ uint32_t const offsetForHashes = conditions.offsetForHashes;
+ uint32_t const offsetForInputs = eventInputGPU.ebDigis.ndigis;
+ unsigned int totalChannels = eventInputGPU.ebDigis.ndigis + eventInputGPU.eeDigis.ndigis;
+
+ //
+ // 1d preparation kernel
+ //
+ unsigned int nchannels_per_block = 32;
+ unsigned int threads_1d = 10 * nchannels_per_block;
+ unsigned int blocks_1d = threads_1d > 10 * totalChannels ? 1 : (totalChannels * 10 + threads_1d - 1) / threads_1d;
+ int shared_bytes = nchannels_per_block * EcalDataFrame::MAXSAMPLES *
+ (sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(char) + sizeof(bool));
+ kernel_prep_1d_and_initialize<<>>(
+ conditions.pulseShapes.values,
+ eventInputGPU.ebDigis.data,
+ eventInputGPU.ebDigis.ids,
+ eventInputGPU.eeDigis.data,
+ eventInputGPU.eeDigis.ids,
+ scratch.samples,
+ (SampleVector*)eventOutputGPU.amplitudesAll,
+ scratch.gainsNoise,
+ conditions.pedestals.mean_x1,
+ conditions.pedestals.mean_x12,
+ conditions.pedestals.rms_x12,
+ conditions.pedestals.mean_x6,
+ conditions.gainRatios.gain6Over1,
+ conditions.gainRatios.gain12Over6,
+ scratch.hasSwitchToGain6,
+ scratch.hasSwitchToGain1,
+ scratch.isSaturated,
+ eventOutputGPU.amplitude,
+ eventOutputGPU.chi2,
+ eventOutputGPU.pedestal,
+ eventOutputGPU.did,
+ eventOutputGPU.flags,
+ scratch.acState,
+ scratch.activeBXs,
+ offsetForHashes,
+ offsetForInputs,
+ gainSwitchUseMaxSampleEB,
+ gainSwitchUseMaxSampleEE,
+ totalChannels);
+ cudaCheck(cudaGetLastError());
+
+ //
+ // 2d preparation kernel
+ //
+ int blocks_2d = totalChannels;
+ dim3 threads_2d{10, 10};
+ kernel_prep_2d<<>>(scratch.gainsNoise,
+ eventInputGPU.ebDigis.ids,
+ eventInputGPU.eeDigis.ids,
+ conditions.pedestals.rms_x12,
+ conditions.pedestals.rms_x6,
+ conditions.pedestals.rms_x1,
+ conditions.gainRatios.gain12Over6,
+ conditions.gainRatios.gain6Over1,
+ conditions.samplesCorrelation.EBG12SamplesCorrelation,
+ conditions.samplesCorrelation.EBG6SamplesCorrelation,
+ conditions.samplesCorrelation.EBG1SamplesCorrelation,
+ conditions.samplesCorrelation.EEG12SamplesCorrelation,
+ conditions.samplesCorrelation.EEG6SamplesCorrelation,
+ conditions.samplesCorrelation.EEG1SamplesCorrelation,
+ scratch.noisecov,
+ scratch.pulse_matrix,
+ conditions.pulseShapes.values,
+ scratch.hasSwitchToGain6,
+ scratch.hasSwitchToGain1,
+ scratch.isSaturated,
+ offsetForHashes,
+ offsetForInputs);
+ cudaCheck(cudaGetLastError());
+
+ // run minimization kernels
+ v1::minimization_procedure(eventInputGPU, eventOutputGPU, scratch, conditions, configParameters, cudaStream);
+
+ if (configParameters.shouldRunTimingComputation) {
+ //
+ // TODO: this guy can run concurrently with other kernels,
+ // there is no dependence on the order of execution
+ //
+ unsigned int threads_time_init = threads_1d;
+ unsigned int blocks_time_init = blocks_1d;
+ int sharedBytesInit = 2 * threads_time_init * sizeof(SampleVector::Scalar);
+ kernel_time_computation_init<<>>(
+ eventInputGPU.ebDigis.data,
+ eventInputGPU.ebDigis.ids,
+ eventInputGPU.eeDigis.data,
+ eventInputGPU.eeDigis.ids,
+ conditions.pedestals.rms_x12,
+ conditions.pedestals.rms_x6,
+ conditions.pedestals.rms_x1,
+ conditions.pedestals.mean_x12,
+ conditions.pedestals.mean_x6,
+ conditions.pedestals.mean_x1,
+ conditions.gainRatios.gain12Over6,
+ conditions.gainRatios.gain6Over1,
+ scratch.sample_values,
+ scratch.sample_value_errors,
+ scratch.ampMaxError,
+ scratch.useless_sample_values,
+ scratch.pedestal_nums,
+ offsetForHashes,
+ offsetForInputs,
+ conditions.sampleMask.getEcalSampleMaskRecordEB(),
+ conditions.sampleMask.getEcalSampleMaskRecordEE(),
+ totalChannels);
+ cudaCheck(cudaGetLastError());
+
+ //
+ // TODO: small kernel only for EB. It needs to be checked if
+ /// fusing such small kernels is beneficial in here
+ //
+ // we are running only over EB digis
+ // therefore we need to create threads/blocks only for that
+ unsigned int const threadsFixMGPA = threads_1d;
+ unsigned int const blocksFixMGPA =
+ threadsFixMGPA > 10 * eventInputGPU.ebDigis.ndigis
+ ? 1
+ : (10 * eventInputGPU.ebDigis.ndigis + threadsFixMGPA - 1) / threadsFixMGPA;
+ kernel_time_compute_fixMGPAslew<<>>(
+ eventInputGPU.ebDigis.data,
+ eventInputGPU.eeDigis.data,
+ scratch.sample_values,
+ scratch.sample_value_errors,
+ scratch.useless_sample_values,
+ conditions.sampleMask.getEcalSampleMaskRecordEB(),
+ totalChannels,
+ offsetForInputs);
+ cudaCheck(cudaGetLastError());
+
+ //
+ //
+ //
+ int sharedBytes = EcalDataFrame::MAXSAMPLES * nchannels_per_block * 4 * sizeof(SampleVector::Scalar);
+ auto const threads_nullhypot = threads_1d;
+ auto const blocks_nullhypot = blocks_1d;
+ kernel_time_compute_nullhypot<<>>(
+ scratch.sample_values,
+ scratch.sample_value_errors,
+ scratch.useless_sample_values,
+ scratch.chi2sNullHypot,
+ scratch.sum0sNullHypot,
+ scratch.sumAAsNullHypot,
+ totalChannels);
+ cudaCheck(cudaGetLastError());
+
+ unsigned int nchannels_per_block_makeratio = 10;
+ unsigned int threads_makeratio = 45 * nchannels_per_block_makeratio;
+ unsigned int blocks_makeratio = threads_makeratio > 45 * totalChannels
+ ? 1
+ : (totalChannels * 45 + threads_makeratio - 1) / threads_makeratio;
+ int sharedBytesMakeRatio = 5 * threads_makeratio * sizeof(SampleVector::Scalar);
+ kernel_time_compute_makeratio<<>>(
+ scratch.sample_values,
+ scratch.sample_value_errors,
+ eventInputGPU.ebDigis.ids,
+ eventInputGPU.eeDigis.ids,
+ scratch.useless_sample_values,
+ scratch.pedestal_nums,
+ configParameters.amplitudeFitParametersEB,
+ configParameters.amplitudeFitParametersEE,
+ configParameters.timeFitParametersEB,
+ configParameters.timeFitParametersEE,
+ scratch.sumAAsNullHypot,
+ scratch.sum0sNullHypot,
+ scratch.tMaxAlphaBetas,
+ scratch.tMaxErrorAlphaBetas,
+ scratch.accTimeMax,
+ scratch.accTimeWgt,
+ scratch.tcState,
+ configParameters.timeFitParametersSizeEB,
+ configParameters.timeFitParametersSizeEE,
+ configParameters.timeFitLimitsFirstEB,
+ configParameters.timeFitLimitsFirstEE,
+ configParameters.timeFitLimitsSecondEB,
+ configParameters.timeFitLimitsSecondEE,
+ totalChannels,
+ offsetForInputs);
+ cudaCheck(cudaGetLastError());
+
+ //
+ //
+ //
+ auto const threads_findamplchi2 = threads_1d;
+ auto const blocks_findamplchi2 = blocks_1d;
+ int const sharedBytesFindAmplChi2 = 2 * threads_findamplchi2 * sizeof(SampleVector::Scalar);
+ kernel_time_compute_findamplchi2_and_finish<<>>(scratch.sample_values,
+ scratch.sample_value_errors,
+ eventInputGPU.ebDigis.ids,
+ eventInputGPU.eeDigis.ids,
+ scratch.useless_sample_values,
+ scratch.tMaxAlphaBetas,
+ scratch.tMaxErrorAlphaBetas,
+ scratch.accTimeMax,
+ scratch.accTimeWgt,
+ configParameters.amplitudeFitParametersEB,
+ configParameters.amplitudeFitParametersEE,
+ scratch.sumAAsNullHypot,
+ scratch.sum0sNullHypot,
+ scratch.chi2sNullHypot,
+ scratch.tcState,
+ scratch.ampMaxAlphaBeta,
+ scratch.ampMaxError,
+ scratch.timeMax,
+ scratch.timeError,
+ totalChannels,
+ offsetForInputs);
+ cudaCheck(cudaGetLastError());
+
+ //
+ //
+ //
+ auto const threads_timecorr = 32;
+ auto const blocks_timecorr =
+ threads_timecorr > totalChannels ? 1 : (totalChannels + threads_timecorr - 1) / threads_timecorr;
+ kernel_time_correction_and_finalize<<>>(
+ eventOutputGPU.amplitude,
+ eventInputGPU.ebDigis.data,
+ eventInputGPU.ebDigis.ids,
+ eventInputGPU.eeDigis.data,
+ eventInputGPU.eeDigis.ids,
+ conditions.timeBiasCorrections.EBTimeCorrAmplitudeBins,
+ conditions.timeBiasCorrections.EETimeCorrAmplitudeBins,
+ conditions.timeBiasCorrections.EBTimeCorrShiftBins,
+ conditions.timeBiasCorrections.EETimeCorrShiftBins,
+ scratch.timeMax,
+ scratch.timeError,
+ conditions.pedestals.rms_x12,
+ conditions.timeCalibConstants.values,
+ eventOutputGPU.jitter,
+ eventOutputGPU.jitterError,
+ eventOutputGPU.flags,
+ conditions.timeBiasCorrections.EBTimeCorrAmplitudeBinsSize,
+ conditions.timeBiasCorrections.EETimeCorrAmplitudeBinsSize,
+ configParameters.timeConstantTermEB,
+ configParameters.timeConstantTermEE,
+ conditions.timeOffsetConstant.getEBValue(),
+ conditions.timeOffsetConstant.getEEValue(),
+ configParameters.timeNconstEB,
+ configParameters.timeNconstEE,
+ configParameters.amplitudeThreshEB,
+ configParameters.amplitudeThreshEE,
+ configParameters.outOfTimeThreshG12pEB,
+ configParameters.outOfTimeThreshG12pEE,
+ configParameters.outOfTimeThreshG12mEB,
+ configParameters.outOfTimeThreshG12mEE,
+ configParameters.outOfTimeThreshG61pEB,
+ configParameters.outOfTimeThreshG61pEE,
+ configParameters.outOfTimeThreshG61mEB,
+ configParameters.outOfTimeThreshG61mEE,
+ offsetForHashes,
+ offsetForInputs,
+ totalChannels);
+ cudaCheck(cudaGetLastError());
+ }
+
+ /*
+ cudaEventRecord(end_event, 0);
+ cudaEventSynchronize(end_event);
+ float ms;
+ cudaEventElapsedTime(&ms, start_event, end_event);
+ std::cout << "elapsed time = " << ms << std::endl;
+ */
+ }
+
+ } // namespace multifit
+} // namespace ecal
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgo_gpu_new.h b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgo_gpu_new.h
new file mode 100644
index 0000000000000..93923e63be632
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgo_gpu_new.h
@@ -0,0 +1,23 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_EcalUncalibRecHitMultiFitAlgo_gpu_new_h
+#define RecoLocalCalo_EcalRecProducers_plugins_EcalUncalibRecHitMultiFitAlgo_gpu_new_h
+
+#include
+
+#include
+
+#include "DeclsForKernels.h"
+
+namespace ecal {
+ namespace multifit {
+
+ void entryPoint(EventInputDataGPU const&,
+ EventOutputDataGPU&,
+ EventDataForScratchGPU&,
+ ConditionsProducts const&,
+ ConfigurationParameters const&,
+ cudaStream_t);
+
+ } // namespace multifit
+} // namespace ecal
+
+#endif // RecoLocalCalo_EcalRecProducers_plugins_EcalUncalibRecHitMultiFitAlgo_gpu_new_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc
index cdf21e3240954..5bd54787ee155 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc
@@ -1,44 +1,35 @@
-// framework
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
-//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h"
-
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-
-// algorithm specific
-#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
-#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/Common.h"
-
#include
-#include "CondFormats/EcalObjects/interface/EcalTimeOffsetConstant.h"
-
-#include "CondFormats/DataRecord/interface/EcalPedestalsRcd.h"
+#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h"
#include "CondFormats/DataRecord/interface/EcalGainRatiosRcd.h"
-#include "CondFormats/DataRecord/interface/EcalPulseShapesRcd.h"
+#include "CondFormats/DataRecord/interface/EcalPedestalsRcd.h"
#include "CondFormats/DataRecord/interface/EcalPulseCovariancesRcd.h"
+#include "CondFormats/DataRecord/interface/EcalPulseShapesRcd.h"
+#include "CondFormats/DataRecord/interface/EcalSampleMaskRcd.h"
#include "CondFormats/DataRecord/interface/EcalSamplesCorrelationRcd.h"
#include "CondFormats/DataRecord/interface/EcalTimeBiasCorrectionsRcd.h"
#include "CondFormats/DataRecord/interface/EcalTimeCalibConstantsRcd.h"
#include "CondFormats/DataRecord/interface/EcalTimeOffsetConstantRcd.h"
-#include "CondFormats/DataRecord/interface/EcalSampleMaskRcd.h"
-
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h"
+#include "CondFormats/EcalObjects/interface/EcalTimeOffsetConstant.h"
+#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h"
#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h"
+#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h"
#include "RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h"
#include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h"
#include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h"
-#include "RecoLocalCalo/EcalRecAlgos/interface/EcalUncalibRecHitMultiFitAlgo_gpu_new.h"
+#include "Common.h"
+#include "DeclsForKernels.h"
+#include "EcalUncalibRecHitMultiFitAlgo_gpu_new.h"
class EcalUncalibRecHitProducerGPU : public edm::stream::EDProducer {
public:
@@ -290,7 +281,7 @@ void EcalUncalibRecHitProducerGPU::acquire(edm::Event const& event,
if ((neb_ + nee_) > maxNumberHits_) {
edm::LogError("EcalUncalibRecHitProducerGPU") << "max number of channels exceeded. See options 'maxNumberHits' ";
}
-
+
// conditions
setup.get().get(pedestalsHandle_);
setup.get().get(gainRatiosHandle_);
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EigenMatrixTypes_gpu.h b/RecoLocalCalo/EcalRecProducers/plugins/EigenMatrixTypes_gpu.h
new file mode 100644
index 0000000000000..bbf9cb0dbb5c9
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/EigenMatrixTypes_gpu.h
@@ -0,0 +1,49 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_EigenMatrixTypes_gpu_h
+#define RecoLocalCalo_EcalRecProducers_plugins_EigenMatrixTypes_gpu_h
+
+#include
+
+#include
+
+#include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h"
+
+namespace ecal {
+ namespace multifit {
+
+ constexpr int SampleVectorSize = 10;
+ constexpr int FullSampleVectorSize = 19;
+ constexpr int PulseVectorSize = 12;
+ constexpr int NGains = 3;
+
+ using data_type = ::ecal::reco::ComputationScalarType;
+
+ typedef Eigen::Matrix PulseMatrixType;
+ typedef Eigen::Matrix BXVectorType;
+ using SampleMatrixD = Eigen::Matrix;
+
+ typedef Eigen::Matrix SampleVector;
+ typedef Eigen::Matrix FullSampleVector;
+ typedef Eigen::Matrix PulseVector;
+ typedef Eigen::Matrix BXVector;
+ typedef Eigen::Matrix SampleGainVector;
+ typedef Eigen::Matrix SampleMatrix;
+ typedef Eigen::Matrix FullSampleMatrix;
+ typedef Eigen::Matrix PulseMatrix;
+ typedef Eigen::Matrix
+ SamplePulseMatrix;
+ typedef Eigen::LLT SampleDecompLLT;
+ typedef Eigen::LLT SampleDecompLLTD;
+ typedef Eigen::LLT PulseDecompLLT;
+ typedef Eigen::LDLT PulseDecompLDLT;
+
+ typedef Eigen::Matrix SingleMatrix;
+ typedef Eigen::Matrix SingleVector;
+
+ typedef std::array SampleMatrixGainArray;
+
+ using PermutationMatrix = Eigen::PermutationMatrix;
+
+ } // namespace multifit
+} // namespace ecal
+
+#endif // RecoLocalCalo_EcalRecProducers_plugins_EigenMatrixTypes_gpu_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.cu b/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.cu
new file mode 100644
index 0000000000000..00c61e04e1596
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.cu
@@ -0,0 +1,323 @@
+#include "DataFormats/EcalDetId/interface/EBDetId.h"
+#include "DataFormats/EcalDetId/interface/EEDetId.h"
+
+#include "KernelHelpers.h"
+
+namespace ecal {
+ namespace reconstruction {
+
+ namespace internal {
+
+ namespace barrel {
+
+ __device__ __forceinline__ bool positiveZ(uint32_t id) { return id & 0x10000; }
+
+ __device__ __forceinline__ uint32_t ietaAbs(uint32_t id) { return (id >> 9) & 0x7F; }
+
+ __device__ __forceinline__ uint32_t iphi(uint32_t id) { return id & 0x1FF; }
+
+ __device__ int dccFromSm(int ism) {
+ int iz = 1;
+ if (ism > 18)
+ iz = -1;
+ if (iz == -1)
+ ism -= 18;
+ int idcc = 9 + ism;
+ if (iz == +1)
+ idcc += 18;
+ return idcc;
+ }
+
+ __device__ int sm(int ieta, int iphi) {
+ int iz = 1;
+ if (ieta < 0)
+ iz = -1;
+ ieta *= iz;
+ int iphi_ = iphi;
+ if (iphi_ > 360)
+ iphi_ -= 360;
+ int ism = (iphi_ - 1) / 20 + 1;
+ if (iz == -1)
+ ism += 18;
+ return ism;
+ }
+
+ __device__ int dcc(int ieta, int iphi) {
+ int ism = sm(ieta, iphi);
+ return dccFromSm(ism);
+ }
+
+ //
+ // ---- why on hell things are so complex and not simple ???
+ //
+
+ __device__ int lm_channel(int iX, int iY) {
+ static const int idx_[] = {
+ // clang-format off
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
+ 1, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8, // 3
+ 1, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8, // 2
+ 1, 3, 3, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 9, 9, 9, 9, // 1
+ 1, 3, 3, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 9, 9, 9, 9 // 0
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
+ // clang-format on
+ };
+
+ int il, ic, ii;
+ const int iym = 4;
+ const int ixm = 17;
+ int iX_ = iX + 1;
+ int iY_ = iY + 1;
+ il = iym - iY_;
+ ic = iX_ - 1;
+ ii = il * ixm + ic;
+ if (ii < 0 || ii > (int)(sizeof(idx_) / sizeof(int))) {
+ return -1;
+ };
+ return idx_[ii];
+ }
+
+ __device__ int localCoord_x(int ieta, int iphi) {
+ int iz = 1;
+ if (ieta < 0) {
+ iz = -1;
+ }
+ ieta *= iz;
+ // int iphi_ = iphi;
+ // if (iphi_ > 360) {
+ // iphi_ -= 360;
+ // }
+ int ix = ieta - 1;
+ // int iy = (iphi_ - 1) % 20;
+ // if (iz == -1) {
+ // iy = 19 - iy;
+ // }
+
+ return ix;
+ }
+
+ __device__ int localCoord_y(int ieta, int iphi) {
+ int iz = 1;
+ if (ieta < 0) {
+ iz = -1;
+ }
+ // ieta *= iz;
+ int iphi_ = iphi;
+ if (iphi_ > 360) {
+ iphi_ -= 360;
+ }
+ // int ix = ieta - 1;
+ int iy = (iphi_ - 1) % 20;
+ if (iz == -1) {
+ iy = 19 - iy;
+ }
+
+ return iy;
+ }
+
+ __device__ int lmmod(int ieta, int iphi) {
+ int ix = localCoord_x(ieta, iphi);
+ int iy = localCoord_y(ieta, iphi);
+
+ return lm_channel(ix / 5, iy / 5);
+ }
+
+ __device__ int side(int ieta, int iphi) {
+ int ilmmod = lmmod(ieta, iphi);
+ return (ilmmod % 2 == 0) ? 1 : 0;
+ }
+
+ } // namespace barrel
+
+ } // namespace internal
+
+ __device__ uint32_t hashedIndexEB(uint32_t id) {
+ using namespace internal::barrel;
+ return (EBDetId::MAX_IETA + (positiveZ(id) ? ietaAbs(id) - 1 : -ietaAbs(id))) * EBDetId::MAX_IPHI + iphi(id) - 1;
+ }
+
+ //
+ // https://cmssdt.cern.ch/lxr/source/CalibCalorimetry/EcalLaserAnalyzer/src/MEEBGeom.cc
+ // function: "lmr"
+
+ __device__ int laser_monitoring_region_EB(uint32_t id) {
+ using namespace internal::barrel;
+
+ int ieta;
+ if (positiveZ(id)) {
+ ieta = ietaAbs(id);
+ } else {
+ ieta = -ietaAbs(id);
+ }
+
+ int idcc = dcc(ieta, (int)(iphi(id)));
+ int ism = idcc - 9;
+
+ int iside = side(ieta, (int)(iphi(id)));
+ // int iside = positiveZ(id) ? 1 : 0;
+
+ return (1 + 2 * (ism - 1) + iside);
+ // return ieta;
+ // return (int) (iphi(id));
+ // return idcc;
+ // return iside;
+ }
+
+ namespace internal {
+
+ namespace endcap {
+
+ __device__ __forceinline__ uint32_t ix(uint32_t id) { return (id >> 7) & 0x7F; }
+
+ __device__ __forceinline__ uint32_t iy(uint32_t id) { return id & 0x7F; }
+
+ __device__ __forceinline__ bool positiveZ(uint32_t id) { return id & 0x4000; }
+
+ // these constants come from EE Det Id
+ __constant__ const unsigned short kxf[] = {
+ 41, 51, 41, 51, 41, 51, 36, 51, 36, 51, 26, 51, 26, 51, 26, 51, 21, 51, 21, 51, 21, 51, 21, 51, 21,
+ 51, 16, 51, 16, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 9, 51, 9, 51, 9, 51, 9, 51, 9, 51,
+ 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 4, 51, 4, 51, 4,
+ 51, 4, 51, 4, 56, 1, 58, 1, 59, 1, 60, 1, 61, 1, 61, 1, 62, 1, 62, 1, 62, 1, 62, 1, 62,
+ 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, 1, 61, 1, 61, 1, 60, 1, 59, 1, 58, 4, 56, 4, 51, 4,
+ 51, 4, 51, 4, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51,
+ 9, 51, 9, 51, 9, 51, 9, 51, 9, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 16, 51, 16, 51, 21,
+ 51, 21, 51, 21, 51, 21, 51, 21, 51, 26, 51, 26, 51, 26, 51, 36, 51, 36, 51, 41, 51, 41, 51, 41, 51};
+
+ __constant__ const unsigned short kdi[] = {
+ 0, 10, 20, 30, 40, 50, 60, 75, 90, 105, 120, 145, 170, 195, 220, 245, 270,
+ 300, 330, 360, 390, 420, 450, 480, 510, 540, 570, 605, 640, 675, 710, 747, 784, 821,
+ 858, 895, 932, 969, 1006, 1043, 1080, 1122, 1164, 1206, 1248, 1290, 1332, 1374, 1416, 1458, 1500,
+ 1545, 1590, 1635, 1680, 1725, 1770, 1815, 1860, 1905, 1950, 1995, 2040, 2085, 2130, 2175, 2220, 2265,
+ 2310, 2355, 2400, 2447, 2494, 2541, 2588, 2635, 2682, 2729, 2776, 2818, 2860, 2903, 2946, 2988, 3030,
+ 3071, 3112, 3152, 3192, 3232, 3272, 3311, 3350, 3389, 3428, 3467, 3506, 3545, 3584, 3623, 3662, 3701,
+ 3740, 3779, 3818, 3857, 3896, 3935, 3974, 4013, 4052, 4092, 4132, 4172, 4212, 4253, 4294, 4336, 4378,
+ 4421, 4464, 4506, 4548, 4595, 4642, 4689, 4736, 4783, 4830, 4877, 4924, 4969, 5014, 5059, 5104, 5149,
+ 5194, 5239, 5284, 5329, 5374, 5419, 5464, 5509, 5554, 5599, 5644, 5689, 5734, 5779, 5824, 5866, 5908,
+ 5950, 5992, 6034, 6076, 6118, 6160, 6202, 6244, 6281, 6318, 6355, 6392, 6429, 6466, 6503, 6540, 6577,
+ 6614, 6649, 6684, 6719, 6754, 6784, 6814, 6844, 6874, 6904, 6934, 6964, 6994, 7024, 7054, 7079, 7104,
+ 7129, 7154, 7179, 7204, 7219, 7234, 7249, 7264, 7274, 7284, 7294, 7304, 7314};
+
+ __device__ int quadrant(int iX, int iY) {
+ bool near = iX >= 11;
+ bool far = !near;
+ bool top = iY >= 11;
+ bool bot = !top;
+
+ int iquad = 0;
+ if (near && top)
+ iquad = 1;
+ if (far && top)
+ iquad = 2;
+ if (far && bot)
+ iquad = 3;
+ if (near && bot)
+ iquad = 4;
+
+ return iquad;
+ }
+
+ __device__ int sector(int iX, int iY) {
+ // Y (towards the surface)
+ // T
+ // |
+ // |
+ // |
+ // o---------| X (towards center of LHC)
+ //
+ static const int idx_[] = {
+ // clang-format off
+ // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
+ 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, // 20
+ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, // 19
+ 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 8, 0, 0, 0, // 18
+ 0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 8, 8, 8, 0, 0, // 17
+ 0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 8, 8, 8, 8, 0, // 16
+ 0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 9, 9, 9, 9, 8, 8, 8, 8, 8, 0, // 15
+ 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 9, 9, 9, 8, 8, 8, 8, 8, 8, 0, // 14
+ 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, // 13
+ 3, 3, 2, 2, 2, 2, 2, 2, 2, 0, 0, 8, 8, 8, 8, 8, 8, 8, 7, 7, // 12
+ 3, 3, 3, 3, 3, 3, 3, 2, 0, 0, 0, 0, 8, 7, 7, 7, 7, 7, 7, 7, // 11
+ 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, // 10
+ 3, 3, 3, 3, 3, 3, 3, 4, 4, 0, 0, 6, 6, 7, 7, 7, 7, 7, 7, 7, // 9
+ 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 7, 7, 7, // 8
+ 0, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 0, // 7
+ 0, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 0, // 6
+ 0, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 0, // 5
+ 0, 0, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 0, 0, // 4
+ 0, 0, 0, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 0, 0, 0, // 3
+ 0, 0, 0, 0, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 0, 0, 0, 0, // 2
+ 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0 // 1
+ // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
+ // clang-format on
+ };
+
+ int iym, ixm, il, ic, ii;
+ iym = 20;
+ ixm = 20;
+ int iX_ = iX;
+ int iY_ = iY;
+ il = iym - iY_;
+ ic = iX_ - 1;
+ ii = il * ixm + ic;
+
+ if (ii < 0 || ii > (int)(sizeof(idx_) / sizeof(int)) || idx_[ii] == 0) {
+ return -1;
+ };
+ return idx_[ii];
+ }
+
+ } // namespace endcap
+
+ } // namespace internal
+
+ __device__ uint32_t hashedIndexEE(uint32_t id) {
+ using namespace internal::endcap;
+
+ const uint32_t jx(ix(id));
+ const uint32_t jd(2 * (iy(id) - 1) + (jx - 1) / 50);
+ return ((positiveZ(id) ? EEDetId::kEEhalf : 0) + kdi[jd] + jx - kxf[jd]);
+ }
+
+ //
+ // https://cmssdt.cern.ch/lxr/source/CalibCalorimetry/EcalLaserAnalyzer/src/MEEEGeom.cc
+ // https://github.com/cms-sw/cmssw/blob/master/CalibCalorimetry/EcalLaserCorrection/src/EcalLaserDbService.cc
+ //
+
+ __device__ int laser_monitoring_region_EE(uint32_t id) {
+ using namespace internal::endcap;
+
+ // SuperCrysCoord
+ uint32_t iX = (ix(id) - 1) / 5 + 1;
+ uint32_t iY = (iy(id) - 1) / 5 + 1;
+
+ // Correct convention
+ // * @param iz iz/zside index: -1 for EE-, +1 for EE+
+ // https://github.com/cms-sw/cmssw/blob/master/DataFormats/EcalDetId/interface/EEDetId.h#L68-L71
+ // zside in https://github.com/cms-sw/cmssw/blob/master/CalibCalorimetry/EcalLaserCorrection/src/EcalLaserDbService.cc#L63
+ //
+ int iz = positiveZ(id) ? 1 : -1;
+
+ int iquad = quadrant(iX, iY);
+ int isect = sector(iX, iY);
+ if (isect < 0)
+ return -1;
+
+ int ilmr = 0;
+ ilmr = isect - 6;
+ if (ilmr <= 0)
+ ilmr += 9;
+ if (ilmr == 9)
+ ilmr++;
+ if (ilmr == 8 && iquad == 4)
+ ilmr++;
+ if (iz == +1)
+ ilmr += 72;
+ else
+ ilmr += 82;
+
+ return ilmr;
+ }
+
+ } // namespace reconstruction
+} // namespace ecal
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h b/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h
new file mode 100644
index 0000000000000..103087c3517f9
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h
@@ -0,0 +1,452 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_KernelHelpers_h
+#define RecoLocalCalo_EcalRecProducers_plugins_KernelHelpers_h
+
+#include
+#include
+#include
+
+#include
+
+namespace ecal {
+ namespace multifit {
+
+ template
+ using ColMajorMatrix = Eigen::Matrix;
+
+ template
+ using RowMajorMatrix = Eigen::Matrix;
+
+ template
+ using ColumnVector = Eigen::Matrix;
+
+ template
+ using RowVector = Eigen::Matrix;
+
+ // FIXME: provide specialization for Row Major layout
+ template
+ struct MapSymM {
+ using type = T;
+ using base_type = typename std::remove_const::type;
+
+ static constexpr int total = Stride * (Stride + 1) / 2;
+ static constexpr int stride = Stride;
+ T* data;
+
+ __forceinline__ __device__ MapSymM(T* data) : data{data} {}
+
+ __forceinline__ __device__ T const& operator()(int const row, int const col) const {
+ auto const tmp = (Stride - col) * (Stride - col + 1) / 2;
+ auto const index = total - tmp + row - col;
+ return data[index];
+ }
+
+ template
+ __forceinline__ __device__ typename std::enable_if::value, base_type>::type&
+ operator()(int const row, int const col) {
+ auto const tmp = (Stride - col) * (Stride - col + 1) / 2;
+ auto const index = total - tmp + row - col;
+ return data[index];
+ }
+ };
+
+ // FIXME: either use/modify/improve eigen or make this more generic
+ // this is a map for a pulse matrix to building a 2d matrix for each channel
+ // and hide indexing
+ template
+ struct MapMForPM {
+ using type = T;
+ using base_type = typename std::remove_cv::type;
+
+ type* data;
+ __forceinline__ __device__ MapMForPM(type* data) : data{data} {}
+
+ __forceinline__ __device__ base_type operator()(int const row, int const col) const {
+ auto const index = 2 - col + row;
+ return index >= 0 ? data[index] : 0;
+ }
+ };
+
+ // simple/trivial cholesky decomposition impl
+ template
+ __forceinline__ __device__ void compute_decomposition_unrolled(MatrixType1& L, MatrixType2 const& M) {
+ auto const sqrtm_0_0 = std::sqrt(M(0, 0));
+ L(0, 0) = sqrtm_0_0;
+ using T = typename MatrixType1::base_type;
+
+#pragma unroll
+ for (int i = 1; i < MatrixType1::stride; i++) {
+ T sumsq{0};
+ for (int j = 0; j < i; j++) {
+ T sumsq2{0};
+ auto const m_i_j = M(i, j);
+ for (int k = 0; k < j; ++k)
+ sumsq2 += L(i, k) * L(j, k);
+
+ auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
+ L(i, j) = value_i_j;
+
+ sumsq += value_i_j * value_i_j;
+ }
+
+ auto const l_i_i = std::sqrt(M(i, i) - sumsq);
+ L(i, i) = l_i_i;
+ }
+ }
+
+ template
+ __forceinline__ __device__ void compute_decomposition(MatrixType1& L, MatrixType2 const& M, int const N) {
+ auto const sqrtm_0_0 = std::sqrt(M(0, 0));
+ L(0, 0) = sqrtm_0_0;
+ using T = typename MatrixType1::base_type;
+
+ for (int i = 1; i < N; i++) {
+ T sumsq{0};
+ for (int j = 0; j < i; j++) {
+ T sumsq2{0};
+ auto const m_i_j = M(i, j);
+ for (int k = 0; k < j; ++k)
+ sumsq2 += L(i, k) * L(j, k);
+
+ auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
+ L(i, j) = value_i_j;
+
+ sumsq += value_i_j * value_i_j;
+ }
+
+ auto const l_i_i = std::sqrt(M(i, i) - sumsq);
+ L(i, i) = l_i_i;
+ }
+ }
+
+ template
+ __forceinline__ __device__ void compute_decomposition_forwardsubst_with_offsets(
+ MatrixType1& L,
+ MatrixType2 const& M,
+ float b[MatrixType1::stride],
+ VectorType const& Atb,
+ int const N,
+ ColumnVector const& pulseOffsets) {
+ auto const real_0 = pulseOffsets(0);
+ auto const sqrtm_0_0 = std::sqrt(M(real_0, real_0));
+ L(0, 0) = sqrtm_0_0;
+ using T = typename MatrixType1::base_type;
+ b[0] = Atb(real_0) / sqrtm_0_0;
+
+ for (int i = 1; i < N; i++) {
+ auto const i_real = pulseOffsets(i);
+ T sumsq{0};
+ T total = 0;
+ auto const atb = Atb(i_real);
+ for (int j = 0; j < i; j++) {
+ auto const j_real = pulseOffsets(j);
+ T sumsq2{0};
+ auto const m_i_j = M(std::max(i_real, j_real), std::min(i_real, j_real));
+ for (int k = 0; k < j; ++k)
+ sumsq2 += L(i, k) * L(j, k);
+
+ auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
+ L(i, j) = value_i_j;
+
+ sumsq += value_i_j * value_i_j;
+ total += value_i_j * b[j];
+ }
+
+ auto const l_i_i = std::sqrt(M(i_real, i_real) - sumsq);
+ L(i, i) = l_i_i;
+ b[i] = (atb - total) / l_i_i;
+ }
+ }
+
+ template
+ __forceinline__ __device__ void update_decomposition_forwardsubst_with_offsets(
+ MatrixType1& L,
+ MatrixType2 const& M,
+ float b[MatrixType1::stride],
+ VectorType const& Atb,
+ int const N,
+ ColumnVector const& pulseOffsets) {
+ using T = typename MatrixType1::base_type;
+ auto const i = N - 1;
+ auto const i_real = pulseOffsets(i);
+ T sumsq{0};
+ T total = 0;
+ for (int j = 0; j < i; j++) {
+ auto const j_real = pulseOffsets(j);
+ T sumsq2{0};
+ auto const m_i_j = M(std::max(i_real, j_real), std::min(i_real, j_real));
+ for (int k = 0; k < j; ++k)
+ sumsq2 += L(i, k) * L(j, k);
+
+ auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
+ L(i, j) = value_i_j;
+ sumsq += value_i_j * value_i_j;
+
+ total += value_i_j * b[j];
+ }
+
+ auto const l_i_i = std::sqrt(M(i_real, i_real) - sumsq);
+ L(i, i) = l_i_i;
+ b[i] = (Atb(i_real) - total) / l_i_i;
+ }
+
+ template
+ __device__ void solve_forward_subst_matrix(MatrixType1& A,
+ MatrixType2 const& pulseMatrixView,
+ MatrixType3 const& matrixL) {
+ // FIXME: this assumes pulses are on columns and samples on rows
+ constexpr auto NPULSES = MatrixType2::ColsAtCompileTime;
+ constexpr auto NSAMPLES = MatrixType2::RowsAtCompileTime;
+
+#pragma unroll
+ for (int icol = 0; icol < NPULSES; icol++) {
+ float reg_b[NSAMPLES];
+ float reg_L[NSAMPLES];
+
+// preload a column and load column 0 of cholesky
+#pragma unroll
+ for (int i = 0; i < NSAMPLES; i++) {
+ reg_b[i] = __ldg(&pulseMatrixView.coeffRef(i, icol));
+ reg_L[i] = matrixL(i, 0);
+ }
+
+ // compute x0 and store it
+ auto x_prev = reg_b[0] / reg_L[0];
+ A(0, icol) = x_prev;
+
+// iterate
+#pragma unroll
+ for (int iL = 1; iL < NSAMPLES; iL++) {
+// update accum
+#pragma unroll
+ for (int counter = iL; counter < NSAMPLES; counter++)
+ reg_b[counter] -= x_prev * reg_L[counter];
+
+// load the next column of cholesky
+#pragma unroll
+ for (int counter = iL; counter < NSAMPLES; counter++)
+ reg_L[counter] = matrixL(counter, iL);
+
+ // compute the next x for M(iL, icol)
+ x_prev = reg_b[iL] / reg_L[iL];
+
+ // store the result value
+ A(iL, icol) = x_prev;
+ }
+ }
+ }
+
+ template
+ __device__ void solve_forward_subst_vector(float reg_b[MatrixType1::RowsAtCompileTime],
+ MatrixType1 inputAmplitudesView,
+ MatrixType2 matrixL) {
+ constexpr auto NSAMPLES = MatrixType1::RowsAtCompileTime;
+
+ float reg_b_tmp[NSAMPLES];
+ float reg_L[NSAMPLES];
+
+// preload a column and load column 0 of cholesky
+#pragma unroll
+ for (int i = 0; i < NSAMPLES; i++) {
+ reg_b_tmp[i] = inputAmplitudesView(i);
+ reg_L[i] = matrixL(i, 0);
+ }
+
+ // compute x0 and store it
+ auto x_prev = reg_b_tmp[0] / reg_L[0];
+ reg_b[0] = x_prev;
+
+// iterate
+#pragma unroll
+ for (int iL = 1; iL < NSAMPLES; iL++) {
+// update accum
+#pragma unroll
+ for (int counter = iL; counter < NSAMPLES; counter++)
+ reg_b_tmp[counter] -= x_prev * reg_L[counter];
+
+// load the next column of cholesky
+#pragma unroll
+ for (int counter = iL; counter < NSAMPLES; counter++)
+ reg_L[counter] = matrixL(counter, iL);
+
+ // compute the next x for M(iL, icol)
+ x_prev = reg_b_tmp[iL] / reg_L[iL];
+
+ // store the result value
+ reg_b[iL] = x_prev;
+ }
+ }
+
+ // TODO: add active bxs
+ template
+ __device__ void fnnls(MatrixType const& AtA,
+ VectorType const& Atb,
+ VectorType& solution,
+ int& npassive,
+ ColumnVector& pulseOffsets,
+ MapSymM& matrixL,
+ double const eps,
+ int const maxIterations) {
+ // constants
+ constexpr auto NPULSES = VectorType::RowsAtCompileTime;
+
+ // to keep track of where to terminate if converged
+ Eigen::Index w_max_idx_prev = 0;
+ float w_max_prev = 0;
+ auto eps_to_use = eps;
+ bool recompute = false;
+
+ // used throughout
+ VectorType s;
+ float reg_b[NPULSES];
+ //float matrixLStorage[MapSymM::total];
+ //MapSymM matrixL{matrixLStorage};
+
+ int iter = 0;
+ while (true) {
+ if (iter > 0 || npassive == 0) {
+ auto const nactive = NPULSES - npassive;
+ // exit if there are no more pulses to constrain
+ if (nactive == 0)
+ break;
+
+ // compute the gradient
+ //w.tail(nactive) = Atb.tail(nactive) - (AtA * solution).tail(nactive);
+ Eigen::Index w_max_idx;
+ float w_max = -std::numeric_limits::max();
+ for (int icol = npassive; icol < NPULSES; icol++) {
+ auto const icol_real = pulseOffsets(icol);
+ auto const atb = Atb(icol_real);
+ float sum = 0;
+#pragma unroll
+ for (int counter = 0; counter < NPULSES; counter++)
+ sum += counter > icol_real ? AtA(counter, icol_real) * solution(counter)
+ : AtA(icol_real, counter) * solution(counter);
+
+ auto const w = atb - sum;
+ if (w > w_max) {
+ w_max = w;
+ w_max_idx = icol - npassive;
+ }
+ }
+
+ // check for convergence
+ if (w_max < eps_to_use || w_max_idx == w_max_idx_prev && w_max == w_max_prev)
+ break;
+
+ if (iter >= maxIterations)
+ break;
+
+ w_max_prev = w_max;
+ w_max_idx_prev = w_max_idx;
+
+ // move index to the right part of the vector
+ w_max_idx += npassive;
+
+ Eigen::numext::swap(pulseOffsets.coeffRef(npassive), pulseOffsets.coeffRef(w_max_idx));
+ ++npassive;
+ }
+
+ // inner loop
+ while (true) {
+ if (npassive == 0)
+ break;
+
+ //s.head(npassive)
+ //auto const& matrixL =
+ // AtA.topLeftCorner(npassive, npassive)
+ // .llt().matrixL();
+ //.solve(Atb.head(npassive));
+ if (recompute || iter == 0)
+ compute_decomposition_forwardsubst_with_offsets(matrixL, AtA, reg_b, Atb, npassive, pulseOffsets);
+ else
+ update_decomposition_forwardsubst_with_offsets(matrixL, AtA, reg_b, Atb, npassive, pulseOffsets);
+
+ // run backward substituion
+ s(npassive - 1) = reg_b[npassive - 1] / matrixL(npassive - 1, npassive - 1);
+ for (int i = npassive - 2; i >= 0; --i) {
+ float total = 0;
+ for (int j = i + 1; j < npassive; j++)
+ total += matrixL(j, i) * s(j);
+
+ s(i) = (reg_b[i] - total) / matrixL(i, i);
+ }
+
+ // done if solution values are all positive
+ bool hasNegative = false;
+ bool hasNans = false;
+ for (int counter = 0; counter < npassive; counter++) {
+ auto const s_ii = s(counter);
+ hasNegative |= s_ii <= 0;
+ hasNans |= std::isnan(s_ii);
+ }
+
+ // FIXME: temporary solution. my cholesky impl is unstable yielding nans
+ // this check removes nans - do not accept solution unless all values
+ // are stable
+ if (hasNans)
+ break;
+ if (!hasNegative) {
+ for (int i = 0; i < npassive; i++) {
+ auto const i_real = pulseOffsets(i);
+ solution(i_real) = s(i);
+ }
+ //solution.head(npassive) = s.head(npassive);
+ recompute = false;
+ break;
+ }
+
+ // there were negative values -> have to recompute the whole decomp
+ recompute = true;
+
+ auto alpha = std::numeric_limits::max();
+ Eigen::Index alpha_idx = 0, alpha_idx_real = 0;
+ for (int i = 0; i < npassive; i++) {
+ if (s[i] <= 0.) {
+ auto const i_real = pulseOffsets(i);
+ auto const ratio = solution[i_real] / (solution[i_real] - s[i]);
+ if (ratio < alpha) {
+ alpha = ratio;
+ alpha_idx = i;
+ alpha_idx_real = i_real;
+ }
+ }
+ }
+
+ // upadte solution
+ for (int i = 0; i < npassive; i++) {
+ auto const i_real = pulseOffsets(i);
+ solution(i_real) += alpha * (s(i) - solution(i_real));
+ }
+ //solution.head(npassive) += alpha *
+ // (s.head(npassive) - solution.head(npassive));
+ solution[alpha_idx_real] = 0;
+ --npassive;
+
+ Eigen::numext::swap(pulseOffsets.coeffRef(npassive), pulseOffsets.coeffRef(alpha_idx));
+ }
+
+ // as in cpu
+ ++iter;
+ if (iter % 16 == 0)
+ eps_to_use *= 2;
+ }
+ }
+
+ } // namespace multifit
+} // namespace ecal
+
+namespace ecal {
+ namespace reconstruction {
+
+ __device__ uint32_t hashedIndexEB(uint32_t id);
+
+ __device__ uint32_t hashedIndexEE(uint32_t id);
+
+ __device__ int laser_monitoring_region_EB(uint32_t id);
+
+ __device__ int laser_monitoring_region_EE(uint32_t id);
+
+ } // namespace reconstruction
+} // namespace ecal
+
+#endif // RecoLocalCalo_EcalRecProducers_plugins_KernelHelpers_h
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.cu
new file mode 100644
index 0000000000000..ddb9334826f40
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.cu
@@ -0,0 +1,1131 @@
+#include
+#include
+
+#include
+
+#include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
+#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
+#include "DataFormats/Math/interface/approx_exp.h"
+#include "DataFormats/Math/interface/approx_log.h"
+
+#include "Common.h"
+#include "TimeComputationKernels.h"
+#include "KernelHelpers.h"
+
+//#define DEBUG
+
+//#define ECAL_RECO_CUDA_DEBUG
+
+namespace ecal {
+ namespace multifit {
+
+ __device__ __forceinline__ bool use_sample(unsigned int sample_mask, unsigned int sample) {
+ return sample_mask & (0x1 << (EcalDataFrame::MAXSAMPLES - (sample + 1)));
+ }
+
+ __global__ void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values,
+ SampleVector::Scalar const* sample_value_errors,
+ bool const* useless_sample_values,
+ SampleVector::Scalar* chi2s,
+ SampleVector::Scalar* sum0s,
+ SampleVector::Scalar* sumAAs,
+ const int nchannels) {
+ using ScalarType = SampleVector::Scalar;
+ constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+ // indices
+ int tx = threadIdx.x + blockDim.x * blockIdx.x;
+ int ltx = threadIdx.x;
+ int ch = tx / nsamples;
+ int nchannels_per_block = blockDim.x / nsamples;
+
+ // TODO: make sure that this branch plays nicely with __syncthreads inside
+ // can there be a deadlock even if the thread is inactive
+ if (ch < nchannels) {
+ //
+ int sample = tx % nsamples;
+
+ // shared mem inits
+ extern __shared__ char sdata[];
+ char* s_sum0 = sdata;
+ SampleVector::Scalar* s_sum1 = reinterpret_cast(s_sum0 + nchannels_per_block * nsamples);
+ SampleVector::Scalar* s_sumA = s_sum1 + nchannels_per_block * nsamples;
+ SampleVector::Scalar* s_sumAA = s_sumA + nchannels_per_block * nsamples;
+
+ // TODO make sure no div by 0
+ const auto inv_error =
+ useless_sample_values[tx] ? 0.0 : 1.0 / (sample_value_errors[tx] * sample_value_errors[tx]);
+ const auto sample_value = sample_values[tx];
+ s_sum0[ltx] = useless_sample_values[tx] ? 0 : 1;
+ s_sum1[ltx] = inv_error;
+ s_sumA[ltx] = sample_value * inv_error;
+ s_sumAA[ltx] = sample_value * sample_value * inv_error;
+ __syncthreads();
+
+ // 5 threads for [0, 4] samples
+ if (sample < 5) {
+ s_sum0[ltx] += s_sum0[ltx + 5];
+ s_sum1[ltx] += s_sum1[ltx + 5];
+ s_sumA[ltx] += s_sumA[ltx + 5];
+ s_sumAA[ltx] += s_sumAA[ltx + 5];
+ }
+ __syncthreads();
+
+ if (sample < 2) {
+ // note double counting of sample 3
+ s_sum0[ltx] += s_sum0[ltx + 2] + s_sum0[ltx + 3];
+ s_sum1[ltx] += s_sum1[ltx + 2] + s_sum1[ltx + 3];
+ s_sumA[ltx] += s_sumA[ltx + 2] + s_sumA[ltx + 3];
+ s_sumAA[ltx] += s_sumAA[ltx + 2] + s_sumAA[ltx + 3];
+ }
+ __syncthreads();
+
+ if (sample == 0) {
+ // note, subtract to remove the double counting of sample == 3
+ //s_sum0[ltx] += s_sum0[ltx+1] - s_sum0[ltx+3];
+ //s_sum1[ltx] += s_sum1[ltx+1] - s_sum1[ltx+3];
+ //s_sumA[ltx] += s_sumA[ltx+1] - s_sumA[ltx+3];
+ //s_sumAA[ltx] += s_sumAA[ltx+1] - s_sumAA[ltx+3];
+ const auto sum0 = s_sum0[ltx] + s_sum0[ltx + 1] - s_sum0[ltx + 3];
+ const auto sum1 = s_sum1[ltx] + s_sum1[ltx + 1] - s_sum1[ltx + 3];
+ const auto sumA = s_sumA[ltx] + s_sumA[ltx + 1] - s_sumA[ltx + 3];
+ const auto sumAA = s_sumAA[ltx] + s_sumAA[ltx + 1] - s_sumAA[ltx + 3];
+ const auto chi2 = sum0 > 0 ? (sumAA - sumA * sumA / sum1) / sum0 : static_cast(0);
+ chi2s[ch] = chi2;
+ sum0s[ch] = sum0;
+ sumAAs[ch] = sumAA;
+
+#ifdef DEBUG_TC_NULLHYPOT
+ if (ch == 0) {
+ printf("chi2 = %f sum0 = %d sumAA = %f\n", chi2, static_cast(sum0), sumAA);
+ }
+#endif
+ }
+ }
+ }
+
+ constexpr float fast_expf(float x) { return unsafe_expf<6>(x); }
+ constexpr float fast_logf(float x) { return unsafe_logf<7>(x); }
+
+ //#define DEBUG_TC_MAKERATIO
+ //
+ // launch ctx parameters are
+ // 45 threads per channel, X channels per block, Y blocks
+ // 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9
+ // TODO: it might be much beter to use 32 threads per channel instead of 45
+ // to simplify the synchronization
+ //
+ __global__ void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values,
+ SampleVector::Scalar const* sample_value_errors,
+ uint32_t const* dids_eb,
+ uint32_t const* dids_ee,
+ bool const* useless_sample_values,
+ char const* pedestal_nums,
+ ConfigurationParameters::type const* amplitudeFitParametersEB,
+ ConfigurationParameters::type const* amplitudeFitParametersEE,
+ ConfigurationParameters::type const* timeFitParametersEB,
+ ConfigurationParameters::type const* timeFitParametersEE,
+ SampleVector::Scalar const* sumAAsNullHypot,
+ SampleVector::Scalar const* sum0sNullHypot,
+ SampleVector::Scalar* tMaxAlphaBetas,
+ SampleVector::Scalar* tMaxErrorAlphaBetas,
+ SampleVector::Scalar* g_accTimeMax,
+ SampleVector::Scalar* g_accTimeWgt,
+ TimeComputationState* g_state,
+ unsigned const int timeFitParameters_sizeEB,
+ unsigned const int timeFitParameters_sizeEE,
+ ConfigurationParameters::type const timeFitLimits_firstEB,
+ ConfigurationParameters::type const timeFitLimits_firstEE,
+ ConfigurationParameters::type const timeFitLimits_secondEB,
+ ConfigurationParameters::type const timeFitLimits_secondEE,
+ const int nchannels,
+ uint32_t const offsetForInputs) {
+ using ScalarType = SampleVector::Scalar;
+
+ // constants
+ constexpr int nthreads_per_channel = 45; // n=10, n(n-1)/2
+ constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+ // indices
+ const int gtx = threadIdx.x + blockDim.x * blockIdx.x;
+ const int ch = gtx / nthreads_per_channel;
+ const int ltx = threadIdx.x % nthreads_per_channel;
+ const int ch_start = ch * nsamples;
+ const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+ const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+
+ // rmeove inactive threads
+ // TODO: need to understand if this is 100% safe in presence of syncthreads
+ if (ch >= nchannels)
+ return;
+
+ const auto did = DetId{dids[inputCh]};
+ const auto isBarrel = did.subdetId() == EcalBarrel;
+ const auto* amplitudeFitParameters = isBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE;
+ const auto* timeFitParameters = isBarrel ? timeFitParametersEB : timeFitParametersEE;
+ const auto timeFitParameters_size = isBarrel ? timeFitParameters_sizeEB : timeFitParameters_sizeEE;
+ const auto timeFitLimits_first = isBarrel ? timeFitLimits_firstEB : timeFitLimits_firstEE;
+ const auto timeFitLimits_second = isBarrel ? timeFitLimits_secondEB : timeFitLimits_secondEE;
+
+ extern __shared__ char smem[];
+ ScalarType* shr_chi2s = reinterpret_cast(smem);
+ ScalarType* shr_time_wgt = shr_chi2s + blockDim.x;
+ ScalarType* shr_time_max = shr_time_wgt + blockDim.x;
+ ScalarType* shrTimeMax = shr_time_max + blockDim.x;
+ ScalarType* shrTimeWgt = shrTimeMax + blockDim.x;
+
+ // map tx -> (sample_i, sample_j)
+ int sample_i, sample_j = 0;
+ if (ltx >= 0 && ltx <= 8) {
+ sample_i = 0;
+ sample_j = 1 + ltx;
+ } else if (ltx <= 16) {
+ sample_i = 1;
+ sample_j = 2 + ltx - 9;
+ } else if (ltx <= 23) {
+ sample_i = 2;
+ sample_j = 3 + ltx - 17;
+ } else if (ltx <= 29) {
+ sample_i = 3;
+ sample_j = 4 + ltx - 24;
+ } else if (ltx <= 34) {
+ sample_i = 4;
+ sample_j = 5 + ltx - 30;
+ } else if (ltx <= 38) {
+ sample_i = 5;
+ sample_j = 6 + ltx - 35;
+ } else if (ltx <= 41) {
+ sample_i = 6;
+ sample_j = 7 + ltx - 39;
+ } else if (ltx <= 43) {
+ sample_i = 7;
+ sample_j = 8 + ltx - 42;
+ } else if (ltx <= 44) {
+ sample_i = 8;
+ sample_j = 9;
+ } else
+ assert(false);
+
+ const auto tx_i = ch_start + sample_i;
+ const auto tx_j = ch_start + sample_j;
+
+ //
+ // note, given the way we partition the block, with 45 threads per channel
+ // we will end up with inactive threads which need to be dragged along
+ // through the synching point
+ //
+ /*
+ bool const condToExit = ch >= nchannels
+ ? true
+ : useless_sample_values[tx_i]
+ || useless_sample_values[tx_j]
+ || sample_values[tx_i]<=1 || sample_values[tx_j]<=1;
+ */
+ bool const condForUselessSamples = useless_sample_values[tx_i] || useless_sample_values[tx_j] ||
+ sample_values[tx_i] <= 1 || sample_values[tx_j] <= 1;
+
+ //
+ // see cpu implementation for explanation
+ //
+ ScalarType chi2 = std::numeric_limits::max();
+ ScalarType tmax = 0;
+ ScalarType tmaxerr = 0;
+ shrTimeMax[threadIdx.x] = 0;
+ shrTimeWgt[threadIdx.x] = 0;
+ bool internalCondForSkipping1 = true;
+ bool internalCondForSkipping2 = true;
+ if (!condForUselessSamples) {
+ const auto rtmp = sample_values[tx_i] / sample_values[tx_j];
+ const auto invampl_i = 1.0 / sample_values[tx_i];
+ const auto relErr2_i = sample_value_errors[tx_i] * sample_value_errors[tx_i] * invampl_i * invampl_i;
+ const auto invampl_j = 1.0 / sample_values[tx_j];
+ const auto relErr2_j = sample_value_errors[tx_j] * sample_value_errors[tx_j] * invampl_j * invampl_j;
+ const auto err1 = rtmp * rtmp * (relErr2_i + relErr2_j);
+ auto err2 = sample_value_errors[tx_j] * (sample_values[tx_i] - sample_values[tx_j]) * (invampl_j * invampl_j);
+ // TODO non-divergent branch for a block if each block has 1 channel
+ // otherwise non-divergent for groups of 45 threads
+ // at this point, pedestal_nums[ch] can be either 0, 1 or 2
+ if (pedestal_nums[ch] == 2)
+ err2 *= err2 * 0.5;
+ const auto err3 = (0.289 * 0.289) * (invampl_j * invampl_j);
+ const auto total_error = std::sqrt(err1 + err2 + err3);
+
+ const auto alpha = amplitudeFitParameters[0];
+ const auto beta = amplitudeFitParameters[1];
+ const auto alphabeta = alpha * beta;
+ const auto invalphabeta = 1.0 / alphabeta;
+
+ // variables instead of a struct
+ const auto ratio_index = sample_i;
+ const auto ratio_step = sample_j - sample_i;
+ const auto ratio_value = rtmp;
+ const auto ratio_error = total_error;
+
+ const auto rlim_i_j = fast_expf(static_cast(sample_j - sample_i) / beta) - 0.001;
+ internalCondForSkipping1 = !(total_error < 1.0 && rtmp > 0.001 && rtmp < rlim_i_j);
+ if (!internalCondForSkipping1) {
+ //
+ // precompute.
+ // in cpu version this was done conditionally
+ // however easier to do it here (precompute) and then just filter out
+ // if not needed
+ //
+ const auto l_timeFitLimits_first = timeFitLimits_first;
+ const auto l_timeFitLimits_second = timeFitLimits_second;
+ if (ratio_step == 1 && ratio_value >= l_timeFitLimits_first && ratio_value <= l_timeFitLimits_second) {
+ const auto time_max_i = static_cast(ratio_index);
+ auto u = timeFitParameters[timeFitParameters_size - 1];
+#pragma unroll
+ for (int k = timeFitParameters_size - 2; k >= 0; k--)
+ u = u * ratio_value + timeFitParameters[k];
+
+ auto du = (timeFitParameters_size - 1) * (timeFitParameters[timeFitParameters_size - 1]);
+ for (int k = timeFitParameters_size - 2; k >= 1; k--)
+ du = du * ratio_value + k * timeFitParameters[k];
+
+ const auto error2 = ratio_error * ratio_error * du * du;
+ const auto time_max = error2 > 0 ? (time_max_i - u) / error2 : static_cast(0);
+ const auto time_wgt = error2 > 0 ? 1.0 / error2 : static_cast(0);
+
+ // store into shared mem
+ // note, this name is essentially identical to the one used
+ // below.
+ shrTimeMax[threadIdx.x] = error2 > 0 ? time_max : 0;
+ shrTimeWgt[threadIdx.x] = error2 > 0 ? time_wgt : 0;
+ } else {
+ shrTimeMax[threadIdx.x] = 0;
+ shrTimeWgt[threadIdx.x] = 0;
+ }
+
+ // continue with ratios
+ const auto stepOverBeta = static_cast(ratio_step) / beta;
+ const auto offset = static_cast(ratio_index) + alphabeta;
+ const auto rmin = std::max(ratio_value - ratio_error, 0.001);
+ const auto rmax = std::min(ratio_value + ratio_error,
+ fast_expf(static_cast(ratio_step) / beta) - 0.001);
+ const auto time1 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmin)) / alpha) - 1.0);
+ const auto time2 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmax)) / alpha) - 1.0);
+
+ // set these guys
+ tmax = 0.5 * (time1 + time2);
+ tmaxerr = 0.5 * std::sqrt((time1 - time2) * (time1 - time2));
+#ifdef DEBUG_TC_MAKERATIO
+ if (ch == 1 || ch == 0)
+ printf("ch = %d ltx = %d tmax = %f tmaxerr = %f time1 = %f time2 = %f offset = %f rmin = %f rmax = %f\n",
+ ch,
+ ltx,
+ tmax,
+ tmaxerr,
+ time1,
+ time2,
+ offset,
+ rmin,
+ rmax);
+#endif
+
+ SampleVector::Scalar sumAf = 0;
+ SampleVector::Scalar sumff = 0;
+ const int itmin = std::max(-1, static_cast(std::floor(tmax - alphabeta)));
+ auto loffset = (static_cast(itmin) - tmax) * invalphabeta;
+ // TODO: data dependence
+ for (int it = itmin + 1; it < nsamples; it++) {
+ loffset += invalphabeta;
+ if (useless_sample_values[ch_start + it])
+ continue;
+ const auto inverr2 = 1.0 / (sample_value_errors[ch_start + it] * sample_value_errors[ch_start + it]);
+ const auto term1 = 1.0 + loffset;
+ const auto f = (term1 > 1e-6) ? fast_expf(alpha * (fast_logf(term1) - loffset)) : 0;
+ sumAf += sample_values[ch_start + it] * (f * inverr2);
+ sumff += f * (f * inverr2);
+ }
+
+ const auto sumAA = sumAAsNullHypot[ch];
+ const auto sum0 = sum0sNullHypot[ch];
+ chi2 = sumAA;
+ // TODO: sum0 can not be 0 below, need to introduce the check upfront
+ if (sumff > 0) {
+ chi2 = sumAA - sumAf * (sumAf / sumff);
+ }
+ chi2 /= sum0;
+
+#ifdef DEBUG_TC_MAKERATIO
+ if (ch == 1 || ch == 0)
+ printf("ch = %d ltx = %d sumAf = %f sumff = %f sumAA = %f sum0 = %d tmax = %f tmaxerr = %f chi2 = %f\n",
+ ch,
+ ltx,
+ sumAf,
+ sumff,
+ sumAA,
+ static_cast(sum0),
+ tmax,
+ tmaxerr,
+ chi2);
+#endif
+
+ if (chi2 > 0 && tmax > 0 && tmaxerr > 0)
+ internalCondForSkipping2 = false;
+ else
+ chi2 = std::numeric_limits::max();
+ }
+ }
+
+ // store into smem
+ shr_chi2s[threadIdx.x] = chi2;
+ __syncthreads();
+
+ // find min chi2 - quite crude for now
+ // TODO validate/check
+ char iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
+ bool oddElements = nthreads_per_channel % 2;
+#pragma unroll
+ while (iter >= 1) {
+ if (ltx < iter)
+ // for odd ns, the last guy will just store itself
+ // exception is for ltx == 0 and iter==1
+ shr_chi2s[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
+ ? shr_chi2s[threadIdx.x]
+ : std::min(shr_chi2s[threadIdx.x], shr_chi2s[threadIdx.x + iter]);
+ __syncthreads();
+ oddElements = iter % 2;
+ iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
+ }
+
+ // filter out inactive or useless samples threads
+ if (!condForUselessSamples && !internalCondForSkipping1 && !internalCondForSkipping2) {
+ // min chi2, now compute weighted average of tmax measurements
+ // see cpu version for more explanation
+ const auto chi2min = shr_chi2s[threadIdx.x - ltx];
+ const auto chi2Limit = chi2min + 1.0;
+ const auto inverseSigmaSquared = chi2 < chi2Limit ? 1.0 / (tmaxerr * tmaxerr) : 0.0;
+
+#ifdef DEBUG_TC_MAKERATIO
+ if (ch == 1 || ch == 0)
+ printf("ch = %d ltx = %d chi2min = %f chi2Limit = %f inverseSigmaSquared = %f\n",
+ ch,
+ ltx,
+ chi2min,
+ chi2Limit,
+ inverseSigmaSquared);
+#endif
+
+ // store into shared mem and run reduction
+ // TODO: check if cooperative groups would be better
+ // TODO: check if shuffling intrinsics are better
+ shr_time_wgt[threadIdx.x] = inverseSigmaSquared;
+ shr_time_max[threadIdx.x] = tmax * inverseSigmaSquared;
+ } else {
+ shr_time_wgt[threadIdx.x] = 0;
+ shr_time_max[threadIdx.x] = 0;
+ }
+ __syncthreads();
+
+ // reduce to compute time_max and time_wgt
+ iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
+ oddElements = nthreads_per_channel % 2;
+#pragma unroll
+ while (iter >= 1) {
+ if (ltx < iter) {
+ shr_time_wgt[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
+ ? shr_time_wgt[threadIdx.x]
+ : shr_time_wgt[threadIdx.x] + shr_time_wgt[threadIdx.x + iter];
+ shr_time_max[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
+ ? shr_time_max[threadIdx.x]
+ : shr_time_max[threadIdx.x] + shr_time_max[threadIdx.x + iter];
+ shrTimeMax[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
+ ? shrTimeMax[threadIdx.x]
+ : shrTimeMax[threadIdx.x] + shrTimeMax[threadIdx.x + iter];
+ shrTimeWgt[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
+ ? shrTimeWgt[threadIdx.x]
+ : shrTimeWgt[threadIdx.x] + shrTimeWgt[threadIdx.x + iter];
+ }
+
+ __syncthreads();
+ oddElements = iter % 2;
+ iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
+ }
+
+ // load from shared memory the 0th guy (will contain accumulated values)
+ // compute
+ // store into global mem
+ if (ltx == 0) {
+ const auto tmp_time_max = shr_time_max[threadIdx.x];
+ const auto tmp_time_wgt = shr_time_wgt[threadIdx.x];
+
+ // we are done if there number of time ratios is 0
+ if (tmp_time_wgt == 0 && tmp_time_max == 0) {
+ g_state[ch] = TimeComputationState::Finished;
+ return;
+ }
+
+ // no div by 0
+ const auto tMaxAlphaBeta = tmp_time_max / tmp_time_wgt;
+ const auto tMaxErrorAlphaBeta = 1.0 / std::sqrt(tmp_time_wgt);
+
+ tMaxAlphaBetas[ch] = tMaxAlphaBeta;
+ tMaxErrorAlphaBetas[ch] = tMaxErrorAlphaBeta;
+ g_accTimeMax[ch] = shrTimeMax[threadIdx.x];
+ g_accTimeWgt[ch] = shrTimeWgt[threadIdx.x];
+ g_state[ch] = TimeComputationState::NotFinished;
+
+#ifdef DEBUG_TC_MAKERATIO
+ printf("ch = %d time_max = %f time_wgt = %f\n", ch, tmp_time_max, tmp_time_wgt);
+ printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f timeMax = %f timeWgt = %f\n",
+ ch,
+ tMaxAlphaBeta,
+ tMaxErrorAlphaBeta,
+ shrTimeMax[threadIdx.x],
+ shrTimeWgt[threadIdx.x]);
+#endif
+ }
+ }
+
+ /// launch ctx parameters are
+ /// 10 threads per channel, N channels per block, Y blocks
+ /// TODO: do we need to keep the state around or can be removed?!
+ //#define DEBUG_FINDAMPLCHI2_AND_FINISH
+ __global__ void kernel_time_compute_findamplchi2_and_finish(
+ SampleVector::Scalar const* sample_values,
+ SampleVector::Scalar const* sample_value_errors,
+ uint32_t const* dids_eb,
+ uint32_t const* dids_ee,
+ bool const* useless_samples,
+ SampleVector::Scalar const* g_tMaxAlphaBeta,
+ SampleVector::Scalar const* g_tMaxErrorAlphaBeta,
+ SampleVector::Scalar const* g_accTimeMax,
+ SampleVector::Scalar const* g_accTimeWgt,
+ ConfigurationParameters::type const* amplitudeFitParametersEB,
+ ConfigurationParameters::type const* amplitudeFitParametersEE,
+ SampleVector::Scalar const* sumAAsNullHypot,
+ SampleVector::Scalar const* sum0sNullHypot,
+ SampleVector::Scalar const* chi2sNullHypot,
+ TimeComputationState* g_state,
+ SampleVector::Scalar* g_ampMaxAlphaBeta,
+ SampleVector::Scalar* g_ampMaxError,
+ SampleVector::Scalar* g_timeMax,
+ SampleVector::Scalar* g_timeError,
+ const int nchannels,
+ uint32_t const offsetForInputs) {
+ using ScalarType = SampleVector::Scalar;
+
+ // constants
+ constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+ // indices
+ const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
+ const int ch = gtx / nsamples;
+ const int sample = threadIdx.x % nsamples;
+ const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+ const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+
+ // configure shared mem
+ // per block, we need #threads per block * 2 * sizeof(ScalarType)
+ // we run with N channels per block
+ extern __shared__ char smem[];
+ ScalarType* shr_sumAf = reinterpret_cast(smem);
+ ScalarType* shr_sumff = shr_sumAf + blockDim.x;
+
+ if (ch >= nchannels)
+ return;
+
+ auto state = g_state[ch];
+ const auto did = DetId{dids[inputCh]};
+ const auto* amplitudeFitParameters =
+ did.subdetId() == EcalBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE;
+
+ // TODO is that better than storing into global and launching another kernel
+ // for the first 10 threads
+ if (state == TimeComputationState::NotFinished) {
+ const auto alpha = amplitudeFitParameters[0];
+ const auto beta = amplitudeFitParameters[1];
+ const auto alphabeta = alpha * beta;
+ const auto invalphabeta = 1.0 / alphabeta;
+ const auto tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
+ const auto sample_value = sample_values[gtx];
+ const auto sample_value_error = sample_value_errors[gtx];
+ const auto inverr2 =
+ useless_samples[gtx] ? static_cast(0) : 1.0 / (sample_value_error * sample_value_error);
+ const auto offset = (static_cast(sample) - tMaxAlphaBeta) * invalphabeta;
+ const auto term1 = 1.0 + offset;
+ const auto f = term1 > 1e-6 ? fast_expf(alpha * (fast_logf(term1) - offset)) : static_cast(0.0);
+ const auto sumAf = sample_value * (f * inverr2);
+ const auto sumff = f * (f * inverr2);
+
+ // store into shared mem
+ shr_sumAf[threadIdx.x] = sumAf;
+ shr_sumff[threadIdx.x] = sumff;
+ } else {
+ shr_sumAf[threadIdx.x] = 0;
+ shr_sumff[threadIdx.x] = 0;
+ }
+ __syncthreads();
+
+ // reduce
+ // unroll completely here (but hardcoded)
+ if (sample < 5) {
+ shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x + 5];
+ shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x + 5];
+ }
+ __syncthreads();
+
+ if (sample < 2) {
+ // will need to subtract for ltx = 3, we double count here
+ shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x + 2] + shr_sumAf[threadIdx.x + 3];
+ shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x + 2] + shr_sumff[threadIdx.x + 3];
+ }
+ __syncthreads();
+
+ if (sample == 0) {
+ // exit if the state is done
+ // note, we do not exit before all __synchtreads are finished
+ if (state == TimeComputationState::Finished) {
+ g_timeMax[ch] = 5;
+ g_timeError[ch] = -999;
+ return;
+ }
+
+ // subtract to avoid double counting
+ const auto sumff = shr_sumff[threadIdx.x] + shr_sumff[threadIdx.x + 1] - shr_sumff[threadIdx.x + 3];
+ const auto sumAf = shr_sumAf[threadIdx.x] + shr_sumAf[threadIdx.x + 1] - shr_sumAf[threadIdx.x + 3];
+
+ const auto ampMaxAlphaBeta = sumff > 0 ? sumAf / sumff : 0;
+ const auto sumAA = sumAAsNullHypot[ch];
+ const auto sum0 = sum0sNullHypot[ch];
+ const auto nullChi2 = chi2sNullHypot[ch];
+ if (sumff > 0) {
+ const auto chi2AlphaBeta = (sumAA - sumAf * sumAf / sumff) / sum0;
+ if (chi2AlphaBeta > nullChi2) {
+ // null hypothesis is better
+ state = TimeComputationState::Finished;
+#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
+ printf("ch = %d chi2AlphaBeta = %f nullChi2 = %f sumAA = %f sumAf = %f sumff = %f sum0 = %f\n",
+ ch,
+ chi2AlphaBeta,
+ nullChi2,
+ sumAA,
+ sumAf,
+ sumff,
+ sum0);
+#endif
+ }
+
+ // store to global
+ g_ampMaxAlphaBeta[ch] = ampMaxAlphaBeta;
+ } else {
+#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
+ printf("ch = %d sum0 = %f sumAA = %f sumff = %f sumAf = %f\n", ch, sum0, sumAA, sumff, sumAf);
+#endif
+ state = TimeComputationState::Finished;
+ }
+
+ // store the state to global and finish calcs
+ g_state[ch] = state;
+ if (state == TimeComputationState::Finished) {
+ // store default values into global
+ g_timeMax[ch] = 5;
+ g_timeError[ch] = -999;
+#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
+ printf("ch = %d finished state\n", ch);
+#endif
+ return;
+ }
+
+ const auto ampMaxError = g_ampMaxError[ch];
+ const auto test_ratio = ampMaxAlphaBeta / ampMaxError;
+ const auto accTimeMax = g_accTimeMax[ch];
+ const auto accTimeWgt = g_accTimeWgt[ch];
+ const auto tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
+ const auto tMaxErrorAlphaBeta = g_tMaxErrorAlphaBeta[ch];
+ // branch to separate large vs small pulses
+ // see cpu version for more info
+ if (test_ratio > 5.0 && accTimeWgt > 0) {
+ const auto tMaxRatio = accTimeWgt > 0 ? accTimeMax / accTimeWgt : static_cast(0);
+ const auto tMaxErrorRatio = accTimeWgt > 0 ? 1.0 / std::sqrt(accTimeWgt) : static_cast(0);
+
+ if (test_ratio > 10.0) {
+ g_timeMax[ch] = tMaxRatio;
+ g_timeError[ch] = tMaxErrorRatio;
+
+#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
+ printf("ch = %d tMaxRatio = %f tMaxErrorRatio = %f\n", ch, tMaxRatio, tMaxErrorRatio);
+#endif
+ } else {
+ const auto timeMax = (tMaxAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) +
+ tMaxRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) /
+ 5.0;
+ const auto timeError = (tMaxErrorAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) +
+ tMaxErrorRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) /
+ 5.0;
+ state = TimeComputationState::Finished;
+ g_state[ch] = state;
+ g_timeMax[ch] = timeMax;
+ g_timeError[ch] = timeError;
+
+#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
+ printf("ch = %d timeMax = %f timeError = %f\n", ch, timeMax, timeError);
+#endif
+ }
+ } else {
+ state = TimeComputationState::Finished;
+ g_state[ch] = state;
+ g_timeMax[ch] = tMaxAlphaBeta;
+ g_timeError[ch] = tMaxErrorAlphaBeta;
+
+#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
+ printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f\n", ch, tMaxAlphaBeta, tMaxErrorAlphaBeta);
+#endif
+ }
+ }
+ }
+
+ __global__ void kernel_time_compute_fixMGPAslew(uint16_t const* digis_eb,
+ uint16_t const* digis_ee,
+ SampleVector::Scalar* sample_values,
+ SampleVector::Scalar* sample_value_errors,
+ bool* useless_sample_values,
+ unsigned const int sample_mask,
+ const int nchannels,
+ uint32_t const offsetForInputs) {
+ using ScalarType = SampleVector::Scalar;
+
+ // constants
+ constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+ // indices
+ const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
+ const int ch = gtx / nsamples;
+ const int sample = threadIdx.x % nsamples;
+ const int inputGtx = ch >= offsetForInputs ? gtx - offsetForInputs * nsamples : gtx;
+ const auto* digis = ch >= offsetForInputs ? digis_ee : digis_eb;
+
+ // remove thread for sample 0, oversubscribing is easier than ....
+ if (ch >= nchannels || sample == 0)
+ return;
+
+ if (!use_sample(sample_mask, sample))
+ return;
+
+ const auto gainIdPrev = ecal::mgpa::gainId(digis[inputGtx - 1]);
+ const auto gainIdNext = ecal::mgpa::gainId(digis[inputGtx]);
+ if (gainIdPrev >= 1 && gainIdPrev <= 3 && gainIdNext >= 1 && gainIdNext <= 3 && gainIdPrev < gainIdNext) {
+ sample_values[gtx - 1] = 0;
+ sample_value_errors[gtx - 1] = 1e+9;
+ useless_sample_values[gtx - 1] = true;
+ }
+ }
+
+ __global__ void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values,
+ SampleVector::Scalar const* sample_value_errors,
+ uint32_t const* dids,
+ bool const* useless_samples,
+ SampleVector::Scalar const* g_timeMax,
+ SampleVector::Scalar const* amplitudeFitParametersEB,
+ SampleVector::Scalar const* amplitudeFitParametersEE,
+ SampleVector::Scalar* g_amplitudeMax,
+ const int nchannels) {
+ using ScalarType = SampleVector::Scalar;
+
+ // constants
+ constexpr ScalarType corr4 = 1.;
+ constexpr ScalarType corr6 = 1.;
+ constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+ // indices
+ const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
+ const int ch = gtx / nsamples;
+ const int sample = threadIdx.x % nsamples;
+
+ if (ch >= nchannels)
+ return;
+
+ const auto did = DetId{dids[ch]};
+ const auto* amplitudeFitParameters =
+ did.subdetId() == EcalBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE;
+
+ // configure shared mem
+ extern __shared__ char smem[];
+ ScalarType* shr_sum1 = reinterpret_cast(smem);
+ auto* shr_sumA = shr_sum1 + blockDim.x;
+ auto* shr_sumF = shr_sumA + blockDim.x;
+ auto* shr_sumAF = shr_sumF + blockDim.x;
+ auto* shr_sumFF = shr_sumAF + blockDim.x;
+
+ const auto alpha = amplitudeFitParameters[0];
+ const auto beta = amplitudeFitParameters[1];
+ const auto timeMax = g_timeMax[ch];
+ const auto pedestalLimit = timeMax - (alpha * beta) - 1.0;
+ const auto sample_value = sample_values[gtx];
+ const auto sample_value_error = sample_value_errors[gtx];
+ const auto inverr2 =
+ sample_value_error > 0 ? 1. / (sample_value_error * sample_value_error) : static_cast(0);
+ const auto termOne = 1 + (sample - timeMax) / (alpha * beta);
+ const auto f = termOne > 1.e-5 ? fast_expf(alpha * fast_logf(termOne) - (sample - timeMax) / beta)
+ : static_cast(0.);
+
+ bool const cond = ((sample < pedestalLimit) || (f > 0.6 * corr6 && sample <= timeMax) ||
+ (f > 0.4 * corr4 && sample >= timeMax)) &&
+ !useless_samples[gtx];
+
+ // store into shared mem
+ shr_sum1[threadIdx.x] = cond ? inverr2 : static_cast(0);
+ shr_sumA[threadIdx.x] = cond ? sample_value * inverr2 : static_cast(0);
+ shr_sumF[threadIdx.x] = cond ? f * inverr2 : static_cast(0);
+ shr_sumAF[threadIdx.x] = cond ? (f * inverr2) * sample_value : static_cast(0);
+ shr_sumFF[threadIdx.x] = cond ? f * (f * inverr2) : static_cast(0);
+
+ // reduction
+ if (sample <= 4) {
+ shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x + 5];
+ shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x + 5];
+ shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x + 5];
+ shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x + 5];
+ shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x + 5];
+ }
+ __syncthreads();
+
+ if (sample < 2) {
+ // note: we double count sample 3
+ shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x + 2] + shr_sum1[threadIdx.x + 3];
+ shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x + 2] + shr_sumA[threadIdx.x + 3];
+ shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x + 2] + shr_sumF[threadIdx.x + 3];
+ shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x + 2] + shr_sumAF[threadIdx.x + 3];
+ shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x + 2] + shr_sumFF[threadIdx.x + 3];
+ }
+ __syncthreads();
+
+ if (sample == 0) {
+ const auto sum1 = shr_sum1[threadIdx.x] + shr_sum1[threadIdx.x + 1] - shr_sum1[threadIdx.x + 3];
+ const auto sumA = shr_sumA[threadIdx.x] + shr_sumA[threadIdx.x + 1] - shr_sumA[threadIdx.x + 3];
+ const auto sumF = shr_sumF[threadIdx.x] + shr_sumF[threadIdx.x + 1] - shr_sumF[threadIdx.x + 3];
+ const auto sumAF = shr_sumAF[threadIdx.x] + shr_sumAF[threadIdx.x + 1] - shr_sumAF[threadIdx.x + 3];
+ const auto sumFF = shr_sumFF[threadIdx.x] + shr_sumFF[threadIdx.x + 1] - shr_sumFF[threadIdx.x + 3];
+
+ const auto denom = sumFF * sum1 - sumF * sumF;
+ const auto condForDenom = sum1 > 0 && ecal::abs(denom) > 1.e-20;
+ const auto amplitudeMax = condForDenom ? (sumAF * sum1 - sumA * sumF) / denom : static_cast(0.);
+
+ // store into global mem
+ g_amplitudeMax[ch] = amplitudeMax;
+ }
+ }
+
+ //#define ECAL_RECO_CUDA_TC_INIT_DEBUG
+ __global__ void kernel_time_computation_init(uint16_t const* digis_eb,
+ uint32_t const* dids_eb,
+ uint16_t const* digis_ee,
+ uint32_t const* dids_ee,
+ float const* rms_x12,
+ float const* rms_x6,
+ float const* rms_x1,
+ float const* mean_x12,
+ float const* mean_x6,
+ float const* mean_x1,
+ float const* gain12Over6,
+ float const* gain6Over1,
+ SampleVector::Scalar* sample_values,
+ SampleVector::Scalar* sample_value_errors,
+ SampleVector::Scalar* ampMaxError,
+ bool* useless_sample_values,
+ char* pedestal_nums,
+ uint32_t const offsetForHashes,
+ uint32_t const offsetForInputs,
+ unsigned const int sample_maskEB,
+ unsigned const int sample_maskEE,
+ int nchannels) {
+ using ScalarType = SampleVector::Scalar;
+
+ // constants
+ constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+ // indices
+ const int tx = threadIdx.x + blockDim.x * blockIdx.x;
+ const int ch = tx / nsamples;
+ const int inputTx = ch >= offsetForInputs ? tx - offsetForInputs * nsamples : tx;
+ const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
+ const auto* digis = ch >= offsetForInputs ? digis_ee : digis_eb;
+ const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb;
+
+ if (ch < nchannels) {
+ // indices/inits
+ const int sample = tx % nsamples;
+ const int input_ch_start = inputCh * nsamples;
+ SampleVector::Scalar pedestal = 0.;
+ int num = 0;
+
+ // configure shared mem
+ extern __shared__ char smem[];
+ ScalarType* shrSampleValues = reinterpret_cast(smem);
+ ScalarType* shrSampleValueErrors = shrSampleValues + blockDim.x;
+
+ // 0 and 1 sample values
+ const auto adc0 = ecal::mgpa::adc(digis[input_ch_start]);
+ const auto gainId0 = ecal::mgpa::gainId(digis[input_ch_start]);
+ const auto adc1 = ecal::mgpa::adc(digis[input_ch_start + 1]);
+ const auto gainId1 = ecal::mgpa::gainId(digis[input_ch_start + 1]);
+ const auto did = DetId{dids[inputCh]};
+ const auto isBarrel = did.subdetId() == EcalBarrel;
+ const auto sample_mask = did.subdetId() == EcalBarrel ? sample_maskEB : sample_maskEE;
+ const auto hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
+ : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
+
+ // set pedestal
+ // TODO this branch is non-divergent for a group of 10 threads
+ if (gainId0 == 1 && use_sample(sample_mask, 0)) {
+ pedestal = static_cast(adc0);
+ num = 1;
+
+ const auto diff = adc1 - adc0;
+ if (gainId1 == 1 && use_sample(sample_mask, 1) && std::abs(diff) < 3 * rms_x12[hashedId]) {
+ pedestal = (pedestal + static_cast(adc1)) / 2.0;
+ num = 2;
+ }
+ } else {
+ pedestal = mean_x12[ch];
+ }
+
+ // ped subtracted and gain-renormalized samples.
+ const auto gainId = ecal::mgpa::gainId(digis[inputTx]);
+ const auto adc = ecal::mgpa::adc(digis[inputTx]);
+
+ bool bad = false;
+ SampleVector::Scalar sample_value, sample_value_error;
+ // TODO divergent branch
+ // TODO: piece below is general both for amplitudes and timing
+ // potentially there is a way to reduce the amount of code...
+ if (!use_sample(sample_mask, sample)) {
+ bad = true;
+ sample_value = 0;
+ sample_value_error = 0;
+ } else if (gainId == 1) {
+ sample_value = static_cast(adc) - pedestal;
+ sample_value_error = rms_x12[hashedId];
+ } else if (gainId == 2) {
+ sample_value = (static_cast(adc) - mean_x6[hashedId]) * gain12Over6[hashedId];
+ sample_value_error = rms_x6[hashedId] * gain12Over6[hashedId];
+ } else if (gainId == 3) {
+ sample_value = (static_cast(adc) - mean_x1[hashedId]) * gain6Over1[hashedId] *
+ gain12Over6[hashedId];
+ sample_value_error = rms_x1[hashedId] * gain6Over1[hashedId] * gain12Over6[hashedId];
+ } else {
+ sample_value = 0;
+ sample_value_error = 0;
+ bad = true;
+ }
+
+ // TODO: make sure we save things correctly when sample is useless
+ const auto useless_sample = (sample_value_error <= 0) | bad;
+ useless_sample_values[tx] = useless_sample;
+ sample_values[tx] = sample_value;
+ sample_value_errors[tx] = useless_sample ? 1e+9 : sample_value_error;
+
+ // DEBUG
+#ifdef ECAL_RECO_CUDA_TC_INIT_DEBUG
+ if (ch == 0) {
+ printf("sample = %d sample_value = %f sample_value_error = %f useless = %c\n",
+ sample,
+ sample_value,
+ sample_value_error,
+ useless_sample ? '1' : '0');
+ }
+#endif
+
+ // store into the shared mem
+ shrSampleValues[threadIdx.x] = sample_value_error > 0 ? sample_value : std::numeric_limits::min();
+ shrSampleValueErrors[threadIdx.x] = sample_value_error;
+ __syncthreads();
+
+ // perform the reduction with min
+ if (sample < 5) {
+ // note, if equal -> we keep the value with lower sample as for cpu
+ shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 5]
+ ? shrSampleValueErrors[threadIdx.x + 5]
+ : shrSampleValueErrors[threadIdx.x];
+ shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 5]);
+ }
+ __syncthreads();
+
+ // a bit of an overkill, but easier than to compare across 3 values
+ if (sample < 3) {
+ shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 3]
+ ? shrSampleValueErrors[threadIdx.x + 3]
+ : shrSampleValueErrors[threadIdx.x];
+ shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 3]);
+ }
+ __syncthreads();
+
+ if (sample < 2) {
+ shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 2]
+ ? shrSampleValueErrors[threadIdx.x + 2]
+ : shrSampleValueErrors[threadIdx.x];
+ shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 2]);
+ }
+ __syncthreads();
+
+ if (sample == 0) {
+ // we only needd the max error
+ const auto maxSampleValueError = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 1]
+ ? shrSampleValueErrors[threadIdx.x + 1]
+ : shrSampleValueErrors[threadIdx.x];
+
+ // # pedestal samples used
+ pedestal_nums[ch] = num;
+ // this is used downstream
+ ampMaxError[ch] = maxSampleValueError;
+
+ // DEBUG
+#ifdef ECAL_RECO_CUDA_TC_INIT_DEBUG
+ if (ch == 0) {
+ printf("pedestal_nums = %d ampMaxError = %f\n", num, maxSampleValueError);
+ }
+#endif
+ }
+ }
+ }
+
+ ///
+ /// launch context parameters: 1 thread per channel
+ ///
+ //#define DEBUG_TIME_CORRECTION
+ __global__ void kernel_time_correction_and_finalize(
+ // SampleVector::Scalar const* g_amplitude,
+ ::ecal::reco::StorageScalarType const* g_amplitude,
+ uint16_t const* digis_eb,
+ uint32_t const* dids_eb,
+ uint16_t const* digis_ee,
+ uint32_t const* dids_ee,
+ float const* amplitudeBinsEB,
+ float const* amplitudeBinsEE,
+ float const* shiftBinsEB,
+ float const* shiftBinsEE,
+ SampleVector::Scalar const* g_timeMax,
+ SampleVector::Scalar const* g_timeError,
+ float const* g_rms_x12,
+ float const* timeCalibConstant,
+ float* g_jitter,
+ float* g_jitterError,
+ uint32_t* flags,
+ const int amplitudeBinsSizeEB,
+ const int amplitudeBinsSizeEE,
+ ConfigurationParameters::type const timeConstantTermEB,
+ ConfigurationParameters::type const timeConstantTermEE,
+ float const offsetTimeValueEB,
+ float const offsetTimeValueEE,
+ ConfigurationParameters::type const timeNconstEB,
+ ConfigurationParameters::type const timeNconstEE,
+ ConfigurationParameters::type const amplitudeThresholdEB,
+ ConfigurationParameters::type const amplitudeThresholdEE,
+ ConfigurationParameters::type const outOfTimeThreshG12pEB,
+ ConfigurationParameters::type const outOfTimeThreshG12pEE,
+ ConfigurationParameters::type const outOfTimeThreshG12mEB,
+ ConfigurationParameters::type const outOfTimeThreshG12mEE,
+ ConfigurationParameters::type const outOfTimeThreshG61pEB,
+ ConfigurationParameters::type const outOfTimeThreshG61pEE,
+ ConfigurationParameters::type const outOfTimeThreshG61mEB,
+ ConfigurationParameters::type const outOfTimeThreshG61mEE,
+ uint32_t const offsetForHashes,
+ uint32_t const offsetForInputs,
+ const int nchannels) {
+ using ScalarType = SampleVector::Scalar;
+
+ // constants
+ constexpr int nsamples = EcalDataFrame::MAXSAMPLES;
+
+ // indices
+ const int gtx = threadIdx.x + blockIdx.x * blockDim.x;
+ const int inputGtx = gtx >= offsetForInputs ? gtx - offsetForInputs : gtx;
+ const auto* dids = gtx >= offsetForInputs ? dids_ee : dids_eb;
+ const auto& digis = gtx >= offsetForInputs ? digis_ee : digis_eb;
+
+ // filter out outside of range threads
+ if (gtx >= nchannels)
+ return;
+
+ const auto did = DetId{dids[inputGtx]};
+ const auto isBarrel = did.subdetId() == EcalBarrel;
+ const auto hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
+ : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
+ const auto* amplitudeBins = isBarrel ? amplitudeBinsEB : amplitudeBinsEE;
+ const auto* shiftBins = isBarrel ? shiftBinsEB : shiftBinsEE;
+ const auto amplitudeBinsSize = isBarrel ? amplitudeBinsSizeEB : amplitudeBinsSizeEE;
+ const auto timeConstantTerm = isBarrel ? timeConstantTermEB : timeConstantTermEE;
+ const auto timeNconst = isBarrel ? timeNconstEB : timeNconstEE;
+ const auto offsetTimeValue = isBarrel ? offsetTimeValueEB : offsetTimeValueEE;
+ const auto amplitudeThreshold = isBarrel ? amplitudeThresholdEB : amplitudeThresholdEE;
+ const auto outOfTimeThreshG12p = isBarrel ? outOfTimeThreshG12pEB : outOfTimeThreshG12pEE;
+ const auto outOfTimeThreshG12m = isBarrel ? outOfTimeThreshG12mEB : outOfTimeThreshG12mEE;
+ const auto outOfTimeThreshG61p = isBarrel ? outOfTimeThreshG61pEB : outOfTimeThreshG61pEE;
+ const auto outOfTimeThreshG61m = isBarrel ? outOfTimeThreshG61mEB : outOfTimeThreshG61mEE;
+
+ // load some
+ const auto amplitude = g_amplitude[gtx];
+ const auto rms_x12 = g_rms_x12[hashedId];
+ const auto timeCalibConst = timeCalibConstant[hashedId];
+
+ int myBin = -1;
+ for (int bin = 0; bin < amplitudeBinsSize; bin++) {
+ if (amplitude > amplitudeBins[bin])
+ myBin = bin;
+ else
+ break;
+ }
+
+ ScalarType correction = 0;
+ if (myBin == -1) {
+ correction = shiftBins[0];
+ } else if (myBin == amplitudeBinsSize - 1) {
+ correction = shiftBins[myBin];
+ } else {
+ correction = shiftBins[myBin + 1] - shiftBins[myBin];
+ correction *= (amplitude - amplitudeBins[myBin]) / (amplitudeBins[myBin + 1] - amplitudeBins[myBin]);
+ correction += shiftBins[myBin];
+ }
+
+ // correction * 1./25.
+ correction = correction * 0.04;
+ const auto timeMax = g_timeMax[gtx];
+ const auto timeError = g_timeError[gtx];
+ const auto jitter = timeMax - 5 + correction;
+ const auto jitterError =
+ std::sqrt(timeError * timeError + timeConstantTerm * timeConstantTerm * 0.04 * 0.04); // 0.04 = 1./25.
+
+#ifdef DEBUG_TIME_CORRECTION
+ // if (gtx == 0) {
+ printf("ch = %d timeMax = %f timeError = %f jitter = %f correction = %f\n",
+ gtx,
+ timeMax,
+ timeError,
+ jitter,
+ correction);
+// }
+#endif
+
+ // store back to global
+ g_jitter[gtx] = jitter;
+ g_jitterError[gtx] = jitterError;
+
+ // set the flag
+ // TODO: replace with something more efficient (if required),
+ // for now just to make it work
+ if (amplitude > amplitudeThreshold * rms_x12) {
+ auto threshP = outOfTimeThreshG12p;
+ auto threshM = outOfTimeThreshG12m;
+ if (amplitude > 3000.) {
+ for (int isample = 0; isample < nsamples; isample++) {
+ int gainid = ecal::mgpa::gainId(digis[nsamples * inputGtx + isample]);
+ if (gainid != 1) {
+ threshP = outOfTimeThreshG61p;
+ threshM = outOfTimeThreshG61m;
+ break;
+ }
+ }
+ }
+
+ const auto correctedTime = (timeMax - 5) * 25 + timeCalibConst + offsetTimeValue;
+ const auto nterm = timeNconst * rms_x12 / amplitude;
+ const auto sigmat = std::sqrt(nterm * nterm + timeConstantTerm * timeConstantTerm);
+ if (correctedTime > sigmat * threshP || correctedTime < -sigmat * threshM)
+ flags[gtx] |= 0x1 << EcalUncalibratedRecHit::kOutOfTime;
+ }
+ }
+
+ } // namespace multifit
+} // namespace ecal
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.h
new file mode 100644
index 0000000000000..68500ddfe70d2
--- /dev/null
+++ b/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.h
@@ -0,0 +1,182 @@
+#ifndef RecoLocalCalo_EcalRecProducers_plugins_TimeComputationKernels_h
+#define RecoLocalCalo_EcalRecProducers_plugins_TimeComputationKernels_h
+
+#include
+#include
+
+#include
+
+#include "DataFormats/Math/interface/approx_exp.h"
+#include "DataFormats/Math/interface/approx_log.h"
+
+#include "Common.h"
+#include "DeclsForKernels.h"
+#include "EigenMatrixTypes_gpu.h"
+
+//#define DEBUG
+
+//#define ECAL_RECO_CUDA_DEBUG
+
+namespace ecal {
+ namespace multifit {
+
+ __global__ void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values,
+ SampleVector::Scalar const* sample_value_errors,
+ bool const* useless_sample_values,
+ SampleVector::Scalar* chi2s,
+ SampleVector::Scalar* sum0s,
+ SampleVector::Scalar* sumAAs,
+ int const nchannels);
+ //
+ // launch ctx parameters are
+ // 45 threads per channel, X channels per block, Y blocks
+ // 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9
+ // TODO: it might be much beter to use 32 threads per channel instead of 45
+ // to simplify the synchronization
+ //
+ __global__ void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values,
+ SampleVector::Scalar const* sample_value_errors,
+ uint32_t const* dids_eb,
+ uint32_t const* dids_ee,
+ bool const* useless_sample_values,
+ char const* pedestal_nums,
+ ConfigurationParameters::type const* amplitudeFitParametersEB,
+ ConfigurationParameters::type const* amplitudeFitParametersEE,
+ ConfigurationParameters::type const* timeFitParametersEB,
+ ConfigurationParameters::type const* timeFitParametersEE,
+ SampleVector::Scalar const* sumAAsNullHypot,
+ SampleVector::Scalar const* sum0sNullHypot,
+ SampleVector::Scalar* tMaxAlphaBetas,
+ SampleVector::Scalar* tMaxErrorAlphaBetas,
+ SampleVector::Scalar* g_accTimeMax,
+ SampleVector::Scalar* g_accTimeWgt,
+ TimeComputationState* g_state,
+ unsigned int const timeFitParameters_sizeEB,
+ unsigned int const timeFitParameters_sizeEE,
+ ConfigurationParameters::type const timeFitLimits_firstEB,
+ ConfigurationParameters::type const timeFitLimits_firstEE,
+ ConfigurationParameters::type const timeFitLimits_secondEB,
+ ConfigurationParameters::type const timeFitLimits_secondEE,
+ int const nchannels,
+ uint32_t const offsetForInputs);
+
+ /// launch ctx parameters are
+ /// 10 threads per channel, N channels per block, Y blocks
+ /// TODO: do we need to keep the state around or can be removed?!
+ //#define DEBUG_FINDAMPLCHI2_AND_FINISH
+ __global__ void kernel_time_compute_findamplchi2_and_finish(
+ SampleVector::Scalar const* sample_values,
+ SampleVector::Scalar const* sample_value_errors,
+ uint32_t const* dids_eb,
+ uint32_t const* dids_ee,
+ bool const* useless_samples,
+ SampleVector::Scalar const* g_tMaxAlphaBeta,
+ SampleVector::Scalar const* g_tMaxErrorAlphaBeta,
+ SampleVector::Scalar const* g_accTimeMax,
+ SampleVector::Scalar const* g_accTimeWgt,
+ ConfigurationParameters::type const* amplitudeFitParametersEB,
+ ConfigurationParameters::type const* amplitudeFitParametersEE,
+ SampleVector::Scalar const* sumAAsNullHypot,
+ SampleVector::Scalar const* sum0sNullHypot,
+ SampleVector::Scalar const* chi2sNullHypot,
+ TimeComputationState* g_state,
+ SampleVector::Scalar* g_ampMaxAlphaBeta,
+ SampleVector::Scalar* g_ampMaxError,
+ SampleVector::Scalar* g_timeMax,
+ SampleVector::Scalar* g_timeError,
+ int const nchannels,
+ uint32_t const offsetForInputs);
+
+ __global__ void kernel_time_compute_fixMGPAslew(uint16_t const* digis_eb,
+ uint16_t const* digis_ee,
+ SampleVector::Scalar* sample_values,
+ SampleVector::Scalar* sample_value_errors,
+ bool* useless_sample_values,
+ unsigned int const sample_mask,
+ int const nchannels,
+ uint32_t const offsetForInputs);
+
+ __global__ void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values,
+ SampleVector::Scalar const* sample_value_errors,
+ uint32_t const* dids_eb,
+ uint32_t const* dids_ed,
+ bool const* useless_samples,
+ SampleVector::Scalar const* g_timeMax,
+ SampleVector::Scalar const* amplitudeFitParametersEB,
+ SampleVector::Scalar const* amplitudeFitParametersEE,
+ SampleVector::Scalar* g_amplitudeMax,
+ int const nchannels,
+ uint32_t const offsetForInputs);
+
+ //#define ECAL_RECO_CUDA_TC_INIT_DEBUG
+ __global__ void kernel_time_computation_init(uint16_t const* digis_eb,
+ uint32_t const* dids_eb,
+ uint16_t const* digis_ee,
+ uint32_t const* dids_ee,
+ float const* rms_x12,
+ float const* rms_x6,
+ float const* rms_x1,
+ float const* mean_x12,
+ float const* mean_x6,
+ float const* mean_x1,
+ float const* gain12Over6,
+ float const* gain6Over1,
+ SampleVector::Scalar* sample_values,
+ SampleVector::Scalar* sample_value_errors,
+ SampleVector::Scalar* ampMaxError,
+ bool* useless_sample_values,
+ char* pedestal_nums,
+ uint32_t const offsetForHashes,
+ uint32_t const offsetForInputs,
+ unsigned int const sample_maskEB,
+ unsigned int const sample_maskEE,
+ int nchannels);
+
+ ///
+ /// launch context parameters: 1 thread per channel
+ ///
+ //#define DEBUG_TIME_CORRECTION
+ __global__ void kernel_time_correction_and_finalize(
+ // SampleVector::Scalar const* g_amplitude,
+ ::ecal::reco::StorageScalarType const* g_amplitude,
+ uint16_t const* digis_eb,
+ uint32_t const* dids_eb,
+ uint16_t const* digis_ee,
+ uint32_t const* dids_ee,
+ float const* amplitudeBinsEB,
+ float const* amplitudeBinsEE,
+ float const* shiftBinsEB,
+ float const* shiftBinsEE,
+ SampleVector::Scalar const* g_timeMax,
+ SampleVector::Scalar const* g_timeError,
+ float const* g_rms_x12,
+ float const* timeCalibConstant,
+ ::ecal::reco::StorageScalarType* g_jitter,
+ ::ecal::reco::StorageScalarType* g_jitterError,
+ uint32_t* flags,
+ int const amplitudeBinsSizeEB,
+ int const amplitudeBinsSizeEE,
+ ConfigurationParameters::type const timeConstantTermEB,
+ ConfigurationParameters::type const timeConstantTermEE,
+ float const offsetTimeValueEB,
+ float const offsetTimeValueEE,
+ ConfigurationParameters::type const timeNconstEB,
+ ConfigurationParameters::type const timeNconstEE,
+ ConfigurationParameters::type const amplitudeThresholdEB,
+ ConfigurationParameters::type const amplitudeThresholdEE,
+ ConfigurationParameters::type const outOfTimeThreshG12pEB,
+ ConfigurationParameters::type const outOfTimeThreshG12pEE,
+ ConfigurationParameters::type const outOfTimeThreshG12mEB,
+ ConfigurationParameters::type const outOfTimeThreshG12mEE,
+ ConfigurationParameters::type const outOfTimeThreshG61pEB,
+ ConfigurationParameters::type const outOfTimeThreshG61pEE,
+ ConfigurationParameters::type const outOfTimeThreshG61mEB,
+ ConfigurationParameters::type const outOfTimeThreshG61mEE,
+ uint32_t const offsetForHashes,
+ uint32_t const offsetForInputs,
+ int const nchannels);
+
+ } // namespace multifit
+} // namespace ecal
+
+#endif // RecoLocalCalo_EcalRecProducers_plugins_TimeComputationKernels_h