From 7b3807c1489492f8af3a545d7729aebb634bfe6b Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 15 Jun 2020 12:41:59 +0200 Subject: [PATCH] Restructure code to work around CUDA build limitations (cms-patatrack#483) Move ECAL and HCAL CUDA code to plugins. General cleanup: remove unused code, apply clang-format and various include changes. Fix product labels for HCAL rechits on CPU. Co-authored-by: Andrea Bocci --- EventFilter/EcalRawToDigi/BuildFile.xml | 3 - .../interface/EcalRegionCabling.h | 15 +- .../EcalRawToDigi/plugins/BuildFile.xml | 7 +- .../EcalRawToDigi/plugins/DeclsForKernels.h | 108 ++ .../plugins/EcalCPUDigisProducer.cc | 5 +- .../EcalRawToDigi/plugins/EcalRawToDigiGPU.cc | 31 +- .../EcalRawToDigi/plugins/UnpackGPU.cu | 337 +++++ EventFilter/EcalRawToDigi/plugins/UnpackGPU.h | 23 + .../AmplitudeComputationCommonKernels.cu | 505 ++++++++ .../AmplitudeComputationCommonKernels.h | 98 ++ .../plugins/AmplitudeComputationKernels.cu | 400 ++++++ .../plugins/AmplitudeComputationKernels.h | 29 + .../EcalRecProducers/plugins/BuildFile.xml | 25 +- .../EcalRecProducers/plugins/Common.h | 46 + .../plugins/DeclsForKernels.h | 333 +++++ .../plugins/EcalRecHitBuilderKernels.cu | 673 ++++++++++ .../plugins/EcalRecHitBuilderKernels.h | 94 ++ .../plugins/EcalRecHitConvertGPU2CPUFormat.cc | 20 +- .../plugins/EcalRecHitProducerGPU.cc | 72 +- .../EcalUncalibRecHitConvertGPU2CPUFormat.cc | 20 +- .../EcalUncalibRecHitMultiFitAlgo_gpu_new.cu | 312 +++++ .../EcalUncalibRecHitMultiFitAlgo_gpu_new.h | 23 + .../plugins/EcalUncalibRecHitProducerGPU.cc | 47 +- .../plugins/EigenMatrixTypes_gpu.h | 49 + .../EcalRecProducers/plugins/KernelHelpers.cu | 323 +++++ .../EcalRecProducers/plugins/KernelHelpers.h | 452 +++++++ .../plugins/TimeComputationKernels.cu | 1131 +++++++++++++++++ .../plugins/TimeComputationKernels.h | 182 +++ 28 files changed, 5215 insertions(+), 148 deletions(-) create mode 100644 EventFilter/EcalRawToDigi/plugins/DeclsForKernels.h create mode 100644 EventFilter/EcalRawToDigi/plugins/UnpackGPU.cu create mode 100644 EventFilter/EcalRawToDigi/plugins/UnpackGPU.h create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.cu create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.h create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.h create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/Common.h create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/DeclsForKernels.h create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.cu create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.h create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgo_gpu_new.h create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/EigenMatrixTypes_gpu.h create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.cu create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.cu create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.h diff --git a/EventFilter/EcalRawToDigi/BuildFile.xml b/EventFilter/EcalRawToDigi/BuildFile.xml index da28405324833..31e115bdc7880 100644 --- a/EventFilter/EcalRawToDigi/BuildFile.xml +++ b/EventFilter/EcalRawToDigi/BuildFile.xml @@ -8,7 +8,6 @@ - @@ -18,8 +17,6 @@ - - diff --git a/EventFilter/EcalRawToDigi/interface/EcalRegionCabling.h b/EventFilter/EcalRawToDigi/interface/EcalRegionCabling.h index fa6e9f5d5a161..38a9ebdf18cb8 100644 --- a/EventFilter/EcalRawToDigi/interface/EcalRegionCabling.h +++ b/EventFilter/EcalRawToDigi/interface/EcalRegionCabling.h @@ -1,14 +1,11 @@ -#ifndef EcalRegionCabling_H -#define EcalRegionCabling_H +#ifndef EventFilter_EcalRawToDigi_interface_EcalRegionCabling_h +#define EventFilter_EcalRawToDigi_interface_EcalRegionCabling_h -#include "Geometry/EcalMapping/interface/EcalElectronicsMapping.h" -#include "Geometry/EcalMapping/interface/ESElectronicsMapper.h" - -#include "DataFormats/EcalRecHit/interface/EcalRecHit.h" -#include "FWCore/ParameterSet/interface/ParameterSet.h" #include "DataFormats/FEDRawData/interface/FEDNumbering.h" - #include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "Geometry/EcalMapping/interface/ESElectronicsMapper.h" +#include "Geometry/EcalMapping/interface/EcalElectronicsMapping.h" class EcalRegionCabling { public: @@ -73,4 +70,4 @@ class EcalRegionCabling { const ESElectronicsMapper* es_mapping_; }; -#endif +#endif // EventFilter_EcalRawToDigi_interface_EcalRegionCabling_h diff --git a/EventFilter/EcalRawToDigi/plugins/BuildFile.xml b/EventFilter/EcalRawToDigi/plugins/BuildFile.xml index 3bd25122ff029..df29d960b57e9 100644 --- a/EventFilter/EcalRawToDigi/plugins/BuildFile.xml +++ b/EventFilter/EcalRawToDigi/plugins/BuildFile.xml @@ -3,7 +3,6 @@ - @@ -14,10 +13,6 @@ - - - - - + diff --git a/EventFilter/EcalRawToDigi/plugins/DeclsForKernels.h b/EventFilter/EcalRawToDigi/plugins/DeclsForKernels.h new file mode 100644 index 0000000000000..9eab30a33fb3c --- /dev/null +++ b/EventFilter/EcalRawToDigi/plugins/DeclsForKernels.h @@ -0,0 +1,108 @@ +#ifndef EventFilter_EcalRawToDigi_plugins_DeclsForKernels_h +#define EventFilter_EcalRawToDigi_plugins_DeclsForKernels_h + +#include + +#include "EventFilter/EcalRawToDigi/interface/DCCRawDataDefinitions.h" +#include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +namespace ecal { + namespace raw { + + constexpr auto empty_event_size = EMPTYEVENTSIZE; + constexpr uint32_t nfeds_max = 54; + constexpr uint32_t nbytes_per_fed_max = 10 * 1024; + + struct InputDataCPU { + std::vector> data; + std::vector> offsets; + std::vector> feds; + + void allocate() { + // 2KB per FED resize + data.resize(nfeds_max * sizeof(unsigned char) * nbytes_per_fed_max); + offsets.resize(nfeds_max, 0); + feds.resize(nfeds_max, 0); + } + }; + + struct ConfigurationParameters { + uint32_t maxChannels; + }; + + struct OutputDataCPU { + // [0] - eb, [1] - ee + std::vector> nchannels; + + void allocate() { nchannels.resize(2); } + }; + + struct OutputDataGPU { + uint16_t *samplesEB = nullptr, *samplesEE = nullptr; + uint32_t *idsEB = nullptr, *idsEE = nullptr; + + // FIXME: we should separate max channels parameter for eb and ee + // FIXME: replace hardcoded values + void allocate(ConfigurationParameters const &config) { + cudaCheck(cudaMalloc((void **)&samplesEB, config.maxChannels * sizeof(uint16_t) * 10)); + cudaCheck(cudaMalloc((void **)&samplesEE, config.maxChannels * sizeof(uint16_t) * 10)); + cudaCheck(cudaMalloc((void **)&idsEB, config.maxChannels * sizeof(uint32_t))); + cudaCheck(cudaMalloc((void **)&idsEE, config.maxChannels * sizeof(uint32_t))); + } + + void deallocate(ConfigurationParameters const &config) { + if (samplesEB) { + cudaCheck(cudaFree(samplesEB)); + cudaCheck(cudaFree(samplesEE)); + cudaCheck(cudaFree(idsEB)); + cudaCheck(cudaFree(idsEE)); + } + } + }; + + struct ScratchDataGPU { + // [0] = EB + // [1] = EE + uint32_t *pChannelsCounter = nullptr; + + void allocate(ConfigurationParameters const &config) { + cudaCheck(cudaMalloc((void **)&pChannelsCounter, sizeof(uint32_t) * 2)); + } + + void deallocate(ConfigurationParameters const &config) { + if (pChannelsCounter) { + cudaCheck(cudaFree(pChannelsCounter)); + } + } + }; + + struct InputDataGPU { + unsigned char *data = nullptr; + uint32_t *offsets = nullptr; + int *feds = nullptr; + + void allocate() { + cudaCheck(cudaMalloc((void **)&data, sizeof(unsigned char) * nbytes_per_fed_max * nfeds_max)); + cudaCheck(cudaMalloc((void **)&offsets, sizeof(uint32_t) * nfeds_max)); + cudaCheck(cudaMalloc((void **)&feds, sizeof(int) * nfeds_max)); + } + + void deallocate() { + if (data) { + cudaCheck(cudaFree(data)); + cudaCheck(cudaFree(offsets)); + cudaCheck(cudaFree(feds)); + } + } + }; + + struct ConditionsProducts { + ElectronicsMappingGPU::Product const &eMappingProduct; + }; + + } // namespace raw +} // namespace ecal + +#endif // EventFilter_EcalRawToDigi_plugins_DeclsForKernels_h diff --git a/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc b/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc index a046ce92fe336..fa53efe6286f1 100644 --- a/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc +++ b/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc @@ -6,9 +6,7 @@ #include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" #include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" #include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h" -#include "EventFilter/EcalRawToDigi/interface/DeclsForKernels.h" #include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h" -#include "EventFilter/EcalRawToDigi/interface/UnpackGPU.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" #include "FWCore/Framework/interface/MakerMacros.h" @@ -18,6 +16,9 @@ #include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "DeclsForKernels.h" +#include "UnpackGPU.h" + class EcalCPUDigisProducer : public edm::stream::EDProducer { public: explicit EcalCPUDigisProducer(edm::ParameterSet const& ps); diff --git a/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc b/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc index 18dc2307e9bfc..e2a127e950252 100644 --- a/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc +++ b/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc @@ -1,29 +1,20 @@ #include -// framework -#include "FWCore/Framework/interface/stream/EDProducer.h" -//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h" -//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h" - -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" -#include "FWCore/ParameterSet/interface/ParameterSet.h" -#include "FWCore/Framework/interface/Event.h" -#include "FWCore/Framework/interface/EventSetup.h" -#include "FWCore/Framework/interface/MakerMacros.h" - -// algorithm specific - -#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h" -#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" #include "CUDADataFormats/EcalDigi/interface/DigisCollection.h" - #include "CondFormats/DataRecord/interface/EcalMappingElectronicsRcd.h" - +#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" +#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h" #include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "EventFilter/EcalRawToDigi/interface/DeclsForKernels.h" -#include "EventFilter/EcalRawToDigi/interface/UnpackGPU.h" +#include "DeclsForKernels.h" +#include "UnpackGPU.h" class EcalRawToDigiGPU : public edm::stream::EDProducer { public: diff --git a/EventFilter/EcalRawToDigi/plugins/UnpackGPU.cu b/EventFilter/EcalRawToDigi/plugins/UnpackGPU.cu new file mode 100644 index 0000000000000..79f916d510e72 --- /dev/null +++ b/EventFilter/EcalRawToDigi/plugins/UnpackGPU.cu @@ -0,0 +1,337 @@ +#include "EventFilter/EcalRawToDigi/interface/ElectronicsIdGPU.h" + +#include "UnpackGPU.h" + +namespace ecal { + namespace raw { + + __forceinline__ __device__ void print_raw_buffer(uint8_t const* const buffer, + uint32_t const nbytes, + uint32_t const nbytes_per_row = 20) { + for (uint32_t i = 0; i < nbytes; i++) { + if (i % nbytes_per_row == 0 && i > 0) + printf("\n"); + printf("%02X ", buffer[i]); + } + } + + __forceinline__ __device__ void print_first3bits(uint64_t const* buffer, uint32_t size) { + for (uint32_t i = 0; i < size; ++i) { + uint8_t const b61 = (buffer[i] >> 61) & 0x1; + uint8_t const b62 = (buffer[i] >> 62) & 0x1; + uint8_t const b63 = (buffer[i] >> 63) & 0x1; + printf("[word: %u] %u%u%u\n", i, b63, b62, b61); + } + } + + __forceinline__ __device__ bool is_barrel(uint8_t dccid) { + return dccid >= ElectronicsIdGPU::MIN_DCCID_EBM && dccid <= ElectronicsIdGPU::MAX_DCCID_EBP; + } + + __forceinline__ __device__ uint8_t fed2dcc(int fed) { return static_cast(fed - 600); } + + __forceinline__ __device__ int zside_for_eb(ElectronicsIdGPU const& eid) { + int dcc = eid.dccId(); + return ((dcc >= ElectronicsIdGPU::MIN_DCCID_EBM && dcc <= ElectronicsIdGPU::MAX_DCCID_EBM)) ? -1 : 1; + } + + __forceinline__ __device__ bool is_synced_towerblock(uint16_t const dccbx, + uint16_t const bx, + uint16_t const dccl1, + uint16_t const l1) { + bool const bxsync = (bx == 0 && dccbx == 3564) || (bx == dccbx && dccbx != 3564); + bool const l1sync = (l1 == ((dccl1 - 1) & 0xfff)); + return bxsync && l1sync; + } + + __forceinline__ __device__ bool right_tower_for_eb(int tower) { + // for EB, two types of tower (LVRB top/bottom) + if ((tower > 12 && tower < 21) || (tower > 28 && tower < 37) || (tower > 44 && tower < 53) || + (tower > 60 && tower < 69)) + return true; + else + return false; + } + + __forceinline__ __device__ uint32_t compute_ebdetid(ElectronicsIdGPU const& eid) { + // as in Geometry/EcalMaping/.../EcalElectronicsMapping + auto const dcc = eid.dccId(); + auto const tower = eid.towerId(); + auto const strip = eid.stripId(); + auto const xtal = eid.xtalId(); + + int smid = 0; + int iphi = 0; + bool EBPlus = (zside_for_eb(eid) > 0); + bool EBMinus = !EBPlus; + + if (zside_for_eb(eid) < 0) { + smid = dcc + 19 - ElectronicsIdGPU::DCCID_PHI0_EBM; + iphi = (smid - 19) * ElectronicsIdGPU::kCrystalsInPhi; + iphi += 5 * ((tower - 1) % ElectronicsIdGPU::kTowersInPhi); + } else { + smid = dcc + 1 - ElectronicsIdGPU::DCCID_PHI0_EBP; + iphi = (smid - 1) * ElectronicsIdGPU::kCrystalsInPhi; + iphi += 5 * (ElectronicsIdGPU::kTowersInPhi - ((tower - 1) % ElectronicsIdGPU::kTowersInPhi) - 1); + } + + bool RightTower = right_tower_for_eb(tower); + int ieta = 5 * ((tower - 1) / ElectronicsIdGPU::kTowersInPhi) + 1; + if (RightTower) { + ieta += (strip - 1); + if (strip % 2 == 1) { + if (EBMinus) + iphi += (xtal - 1) + 1; + else + iphi += (4 - (xtal - 1)) + 1; + } else { + if (EBMinus) + iphi += (4 - (xtal - 1)) + 1; + else + iphi += (xtal - 1) + 1; + } + } else { + ieta += 4 - (strip - 1); + if (strip % 2 == 1) { + if (EBMinus) + iphi += (4 - (xtal - 1)) + 1; + else + iphi += (xtal - 1) + 1; + } else { + if (EBMinus) + iphi += (xtal - 1) + 1; + else + iphi += (4 - (xtal - 1)) + 1; + } + } + + if (zside_for_eb(eid) < 0) + ieta = -ieta; + + DetId did{DetId::Ecal, EcalBarrel}; + return did.rawId() | ((ieta > 0) ? (0x10000 | (ieta << 9)) : ((-ieta) << 9)) | (iphi & 0x1FF); + } + + __forceinline__ __device__ int adc(uint16_t sample) { return sample & 0xfff; } + + __forceinline__ __device__ int gainId(uint16_t sample) { return (sample >> 12) & 0x3; } + + template + __global__ void kernel_unpack_test(unsigned char const* __restrict__ data, + uint32_t const* __restrict__ offsets, + int const* __restrict__ feds, + uint16_t* samplesEB, + uint16_t* samplesEE, + uint32_t* idsEB, + uint32_t* idsEE, + uint32_t* pChannelsCounterEBEE, + uint32_t const* eid2did, + uint32_t const nbytesTotal) { + // indices + auto const ifed = blockIdx.x; + + // FIXME: use only the very first fed + //if (ifed!=10) return; + + // offset in bytes + auto const offset = offsets[ifed]; + // fed id + auto const fed = feds[ifed]; + auto const isBarrel = is_barrel(static_cast(fed - 600)); + // size + auto const size = ifed == gridDim.x - 1 ? nbytesTotal - offset : offsets[ifed + 1] - offset; + auto* samples = isBarrel ? samplesEB : samplesEE; + auto* ids = isBarrel ? idsEB : idsEE; + auto* pChannelsCounter = isBarrel ? &pChannelsCounterEBEE[0] : &pChannelsCounterEBEE[1]; + + // FIXME: debugging + //printf("ifed = %u fed = %d offset = %u size = %u\n", ifed, fed, offset, size); + + // offset to the right raw buffer + uint64_t const* buffer = reinterpret_cast(data + offset); + + // dump first 3 bits for each 64-bit word + //print_first3bits(buffer, size / 8); + + // + // fed header + // + auto const fed_header = buffer[0]; + uint32_t bx = (fed_header >> 20) & 0xfff; + uint32_t lv1 = (fed_header >> 32) & 0xffffff; + + // 9 for fed + dcc header + // 36 for 4 EE TCC blocks or 18 for 1 EB TCC block + // 6 for SR block size + + // dcc header w2 + auto const w2 = buffer[2]; + uint8_t const fov = (w2 >> 48) & 0xf; + + // + // print Tower block headers + // + uint8_t ntccblockwords = isBarrel ? 18 : 36; + auto const* tower_blocks_start = buffer + 9 + ntccblockwords + 6; + auto const* trailer = buffer + (size / 8 - 1); + auto const* current_tower_block = tower_blocks_start; + while (current_tower_block != trailer) { + auto const w = *current_tower_block; + uint8_t ttid = w & 0xff; + uint16_t bxlocal = (w >> 16) & 0xfff; + uint16_t lv1local = (w >> 32) & 0xfff; + uint16_t block_length = (w >> 48) & 0x1ff; + + uint16_t const dccbx = bx & 0xfff; + uint16_t const dccl1 = lv1 & 0xfff; + // fov>=1 is required to support simulated data for which bx==bxlocal==0 + if (fov >= 1 && !is_synced_towerblock(dccbx, bxlocal, dccl1, lv1local)) { + current_tower_block += block_length; + continue; + } + + // go through all the channels + // get the next channel coordinates + uint32_t nchannels = (block_length - 1) / 3; + + // 1 threads per channel in this block + for (uint32_t ich = 0; ich < nchannels; ich += NTHREADS) { + auto const i_to_access = ich + threadIdx.x; + // threads outside of the range -> leave the loop + if (i_to_access >= nchannels) + break; + + // inc the channel's counter and get the pos where to store + auto const wdata = current_tower_block[1 + i_to_access * 3]; + uint8_t const stripid = wdata & 0x7; + uint8_t const xtalid = (wdata >> 4) & 0x7; + ElectronicsIdGPU eid{fed2dcc(fed), ttid, stripid, xtalid}; + auto const didraw = isBarrel ? compute_ebdetid(eid) : eid2did[eid.linearIndex()]; + // FIXME: what kind of channels are these guys + if (didraw == 0) + continue; + + // get samples + uint16_t sampleValues[10]; + sampleValues[0] = (wdata >> 16) & 0x3fff; + sampleValues[1] = (wdata >> 32) & 0x3fff; + sampleValues[2] = (wdata >> 48) & 0x3fff; + auto const wdata1 = current_tower_block[2 + i_to_access * 3]; + sampleValues[3] = wdata1 & 0x3fff; + sampleValues[4] = (wdata1 >> 16) & 0x3fff; + sampleValues[5] = (wdata1 >> 32) & 0x3fff; + sampleValues[6] = (wdata1 >> 48) & 0x3fff; + auto const wdata2 = current_tower_block[3 + i_to_access * 3]; + sampleValues[7] = wdata2 & 0x3fff; + sampleValues[8] = (wdata2 >> 16) & 0x3fff; + sampleValues[9] = (wdata2 >> 32) & 0x3fff; + //printf("stripid = %u xtalid = %u\n", stripid, xtalid); + + // check gain + bool isSaturation = true; + short firstGainZeroSampID{-1}, firstGainZeroSampADC{-1}; + for (uint32_t si = 0; si < 10; si++) { + if (gainId(sampleValues[si]) == 0) { + firstGainZeroSampID = si; + firstGainZeroSampADC = adc(sampleValues[si]); + break; + } + } + if (firstGainZeroSampID != -1) { + unsigned int plateauEnd = std::min(10u, (unsigned int)(firstGainZeroSampID + 5)); + for (unsigned int s = firstGainZeroSampID; s < plateauEnd; s++) { + if (gainId(sampleValues[s]) == 0 && adc(sampleValues[s]) == firstGainZeroSampADC) { + ; + } else { + isSaturation = false; + break; + } //it's not saturation + } + // get rid of channels which are stuck in gain0 + if (firstGainZeroSampID < 3) { + isSaturation = false; + } + if (!isSaturation) + continue; + } else { // there is no zero gainId sample + // gain switch check + short numGain = 1; + bool gainSwitchError = false; + for (unsigned int si = 1; si < 10; si++) { + if ((gainId(sampleValues[si - 1]) > gainId(sampleValues[si])) && numGain < 5) + gainSwitchError = true; + if (gainId(sampleValues[si - 1]) == gainId(sampleValues[si])) + numGain++; + else + numGain = 1; + } + if (gainSwitchError) + continue; + } + + auto const pos = atomicAdd(pChannelsCounter, 1); + + // store to global + ids[pos] = didraw; + samples[pos * 10] = sampleValues[0]; + samples[pos * 10 + 1] = sampleValues[1]; + samples[pos * 10 + 2] = sampleValues[2]; + samples[pos * 10 + 3] = sampleValues[3]; + samples[pos * 10 + 4] = sampleValues[4]; + samples[pos * 10 + 5] = sampleValues[5]; + samples[pos * 10 + 6] = sampleValues[6]; + samples[pos * 10 + 7] = sampleValues[7]; + samples[pos * 10 + 8] = sampleValues[8]; + samples[pos * 10 + 9] = sampleValues[9]; + } + + current_tower_block += block_length; + } + } + + void entryPoint(InputDataCPU const& inputCPU, + InputDataGPU& inputGPU, + OutputDataGPU& outputGPU, + ScratchDataGPU& scratchGPU, + OutputDataCPU& outputCPU, + ConditionsProducts const& conditions, + cudaStream_t cudaStream, + uint32_t const nfedsWithData, + uint32_t const nbytesTotal) { + // transfer + cudaCheck(cudaMemcpyAsync( + inputGPU.data, inputCPU.data.data(), nbytesTotal * sizeof(unsigned char), cudaMemcpyHostToDevice, cudaStream)); + cudaCheck(cudaMemcpyAsync(inputGPU.offsets, + inputCPU.offsets.data(), + nfedsWithData * sizeof(uint32_t), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemsetAsync(scratchGPU.pChannelsCounter, + 0, + sizeof(uint32_t) * 2, // EB + EE + cudaStream)); + cudaCheck(cudaMemcpyAsync( + inputGPU.feds, inputCPU.feds.data(), nfedsWithData * sizeof(int), cudaMemcpyHostToDevice, cudaStream)); + + kernel_unpack_test<32><<>>(inputGPU.data, + inputGPU.offsets, + inputGPU.feds, + outputGPU.samplesEB, + outputGPU.samplesEE, + outputGPU.idsEB, + outputGPU.idsEE, + scratchGPU.pChannelsCounter, + conditions.eMappingProduct.eid2did, + nbytesTotal); + cudaCheck(cudaGetLastError()); + + // transfer the counters for how many eb and ee channels we got + cudaCheck(cudaMemcpyAsync(outputCPU.nchannels.data(), + scratchGPU.pChannelsCounter, + sizeof(uint32_t) * 2, + cudaMemcpyDeviceToHost, + cudaStream)); + } + + } // namespace raw +} // namespace ecal diff --git a/EventFilter/EcalRawToDigi/plugins/UnpackGPU.h b/EventFilter/EcalRawToDigi/plugins/UnpackGPU.h new file mode 100644 index 0000000000000..d98906e7e24a7 --- /dev/null +++ b/EventFilter/EcalRawToDigi/plugins/UnpackGPU.h @@ -0,0 +1,23 @@ +#ifndef EventFilter_EcalRawToDigi_plugins_UnpackGPU_h +#define EventFilter_EcalRawToDigi_plugins_UnpackGPU_h + +#include "DeclsForKernels.h" + +namespace ecal { + namespace raw { + + // FIXME: bundle up uint32_t values + void entryPoint(InputDataCPU const&, + InputDataGPU&, + OutputDataGPU&, + ScratchDataGPU&, + OutputDataCPU&, + ConditionsProducts const&, + cudaStream_t, + uint32_t const, + uint32_t const); + + } // namespace raw +} // namespace ecal + +#endif // EventFilter_EcalRawToDigi_plugins_UnpackGPU_h diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.cu new file mode 100644 index 0000000000000..36601a0fb9230 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.cu @@ -0,0 +1,505 @@ +#include +#include + +#include + +#include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h" +#include "CondFormats/EcalObjects/interface/EcalPulseShapes.h" +#include "CondFormats/EcalObjects/interface/EcalSamplesCorrelation.h" +#include "DataFormats/EcalDigi/interface/EcalDataFrame.h" +#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h" +#include "DataFormats/Math/interface/approx_exp.h" +#include "DataFormats/Math/interface/approx_log.h" + +#include "AmplitudeComputationCommonKernels.h" +#include "KernelHelpers.h" + +namespace ecal { + namespace multifit { + + /// + /// assume kernel launch configuration is + /// (MAXSAMPLES * nchannels, blocks) + /// + __global__ void kernel_prep_1d_and_initialize(EcalPulseShape const* shapes_in, + uint16_t const* digis_in_eb, + uint32_t const* dids_eb, + uint16_t const* digis_in_ee, + uint32_t const* dids_ee, + SampleVector* amplitudes, + SampleVector* amplitudesForMinimization, + SampleGainVector* gainsNoise, + float const* mean_x1, + float const* mean_x12, + float const* rms_x12, + float const* mean_x6, + float const* gain6Over1, + float const* gain12Over6, + bool* hasSwitchToGain6, + bool* hasSwitchToGain1, + bool* isSaturated, + ::ecal::reco::StorageScalarType* energies, + ::ecal::reco::StorageScalarType* chi2, + ::ecal::reco::StorageScalarType* g_pedestal, + uint32_t* dids_out, + uint32_t* flags, + char* acState, + BXVectorType* bxs, + uint32_t const offsetForHashes, + uint32_t const offsetForInputs, + bool const gainSwitchUseMaxSampleEB, + bool const gainSwitchUseMaxSampleEE, + int const nchannels) { + constexpr bool dynamicPedestal = false; //---- default to false, ok + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + constexpr int sample_max = 5; + constexpr int full_pulse_max = 9; + int const tx = threadIdx.x + blockIdx.x * blockDim.x; + int const nchannels_per_block = blockDim.x / nsamples; + int const ch = tx / nsamples; + // for accessing input arrays + int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch; + int const inputTx = ch >= offsetForInputs ? tx - offsetForInputs * 10 : tx; + // eb is first and then ee + auto const* digis_in = ch >= offsetForInputs ? digis_in_ee : digis_in_eb; + auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb; + int const sample = threadIdx.x % nsamples; + + if (ch < nchannels) { + // array of 10 x channels per block + // TODO: any other way of doing simple reduction + // assume bool is 1 byte, should be quite safe + extern __shared__ char shared_mem[]; + bool* shr_hasSwitchToGain6 = reinterpret_cast(shared_mem); + bool* shr_hasSwitchToGain1 = shr_hasSwitchToGain6 + nchannels_per_block * nsamples; + bool* shr_hasSwitchToGain0 = shr_hasSwitchToGain1 + nchannels_per_block * nsamples; + bool* shr_isSaturated = shr_hasSwitchToGain0 + nchannels_per_block * nsamples; + bool* shr_hasSwitchToGain0_tmp = shr_isSaturated + nchannels_per_block * nsamples; + char* shr_counts = reinterpret_cast(shr_hasSwitchToGain0_tmp) + nchannels_per_block * nsamples; + + // + // indices + // + auto const did = DetId{dids[inputCh]}; + auto const isBarrel = did.subdetId() == EcalBarrel; + // TODO offset for ee, 0 for eb + auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) + : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId()); + + // + // pulse shape template + /* + for (int isample=sample; isample first 2 threads of each channel will be used here + // => 0 -> will compare 3 and 4 and put into 0 + // => 1 -> will compare 4 and 5 and put into 1 + shr_isSaturated[threadIdx.x] = shr_isSaturated[threadIdx.x + 3] || shr_isSaturated[threadIdx.x + 4]; + } + __syncthreads(); + + bool check_hasSwitchToGain0 = false; + + if (sample == 0) { + shr_hasSwitchToGain6[threadIdx.x] = + shr_hasSwitchToGain6[threadIdx.x] || shr_hasSwitchToGain6[threadIdx.x + 1]; + shr_hasSwitchToGain1[threadIdx.x] = + shr_hasSwitchToGain1[threadIdx.x] || shr_hasSwitchToGain1[threadIdx.x + 1]; + shr_hasSwitchToGain0_tmp[threadIdx.x] = + shr_hasSwitchToGain0_tmp[threadIdx.x] || shr_hasSwitchToGain0_tmp[threadIdx.x + 1]; + + hasSwitchToGain6[ch] = shr_hasSwitchToGain6[threadIdx.x]; + hasSwitchToGain1[ch] = shr_hasSwitchToGain1[threadIdx.x]; + + // set only for the threadIdx.x corresponding to sample==0 + check_hasSwitchToGain0 = shr_hasSwitchToGain0_tmp[threadIdx.x]; + + shr_isSaturated[threadIdx.x + 3] = shr_isSaturated[threadIdx.x] || shr_isSaturated[threadIdx.x + 1]; + isSaturated[ch] = shr_isSaturated[threadIdx.x + 3]; + } + + // TODO: w/o this sync, there is a race + // if (threadIdx == sample_max) below uses max sample thread, not for 0 sample + // check if we can remove it + __syncthreads(); + + // TODO: divergent branch + if (gainId == 0 || gainId == 3) { + pedestal = mean_x1[hashedId]; + gainratio = gain6Over1[hashedId] * gain12Over6[hashedId]; + gainsNoise[ch](sample) = 2; + } else if (gainId == 1) { + pedestal = mean_x12[hashedId]; + gainratio = 1.; + gainsNoise[ch](sample) = 0; + } else if (gainId == 2) { + pedestal = mean_x6[hashedId]; + gainratio = gain12Over6[hashedId]; + gainsNoise[ch](sample) = 1; + } + + // TODO: compile time constant -> branch should be non-divergent + if (dynamicPedestal) + amplitude = static_cast(adc) * gainratio; + else + amplitude = (static_cast(adc) - pedestal) * gainratio; + amplitudes[ch][sample] = amplitude; + +#ifdef ECAL_RECO_CUDA_DEBUG + printf("%d %d %d %d %f %f %f\n", tx, ch, sample, adc, amplitude, pedestal, gainratio); + if (adc == 0) + printf("adc is zero\n"); +#endif + + // + // initialization + // + amplitudesForMinimization[ch](sample) = 0; + bxs[ch](sample) = sample - 5; + + // select the thread for the max sample + //---> hardcoded above to be 5th sample, ok + if (sample == sample_max) { + // + // initialization + // + acState[ch] = static_cast(MinimizationState::NotFinished); + energies[ch] = 0; + chi2[ch] = 0; + g_pedestal[ch] = 0; + uint32_t flag = 0; + dids_out[ch] = did.rawId(); + + // start of this channel in shared mem + int const chStart = threadIdx.x - sample_max; + // thread for the max sample in shared mem + int const threadMax = threadIdx.x; + auto const gainSwitchUseMaxSample = isBarrel ? gainSwitchUseMaxSampleEB : gainSwitchUseMaxSampleEE; + + // this flag setting is applied to all of the cases + if (shr_hasSwitchToGain6[chStart]) + flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain6; + if (shr_hasSwitchToGain1[chStart]) + flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain1; + + // this corresponds to cpu branching on lastSampleBeforeSaturation + // likely false + if (check_hasSwitchToGain0) { + // assign for the case some sample having gainId == 0 + //energies[ch] = amplitudes[ch][sample_max]; + energies[ch] = amplitude; + + // check if samples before sample_max have true + bool saturated_before_max = false; +#pragma unroll + for (char ii = 0; ii < 5; ii++) + saturated_before_max = saturated_before_max || shr_hasSwitchToGain0[chStart + ii]; + + // if saturation is in the max sample and not in the first 5 + if (!saturated_before_max && shr_hasSwitchToGain0[threadMax]) + energies[ch] = 49140; // 4095 * 12 + //---- AM FIXME : no pedestal subtraction??? + //It should be "(4095. - pedestal) * gainratio" + + // set state flag to terminate further processing of this channel + acState[ch] = static_cast(MinimizationState::Precomputed); + flag |= 0x1 << EcalUncalibratedRecHit::kSaturated; + flags[ch] = flag; + return; + } + + // according to cpu version + // auto max_amplitude = amplitudes[ch][sample_max]; + auto const max_amplitude = amplitude; + // according to cpu version + auto shape_value = shapes_in[hashedId].pdfval[full_pulse_max - 7]; + // note, no syncing as the same thread will be accessing here + bool hasGainSwitch = + shr_hasSwitchToGain6[chStart] || shr_hasSwitchToGain1[chStart] || shr_isSaturated[chStart + 3]; + + // pedestal is final unconditionally + g_pedestal[ch] = pedestal; + if (hasGainSwitch && gainSwitchUseMaxSample) { + // thread for sample=0 will access the right guys + energies[ch] = max_amplitude / shape_value; + acState[ch] = static_cast(MinimizationState::Precomputed); + flags[ch] = flag; + return; + } + + // this happens cause sometimes rms_x12 is 0... + // needs to be checkec why this is the case + // general case here is that noisecov is a Zero matrix + if (rmsForChecking == 0) { + acState[ch] = static_cast(MinimizationState::Precomputed); + flags[ch] = flag; + return; + } + + // for the case when no shortcuts were taken + flags[ch] = flag; + } + } + } + + /// + /// assume kernel launch configuration is + /// ([MAXSAMPLES, MAXSAMPLES], nchannels) + /// + __global__ void kernel_prep_2d(SampleGainVector const* gainNoise, + uint32_t const* dids_eb, + uint32_t const* dids_ee, + float const* rms_x12, + float const* rms_x6, + float const* rms_x1, + float const* gain12Over6, + float const* gain6Over1, + double const* G12SamplesCorrelationEB, + double const* G6SamplesCorrelationEB, + double const* G1SamplesCorrelationEB, + double const* G12SamplesCorrelationEE, + double const* G6SamplesCorrelationEE, + double const* G1SamplesCorrelationEE, + SampleMatrix* noisecov, + PulseMatrixType* pulse_matrix, + EcalPulseShape const* pulse_shape, + bool const* hasSwitchToGain6, + bool const* hasSwitchToGain1, + bool const* isSaturated, + uint32_t const offsetForHashes, + uint32_t const offsetForInputs) { + int const ch = blockIdx.x; + int const tx = threadIdx.x; + int const ty = threadIdx.y; + constexpr float addPedestalUncertainty = 0.f; + constexpr bool dynamicPedestal = false; + constexpr bool simplifiedNoiseModelForGainSwitch = true; //---- default is true + + // to access input arrays (ids and digis only) + int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch; + auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb; + + bool tmp0 = hasSwitchToGain6[ch]; + bool tmp1 = hasSwitchToGain1[ch]; + auto const did = DetId{dids[inputCh]}; + auto const isBarrel = did.subdetId() == EcalBarrel; + auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) + : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId()); + auto const G12SamplesCorrelation = isBarrel ? G12SamplesCorrelationEB : G12SamplesCorrelationEE; + auto const* G6SamplesCorrelation = isBarrel ? G6SamplesCorrelationEB : G6SamplesCorrelationEE; + auto const* G1SamplesCorrelation = isBarrel ? G1SamplesCorrelationEB : G1SamplesCorrelationEE; + bool tmp2 = isSaturated[ch]; + bool hasGainSwitch = tmp0 || tmp1 || tmp2; + auto const vidx = ecal::abs(ty - tx); + + // non-divergent branch for all threads per block + if (hasGainSwitch) { + // TODO: did not include simplified noise model + float noise_value = 0; + + // non-divergent branch - all threads per block + // TODO: all of these constants indicate that + // that these parts could be splitted into completely different + // kernels and run one of them only depending on the config + if (simplifiedNoiseModelForGainSwitch) { + int isample_max = 5; // according to cpu defs + int gainidx = gainNoise[ch][isample_max]; + + // non-divergent branches + if (gainidx == 0) + //noise_value = rms_x12[ch]*rms_x12[ch]*noisecorrs[0](ty, tx); + noise_value = rms_x12[hashedId] * rms_x12[hashedId] * G12SamplesCorrelation[vidx]; + if (gainidx == 1) + // noise_value = gain12Over6[ch]*gain12Over6[ch] * rms_x6[ch]*rms_x6[ch] + // *noisecorrs[1](ty, tx); + noise_value = gain12Over6[hashedId] * gain12Over6[hashedId] * rms_x6[hashedId] * rms_x6[hashedId] * + G6SamplesCorrelation[vidx]; + if (gainidx == 2) + // noise_value = gain12Over6[ch]*gain12Over6[ch] + // * gain6Over1[ch]*gain6Over1[ch] * rms_x1[ch]*rms_x1[ch] + // * noisecorrs[2](ty, tx); + noise_value = gain12Over6[hashedId] * gain12Over6[hashedId] * gain6Over1[hashedId] * gain6Over1[hashedId] * + rms_x1[hashedId] * rms_x1[hashedId] * G1SamplesCorrelation[vidx]; + if (!dynamicPedestal && addPedestalUncertainty > 0.f) + noise_value += addPedestalUncertainty * addPedestalUncertainty; + } else { + int gainidx = 0; + char mask = gainidx; + int pedestal = gainNoise[ch][ty] == mask ? 1 : 0; + // noise_value += /* gainratio is 1*/ rms_x12[ch]*rms_x12[ch] + // *pedestal*noisecorrs[0](ty, tx); + noise_value += + /* gainratio is 1*/ rms_x12[hashedId] * rms_x12[hashedId] * pedestal * G12SamplesCorrelation[vidx]; + // non-divergent branch + if (!dynamicPedestal && addPedestalUncertainty > 0.f) { + noise_value += /* gainratio is 1 */ + addPedestalUncertainty * addPedestalUncertainty * pedestal; + } + + // + gainidx = 1; + mask = gainidx; + pedestal = gainNoise[ch][ty] == mask ? 1 : 0; + // noise_value += gain12Over6[ch]*gain12Over6[ch] + // *rms_x6[ch]*rms_x6[ch]*pedestal*noisecorrs[1](ty, tx); + noise_value += gain12Over6[hashedId] * gain12Over6[hashedId] * rms_x6[hashedId] * rms_x6[hashedId] * + pedestal * G6SamplesCorrelation[vidx]; + // non-divergent branch + if (!dynamicPedestal && addPedestalUncertainty > 0.f) { + noise_value += gain12Over6[hashedId] * gain12Over6[hashedId] * addPedestalUncertainty * + addPedestalUncertainty * pedestal; + } + + // + gainidx = 2; + mask = gainidx; + pedestal = gainNoise[ch][ty] == mask ? 1 : 0; + float tmp = gain6Over1[hashedId] * gain12Over6[hashedId]; + // noise_value += tmp*tmp * rms_x1[ch]*rms_x1[ch] + // *pedestal*noisecorrs[2](ty, tx); + noise_value += tmp * tmp * rms_x1[hashedId] * rms_x1[hashedId] * pedestal * G1SamplesCorrelation[vidx]; + // non-divergent branch + if (!dynamicPedestal && addPedestalUncertainty > 0.f) { + noise_value += tmp * tmp * addPedestalUncertainty * addPedestalUncertainty * pedestal; + } + } + + noisecov[ch](ty, tx) = noise_value; + } else { + auto rms = rms_x12[hashedId]; + float noise_value = rms * rms * G12SamplesCorrelation[vidx]; + if (!dynamicPedestal && addPedestalUncertainty > 0.f) { + //---- add fully correlated component to noise covariance to inflate pedestal uncertainty + noise_value += addPedestalUncertainty * addPedestalUncertainty; + } + noisecov[ch](ty, tx) = noise_value; + } + + // pulse matrix + // int const bx = tx - 5; // -5 -4 -3 ... 3 4 + // int bx = (*bxs)(tx); + // int const offset = 7 - 3 - bx; + int const posToAccess = 9 - tx + ty; // see cpu for reference + float const value = posToAccess >= 7 ? pulse_shape[hashedId].pdfval[posToAccess - 7] : 0; + pulse_matrix[ch](ty, tx) = value; + } + + __global__ void kernel_permute_results(SampleVector* amplitudes, + BXVectorType const* activeBXs, + ::ecal::reco::StorageScalarType* energies, + char const* acState, + int const nchannels) { + // constants + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + int const tx = threadIdx.x + blockIdx.x * blockDim.x; + int const ch = tx / nsamples; + int const iii = tx % nsamples; // this is to address activeBXs + + if (ch >= nchannels) + return; + + // channels that have amplitude precomputed do not need results to be permuted + auto const state = static_cast(acState[ch]); + if (state == MinimizationState::Precomputed) + return; + + // configure shared memory and cp into it + extern __shared__ char smem[]; + SampleVector::Scalar* values = reinterpret_cast(smem); + values[threadIdx.x] = amplitudes[ch](iii); + __syncthreads(); + + // get the sample for this bx + auto const sample = static_cast(activeBXs[ch](iii)) + 5; + + // store back to global + amplitudes[ch](sample) = values[threadIdx.x]; + + // store sample 5 separately + // only for the case when minimization was performed + // not for cases with precomputed amplitudes + if (sample == 5) + energies[ch] = values[threadIdx.x]; + } + +/// +/// Build an Ecal RecHit. +/// TODO: Use SoA data structures on the host directly +/// the reason for removing this from minimize kernel is to isolate the minimize + +/// again, building an aos rec hit involves strides... -> bad memory access pattern +/// +#ifdef RUN_BUILD_AOS_RECHIT + __global__ void kernel_build_rechit( + float const* energies, float const* chi2s, uint32_t* dids, EcalUncalibratedRecHit* rechits, int nchannels) { + int idx = threadIdx.x + blockDim.x * blockIdx.x; + if (idx < nchannels) { + rechits[idx] = EcalUncalibratedRecHit{dids[idx], energies[idx], 0, 0, chi2s[idx], 0}; + } + } +#endif + + } // namespace multifit +} // namespace ecal diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.h new file mode 100644 index 0000000000000..3e813b529a2ad --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.h @@ -0,0 +1,98 @@ +#ifndef RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationCommonKernels_h +#define RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationCommonKernels_h + +#include "Common.h" +#include "DeclsForKernels.h" +#include "EigenMatrixTypes_gpu.h" + +class EcalPulseShape; +// this flag setting is applied to all of the cases +class EcalPulseCovariance; +class EcalUncalibratedRecHit; + +namespace ecal { + namespace multifit { + + /// + /// assume kernel launch configuration is + /// (MAXSAMPLES * nchannels, blocks) + /// TODO: is there a point to split this kernel further to separate reductions + /// + __global__ void kernel_prep_1d_and_initialize(EcalPulseShape const* shapes_in, + uint16_t const* digis_in_eb, + uint32_t const* dids_eb, + uint16_t const* digis_in_ee, + uint32_t const* dids_ee, + SampleVector* amplitudes, + SampleVector* amplitudesForMinimization, + SampleGainVector* gainsNoise, + float const* mean_x1, + float const* mean_x12, + float const* rms_x12, + float const* mean_x6, + float const* gain6Over1, + float const* gain12Over6, + bool* hasSwitchToGain6, + bool* hasSwitchToGain1, + bool* isSaturated, + ::ecal::reco::StorageScalarType* energies, + ::ecal::reco::StorageScalarType* chi2, + ::ecal::reco::StorageScalarType* pedestal, + uint32_t* dids_out, + uint32_t* flags, + char* acState, + BXVectorType* bxs, + uint32_t const offsetForHashes, + uint32_t const offsetForInputs, + bool const gainSwitchUseMaxSampleEB, + bool const gainSwitchUseMaxSampleEE, + int const nchannels); + + /// + /// assume kernel launch configuration is + /// ([MAXSAMPLES, MAXSAMPLES], nchannels) + /// + __global__ void kernel_prep_2d(SampleGainVector const* gainNoise, + uint32_t const* dids_eb, + uint32_t const* dids_ee, + float const* rms_x12, + float const* rms_x6, + float const* rms_x1, + float const* gain12Over6, + float const* gain6Over1, + double const* G12SamplesCorrelationEB, + double const* G6SamplesCorrelationEB, + double const* G1SamplesCorrelationEB, + double const* G12SamplesCorrelationEE, + double const* G6SamplesCorrelationEE, + double const* G1SamplesCorrelationEE, + SampleMatrix* noisecov, + PulseMatrixType* pulse_matrix, + EcalPulseShape const* pulse_shape, + bool const* hasSwitchToGain6, + bool const* hasSwitchToGain1, + bool const* isSaturated, + uint32_t const offsetForHashes, + uint32_t const offsetForInputs); + + __global__ void kernel_permute_results(SampleVector* amplitudes, + BXVectorType const* activeBXs, + ::ecal::reco::StorageScalarType* energies, + char const* acState, + int const nchannels); + +/// +/// Build an Ecal RecHit. +/// TODO: Use SoA data structures on the host directly +/// the reason for removing this from minimize kernel is to isolate the minimize + +/// again, building an aos rec hit involves strides... -> bad memory access pattern +/// +#ifdef RUN_BUILD_AOS_RECHIT + __global__ void kernel_build_rechit( + float const* energies, float const* chi2s, uint32_t* dids, EcalUncalibratedRecHit* rechits, int nchannels); +#endif + + } // namespace multifit +} // namespace ecal + +#endif // RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationCommonKernels_h diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu new file mode 100644 index 0000000000000..42c0b8aab85db --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu @@ -0,0 +1,400 @@ +#include +#include + +#include + +#include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h" +#include "CondFormats/EcalObjects/interface/EcalPulseShapes.h" +#include "DataFormats/EcalDigi/interface/EcalDataFrame.h" +#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" +#include "DataFormats/Math/interface/approx_exp.h" +#include "DataFormats/Math/interface/approx_log.h" + +#include "KernelHelpers.h" +#include "AmplitudeComputationKernels.h" +#include "AmplitudeComputationCommonKernels.h" + +namespace ecal { + namespace multifit { + + void eigen_solve_submatrix(SampleMatrix& mat, SampleVector& invec, SampleVector& outvec, unsigned NP) { + using namespace Eigen; + switch (NP) { // pulse matrix is always square. + case 10: { + Matrix temp = mat.topLeftCorner<10, 10>(); + outvec.head<10>() = temp.ldlt().solve(invec.head<10>()); + break; + } + case 9: { + Matrix temp = mat.topLeftCorner<9, 9>(); + outvec.head<9>() = temp.ldlt().solve(invec.head<9>()); + break; + } + case 8: { + Matrix temp = mat.topLeftCorner<8, 8>(); + outvec.head<8>() = temp.ldlt().solve(invec.head<8>()); + break; + } + case 7: { + Matrix temp = mat.topLeftCorner<7, 7>(); + outvec.head<7>() = temp.ldlt().solve(invec.head<7>()); + break; + } + case 6: { + Matrix temp = mat.topLeftCorner<6, 6>(); + outvec.head<6>() = temp.ldlt().solve(invec.head<6>()); + break; + } + case 5: { + Matrix temp = mat.topLeftCorner<5, 5>(); + outvec.head<5>() = temp.ldlt().solve(invec.head<5>()); + break; + } + case 4: { + Matrix temp = mat.topLeftCorner<4, 4>(); + outvec.head<4>() = temp.ldlt().solve(invec.head<4>()); + break; + } + case 3: { + Matrix temp = mat.topLeftCorner<3, 3>(); + outvec.head<3>() = temp.ldlt().solve(invec.head<3>()); + break; + } + case 2: { + Matrix temp = mat.topLeftCorner<2, 2>(); + outvec.head<2>() = temp.ldlt().solve(invec.head<2>()); + break; + } + case 1: { + Matrix temp = mat.topLeftCorner<1, 1>(); + outvec.head<1>() = temp.ldlt().solve(invec.head<1>()); + break; + } + default: + return; + } + } + + template + __device__ __forceinline__ bool update_covariance(EcalPulseCovariance const& pulse_covariance, + MatrixType& inverse_cov, + SampleVector const& amplitudes) { + constexpr int nsamples = SampleVector::RowsAtCompileTime; + constexpr int npulses = BXVectorType::RowsAtCompileTime; + +#pragma unroll + for (unsigned int ipulse = 0; ipulse < npulses; ipulse++) { + auto const amplitude = amplitudes.coeff(ipulse); + if (amplitude == 0) + continue; + + // FIXME: ipulse - 5 -> ipulse - firstOffset + int bx = ipulse - 5; + int first_sample_t = std::max(0, bx + 3); + int offset = -3 - bx; + + auto const value_sq = amplitude * amplitude; + + for (int col = first_sample_t; col < nsamples; col++) { + for (int row = col; row < nsamples; row++) { + inverse_cov(row, col) += value_sq * __ldg(&pulse_covariance.covval[row + offset][col + offset]); + } + } + } + + return true; + } + + /// + /// launch ctx parameters are (nchannels / block, blocks) + /// TODO: trivial impl for now, there must be a way to improve + /// + /// Conventions: + /// - amplitudes -> solution vector, what we are fitting for + /// - samples -> raw detector responses + /// - passive constraint - satisfied constraint + /// - active constraint - unsatisfied (yet) constraint + /// + __global__ void kernel_minimize(uint32_t const* dids_eb, + uint32_t const* dids_ee, + SampleMatrix const* __restrict__ noisecov, + EcalPulseCovariance const* __restrict__ pulse_covariance, + BXVectorType* bxs, + SampleVector const* __restrict__ samples, + SampleVector* amplitudes, + PulseMatrixType const* __restrict__ pulse_matrix, + ::ecal::reco::StorageScalarType* chi2s, + ::ecal::reco::StorageScalarType* energies, + char* acState, + int nchannels, + int max_iterations, + uint32_t const offsetForHashes, + uint32_t const offsetForInputs) { + // FIXME: ecal has 10 samples and 10 pulses.... + // but this needs to be properly treated and renamed everywhere + constexpr auto NSAMPLES = SampleMatrix::RowsAtCompileTime; + constexpr auto NPULSES = SampleMatrix::RowsAtCompileTime; + static_assert(NSAMPLES == NPULSES); + + using DataType = SampleVector::Scalar; + + extern __shared__ char shrmem[]; + DataType* shrMatrixLForFnnlsStorage = + reinterpret_cast(shrmem) + MapSymM::total * threadIdx.x; + DataType* shrAtAStorage = + reinterpret_cast(shrmem) + MapSymM::total * (threadIdx.x + blockDim.x); + + // FIXME: remove eitehr idx or ch -> they are teh same thing + int idx = threadIdx.x + blockDim.x * blockIdx.x; + auto const ch = idx; + if (idx < nchannels) { + if (static_cast(acState[idx]) == MinimizationState::Precomputed) + return; + + // get the hash + int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch; + auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb; + auto const did = DetId{dids[inputCh]}; + auto const isBarrel = did.subdetId() == EcalBarrel; + auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) + : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId()); + + // inits + int iter = 0; + int npassive = 0; + + ColumnVector pulseOffsets; +#pragma unroll + for (int i = 0; i < NPULSES; ++i) + pulseOffsets(i) = i; + + ColumnVector resultAmplitudes; +#pragma unroll + for (int counter = 0; counter < NPULSES; counter++) + resultAmplitudes(counter) = 0; + + // inits + //SampleDecompLLT covariance_decomposition; + //SampleMatrix inverse_cov; + SampleVector::Scalar chi2 = 0, chi2_now = 0; + + // loop until ocnverge + while (true) { + if (iter >= max_iterations) + break; + + //inverse_cov = noisecov[idx]; + //DataType covMatrixStorage[MapSymM::total]; + DataType* covMatrixStorage = shrMatrixLForFnnlsStorage; + MapSymM covMatrix{covMatrixStorage}; + int counter = 0; +#pragma unroll + for (int col = 0; col < NSAMPLES; col++) +#pragma unroll + for (int row = col; row < NSAMPLES; row++) + covMatrixStorage[counter++] = __ldg(&noisecov[idx].coeffRef(row, col)); + + update_covariance(pulse_covariance[hashedId], covMatrix, resultAmplitudes); + + // compute actual covariance decomposition + //covariance_decomposition.compute(inverse_cov); + //auto const& matrixL = covariance_decomposition.matrixL(); + DataType matrixLStorage[MapSymM::total]; + MapSymM matrixL{matrixLStorage}; + compute_decomposition_unrolled(matrixL, covMatrix); + + // L * A = P + ColMajorMatrix A; + solve_forward_subst_matrix(A, pulse_matrix[idx], matrixL); + + // L b = s + float reg_b[NSAMPLES]; + solve_forward_subst_vector(reg_b, samples[idx], matrixL); + + // FIXME: shared mem + //DataType AtAStorage[MapSymM::total]; + MapSymM AtA{shrAtAStorage}; + //SampleMatrix AtA; + SampleVector Atb; +#pragma unroll + for (int icol = 0; icol < NPULSES; icol++) { + float reg_ai[NSAMPLES]; + +// load column icol +#pragma unroll + for (int counter = 0; counter < NSAMPLES; counter++) + reg_ai[counter] = A(counter, icol); + + // compute diagoanl + float sum = 0.f; +#pragma unroll + for (int counter = 0; counter < NSAMPLES; counter++) + sum += reg_ai[counter] * reg_ai[counter]; + + // store + AtA(icol, icol) = sum; + +// go thru the other columns +#pragma unroll + for (int j = icol + 1; j < NPULSES; j++) { + // load column j + float reg_aj[NSAMPLES]; +#pragma unroll + for (int counter = 0; counter < NSAMPLES; counter++) + reg_aj[counter] = A(counter, j); + + // accum + float sum = 0.f; +#pragma unroll + for (int counter = 0; counter < NSAMPLES; counter++) + sum += reg_aj[counter] * reg_ai[counter]; + + // store + //AtA(icol, j) = sum; + AtA(j, icol) = sum; + } + + // Atb accum + float sum_atb = 0.f; +#pragma unroll + for (int counter = 0; counter < NSAMPLES; counter++) + sum_atb += reg_ai[counter] * reg_b[counter]; + + // store atb + Atb(icol) = sum_atb; + } + + // FIXME: shared mem + //DataType matrixLForFnnlsStorage[MapSymM::total]; + MapSymM matrixLForFnnls{shrMatrixLForFnnlsStorage}; + + fnnls(AtA, + Atb, + //amplitudes[idx], + resultAmplitudes, + npassive, + pulseOffsets, + matrixLForFnnls, + 1e-11, + 500); + + { + DataType accum[NSAMPLES]; +// load accum +#pragma unroll + for (int counter = 0; counter < NSAMPLES; counter++) + accum[counter] = -samples[idx](counter); + + // iterate + for (int icol = 0; icol < NPULSES; icol++) { + DataType pm_col[NSAMPLES]; + +// preload a column of pulse matrix +#pragma unroll + for (int counter = 0; counter < NSAMPLES; counter++) + pm_col[counter] = __ldg(&pulse_matrix[idx].coeffRef(counter, icol)); + +// accum +#pragma unroll + for (int counter = 0; counter < NSAMPLES; counter++) + accum[counter] += resultAmplitudes[icol] * pm_col[counter]; + } + + DataType reg_L[NSAMPLES]; + DataType accumSum = 0; + +// preload a column and load column 0 of cholesky +#pragma unroll + for (int i = 0; i < NSAMPLES; i++) + reg_L[i] = matrixL(i, 0); + + // compute x0 and store it + auto x_prev = accum[0] / reg_L[0]; + accumSum += x_prev * x_prev; + +// iterate +#pragma unroll + for (int iL = 1; iL < NSAMPLES; iL++) { +// update accum +#pragma unroll + for (int counter = iL; counter < NSAMPLES; counter++) + accum[counter] -= x_prev * reg_L[counter]; + +// load the next column of cholesky +#pragma unroll + for (int counter = iL; counter < NSAMPLES; counter++) + reg_L[counter] = matrixL(counter, iL); + + // compute the next x for M(iL, icol) + x_prev = accum[iL] / reg_L[iL]; + + // store teh result value + accumSum += x_prev * x_prev; + } + + chi2_now = accumSum; + } + + auto deltachi2 = chi2_now - chi2; + chi2 = chi2_now; + + if (ecal::abs(deltachi2) < 1e-3) + break; + + //---- AM: TEST + //---- it was 3 lines above, now here as in the CPU version + ++iter; + } + + // store to global output values + // FIXME: amplitudes are used in global directly + chi2s[idx] = chi2; + energies[idx] = resultAmplitudes(5); + +#pragma unroll + for (int counter = 0; counter < NPULSES; counter++) + amplitudes[idx](counter) = resultAmplitudes(counter); + } + } + + namespace v1 { + + void minimization_procedure(EventInputDataGPU const& eventInputGPU, + EventOutputDataGPU& eventOutputGPU, + EventDataForScratchGPU& scratch, + ConditionsProducts const& conditions, + ConfigurationParameters const& configParameters, + cudaStream_t cudaStream) { + using DataType = SampleVector::Scalar; + unsigned int totalChannels = eventInputGPU.ebDigis.ndigis + eventInputGPU.eeDigis.ndigis; + // unsigned int threads_min = conf.threads.x; + // TODO: configure from python + unsigned int threads_min = configParameters.kernelMinimizeThreads[0]; + unsigned int blocks_min = threads_min > totalChannels ? 1 : (totalChannels + threads_min - 1) / threads_min; + uint32_t const offsetForHashes = conditions.offsetForHashes; + uint32_t const offsetForInputs = eventInputGPU.ebDigis.ndigis; + auto const nbytesShared = + 2 * threads_min * MapSymM::total * sizeof(DataType); + kernel_minimize<<>>( + eventInputGPU.ebDigis.ids, + eventInputGPU.eeDigis.ids, + scratch.noisecov, + conditions.pulseCovariances.values, + scratch.activeBXs, + scratch.samples, + (SampleVector*)eventOutputGPU.amplitudesAll, + scratch.pulse_matrix, + eventOutputGPU.chi2, + eventOutputGPU.amplitude, + scratch.acState, + totalChannels, + 50, + offsetForHashes, + offsetForInputs); + cudaCheck(cudaGetLastError()); + } + + } // namespace v1 + + } // namespace multifit +} // namespace ecal diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.h new file mode 100644 index 0000000000000..b8202f75b653b --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.h @@ -0,0 +1,29 @@ +#ifndef RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationKernels_h +#define RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationKernels_h + +#include "Common.h" +#include "DeclsForKernels.h" +#include "EigenMatrixTypes_gpu.h" + +class EcalPulseShape; +class EcalPulseCovariance; +class EcalUncalibratedRecHit; + +namespace ecal { + namespace multifit { + + namespace v1 { + + void minimization_procedure(EventInputDataGPU const& eventInputGPU, + EventOutputDataGPU& eventOutputGPU, + EventDataForScratchGPU& scratch, + ConditionsProducts const& conditions, + ConfigurationParameters const& configParameters, + cudaStream_t cudaStream); + + } + + } // namespace multifit +} // namespace ecal + +#endif // RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationKernels_h diff --git a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml index 3b1d2c0cf159d..61eed4689fd20 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml +++ b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml @@ -1,21 +1,22 @@ - - - - - - - - + + + + - - - + + + + + + + - + + diff --git a/RecoLocalCalo/EcalRecProducers/plugins/Common.h b/RecoLocalCalo/EcalRecProducers/plugins/Common.h new file mode 100644 index 0000000000000..4e7afbd0819db --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/Common.h @@ -0,0 +1,46 @@ +#ifndef RecoLocalCalo_EcalRecProducers_plugins_Common_h +#define RecoLocalCalo_EcalRecProducers_plugins_Common_h + +#include +#include +#include +#include +#include + +#include +#include + +// a workaround for std::abs not being a constexpr function +namespace ecal { + + template + constexpr T abs(T const& value) { + return ::std::max(value, -value); + } + + // temporary + namespace mgpa { + + constexpr int adc(uint16_t sample) { return sample & 0xfff; } + constexpr int gainId(uint16_t sample) { return (sample >> 12) & 0x3; } + + } // namespace mgpa + +} // namespace ecal + +template +struct DurationMeasurer { + DurationMeasurer(std::string const& msg) : msg_{msg}, start_{std::chrono::high_resolution_clock::now()} {} + + ~DurationMeasurer() { + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end - start_).count(); + std::cout << msg_ << "\nduration = " << duration << std::endl; + } + +private: + std::string msg_; + std::chrono::high_resolution_clock::time_point start_; +}; + +#endif // RecoLocalCalo_EcalRecProducers_plugins_Common_h diff --git a/RecoLocalCalo/EcalRecProducers/plugins/DeclsForKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/DeclsForKernels.h new file mode 100644 index 0000000000000..1ea0fcc430882 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/DeclsForKernels.h @@ -0,0 +1,333 @@ +#ifndef RecoLocalCalo_EcalRecProducers_plugins_DeclsForKernels_h +#define RecoLocalCalo_EcalRecProducers_plugins_DeclsForKernels_h + +#include + +#include +#include + +#include "CUDADataFormats/EcalDigi/interface/DigisCollection.h" +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h" +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" +#include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h" +#include "CondFormats/EcalObjects/interface/EcalChannelStatus.h" +#include "CondFormats/EcalObjects/interface/EcalChannelStatusCode.h" +#include "CondFormats/EcalObjects/interface/EcalGainRatios.h" +#include "CondFormats/EcalObjects/interface/EcalPedestals.h" +#include "CondFormats/EcalObjects/interface/EcalTimeBiasCorrections.h" +#include "CondFormats/EcalObjects/interface/EcalTimeOffsetConstant.h" +#include "CondFormats/EcalObjects/interface/EcalWeightSet.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h" + +#include "EigenMatrixTypes_gpu.h" + +struct EcalPulseShape; +class EcalSampleMask; +class EcalTimeBiasCorrections; +struct EcalPulseCovariance; +class EcalDigiCollection; +class EcalXtalGroupId; +class EcalSamplesCorrelation; +class EBDigiCollection; +class EEDigiCollection; + +namespace ecal { + namespace multifit { + + enum class TimeComputationState : char { NotFinished = 0, Finished = 1 }; + enum class MinimizationState : char { + NotFinished = 0, + Finished = 1, + Precomputed = 2, + }; + + // + struct EventInputDataGPU { + ecal::DigisCollection const& ebDigis; + ecal::DigisCollection const& eeDigis; + }; + + // parameters have a fixed type + // Can we go by with single precision + struct ConfigurationParameters { + using type = double; + // device ptrs + type *amplitudeFitParametersEB = nullptr, *amplitudeFitParametersEE = nullptr; + + uint32_t timeFitParametersSizeEB, timeFitParametersSizeEE; + // device ptrs + type *timeFitParametersEB = nullptr, *timeFitParametersEE = nullptr; + + type timeFitLimitsFirstEB, timeFitLimitsFirstEE; + type timeFitLimitsSecondEB, timeFitLimitsSecondEE; + + type timeConstantTermEB, timeConstantTermEE; + + type timeNconstEB, timeNconstEE; + + type amplitudeThreshEE, amplitudeThreshEB; + + type outOfTimeThreshG12pEB, outOfTimeThreshG12mEB; + type outOfTimeThreshG12pEE, outOfTimeThreshG12mEE; + type outOfTimeThreshG61pEE, outOfTimeThreshG61mEE; + type outOfTimeThreshG61pEB, outOfTimeThreshG61mEB; + + std::array kernelMinimizeThreads; + + bool shouldRunTimingComputation; + }; + + struct EventOutputDataGPU final : public ::ecal::UncalibratedRecHit<::ecal::Tag::ptr> { + void allocate(ConfigurationParameters const& configParameters, uint32_t size) { + cudaCheck(cudaMalloc((void**)&litudesAll, size * sizeof(SampleVector))); + cudaCheck(cudaMalloc((void**)&litude, size * sizeof(::ecal::reco::StorageScalarType))); + cudaCheck(cudaMalloc((void**)&chi2, size * sizeof(::ecal::reco::StorageScalarType))); + cudaCheck(cudaMalloc((void**)&pedestal, size * sizeof(::ecal::reco::StorageScalarType))); + + if (configParameters.shouldRunTimingComputation) { + cudaCheck(cudaMalloc((void**)&jitter, size * sizeof(::ecal::reco::StorageScalarType))); + cudaCheck(cudaMalloc((void**)&jitterError, size * sizeof(::ecal::reco::StorageScalarType))); + } + + cudaCheck(cudaMalloc((void**)&did, size * sizeof(uint32_t))); + cudaCheck(cudaMalloc((void**)&flags, size * sizeof(uint32_t))); + } + + void deallocate(ConfigurationParameters const& configParameters) { + cudaCheck(cudaFree(amplitudesAll)); + cudaCheck(cudaFree(amplitude)); + cudaCheck(cudaFree(chi2)); + cudaCheck(cudaFree(pedestal)); + if (configParameters.shouldRunTimingComputation) { + cudaCheck(cudaFree(jitter)); + cudaCheck(cudaFree(jitterError)); + } + cudaCheck(cudaFree(did)); + cudaCheck(cudaFree(flags)); + } + }; + + struct EventDataForScratchGPU { + SampleVector* samples = nullptr; + SampleGainVector* gainsNoise = nullptr; + + SampleMatrix* noisecov = nullptr; + PulseMatrixType* pulse_matrix = nullptr; + BXVectorType* activeBXs = nullptr; + char* acState = nullptr; + + bool *hasSwitchToGain6 = nullptr, *hasSwitchToGain1 = nullptr, *isSaturated = nullptr; + + SampleVector::Scalar *sample_values, *sample_value_errors; + bool* useless_sample_values; + SampleVector::Scalar* chi2sNullHypot; + SampleVector::Scalar* sum0sNullHypot; + SampleVector::Scalar* sumAAsNullHypot; + char* pedestal_nums; + SampleVector::Scalar *tMaxAlphaBetas, *tMaxErrorAlphaBetas; + SampleVector::Scalar *accTimeMax, *accTimeWgt; + SampleVector::Scalar *ampMaxAlphaBeta, *ampMaxError; + SampleVector::Scalar *timeMax, *timeError; + TimeComputationState* tcState; + + void allocate(ConfigurationParameters const& configParameters, uint32_t size) { + cudaCheck(cudaMalloc((void**)&samples, size * sizeof(SampleVector))); + cudaCheck(cudaMalloc((void**)&gainsNoise, size * sizeof(SampleGainVector))); + + cudaCheck(cudaMalloc((void**)&noisecov, size * sizeof(SampleMatrix))); + cudaCheck(cudaMalloc((void**)&pulse_matrix, size * sizeof(PulseMatrixType))); + cudaCheck(cudaMalloc((void**)&activeBXs, size * sizeof(BXVectorType))); + cudaCheck(cudaMalloc((void**)&acState, size * sizeof(char))); + + cudaCheck(cudaMalloc((void**)&hasSwitchToGain6, size * sizeof(bool))); + cudaCheck(cudaMalloc((void**)&hasSwitchToGain1, size * sizeof(bool))); + cudaCheck(cudaMalloc((void**)&isSaturated, size * sizeof(bool))); + + if (configParameters.shouldRunTimingComputation) { + cudaCheck(cudaMalloc((void**)&sample_values, size * sizeof(SampleVector))); + cudaCheck(cudaMalloc((void**)&sample_value_errors, size * sizeof(SampleVector))); + cudaCheck(cudaMalloc((void**)&useless_sample_values, size * sizeof(bool) * EcalDataFrame::MAXSAMPLES)); + cudaCheck(cudaMalloc((void**)&chi2sNullHypot, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&sum0sNullHypot, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&sumAAsNullHypot, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&pedestal_nums, size * sizeof(char))); + + cudaCheck(cudaMalloc((void**)&tMaxAlphaBetas, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&tMaxErrorAlphaBetas, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&accTimeMax, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&accTimeWgt, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&MaxAlphaBeta, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&MaxError, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&timeMax, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&timeError, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&tcState, size * sizeof(TimeComputationState))); + } + } + + void deallocate(ConfigurationParameters const& configParameters) { + cudaCheck(cudaFree(samples)); + cudaCheck(cudaFree(gainsNoise)); + + cudaCheck(cudaFree(noisecov)); + cudaCheck(cudaFree(pulse_matrix)); + cudaCheck(cudaFree(activeBXs)); + cudaCheck(cudaFree(acState)); + + cudaCheck(cudaFree(hasSwitchToGain6)); + cudaCheck(cudaFree(hasSwitchToGain1)); + cudaCheck(cudaFree(isSaturated)); + + if (configParameters.shouldRunTimingComputation) { + cudaCheck(cudaFree(sample_values)); + cudaCheck(cudaFree(sample_value_errors)); + cudaCheck(cudaFree(useless_sample_values)); + cudaCheck(cudaFree(chi2sNullHypot)); + cudaCheck(cudaFree(sum0sNullHypot)); + cudaCheck(cudaFree(sumAAsNullHypot)); + cudaCheck(cudaFree(pedestal_nums)); + + cudaCheck(cudaFree(tMaxAlphaBetas)); + cudaCheck(cudaFree(tMaxErrorAlphaBetas)); + cudaCheck(cudaFree(accTimeMax)); + cudaCheck(cudaFree(accTimeWgt)); + cudaCheck(cudaFree(ampMaxAlphaBeta)); + cudaCheck(cudaFree(ampMaxError)); + cudaCheck(cudaFree(timeMax)); + cudaCheck(cudaFree(timeError)); + cudaCheck(cudaFree(tcState)); + } + } + }; + + // const refs products to conditions + struct ConditionsProducts { + EcalPedestalsGPU::Product const& pedestals; + EcalGainRatiosGPU::Product const& gainRatios; + EcalPulseShapesGPU::Product const& pulseShapes; + EcalPulseCovariancesGPU::Product const& pulseCovariances; + EcalSamplesCorrelationGPU::Product const& samplesCorrelation; + EcalTimeBiasCorrectionsGPU::Product const& timeBiasCorrections; + EcalTimeCalibConstantsGPU::Product const& timeCalibConstants; + EcalSampleMask const& sampleMask; + EcalTimeOffsetConstant const& timeOffsetConstant; + uint32_t offsetForHashes; + }; + + //*/ + + struct xyz { + int x, y, z; + }; + + struct conf_data { + xyz threads; + bool runV1; + cudaStream_t cuStream; + }; + + } // namespace multifit +} // namespace ecal + +// +// ECAL Rechit producer +// + +namespace ecal { + namespace rechit { + + // parameters that are read in the configuration file for rechit producer + struct ConfigurationParameters { + // device ptrs + int* ChannelStatusToBeExcluded = nullptr; + uint32_t ChannelStatusToBeExcludedSize; + + bool killDeadChannels; + + bool recoverEBIsolatedChannels; + bool recoverEEIsolatedChannels; + bool recoverEBVFE; + bool recoverEEVFE; + bool recoverEBFE; + bool recoverEEFE; + + float EBLaserMIN; + float EELaserMIN; + float EBLaserMAX; + float EELaserMAX; + + // std::vector > v_DB_reco_flags; + int* expanded_v_DB_reco_flags; + uint32_t* expanded_Sizes_v_DB_reco_flags; + uint32_t* expanded_flagbit_v_DB_reco_flags; + uint32_t expanded_v_DB_reco_flagsSize; + + uint32_t flagmask; + + // + // bool shouldRunTimingComputation; + }; + + struct EventOutputDataGPU final : public ::ecal::RecHit<::ecal::Tag::ptr> { + void allocate(ConfigurationParameters const& configParameters, uint32_t size) { + // void allocate(uint32_t size) { + //---- configParameters -> needed only to decide if to save the timing information or not + + cudaCheck(cudaMalloc((void**)&energy, size * sizeof(::ecal::reco::StorageScalarType))); + cudaCheck(cudaMalloc((void**)&time, size * sizeof(::ecal::reco::StorageScalarType))); + cudaCheck(cudaMalloc((void**)&chi2, size * sizeof(::ecal::reco::StorageScalarType))); + cudaCheck(cudaMalloc((void**)&flagBits, size * sizeof(uint32_t))); + cudaCheck(cudaMalloc((void**)&extra, size * sizeof(uint32_t))); + cudaCheck(cudaMalloc((void**)&did, size * sizeof(uint32_t))); + } + + void deallocate(ConfigurationParameters const& configParameters) { + // void deallocate() { + //---- configParameters -> needed only to decide if to save the timing information or not + + cudaCheck(cudaFree(energy)); + cudaCheck(cudaFree(time)); + cudaCheck(cudaFree(chi2)); + cudaCheck(cudaFree(flagBits)); + cudaCheck(cudaFree(extra)); + cudaCheck(cudaFree(did)); + } + }; + + struct EventInputDataGPU { + ecal::UncalibratedRecHit const& ebUncalibRecHits; + ecal::UncalibratedRecHit const& eeUncalibRecHits; + }; + + // const refs products to conditions + struct ConditionsProducts { + EcalRechitADCToGeVConstantGPU::Product const& ADCToGeV; + EcalIntercalibConstantsGPU::Product const& Intercalib; + EcalRechitChannelStatusGPU::Product const& ChannelStatus; + // + EcalLaserAPDPNRatiosGPU::Product const& LaserAPDPNRatios; + EcalLaserAPDPNRatiosRefGPU::Product const& LaserAPDPNRatiosRef; + EcalLaserAlphasGPU::Product const& LaserAlphas; + EcalLinearCorrectionsGPU::Product const& LinearCorrections; + // + // + uint32_t offsetForHashes; + }; + + } // namespace rechit +} // namespace ecal + +#endif // RecoLocalCalo_EcalRecProducers_plugins_DeclsForKernels_h diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.cu new file mode 100644 index 0000000000000..d816414aa71a0 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.cu @@ -0,0 +1,673 @@ +#include + +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h" +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" + +#include "EcalRecHitBuilderKernels.h" +#include "KernelHelpers.h" + +namespace ecal { + namespace rechit { + + // uncalibrecHit flags + enum UncalibRecHitFlags { + kGood = -1, // channel is good (mutually exclusive with other states) setFlagBit(kGood) reset flags_ to zero + kPoorReco, // channel has been badly reconstructed (e.g. bad shape, bad chi2 etc.) + kSaturated, // saturated channel + kOutOfTime, // channel out of time + kLeadingEdgeRecovered, // saturated channel: energy estimated from the leading edge before saturation + kHasSwitchToGain6, // at least one data frame is in G6 + kHasSwitchToGain1 // at least one data frame is in G1 + }; + + // recHit flags + enum RecHitFlags { + RecHitFlags_kGood = 0, // channel ok, the energy and time measurement are reliable + RecHitFlags_kPoorReco, // the energy is available from the UncalibRecHit, but approximate (bad shape, large chi2) + RecHitFlags_kOutOfTime, // the energy is available from the UncalibRecHit (sync reco), but the event is out of time + RecHitFlags_kFaultyHardware, // The energy is available from the UncalibRecHit, channel is faulty at some hardware level (e.g. noisy) + RecHitFlags_kNoisy, // the channel is very noisy + RecHitFlags_kPoorCalib, // the energy is available from the UncalibRecHit, but the calibration of the channel is poor + RecHitFlags_kSaturated, // saturated channel (recovery not tried) + RecHitFlags_kLeadingEdgeRecovered, // saturated channel: energy estimated from the leading edge before saturation + RecHitFlags_kNeighboursRecovered, // saturated/isolated dead: energy estimated from neighbours + RecHitFlags_kTowerRecovered, // channel in TT with no data link, info retrieved from Trigger Primitive + RecHitFlags_kDead, // channel is dead and any recovery fails + RecHitFlags_kKilled, // MC only flag: the channel is killed in the real detector + RecHitFlags_kTPSaturated, // the channel is in a region with saturated TP + RecHitFlags_kL1SpikeFlag, // the channel is in a region with TP with sFGVB = 0 + RecHitFlags_kWeird, // the signal is believed to originate from an anomalous deposit (spike) + RecHitFlags_kDiWeird, // the signal is anomalous, and neighbors another anomalous signal + RecHitFlags_kHasSwitchToGain6, // at least one data frame is in G6 + RecHitFlags_kHasSwitchToGain1, // at least one data frame is in G1 + // + RecHitFlags_kUnknown // to ease the interface with functions returning flags. + }; + + // status code + enum EcalChannelStatusCode_Code { + kOk = 0, + kDAC, + kNoLaser, + kNoisy, + kNNoisy, + kNNNoisy, + kNNNNoisy, + kNNNNNoisy, + kFixedG6, + kFixedG1, + kFixedG0, + kNonRespondingIsolated, + kDeadVFE, + kDeadFE, + kNoDataNoTP + }; + + __global__ void kernel_create_ecal_rehit( + // configuration + int const* ChannelStatusToBeExcluded, + uint32_t ChannelStatusToBeExcludedSize, + bool const killDeadChannels, + bool const recoverEBIsolatedChannels, + bool const recoverEEIsolatedChannels, + bool const recoverEBVFE, + bool const recoverEEVFE, + bool const recoverEBFE, + bool const recoverEEFE, + float const EBLaserMIN, + float const EELaserMIN, + float const EBLaserMAX, + float const EELaserMAX, + // for flags setting + int const* expanded_v_DB_reco_flags, // FIXME AM: to be checked + uint32_t const* expanded_Sizes_v_DB_reco_flags, + uint32_t const* expanded_flagbit_v_DB_reco_flags, + uint32_t expanded_v_DB_reco_flagsSize, + uint32_t flagmask, + // conditions + float const* adc2gev, + float const* intercalib, + uint16_t const* status, + float const* apdpnrefs, + float const* alphas, + // input for transparency corrections + float const* p1, + float const* p2, + float const* p3, + edm::TimeValue_t const* t1, + edm::TimeValue_t const* t2, + edm::TimeValue_t const* t3, + // input for linear corrections + float const* lp1, + float const* lp2, + float const* lp3, + edm::TimeValue_t const* lt1, + edm::TimeValue_t const* lt2, + edm::TimeValue_t const* lt3, + // time, used for time dependent corrections + edm::TimeValue_t const event_time, + // input + uint32_t const* did_eb, + uint32_t const* did_ee, + ::ecal::reco::StorageScalarType const* amplitude_eb, // in adc counts + ::ecal::reco::StorageScalarType const* amplitude_ee, // in adc counts + ::ecal::reco::StorageScalarType const* time_eb, + ::ecal::reco::StorageScalarType const* time_ee, + ::ecal::reco::StorageScalarType const* chi2_eb, + ::ecal::reco::StorageScalarType const* chi2_ee, + uint32_t const* flags_eb, + uint32_t const* flags_ee, + // output + uint32_t* did, + ::ecal::reco::StorageScalarType* energy, // in energy [GeV] + ::ecal::reco::StorageScalarType* time, + ::ecal::reco::StorageScalarType* chi2, + uint32_t* flagBits, + uint32_t* extra, + // other + int const nchannels, + uint32_t const nChannelsBarrel, + uint32_t const offsetForHashes) { + // + // NB: energy "type_wrapper::type" most likely std::vector + // + + for (int ch = threadIdx.x + blockDim.x * blockIdx.x; ch < nchannels; ch += blockDim.x * gridDim.x) { + // int ch = threadIdx.x + blockDim.x*blockIdx.x; + + // if (ch < nchannels) { + + bool isEndcap = (ch >= nChannelsBarrel); + + int const inputCh = isEndcap ? ch - nChannelsBarrel : ch; + + uint32_t const* didCh = isEndcap ? did_ee : did_eb; + + // only two values, EB or EE + // AM : FIXME : why not using "isBarrel" ? isBarrel ? adc2gev[0] : adc2gev[1] + float adc2gev_to_use = isEndcap ? adc2gev[1] // ee + : adc2gev[0]; // eb + + // first EB and then EE + + ::ecal::reco::StorageScalarType const* amplitude = isEndcap ? amplitude_ee : amplitude_eb; + + //::ecal::reco::StorageScalarType const* time_in = isEndcap ? time_ee : time_eb; + + ::ecal::reco::StorageScalarType const* chi2_in = isEndcap ? chi2_ee : chi2_eb; + + uint32_t const* flags_in = isEndcap ? flags_ee : flags_eb; + + // simple copy + did[ch] = didCh[inputCh]; + + auto const did_to_use = DetId{didCh[inputCh]}; + + auto const isBarrel = did_to_use.subdetId() == EcalBarrel; + auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did_to_use.rawId()) + : offsetForHashes + ecal::reconstruction::hashedIndexEE(did_to_use.rawId()); + + float const intercalib_to_use = intercalib[hashedId]; + + // get laser coefficient + float lasercalib = 1.; + + // + // AM: ideas + // + // One possibility is to create the map of laser corrections once on CPU + // for all crystals and push them on GPU. + // Then only if the LS is different, update the laser correction + // The variation within a LS is not worth pursuing (<< 0.1% !!) + // and below the precision we can claim on the laser corrections (right?). + // This will save quite some time (also for the CPU version?) + // + + int iLM = 1; + + if (isBarrel) { + iLM = ecal::reconstruction::laser_monitoring_region_EB(did_to_use.rawId()); + } else { + iLM = ecal::reconstruction::laser_monitoring_region_EE(did_to_use.rawId()); + } + + long long t_i = 0, t_f = 0; + float p_i = 0, p_f = 0; + long long lt_i = 0, lt_f = 0; + float lp_i = 0, lp_f = 0; + + // laser + if (event_time >= t1[iLM - 1] && event_time < t2[iLM - 1]) { + t_i = t1[iLM - 1]; + t_f = t2[iLM - 1]; + p_i = p1[hashedId]; + p_f = p2[hashedId]; + } else if (event_time >= t2[iLM - 1] && event_time <= t3[iLM - 1]) { + t_i = t2[iLM - 1]; + t_f = t3[iLM - 1]; + p_i = p2[hashedId]; + p_f = p3[hashedId]; + } else if (event_time < t1[iLM - 1]) { + t_i = t1[iLM - 1]; + t_f = t2[iLM - 1]; + p_i = p1[hashedId]; + p_f = p2[hashedId]; + + } else if (event_time > t3[iLM - 1]) { + t_i = t2[iLM - 1]; + t_f = t3[iLM - 1]; + p_i = p2[hashedId]; + p_f = p3[hashedId]; + } + + // linear corrections + if (event_time >= lt1[iLM - 1] && event_time < lt2[iLM - 1]) { + lt_i = lt1[iLM - 1]; + lt_f = lt2[iLM - 1]; + lp_i = lp1[hashedId]; + lp_f = lp2[hashedId]; + } else if (event_time >= lt2[iLM - 1] && event_time <= lt3[iLM - 1]) { + lt_i = lt2[iLM - 1]; + lt_f = lt3[iLM - 1]; + lp_i = lp2[hashedId]; + lp_f = lp3[hashedId]; + } else if (event_time < lt1[iLM - 1]) { + lt_i = lt1[iLM - 1]; + lt_f = lt2[iLM - 1]; + lp_i = lp1[hashedId]; + lp_f = lp2[hashedId]; + + } else if (event_time > lt3[iLM - 1]) { + lt_i = lt2[iLM - 1]; + lt_f = lt3[iLM - 1]; + lp_i = lp2[hashedId]; + lp_f = lp3[hashedId]; + } + + // apdpnref and alpha + float apdpnref = apdpnrefs[hashedId]; + float alpha = alphas[hashedId]; + + // now calculate transparency correction + if (apdpnref != 0 && (t_i - t_f) != 0 && (lt_i - lt_f) != 0) { + long long tt = event_time; // never subtract two unsigned! + float interpolatedLaserResponse = + p_i / apdpnref + float(tt - t_i) * (p_f - p_i) / (apdpnref * float(t_f - t_i)); + + float interpolatedLinearResponse = + lp_i / apdpnref + float(tt - lt_i) * (lp_f - lp_i) / (apdpnref * float(lt_f - lt_i)); // FIXED BY FC + + if (interpolatedLinearResponse > 2.f || interpolatedLinearResponse < 0.1f) { + interpolatedLinearResponse = 1.f; + } + if (interpolatedLaserResponse <= 0.) { + // AM : how the heck is it possible? + // interpolatedLaserResponse = 0.0001; + lasercalib = 1.; + + } else { + float interpolatedTransparencyResponse = interpolatedLaserResponse / interpolatedLinearResponse; + + // ... and now this: + lasercalib = 1.f / (std::pow(interpolatedTransparencyResponse, alpha) * interpolatedLinearResponse); + } + } + + // + // Check for channels to be excluded from reconstruction + // + // + // Default energy? Not to be updated if "ChannelStatusToBeExcluded" + // Exploited later by the module "EcalRecHitConvertGPU2CPUFormat" + // + energy[ch] = -1; //---- AM: default, un-physical, ok + + // truncate the chi2 + if (chi2_in[inputCh] > 64) + chi2[ch] = 64; + else + chi2[ch] = chi2_in[inputCh]; + + // default values for the flags + flagBits[ch] = 0; + extra[ch] = 0; + + static const int chStatusMask = 0x1f; + // ChannelStatusToBeExcluded is a "int" then I put "dbstatus" to be the same + int dbstatus = EcalChannelStatusCode_Code((status[hashedId]) & chStatusMask); + if (ChannelStatusToBeExcludedSize != 0) { + bool skip_this_channel = false; + for (int ich_to_check = 0; ich_to_check < ChannelStatusToBeExcludedSize; ich_to_check++) { + if (ChannelStatusToBeExcluded[ich_to_check] == dbstatus) { + skip_this_channel = true; + break; + } + } + if (skip_this_channel) { + // skip this channel + continue; + } + } + + // Take our association map of dbstatuses-> recHit flagbits and return the apporpriate flagbit word + + // + // AM: get the smaller "flagbit_counter" with match + // + + uint32_t temporary_flagBits = 0; + + int iterator_flags = 0; + bool need_to_exit = false; + int flagbit_counter = 0; + while (!need_to_exit) { + iterator_flags = 0; + for (unsigned int i = 0; i != expanded_v_DB_reco_flagsSize; ++i) { + // check the correct "flagbit" + if (expanded_flagbit_v_DB_reco_flags[i] == flagbit_counter) { + for (unsigned int j = 0; j < expanded_Sizes_v_DB_reco_flags[i]; j++) { + if (expanded_v_DB_reco_flags[iterator_flags] == dbstatus) { + temporary_flagBits = 0x1 << expanded_flagbit_v_DB_reco_flags[i]; + need_to_exit = true; + break; // also from the big loop!!! + } + iterator_flags++; + } + } else { + // if not, got to the next bunch directly + iterator_flags += expanded_Sizes_v_DB_reco_flags[i]; + } + + if (need_to_exit) { + break; + } + } + flagbit_counter += 1; + } + + flagBits[ch] = temporary_flagBits; + + if ((flagmask & temporary_flagBits) && killDeadChannels) { + // skip this channel + continue; + } + + // + // multiply the adc counts with factors to get GeV + // + + // energy[ch] = amplitude[inputCh] * adc2gev_to_use * intercalib_to_use ; + energy[ch] = amplitude[inputCh] * adc2gev_to_use * intercalib_to_use * lasercalib; + + // Time is not saved so far, FIXME + // time[ch] = time_in[inputCh]; + + // NB: calculate the "flagBits extra" --> not really "flags", but actually an encoded version of energy uncertainty, time unc., ... + + // + // extra packing ... + // + + uint32_t offset; + uint32_t width; + uint32_t value; + + float chi2_temp = chi2[ch]; + if (chi2_temp > 64) + chi2_temp = 64; + // use 7 bits + uint32_t rawChi2 = lround(chi2_temp / 64. * ((1 << 7) - 1)); + + offset = 0; + width = 7; + value = 0; + + uint32_t mask = ((1 << width) - 1) << offset; + value &= ~mask; + value |= (rawChi2 & ((1U << width) - 1)) << offset; + + // extra[ch] = value; + // + + // rawEnergy is actually "error" !!! + uint32_t rawEnergy = 0; + + // AM: FIXME: this is not propagated currently to the uncalibrecHit collection SOA + // if you want to store this in "extra", we need first to add it to the uncalibrecHit results + // then it will be something like the following + // amplitudeError[inputCh] * adc2gev_to_use * intercalib_to_use * lasercalib + // + // + + float amplitudeError_ch = 0.; // amplitudeError[ch]; + + if (amplitudeError_ch > 0.001) { + // uint16_t exponent = getPower10(amplitudeError_ch); + + static constexpr float p10[] = {1.e-2f, 1.e-1f, 1.f, 1.e1f, 1.e2f, 1.e3f, 1.e4f, 1.e5f, 1.e6f}; + int b = amplitudeError_ch < p10[4] ? 0 : 5; + for (; b < 9; ++b) + if (amplitudeError_ch < p10[b]) + break; + + uint16_t exponent = b; + + static constexpr float ip10[] = {1.e5f, 1.e4f, 1.e3f, 1.e2f, 1.e1f, 1.e0f, 1.e-1f, 1.e-2f, 1.e-3f, 1.e-4}; + uint16_t significand = lround(amplitudeError_ch * ip10[exponent]); + // use 13 bits (3 exponent, 10 significand) + rawEnergy = exponent << 10 | significand; + } + + offset = 8; + width = 13; + // value from last change, ok + + mask = ((1 << width) - 1) << offset; + value &= ~mask; + value |= (rawEnergy & ((1U << width) - 1)) << offset; + + uint32_t jitterErrorBits = 0; + jitterErrorBits = jitterErrorBits & 0xFF; + + offset = 24; + width = 8; + // value from last change, ok + + mask = ((1 << width) - 1) << offset; + value &= ~mask; + value |= (jitterErrorBits & ((1U << width) - 1)) << offset; + + // + // now finally set "extra[ch]" + // + extra[ch] = value; + + // + // additional flags setting + // + // using correctly the flags as calculated at the UncalibRecHit stage + // + // Now fill flags + + bool good = true; + + if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kLeadingEdgeRecovered))) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kLeadingEdgeRecovered)); + good = false; + } + + if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kSaturated))) { + // leading edge recovery failed - still keep the information + // about the saturation and do not flag as dead + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kSaturated)); + good = false; + } + + // + // AM: why do we have two tests one after the other checking almost the same thing??? + // Please clean up the code, ... also the original one! + // + // uncalibRH.isSaturated() ---> + // + // bool EcalUncalibratedRecHit::isSaturated() const { + // return EcalUncalibratedRecHit::checkFlag(kSaturated); + // } + // + // + + if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kSaturated))) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kSaturated)); + good = false; + } + + if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kOutOfTime))) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kOutOfTime)); + good = false; + } + if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kPoorReco))) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorReco)); + good = false; + } + if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kHasSwitchToGain6))) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kHasSwitchToGain6)); + } + if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kHasSwitchToGain1))) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kHasSwitchToGain1)); + } + + if (good) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kGood)); + } + + if (isBarrel && (lasercalib < EBLaserMIN || lasercalib > EBLaserMAX)) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorCalib)); + } + if (!isBarrel && (lasercalib < EELaserMIN || lasercalib > EELaserMAX)) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorCalib)); + } + + // recover, killing, and other stuff + + // + // Structure: + // EB + // EE + // + // + // - single MVA + // - democratic sharing + // - kill all the other cases + // + + bool is_Single = false; + bool is_FE = false; + bool is_VFE = false; + + bool is_recoverable = false; // DetIdToBeRecovered + + if (dbstatus == 10 || dbstatus == 11 || dbstatus == 12) { + is_recoverable = true; + } + + if (is_recoverable) { + if (dbstatus == EcalChannelStatusCode_Code::kDeadVFE) { + is_VFE = true; + } else if (dbstatus == EcalChannelStatusCode_Code::kDeadVFE) { + is_FE = true; + } else { + is_Single = true; + } + + // EB + if (isBarrel) { + if (is_Single || is_FE || is_VFE) { + // single MVA + if (is_Single && (recoverEBIsolatedChannels || !killDeadChannels)) { + } + // decmocratic sharing + else if (is_FE && (recoverEBFE || !killDeadChannels)) { + } + // kill all the other cases + else { + energy[ch] = 0.; // Need to set also the flags ... + } + } + } + // EE + else { + if (is_Single || is_FE || is_VFE) { + // single MVA + if (is_Single && (recoverEBIsolatedChannels || !killDeadChannels)) { + } + // decmocratic sharing + else if (is_FE && (recoverEBFE || !killDeadChannels)) { + // + // Code is definitely too long ... + // + + } + // kill all the other cases + else { + energy[ch] = 0.; // Need to set also the flags ... + } + } + } + } + + } // end channel + } + + // host version, to be called by the plugin + void create_ecal_rehit(EventInputDataGPU const& eventInputGPU, + EventOutputDataGPU& eventOutputGPU, + // eventDataForScratchGPU_, + ConditionsProducts const& conditions, + ConfigurationParameters const& configParameters, + uint32_t const nChannelsBarrel, + edm::TimeValue_t const event_time, + cudaStream_t cudaStream) { + int nchannels = eventInputGPU.ebUncalibRecHits.size + eventInputGPU.eeUncalibRecHits.size; + + // unsigned int nchannels_per_block = 32; + unsigned int nchannels_per_block = 16; + unsigned int threads_min = nchannels_per_block; + unsigned int blocks_min = (nchannels + threads_min - 1) / threads_min; // TEST : to be optimized (AM) + + // + // kernel create rechit + // + + // auto const nbytesShared = 2 * threads_min * MapSymM::total * sizeof(DataType); + + kernel_create_ecal_rehit<<>>( + // kernel_create_ecal_rehit <<< blocks_min, threads_min, nbytesShared, cudaStream >>> ( + // kernel_create_ecal_rehit <<< blocks_min, threads_min >>> ( + // configuration + configParameters.ChannelStatusToBeExcluded, + configParameters.ChannelStatusToBeExcludedSize, + configParameters.killDeadChannels, + configParameters.recoverEBIsolatedChannels, + configParameters.recoverEEIsolatedChannels, + configParameters.recoverEBVFE, + configParameters.recoverEEVFE, + configParameters.recoverEBFE, + configParameters.recoverEEFE, + configParameters.EBLaserMIN, + configParameters.EELaserMIN, + configParameters.EBLaserMAX, + configParameters.EELaserMAX, + // for flags setting + configParameters.expanded_v_DB_reco_flags, + configParameters.expanded_Sizes_v_DB_reco_flags, + configParameters.expanded_flagbit_v_DB_reco_flags, + configParameters.expanded_v_DB_reco_flagsSize, + configParameters.flagmask, + // conditions + conditions.ADCToGeV.adc2gev, + conditions.Intercalib.values, + conditions.ChannelStatus.status, + conditions.LaserAPDPNRatiosRef.values, + conditions.LaserAlphas.values, + // input for transparency corrections + conditions.LaserAPDPNRatios.p1, + conditions.LaserAPDPNRatios.p2, + conditions.LaserAPDPNRatios.p3, + conditions.LaserAPDPNRatios.t1, + conditions.LaserAPDPNRatios.t2, + conditions.LaserAPDPNRatios.t3, + // input for linear corrections + conditions.LinearCorrections.p1, + conditions.LinearCorrections.p2, + conditions.LinearCorrections.p3, + conditions.LinearCorrections.t1, + conditions.LinearCorrections.t2, + conditions.LinearCorrections.t3, + // time, used for time dependent corrections + event_time, + // input + eventInputGPU.ebUncalibRecHits.did, + eventInputGPU.eeUncalibRecHits.did, + eventInputGPU.ebUncalibRecHits.amplitude, + eventInputGPU.eeUncalibRecHits.amplitude, + eventInputGPU.ebUncalibRecHits.jitter, + eventInputGPU.eeUncalibRecHits.jitter, + eventInputGPU.ebUncalibRecHits.chi2, + eventInputGPU.eeUncalibRecHits.chi2, + eventInputGPU.ebUncalibRecHits.flags, + eventInputGPU.eeUncalibRecHits.flags, + // output + eventOutputGPU.did, + eventOutputGPU.energy, + eventOutputGPU.time, + eventOutputGPU.chi2, + eventOutputGPU.flagBits, + eventOutputGPU.extra, + // other + nchannels, + nChannelsBarrel, + conditions.offsetForHashes); + } + + } // namespace rechit + +} // namespace ecal diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.h new file mode 100644 index 0000000000000..831c602a24e61 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitBuilderKernels.h @@ -0,0 +1,94 @@ +#ifndef RecoLocalCalo_EcalRecProducers_plugins_EcalRecHitBuilderKernels_h +#define RecoLocalCalo_EcalRecProducers_plugins_EcalRecHitBuilderKernels_h + +// +// Builder of ECAL RecHits on GPU +// + +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h" +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" +#include "DataFormats/Provenance/interface/Timestamp.h" + +#include "Common.h" +#include "DeclsForKernels.h" + +namespace ecal { + namespace rechit { + + __global__ void kernel_create_ecal_rehit( + // configuration + int const* ChannelStatusToBeExcluded, + uint32_t ChannelStatusToBeExcludedSize, + bool killDeadChannels, + bool const recoverEBIsolatedChannels, + bool const recoverEEIsolatedChannels, + bool const recoverEBVFE, + bool const recoverEEVFE, + bool const recoverEBFE, + bool const recoverEEFE, + // for flags setting + int const* expanded_v_DB_reco_flags, + uint32_t const* expanded_Sizes_v_DB_reco_flags, + uint32_t const* expanded_flagbit_v_DB_reco_flags, + uint32_t expanded_v_DB_reco_flagsSize, + uint32_t flagmask, + // conditions + float const* adc2gev, + float const* intercalib, + uint16_t const* status, + float const* apdpnrefs, + float const* alphas, + // input for transparency corrections + float const* p1, + float const* p2, + float const* p3, + edm::TimeValue_t const* t1, + edm::TimeValue_t const* t2, + edm::TimeValue_t const* t3, + // input for linear corrections + float const* lp1, + float const* lp2, + float const* lp3, + edm::TimeValue_t const* lt1, + edm::TimeValue_t const* lt2, + edm::TimeValue_t const* lt3, + // time, used for time dependent corrections + edm::TimeValue_t const event_time, + // input + uint32_t const* did_eb, + uint32_t const* did_ee, + ::ecal::reco::StorageScalarType const* amplitude_eb, // in adc counts + ::ecal::reco::StorageScalarType const* amplitude_ee, // in adc counts + ::ecal::reco::StorageScalarType const* time_eb, + ::ecal::reco::StorageScalarType const* time_ee, + ::ecal::reco::StorageScalarType const* chi2_eb, + ::ecal::reco::StorageScalarType const* chi2_ee, + uint32_t const* flags_eb, + uint32_t const* flags_ee, + // output + uint32_t* did, + ::ecal::reco::StorageScalarType* energy, // in energy [GeV] + ::ecal::reco::StorageScalarType* time, + ::ecal::reco::StorageScalarType* chi2, + uint32_t* flagBits, + uint32_t* extra, + int const nchannels, + uint32_t const nChannelsBarrel, + uint32_t const offsetForHashes); + + // host version, to be called by the plugin + + void create_ecal_rehit(EventInputDataGPU const& eventInputGPU, + EventOutputDataGPU& eventOutputGPU, + // eventDataForScratchGPU_, + ConditionsProducts const& conditions, + ConfigurationParameters const& configParameters, + uint32_t const nChannelsBarrel, + edm::TimeValue_t const event_time, + cudaStream_t cudaStream); + + } // namespace rechit + +} // namespace ecal + +#endif // RecoLocalCalo_EcalRecProducers_plugins_EcalRecHitBuilderKernels_h diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc index 151762c6b63d3..5766fa0ffcb85 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc @@ -1,18 +1,16 @@ -// framework -#include "FWCore/Framework/interface/stream/EDProducer.h" -#include "FWCore/ParameterSet/interface/ParameterSet.h" -#include "FWCore/Framework/interface/Event.h" -#include "FWCore/Framework/interface/EventSetup.h" -#include "FWCore/Framework/interface/MakerMacros.h" +#include -// algorithm specific -#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" #include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h" +#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" #include "DataFormats/EcalRecHit/interface/EcalRecHit.h" #include "DataFormats/EcalRecHit/interface/EcalRecHitCollections.h" -#include "RecoLocalCalo/EcalRecAlgos/interface/Common.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" -#include +#include "Common.h" class EcalRecHitConvertGPU2CPUFormat : public edm::stream::EDProducer<> { public: @@ -57,7 +55,7 @@ EcalRecHitConvertGPU2CPUFormat::~EcalRecHitConvertGPU2CPUFormat() {} void EcalRecHitConvertGPU2CPUFormat::produce(edm::Event& event, edm::EventSetup const& setup) { auto const& hRecHitsGPUEB = event.get(recHitsGPUEB_); auto const& hRecHitsGPUEE = event.get(recHitsGPUEE_); - + auto recHitsCPUEB = std::make_unique(); auto recHitsCPUEE = std::make_unique(); recHitsCPUEB->reserve(hRecHitsGPUEB.energy.size()); diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc index 248248a4cca88..c13d00a530feb 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc @@ -1,50 +1,33 @@ -// framework -#include "FWCore/Framework/interface/stream/EDProducer.h" - -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "FWCore/ParameterSet/interface/ParameterSet.h" -#include "FWCore/Framework/interface/Event.h" -#include "FWCore/Framework/interface/EventSetup.h" -#include "FWCore/Framework/interface/MakerMacros.h" - -// format -#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" #include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h" +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" #include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h" - -// needed for definition of flags -#include "DataFormats/EcalRecHit/interface/EcalRecHit.h" - -// the kernels -#include "RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h" - -// conditions cpu +#include "CommonTools/Utils/interface/StringToEnumValue.h" #include "CondFormats/DataRecord/interface/EcalADCToGeVConstantRcd.h" -#include "CondFormats/DataRecord/interface/EcalIntercalibConstantsRcd.h" #include "CondFormats/DataRecord/interface/EcalChannelStatusRcd.h" - +#include "CondFormats/DataRecord/interface/EcalIntercalibConstantsRcd.h" #include "CondFormats/DataRecord/interface/EcalLaserAPDPNRatiosRcd.h" #include "CondFormats/DataRecord/interface/EcalLaserAPDPNRatiosRefRcd.h" #include "CondFormats/DataRecord/interface/EcalLaserAlphasRcd.h" #include "CondFormats/DataRecord/interface/EcalLinearCorrectionsRcd.h" - -// conditions gpu -#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h" +#include "DataFormats/EcalRecHit/interface/EcalRecHit.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h" -#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h" - #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h" -#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" - -#include "FWCore/ServiceRegistry/interface/Service.h" -#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" - -// configuration -#include "CommonTools/Utils/interface/StringToEnumValue.h" +#include "EcalRecHitBuilderKernels.h" class EcalRecHitProducerGPU : public edm::stream::EDProducer { public: @@ -175,18 +158,16 @@ EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) { configParameters_.ChannelStatusToBeExcludedSize = v_chstatus_.size(); - // call CUDA API functions only if CUDA is available edm::Service cs; if (cs and cs->enabled()) { - cudaCheck(cudaMalloc((void**)&configParameters_.ChannelStatusToBeExcluded, sizeof(int) * v_chstatus_.size())); cudaCheck(cudaMemcpy(configParameters_.ChannelStatusToBeExcluded, v_chstatus_.data(), v_chstatus_.size() * sizeof(int), cudaMemcpyHostToDevice)); } - + // // https://github.com/cms-sw/cmssw/blob/266e21cfc9eb409b093e4cf064f4c0a24c6ac293/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitWorkerSimple.cc // @@ -216,33 +197,33 @@ EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) { // call CUDA API functions only if CUDA is available if (cs and cs->enabled()) { // actual values - cudaCheck( - cudaMalloc((void**)&configParameters_.expanded_v_DB_reco_flags, sizeof(int) * expanded_v_DB_reco_flags_.size())); - + cudaCheck(cudaMalloc((void**)&configParameters_.expanded_v_DB_reco_flags, + sizeof(int) * expanded_v_DB_reco_flags_.size())); + cudaCheck(cudaMemcpy(configParameters_.expanded_v_DB_reco_flags, expanded_v_DB_reco_flags_.data(), expanded_v_DB_reco_flags_.size() * sizeof(int), cudaMemcpyHostToDevice)); - + // sizes cudaCheck(cudaMalloc((void**)&configParameters_.expanded_Sizes_v_DB_reco_flags, sizeof(uint32_t) * expanded_Sizes_v_DB_reco_flags_.size())); - + cudaCheck(cudaMemcpy(configParameters_.expanded_Sizes_v_DB_reco_flags, expanded_Sizes_v_DB_reco_flags_.data(), expanded_Sizes_v_DB_reco_flags_.size() * sizeof(uint32_t), cudaMemcpyHostToDevice)); - + // keys cudaCheck(cudaMalloc((void**)&configParameters_.expanded_flagbit_v_DB_reco_flags, sizeof(uint32_t) * expanded_flagbit_v_DB_reco_flags_.size())); - + cudaCheck(cudaMemcpy(configParameters_.expanded_flagbit_v_DB_reco_flags, expanded_flagbit_v_DB_reco_flags_.data(), expanded_flagbit_v_DB_reco_flags_.size() * sizeof(uint32_t), cudaMemcpyHostToDevice)); } - + configParameters_.expanded_v_DB_reco_flagsSize = expanded_flagbit_v_DB_reco_flags_.size(); flagmask_ = 0; @@ -266,7 +247,6 @@ EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) { } EcalRecHitProducerGPU::~EcalRecHitProducerGPU() { - edm::Service cs; if (cs and cs->enabled()) { // free event ouput data @@ -303,7 +283,7 @@ void EcalRecHitProducerGPU::acquire(edm::Event const& event, if ((neb_ + nee_) > maxNumberHits_) { edm::LogError("EcalRecHitProducerGPU") << "max number of channels exceeded. See options 'maxNumberHits' "; } - + int nchannelsEB = ebUncalibRecHits.size; // --> offsetForInput, first EB and then EE // conditions diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc index 20f51ea5245df..69c6b7a67831c 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc @@ -1,18 +1,16 @@ -// framework -#include "FWCore/Framework/interface/stream/EDProducer.h" -#include "FWCore/ParameterSet/interface/ParameterSet.h" -#include "FWCore/Framework/interface/Event.h" -#include "FWCore/Framework/interface/EventSetup.h" -#include "FWCore/Framework/interface/MakerMacros.h" +#include -// algorithm specific -#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" -#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h" +#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" #include "DataFormats/EcalRecHit/interface/EcalRecHitCollections.h" -#include "RecoLocalCalo/EcalRecAlgos/interface/Common.h" +#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" -#include +#include "Common.h" class EcalUncalibRecHitConvertGPU2CPUFormat : public edm::stream::EDProducer<> { public: diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu new file mode 100644 index 0000000000000..e79a5dff85d9f --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu @@ -0,0 +1,312 @@ +#include +#include + +#include + +#include "CondFormats/EcalObjects/interface/EcalMGPAGainRatio.h" +#include "CondFormats/EcalObjects/interface/EcalPedestals.h" +#include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h" +#include "CondFormats/EcalObjects/interface/EcalPulseShapes.h" +#include "CondFormats/EcalObjects/interface/EcalSampleMask.h" +#include "CondFormats/EcalObjects/interface/EcalSamplesCorrelation.h" +#include "CondFormats/EcalObjects/interface/EcalXtalGroupId.h" +#include "DataFormats/EcalDigi/interface/EcalDataFrame.h" +#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" + +#include "AmplitudeComputationCommonKernels.h" +#include "AmplitudeComputationKernels.h" +#include "Common.h" +#include "EcalUncalibRecHitMultiFitAlgo_gpu_new.h" +#include "TimeComputationKernels.h" + +//#define DEBUG + +//#define ECAL_RECO_CUDA_DEBUG + +namespace ecal { + namespace multifit { + + void entryPoint(EventInputDataGPU const& eventInputGPU, + EventOutputDataGPU& eventOutputGPU, + EventDataForScratchGPU& scratch, + ConditionsProducts const& conditions, + ConfigurationParameters const& configParameters, + cudaStream_t cudaStream) { + using digis_type = std::vector; + using dids_type = std::vector; + // accodring to the cpu setup //----> hardcoded + bool const gainSwitchUseMaxSampleEB = true; + // accodring to the cpu setup //----> hardcoded + bool const gainSwitchUseMaxSampleEE = false; + + uint32_t const offsetForHashes = conditions.offsetForHashes; + uint32_t const offsetForInputs = eventInputGPU.ebDigis.ndigis; + unsigned int totalChannels = eventInputGPU.ebDigis.ndigis + eventInputGPU.eeDigis.ndigis; + + // + // 1d preparation kernel + // + unsigned int nchannels_per_block = 32; + unsigned int threads_1d = 10 * nchannels_per_block; + unsigned int blocks_1d = threads_1d > 10 * totalChannels ? 1 : (totalChannels * 10 + threads_1d - 1) / threads_1d; + int shared_bytes = nchannels_per_block * EcalDataFrame::MAXSAMPLES * + (sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(char) + sizeof(bool)); + kernel_prep_1d_and_initialize<<>>( + conditions.pulseShapes.values, + eventInputGPU.ebDigis.data, + eventInputGPU.ebDigis.ids, + eventInputGPU.eeDigis.data, + eventInputGPU.eeDigis.ids, + scratch.samples, + (SampleVector*)eventOutputGPU.amplitudesAll, + scratch.gainsNoise, + conditions.pedestals.mean_x1, + conditions.pedestals.mean_x12, + conditions.pedestals.rms_x12, + conditions.pedestals.mean_x6, + conditions.gainRatios.gain6Over1, + conditions.gainRatios.gain12Over6, + scratch.hasSwitchToGain6, + scratch.hasSwitchToGain1, + scratch.isSaturated, + eventOutputGPU.amplitude, + eventOutputGPU.chi2, + eventOutputGPU.pedestal, + eventOutputGPU.did, + eventOutputGPU.flags, + scratch.acState, + scratch.activeBXs, + offsetForHashes, + offsetForInputs, + gainSwitchUseMaxSampleEB, + gainSwitchUseMaxSampleEE, + totalChannels); + cudaCheck(cudaGetLastError()); + + // + // 2d preparation kernel + // + int blocks_2d = totalChannels; + dim3 threads_2d{10, 10}; + kernel_prep_2d<<>>(scratch.gainsNoise, + eventInputGPU.ebDigis.ids, + eventInputGPU.eeDigis.ids, + conditions.pedestals.rms_x12, + conditions.pedestals.rms_x6, + conditions.pedestals.rms_x1, + conditions.gainRatios.gain12Over6, + conditions.gainRatios.gain6Over1, + conditions.samplesCorrelation.EBG12SamplesCorrelation, + conditions.samplesCorrelation.EBG6SamplesCorrelation, + conditions.samplesCorrelation.EBG1SamplesCorrelation, + conditions.samplesCorrelation.EEG12SamplesCorrelation, + conditions.samplesCorrelation.EEG6SamplesCorrelation, + conditions.samplesCorrelation.EEG1SamplesCorrelation, + scratch.noisecov, + scratch.pulse_matrix, + conditions.pulseShapes.values, + scratch.hasSwitchToGain6, + scratch.hasSwitchToGain1, + scratch.isSaturated, + offsetForHashes, + offsetForInputs); + cudaCheck(cudaGetLastError()); + + // run minimization kernels + v1::minimization_procedure(eventInputGPU, eventOutputGPU, scratch, conditions, configParameters, cudaStream); + + if (configParameters.shouldRunTimingComputation) { + // + // TODO: this guy can run concurrently with other kernels, + // there is no dependence on the order of execution + // + unsigned int threads_time_init = threads_1d; + unsigned int blocks_time_init = blocks_1d; + int sharedBytesInit = 2 * threads_time_init * sizeof(SampleVector::Scalar); + kernel_time_computation_init<<>>( + eventInputGPU.ebDigis.data, + eventInputGPU.ebDigis.ids, + eventInputGPU.eeDigis.data, + eventInputGPU.eeDigis.ids, + conditions.pedestals.rms_x12, + conditions.pedestals.rms_x6, + conditions.pedestals.rms_x1, + conditions.pedestals.mean_x12, + conditions.pedestals.mean_x6, + conditions.pedestals.mean_x1, + conditions.gainRatios.gain12Over6, + conditions.gainRatios.gain6Over1, + scratch.sample_values, + scratch.sample_value_errors, + scratch.ampMaxError, + scratch.useless_sample_values, + scratch.pedestal_nums, + offsetForHashes, + offsetForInputs, + conditions.sampleMask.getEcalSampleMaskRecordEB(), + conditions.sampleMask.getEcalSampleMaskRecordEE(), + totalChannels); + cudaCheck(cudaGetLastError()); + + // + // TODO: small kernel only for EB. It needs to be checked if + /// fusing such small kernels is beneficial in here + // + // we are running only over EB digis + // therefore we need to create threads/blocks only for that + unsigned int const threadsFixMGPA = threads_1d; + unsigned int const blocksFixMGPA = + threadsFixMGPA > 10 * eventInputGPU.ebDigis.ndigis + ? 1 + : (10 * eventInputGPU.ebDigis.ndigis + threadsFixMGPA - 1) / threadsFixMGPA; + kernel_time_compute_fixMGPAslew<<>>( + eventInputGPU.ebDigis.data, + eventInputGPU.eeDigis.data, + scratch.sample_values, + scratch.sample_value_errors, + scratch.useless_sample_values, + conditions.sampleMask.getEcalSampleMaskRecordEB(), + totalChannels, + offsetForInputs); + cudaCheck(cudaGetLastError()); + + // + // + // + int sharedBytes = EcalDataFrame::MAXSAMPLES * nchannels_per_block * 4 * sizeof(SampleVector::Scalar); + auto const threads_nullhypot = threads_1d; + auto const blocks_nullhypot = blocks_1d; + kernel_time_compute_nullhypot<<>>( + scratch.sample_values, + scratch.sample_value_errors, + scratch.useless_sample_values, + scratch.chi2sNullHypot, + scratch.sum0sNullHypot, + scratch.sumAAsNullHypot, + totalChannels); + cudaCheck(cudaGetLastError()); + + unsigned int nchannels_per_block_makeratio = 10; + unsigned int threads_makeratio = 45 * nchannels_per_block_makeratio; + unsigned int blocks_makeratio = threads_makeratio > 45 * totalChannels + ? 1 + : (totalChannels * 45 + threads_makeratio - 1) / threads_makeratio; + int sharedBytesMakeRatio = 5 * threads_makeratio * sizeof(SampleVector::Scalar); + kernel_time_compute_makeratio<<>>( + scratch.sample_values, + scratch.sample_value_errors, + eventInputGPU.ebDigis.ids, + eventInputGPU.eeDigis.ids, + scratch.useless_sample_values, + scratch.pedestal_nums, + configParameters.amplitudeFitParametersEB, + configParameters.amplitudeFitParametersEE, + configParameters.timeFitParametersEB, + configParameters.timeFitParametersEE, + scratch.sumAAsNullHypot, + scratch.sum0sNullHypot, + scratch.tMaxAlphaBetas, + scratch.tMaxErrorAlphaBetas, + scratch.accTimeMax, + scratch.accTimeWgt, + scratch.tcState, + configParameters.timeFitParametersSizeEB, + configParameters.timeFitParametersSizeEE, + configParameters.timeFitLimitsFirstEB, + configParameters.timeFitLimitsFirstEE, + configParameters.timeFitLimitsSecondEB, + configParameters.timeFitLimitsSecondEE, + totalChannels, + offsetForInputs); + cudaCheck(cudaGetLastError()); + + // + // + // + auto const threads_findamplchi2 = threads_1d; + auto const blocks_findamplchi2 = blocks_1d; + int const sharedBytesFindAmplChi2 = 2 * threads_findamplchi2 * sizeof(SampleVector::Scalar); + kernel_time_compute_findamplchi2_and_finish<<>>(scratch.sample_values, + scratch.sample_value_errors, + eventInputGPU.ebDigis.ids, + eventInputGPU.eeDigis.ids, + scratch.useless_sample_values, + scratch.tMaxAlphaBetas, + scratch.tMaxErrorAlphaBetas, + scratch.accTimeMax, + scratch.accTimeWgt, + configParameters.amplitudeFitParametersEB, + configParameters.amplitudeFitParametersEE, + scratch.sumAAsNullHypot, + scratch.sum0sNullHypot, + scratch.chi2sNullHypot, + scratch.tcState, + scratch.ampMaxAlphaBeta, + scratch.ampMaxError, + scratch.timeMax, + scratch.timeError, + totalChannels, + offsetForInputs); + cudaCheck(cudaGetLastError()); + + // + // + // + auto const threads_timecorr = 32; + auto const blocks_timecorr = + threads_timecorr > totalChannels ? 1 : (totalChannels + threads_timecorr - 1) / threads_timecorr; + kernel_time_correction_and_finalize<<>>( + eventOutputGPU.amplitude, + eventInputGPU.ebDigis.data, + eventInputGPU.ebDigis.ids, + eventInputGPU.eeDigis.data, + eventInputGPU.eeDigis.ids, + conditions.timeBiasCorrections.EBTimeCorrAmplitudeBins, + conditions.timeBiasCorrections.EETimeCorrAmplitudeBins, + conditions.timeBiasCorrections.EBTimeCorrShiftBins, + conditions.timeBiasCorrections.EETimeCorrShiftBins, + scratch.timeMax, + scratch.timeError, + conditions.pedestals.rms_x12, + conditions.timeCalibConstants.values, + eventOutputGPU.jitter, + eventOutputGPU.jitterError, + eventOutputGPU.flags, + conditions.timeBiasCorrections.EBTimeCorrAmplitudeBinsSize, + conditions.timeBiasCorrections.EETimeCorrAmplitudeBinsSize, + configParameters.timeConstantTermEB, + configParameters.timeConstantTermEE, + conditions.timeOffsetConstant.getEBValue(), + conditions.timeOffsetConstant.getEEValue(), + configParameters.timeNconstEB, + configParameters.timeNconstEE, + configParameters.amplitudeThreshEB, + configParameters.amplitudeThreshEE, + configParameters.outOfTimeThreshG12pEB, + configParameters.outOfTimeThreshG12pEE, + configParameters.outOfTimeThreshG12mEB, + configParameters.outOfTimeThreshG12mEE, + configParameters.outOfTimeThreshG61pEB, + configParameters.outOfTimeThreshG61pEE, + configParameters.outOfTimeThreshG61mEB, + configParameters.outOfTimeThreshG61mEE, + offsetForHashes, + offsetForInputs, + totalChannels); + cudaCheck(cudaGetLastError()); + } + + /* + cudaEventRecord(end_event, 0); + cudaEventSynchronize(end_event); + float ms; + cudaEventElapsedTime(&ms, start_event, end_event); + std::cout << "elapsed time = " << ms << std::endl; + */ + } + + } // namespace multifit +} // namespace ecal diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgo_gpu_new.h b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgo_gpu_new.h new file mode 100644 index 0000000000000..93923e63be632 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgo_gpu_new.h @@ -0,0 +1,23 @@ +#ifndef RecoLocalCalo_EcalRecProducers_plugins_EcalUncalibRecHitMultiFitAlgo_gpu_new_h +#define RecoLocalCalo_EcalRecProducers_plugins_EcalUncalibRecHitMultiFitAlgo_gpu_new_h + +#include + +#include + +#include "DeclsForKernels.h" + +namespace ecal { + namespace multifit { + + void entryPoint(EventInputDataGPU const&, + EventOutputDataGPU&, + EventDataForScratchGPU&, + ConditionsProducts const&, + ConfigurationParameters const&, + cudaStream_t); + + } // namespace multifit +} // namespace ecal + +#endif // RecoLocalCalo_EcalRecProducers_plugins_EcalUncalibRecHitMultiFitAlgo_gpu_new_h diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc index cdf21e3240954..5bd54787ee155 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc @@ -1,44 +1,35 @@ -// framework -#include "FWCore/Framework/interface/stream/EDProducer.h" -//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h" -//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h" - -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" -#include "FWCore/ParameterSet/interface/ParameterSet.h" -#include "FWCore/Framework/interface/Event.h" -#include "FWCore/Framework/interface/EventSetup.h" -#include "FWCore/Framework/interface/MakerMacros.h" - -// algorithm specific -#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" -#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" -#include "RecoLocalCalo/EcalRecAlgos/interface/Common.h" - #include -#include "CondFormats/EcalObjects/interface/EcalTimeOffsetConstant.h" - -#include "CondFormats/DataRecord/interface/EcalPedestalsRcd.h" +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" #include "CondFormats/DataRecord/interface/EcalGainRatiosRcd.h" -#include "CondFormats/DataRecord/interface/EcalPulseShapesRcd.h" +#include "CondFormats/DataRecord/interface/EcalPedestalsRcd.h" #include "CondFormats/DataRecord/interface/EcalPulseCovariancesRcd.h" +#include "CondFormats/DataRecord/interface/EcalPulseShapesRcd.h" +#include "CondFormats/DataRecord/interface/EcalSampleMaskRcd.h" #include "CondFormats/DataRecord/interface/EcalSamplesCorrelationRcd.h" #include "CondFormats/DataRecord/interface/EcalTimeBiasCorrectionsRcd.h" #include "CondFormats/DataRecord/interface/EcalTimeCalibConstantsRcd.h" #include "CondFormats/DataRecord/interface/EcalTimeOffsetConstantRcd.h" -#include "CondFormats/DataRecord/interface/EcalSampleMaskRcd.h" - -#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h" +#include "CondFormats/EcalObjects/interface/EcalTimeOffsetConstant.h" +#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h" -#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h" -#include "RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h" -#include "RecoLocalCalo/EcalRecAlgos/interface/EcalUncalibRecHitMultiFitAlgo_gpu_new.h" +#include "Common.h" +#include "DeclsForKernels.h" +#include "EcalUncalibRecHitMultiFitAlgo_gpu_new.h" class EcalUncalibRecHitProducerGPU : public edm::stream::EDProducer { public: @@ -290,7 +281,7 @@ void EcalUncalibRecHitProducerGPU::acquire(edm::Event const& event, if ((neb_ + nee_) > maxNumberHits_) { edm::LogError("EcalUncalibRecHitProducerGPU") << "max number of channels exceeded. See options 'maxNumberHits' "; } - + // conditions setup.get().get(pedestalsHandle_); setup.get().get(gainRatiosHandle_); diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EigenMatrixTypes_gpu.h b/RecoLocalCalo/EcalRecProducers/plugins/EigenMatrixTypes_gpu.h new file mode 100644 index 0000000000000..bbf9cb0dbb5c9 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/EigenMatrixTypes_gpu.h @@ -0,0 +1,49 @@ +#ifndef RecoLocalCalo_EcalRecProducers_plugins_EigenMatrixTypes_gpu_h +#define RecoLocalCalo_EcalRecProducers_plugins_EigenMatrixTypes_gpu_h + +#include + +#include + +#include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h" + +namespace ecal { + namespace multifit { + + constexpr int SampleVectorSize = 10; + constexpr int FullSampleVectorSize = 19; + constexpr int PulseVectorSize = 12; + constexpr int NGains = 3; + + using data_type = ::ecal::reco::ComputationScalarType; + + typedef Eigen::Matrix PulseMatrixType; + typedef Eigen::Matrix BXVectorType; + using SampleMatrixD = Eigen::Matrix; + + typedef Eigen::Matrix SampleVector; + typedef Eigen::Matrix FullSampleVector; + typedef Eigen::Matrix PulseVector; + typedef Eigen::Matrix BXVector; + typedef Eigen::Matrix SampleGainVector; + typedef Eigen::Matrix SampleMatrix; + typedef Eigen::Matrix FullSampleMatrix; + typedef Eigen::Matrix PulseMatrix; + typedef Eigen::Matrix + SamplePulseMatrix; + typedef Eigen::LLT SampleDecompLLT; + typedef Eigen::LLT SampleDecompLLTD; + typedef Eigen::LLT PulseDecompLLT; + typedef Eigen::LDLT PulseDecompLDLT; + + typedef Eigen::Matrix SingleMatrix; + typedef Eigen::Matrix SingleVector; + + typedef std::array SampleMatrixGainArray; + + using PermutationMatrix = Eigen::PermutationMatrix; + + } // namespace multifit +} // namespace ecal + +#endif // RecoLocalCalo_EcalRecProducers_plugins_EigenMatrixTypes_gpu_h diff --git a/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.cu b/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.cu new file mode 100644 index 0000000000000..00c61e04e1596 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.cu @@ -0,0 +1,323 @@ +#include "DataFormats/EcalDetId/interface/EBDetId.h" +#include "DataFormats/EcalDetId/interface/EEDetId.h" + +#include "KernelHelpers.h" + +namespace ecal { + namespace reconstruction { + + namespace internal { + + namespace barrel { + + __device__ __forceinline__ bool positiveZ(uint32_t id) { return id & 0x10000; } + + __device__ __forceinline__ uint32_t ietaAbs(uint32_t id) { return (id >> 9) & 0x7F; } + + __device__ __forceinline__ uint32_t iphi(uint32_t id) { return id & 0x1FF; } + + __device__ int dccFromSm(int ism) { + int iz = 1; + if (ism > 18) + iz = -1; + if (iz == -1) + ism -= 18; + int idcc = 9 + ism; + if (iz == +1) + idcc += 18; + return idcc; + } + + __device__ int sm(int ieta, int iphi) { + int iz = 1; + if (ieta < 0) + iz = -1; + ieta *= iz; + int iphi_ = iphi; + if (iphi_ > 360) + iphi_ -= 360; + int ism = (iphi_ - 1) / 20 + 1; + if (iz == -1) + ism += 18; + return ism; + } + + __device__ int dcc(int ieta, int iphi) { + int ism = sm(ieta, iphi); + return dccFromSm(ism); + } + + // + // ---- why on hell things are so complex and not simple ??? + // + + __device__ int lm_channel(int iX, int iY) { + static const int idx_[] = { + // clang-format off + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + 1, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8, // 3 + 1, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8, // 2 + 1, 3, 3, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 9, 9, 9, 9, // 1 + 1, 3, 3, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 9, 9, 9, 9 // 0 + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + // clang-format on + }; + + int il, ic, ii; + const int iym = 4; + const int ixm = 17; + int iX_ = iX + 1; + int iY_ = iY + 1; + il = iym - iY_; + ic = iX_ - 1; + ii = il * ixm + ic; + if (ii < 0 || ii > (int)(sizeof(idx_) / sizeof(int))) { + return -1; + }; + return idx_[ii]; + } + + __device__ int localCoord_x(int ieta, int iphi) { + int iz = 1; + if (ieta < 0) { + iz = -1; + } + ieta *= iz; + // int iphi_ = iphi; + // if (iphi_ > 360) { + // iphi_ -= 360; + // } + int ix = ieta - 1; + // int iy = (iphi_ - 1) % 20; + // if (iz == -1) { + // iy = 19 - iy; + // } + + return ix; + } + + __device__ int localCoord_y(int ieta, int iphi) { + int iz = 1; + if (ieta < 0) { + iz = -1; + } + // ieta *= iz; + int iphi_ = iphi; + if (iphi_ > 360) { + iphi_ -= 360; + } + // int ix = ieta - 1; + int iy = (iphi_ - 1) % 20; + if (iz == -1) { + iy = 19 - iy; + } + + return iy; + } + + __device__ int lmmod(int ieta, int iphi) { + int ix = localCoord_x(ieta, iphi); + int iy = localCoord_y(ieta, iphi); + + return lm_channel(ix / 5, iy / 5); + } + + __device__ int side(int ieta, int iphi) { + int ilmmod = lmmod(ieta, iphi); + return (ilmmod % 2 == 0) ? 1 : 0; + } + + } // namespace barrel + + } // namespace internal + + __device__ uint32_t hashedIndexEB(uint32_t id) { + using namespace internal::barrel; + return (EBDetId::MAX_IETA + (positiveZ(id) ? ietaAbs(id) - 1 : -ietaAbs(id))) * EBDetId::MAX_IPHI + iphi(id) - 1; + } + + // + // https://cmssdt.cern.ch/lxr/source/CalibCalorimetry/EcalLaserAnalyzer/src/MEEBGeom.cc + // function: "lmr" + + __device__ int laser_monitoring_region_EB(uint32_t id) { + using namespace internal::barrel; + + int ieta; + if (positiveZ(id)) { + ieta = ietaAbs(id); + } else { + ieta = -ietaAbs(id); + } + + int idcc = dcc(ieta, (int)(iphi(id))); + int ism = idcc - 9; + + int iside = side(ieta, (int)(iphi(id))); + // int iside = positiveZ(id) ? 1 : 0; + + return (1 + 2 * (ism - 1) + iside); + // return ieta; + // return (int) (iphi(id)); + // return idcc; + // return iside; + } + + namespace internal { + + namespace endcap { + + __device__ __forceinline__ uint32_t ix(uint32_t id) { return (id >> 7) & 0x7F; } + + __device__ __forceinline__ uint32_t iy(uint32_t id) { return id & 0x7F; } + + __device__ __forceinline__ bool positiveZ(uint32_t id) { return id & 0x4000; } + + // these constants come from EE Det Id + __constant__ const unsigned short kxf[] = { + 41, 51, 41, 51, 41, 51, 36, 51, 36, 51, 26, 51, 26, 51, 26, 51, 21, 51, 21, 51, 21, 51, 21, 51, 21, + 51, 16, 51, 16, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 9, 51, 9, 51, 9, 51, 9, 51, 9, 51, + 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 4, 51, 4, 51, 4, + 51, 4, 51, 4, 56, 1, 58, 1, 59, 1, 60, 1, 61, 1, 61, 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, + 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, 1, 61, 1, 61, 1, 60, 1, 59, 1, 58, 4, 56, 4, 51, 4, + 51, 4, 51, 4, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, + 9, 51, 9, 51, 9, 51, 9, 51, 9, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 16, 51, 16, 51, 21, + 51, 21, 51, 21, 51, 21, 51, 21, 51, 26, 51, 26, 51, 26, 51, 36, 51, 36, 51, 41, 51, 41, 51, 41, 51}; + + __constant__ const unsigned short kdi[] = { + 0, 10, 20, 30, 40, 50, 60, 75, 90, 105, 120, 145, 170, 195, 220, 245, 270, + 300, 330, 360, 390, 420, 450, 480, 510, 540, 570, 605, 640, 675, 710, 747, 784, 821, + 858, 895, 932, 969, 1006, 1043, 1080, 1122, 1164, 1206, 1248, 1290, 1332, 1374, 1416, 1458, 1500, + 1545, 1590, 1635, 1680, 1725, 1770, 1815, 1860, 1905, 1950, 1995, 2040, 2085, 2130, 2175, 2220, 2265, + 2310, 2355, 2400, 2447, 2494, 2541, 2588, 2635, 2682, 2729, 2776, 2818, 2860, 2903, 2946, 2988, 3030, + 3071, 3112, 3152, 3192, 3232, 3272, 3311, 3350, 3389, 3428, 3467, 3506, 3545, 3584, 3623, 3662, 3701, + 3740, 3779, 3818, 3857, 3896, 3935, 3974, 4013, 4052, 4092, 4132, 4172, 4212, 4253, 4294, 4336, 4378, + 4421, 4464, 4506, 4548, 4595, 4642, 4689, 4736, 4783, 4830, 4877, 4924, 4969, 5014, 5059, 5104, 5149, + 5194, 5239, 5284, 5329, 5374, 5419, 5464, 5509, 5554, 5599, 5644, 5689, 5734, 5779, 5824, 5866, 5908, + 5950, 5992, 6034, 6076, 6118, 6160, 6202, 6244, 6281, 6318, 6355, 6392, 6429, 6466, 6503, 6540, 6577, + 6614, 6649, 6684, 6719, 6754, 6784, 6814, 6844, 6874, 6904, 6934, 6964, 6994, 7024, 7054, 7079, 7104, + 7129, 7154, 7179, 7204, 7219, 7234, 7249, 7264, 7274, 7284, 7294, 7304, 7314}; + + __device__ int quadrant(int iX, int iY) { + bool near = iX >= 11; + bool far = !near; + bool top = iY >= 11; + bool bot = !top; + + int iquad = 0; + if (near && top) + iquad = 1; + if (far && top) + iquad = 2; + if (far && bot) + iquad = 3; + if (near && bot) + iquad = 4; + + return iquad; + } + + __device__ int sector(int iX, int iY) { + // Y (towards the surface) + // T + // | + // | + // | + // o---------| X (towards center of LHC) + // + static const int idx_[] = { + // clang-format off + // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, // 20 + 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, // 19 + 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 8, 0, 0, 0, // 18 + 0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 8, 8, 8, 0, 0, // 17 + 0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 8, 8, 8, 8, 0, // 16 + 0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 9, 9, 9, 9, 8, 8, 8, 8, 8, 0, // 15 + 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 9, 9, 9, 8, 8, 8, 8, 8, 8, 0, // 14 + 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, // 13 + 3, 3, 2, 2, 2, 2, 2, 2, 2, 0, 0, 8, 8, 8, 8, 8, 8, 8, 7, 7, // 12 + 3, 3, 3, 3, 3, 3, 3, 2, 0, 0, 0, 0, 8, 7, 7, 7, 7, 7, 7, 7, // 11 + 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, // 10 + 3, 3, 3, 3, 3, 3, 3, 4, 4, 0, 0, 6, 6, 7, 7, 7, 7, 7, 7, 7, // 9 + 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 7, 7, 7, // 8 + 0, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 0, // 7 + 0, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 0, // 6 + 0, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 0, // 5 + 0, 0, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 0, 0, // 4 + 0, 0, 0, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 0, 0, 0, // 3 + 0, 0, 0, 0, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 0, 0, 0, 0, // 2 + 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0 // 1 + // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 + // clang-format on + }; + + int iym, ixm, il, ic, ii; + iym = 20; + ixm = 20; + int iX_ = iX; + int iY_ = iY; + il = iym - iY_; + ic = iX_ - 1; + ii = il * ixm + ic; + + if (ii < 0 || ii > (int)(sizeof(idx_) / sizeof(int)) || idx_[ii] == 0) { + return -1; + }; + return idx_[ii]; + } + + } // namespace endcap + + } // namespace internal + + __device__ uint32_t hashedIndexEE(uint32_t id) { + using namespace internal::endcap; + + const uint32_t jx(ix(id)); + const uint32_t jd(2 * (iy(id) - 1) + (jx - 1) / 50); + return ((positiveZ(id) ? EEDetId::kEEhalf : 0) + kdi[jd] + jx - kxf[jd]); + } + + // + // https://cmssdt.cern.ch/lxr/source/CalibCalorimetry/EcalLaserAnalyzer/src/MEEEGeom.cc + // https://github.com/cms-sw/cmssw/blob/master/CalibCalorimetry/EcalLaserCorrection/src/EcalLaserDbService.cc + // + + __device__ int laser_monitoring_region_EE(uint32_t id) { + using namespace internal::endcap; + + // SuperCrysCoord + uint32_t iX = (ix(id) - 1) / 5 + 1; + uint32_t iY = (iy(id) - 1) / 5 + 1; + + // Correct convention + // * @param iz iz/zside index: -1 for EE-, +1 for EE+ + // https://github.com/cms-sw/cmssw/blob/master/DataFormats/EcalDetId/interface/EEDetId.h#L68-L71 + // zside in https://github.com/cms-sw/cmssw/blob/master/CalibCalorimetry/EcalLaserCorrection/src/EcalLaserDbService.cc#L63 + // + int iz = positiveZ(id) ? 1 : -1; + + int iquad = quadrant(iX, iY); + int isect = sector(iX, iY); + if (isect < 0) + return -1; + + int ilmr = 0; + ilmr = isect - 6; + if (ilmr <= 0) + ilmr += 9; + if (ilmr == 9) + ilmr++; + if (ilmr == 8 && iquad == 4) + ilmr++; + if (iz == +1) + ilmr += 72; + else + ilmr += 82; + + return ilmr; + } + + } // namespace reconstruction +} // namespace ecal diff --git a/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h b/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h new file mode 100644 index 0000000000000..103087c3517f9 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h @@ -0,0 +1,452 @@ +#ifndef RecoLocalCalo_EcalRecProducers_plugins_KernelHelpers_h +#define RecoLocalCalo_EcalRecProducers_plugins_KernelHelpers_h + +#include +#include +#include + +#include + +namespace ecal { + namespace multifit { + + template + using ColMajorMatrix = Eigen::Matrix; + + template + using RowMajorMatrix = Eigen::Matrix; + + template + using ColumnVector = Eigen::Matrix; + + template + using RowVector = Eigen::Matrix; + + // FIXME: provide specialization for Row Major layout + template + struct MapSymM { + using type = T; + using base_type = typename std::remove_const::type; + + static constexpr int total = Stride * (Stride + 1) / 2; + static constexpr int stride = Stride; + T* data; + + __forceinline__ __device__ MapSymM(T* data) : data{data} {} + + __forceinline__ __device__ T const& operator()(int const row, int const col) const { + auto const tmp = (Stride - col) * (Stride - col + 1) / 2; + auto const index = total - tmp + row - col; + return data[index]; + } + + template + __forceinline__ __device__ typename std::enable_if::value, base_type>::type& + operator()(int const row, int const col) { + auto const tmp = (Stride - col) * (Stride - col + 1) / 2; + auto const index = total - tmp + row - col; + return data[index]; + } + }; + + // FIXME: either use/modify/improve eigen or make this more generic + // this is a map for a pulse matrix to building a 2d matrix for each channel + // and hide indexing + template + struct MapMForPM { + using type = T; + using base_type = typename std::remove_cv::type; + + type* data; + __forceinline__ __device__ MapMForPM(type* data) : data{data} {} + + __forceinline__ __device__ base_type operator()(int const row, int const col) const { + auto const index = 2 - col + row; + return index >= 0 ? data[index] : 0; + } + }; + + // simple/trivial cholesky decomposition impl + template + __forceinline__ __device__ void compute_decomposition_unrolled(MatrixType1& L, MatrixType2 const& M) { + auto const sqrtm_0_0 = std::sqrt(M(0, 0)); + L(0, 0) = sqrtm_0_0; + using T = typename MatrixType1::base_type; + +#pragma unroll + for (int i = 1; i < MatrixType1::stride; i++) { + T sumsq{0}; + for (int j = 0; j < i; j++) { + T sumsq2{0}; + auto const m_i_j = M(i, j); + for (int k = 0; k < j; ++k) + sumsq2 += L(i, k) * L(j, k); + + auto const value_i_j = (m_i_j - sumsq2) / L(j, j); + L(i, j) = value_i_j; + + sumsq += value_i_j * value_i_j; + } + + auto const l_i_i = std::sqrt(M(i, i) - sumsq); + L(i, i) = l_i_i; + } + } + + template + __forceinline__ __device__ void compute_decomposition(MatrixType1& L, MatrixType2 const& M, int const N) { + auto const sqrtm_0_0 = std::sqrt(M(0, 0)); + L(0, 0) = sqrtm_0_0; + using T = typename MatrixType1::base_type; + + for (int i = 1; i < N; i++) { + T sumsq{0}; + for (int j = 0; j < i; j++) { + T sumsq2{0}; + auto const m_i_j = M(i, j); + for (int k = 0; k < j; ++k) + sumsq2 += L(i, k) * L(j, k); + + auto const value_i_j = (m_i_j - sumsq2) / L(j, j); + L(i, j) = value_i_j; + + sumsq += value_i_j * value_i_j; + } + + auto const l_i_i = std::sqrt(M(i, i) - sumsq); + L(i, i) = l_i_i; + } + } + + template + __forceinline__ __device__ void compute_decomposition_forwardsubst_with_offsets( + MatrixType1& L, + MatrixType2 const& M, + float b[MatrixType1::stride], + VectorType const& Atb, + int const N, + ColumnVector const& pulseOffsets) { + auto const real_0 = pulseOffsets(0); + auto const sqrtm_0_0 = std::sqrt(M(real_0, real_0)); + L(0, 0) = sqrtm_0_0; + using T = typename MatrixType1::base_type; + b[0] = Atb(real_0) / sqrtm_0_0; + + for (int i = 1; i < N; i++) { + auto const i_real = pulseOffsets(i); + T sumsq{0}; + T total = 0; + auto const atb = Atb(i_real); + for (int j = 0; j < i; j++) { + auto const j_real = pulseOffsets(j); + T sumsq2{0}; + auto const m_i_j = M(std::max(i_real, j_real), std::min(i_real, j_real)); + for (int k = 0; k < j; ++k) + sumsq2 += L(i, k) * L(j, k); + + auto const value_i_j = (m_i_j - sumsq2) / L(j, j); + L(i, j) = value_i_j; + + sumsq += value_i_j * value_i_j; + total += value_i_j * b[j]; + } + + auto const l_i_i = std::sqrt(M(i_real, i_real) - sumsq); + L(i, i) = l_i_i; + b[i] = (atb - total) / l_i_i; + } + } + + template + __forceinline__ __device__ void update_decomposition_forwardsubst_with_offsets( + MatrixType1& L, + MatrixType2 const& M, + float b[MatrixType1::stride], + VectorType const& Atb, + int const N, + ColumnVector const& pulseOffsets) { + using T = typename MatrixType1::base_type; + auto const i = N - 1; + auto const i_real = pulseOffsets(i); + T sumsq{0}; + T total = 0; + for (int j = 0; j < i; j++) { + auto const j_real = pulseOffsets(j); + T sumsq2{0}; + auto const m_i_j = M(std::max(i_real, j_real), std::min(i_real, j_real)); + for (int k = 0; k < j; ++k) + sumsq2 += L(i, k) * L(j, k); + + auto const value_i_j = (m_i_j - sumsq2) / L(j, j); + L(i, j) = value_i_j; + sumsq += value_i_j * value_i_j; + + total += value_i_j * b[j]; + } + + auto const l_i_i = std::sqrt(M(i_real, i_real) - sumsq); + L(i, i) = l_i_i; + b[i] = (Atb(i_real) - total) / l_i_i; + } + + template + __device__ void solve_forward_subst_matrix(MatrixType1& A, + MatrixType2 const& pulseMatrixView, + MatrixType3 const& matrixL) { + // FIXME: this assumes pulses are on columns and samples on rows + constexpr auto NPULSES = MatrixType2::ColsAtCompileTime; + constexpr auto NSAMPLES = MatrixType2::RowsAtCompileTime; + +#pragma unroll + for (int icol = 0; icol < NPULSES; icol++) { + float reg_b[NSAMPLES]; + float reg_L[NSAMPLES]; + +// preload a column and load column 0 of cholesky +#pragma unroll + for (int i = 0; i < NSAMPLES; i++) { + reg_b[i] = __ldg(&pulseMatrixView.coeffRef(i, icol)); + reg_L[i] = matrixL(i, 0); + } + + // compute x0 and store it + auto x_prev = reg_b[0] / reg_L[0]; + A(0, icol) = x_prev; + +// iterate +#pragma unroll + for (int iL = 1; iL < NSAMPLES; iL++) { +// update accum +#pragma unroll + for (int counter = iL; counter < NSAMPLES; counter++) + reg_b[counter] -= x_prev * reg_L[counter]; + +// load the next column of cholesky +#pragma unroll + for (int counter = iL; counter < NSAMPLES; counter++) + reg_L[counter] = matrixL(counter, iL); + + // compute the next x for M(iL, icol) + x_prev = reg_b[iL] / reg_L[iL]; + + // store the result value + A(iL, icol) = x_prev; + } + } + } + + template + __device__ void solve_forward_subst_vector(float reg_b[MatrixType1::RowsAtCompileTime], + MatrixType1 inputAmplitudesView, + MatrixType2 matrixL) { + constexpr auto NSAMPLES = MatrixType1::RowsAtCompileTime; + + float reg_b_tmp[NSAMPLES]; + float reg_L[NSAMPLES]; + +// preload a column and load column 0 of cholesky +#pragma unroll + for (int i = 0; i < NSAMPLES; i++) { + reg_b_tmp[i] = inputAmplitudesView(i); + reg_L[i] = matrixL(i, 0); + } + + // compute x0 and store it + auto x_prev = reg_b_tmp[0] / reg_L[0]; + reg_b[0] = x_prev; + +// iterate +#pragma unroll + for (int iL = 1; iL < NSAMPLES; iL++) { +// update accum +#pragma unroll + for (int counter = iL; counter < NSAMPLES; counter++) + reg_b_tmp[counter] -= x_prev * reg_L[counter]; + +// load the next column of cholesky +#pragma unroll + for (int counter = iL; counter < NSAMPLES; counter++) + reg_L[counter] = matrixL(counter, iL); + + // compute the next x for M(iL, icol) + x_prev = reg_b_tmp[iL] / reg_L[iL]; + + // store the result value + reg_b[iL] = x_prev; + } + } + + // TODO: add active bxs + template + __device__ void fnnls(MatrixType const& AtA, + VectorType const& Atb, + VectorType& solution, + int& npassive, + ColumnVector& pulseOffsets, + MapSymM& matrixL, + double const eps, + int const maxIterations) { + // constants + constexpr auto NPULSES = VectorType::RowsAtCompileTime; + + // to keep track of where to terminate if converged + Eigen::Index w_max_idx_prev = 0; + float w_max_prev = 0; + auto eps_to_use = eps; + bool recompute = false; + + // used throughout + VectorType s; + float reg_b[NPULSES]; + //float matrixLStorage[MapSymM::total]; + //MapSymM matrixL{matrixLStorage}; + + int iter = 0; + while (true) { + if (iter > 0 || npassive == 0) { + auto const nactive = NPULSES - npassive; + // exit if there are no more pulses to constrain + if (nactive == 0) + break; + + // compute the gradient + //w.tail(nactive) = Atb.tail(nactive) - (AtA * solution).tail(nactive); + Eigen::Index w_max_idx; + float w_max = -std::numeric_limits::max(); + for (int icol = npassive; icol < NPULSES; icol++) { + auto const icol_real = pulseOffsets(icol); + auto const atb = Atb(icol_real); + float sum = 0; +#pragma unroll + for (int counter = 0; counter < NPULSES; counter++) + sum += counter > icol_real ? AtA(counter, icol_real) * solution(counter) + : AtA(icol_real, counter) * solution(counter); + + auto const w = atb - sum; + if (w > w_max) { + w_max = w; + w_max_idx = icol - npassive; + } + } + + // check for convergence + if (w_max < eps_to_use || w_max_idx == w_max_idx_prev && w_max == w_max_prev) + break; + + if (iter >= maxIterations) + break; + + w_max_prev = w_max; + w_max_idx_prev = w_max_idx; + + // move index to the right part of the vector + w_max_idx += npassive; + + Eigen::numext::swap(pulseOffsets.coeffRef(npassive), pulseOffsets.coeffRef(w_max_idx)); + ++npassive; + } + + // inner loop + while (true) { + if (npassive == 0) + break; + + //s.head(npassive) + //auto const& matrixL = + // AtA.topLeftCorner(npassive, npassive) + // .llt().matrixL(); + //.solve(Atb.head(npassive)); + if (recompute || iter == 0) + compute_decomposition_forwardsubst_with_offsets(matrixL, AtA, reg_b, Atb, npassive, pulseOffsets); + else + update_decomposition_forwardsubst_with_offsets(matrixL, AtA, reg_b, Atb, npassive, pulseOffsets); + + // run backward substituion + s(npassive - 1) = reg_b[npassive - 1] / matrixL(npassive - 1, npassive - 1); + for (int i = npassive - 2; i >= 0; --i) { + float total = 0; + for (int j = i + 1; j < npassive; j++) + total += matrixL(j, i) * s(j); + + s(i) = (reg_b[i] - total) / matrixL(i, i); + } + + // done if solution values are all positive + bool hasNegative = false; + bool hasNans = false; + for (int counter = 0; counter < npassive; counter++) { + auto const s_ii = s(counter); + hasNegative |= s_ii <= 0; + hasNans |= std::isnan(s_ii); + } + + // FIXME: temporary solution. my cholesky impl is unstable yielding nans + // this check removes nans - do not accept solution unless all values + // are stable + if (hasNans) + break; + if (!hasNegative) { + for (int i = 0; i < npassive; i++) { + auto const i_real = pulseOffsets(i); + solution(i_real) = s(i); + } + //solution.head(npassive) = s.head(npassive); + recompute = false; + break; + } + + // there were negative values -> have to recompute the whole decomp + recompute = true; + + auto alpha = std::numeric_limits::max(); + Eigen::Index alpha_idx = 0, alpha_idx_real = 0; + for (int i = 0; i < npassive; i++) { + if (s[i] <= 0.) { + auto const i_real = pulseOffsets(i); + auto const ratio = solution[i_real] / (solution[i_real] - s[i]); + if (ratio < alpha) { + alpha = ratio; + alpha_idx = i; + alpha_idx_real = i_real; + } + } + } + + // upadte solution + for (int i = 0; i < npassive; i++) { + auto const i_real = pulseOffsets(i); + solution(i_real) += alpha * (s(i) - solution(i_real)); + } + //solution.head(npassive) += alpha * + // (s.head(npassive) - solution.head(npassive)); + solution[alpha_idx_real] = 0; + --npassive; + + Eigen::numext::swap(pulseOffsets.coeffRef(npassive), pulseOffsets.coeffRef(alpha_idx)); + } + + // as in cpu + ++iter; + if (iter % 16 == 0) + eps_to_use *= 2; + } + } + + } // namespace multifit +} // namespace ecal + +namespace ecal { + namespace reconstruction { + + __device__ uint32_t hashedIndexEB(uint32_t id); + + __device__ uint32_t hashedIndexEE(uint32_t id); + + __device__ int laser_monitoring_region_EB(uint32_t id); + + __device__ int laser_monitoring_region_EE(uint32_t id); + + } // namespace reconstruction +} // namespace ecal + +#endif // RecoLocalCalo_EcalRecProducers_plugins_KernelHelpers_h diff --git a/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.cu new file mode 100644 index 0000000000000..ddb9334826f40 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.cu @@ -0,0 +1,1131 @@ +#include +#include + +#include + +#include "DataFormats/EcalDigi/interface/EcalDataFrame.h" +#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h" +#include "DataFormats/Math/interface/approx_exp.h" +#include "DataFormats/Math/interface/approx_log.h" + +#include "Common.h" +#include "TimeComputationKernels.h" +#include "KernelHelpers.h" + +//#define DEBUG + +//#define ECAL_RECO_CUDA_DEBUG + +namespace ecal { + namespace multifit { + + __device__ __forceinline__ bool use_sample(unsigned int sample_mask, unsigned int sample) { + return sample_mask & (0x1 << (EcalDataFrame::MAXSAMPLES - (sample + 1))); + } + + __global__ void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values, + SampleVector::Scalar const* sample_value_errors, + bool const* useless_sample_values, + SampleVector::Scalar* chi2s, + SampleVector::Scalar* sum0s, + SampleVector::Scalar* sumAAs, + const int nchannels) { + using ScalarType = SampleVector::Scalar; + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + int tx = threadIdx.x + blockDim.x * blockIdx.x; + int ltx = threadIdx.x; + int ch = tx / nsamples; + int nchannels_per_block = blockDim.x / nsamples; + + // TODO: make sure that this branch plays nicely with __syncthreads inside + // can there be a deadlock even if the thread is inactive + if (ch < nchannels) { + // + int sample = tx % nsamples; + + // shared mem inits + extern __shared__ char sdata[]; + char* s_sum0 = sdata; + SampleVector::Scalar* s_sum1 = reinterpret_cast(s_sum0 + nchannels_per_block * nsamples); + SampleVector::Scalar* s_sumA = s_sum1 + nchannels_per_block * nsamples; + SampleVector::Scalar* s_sumAA = s_sumA + nchannels_per_block * nsamples; + + // TODO make sure no div by 0 + const auto inv_error = + useless_sample_values[tx] ? 0.0 : 1.0 / (sample_value_errors[tx] * sample_value_errors[tx]); + const auto sample_value = sample_values[tx]; + s_sum0[ltx] = useless_sample_values[tx] ? 0 : 1; + s_sum1[ltx] = inv_error; + s_sumA[ltx] = sample_value * inv_error; + s_sumAA[ltx] = sample_value * sample_value * inv_error; + __syncthreads(); + + // 5 threads for [0, 4] samples + if (sample < 5) { + s_sum0[ltx] += s_sum0[ltx + 5]; + s_sum1[ltx] += s_sum1[ltx + 5]; + s_sumA[ltx] += s_sumA[ltx + 5]; + s_sumAA[ltx] += s_sumAA[ltx + 5]; + } + __syncthreads(); + + if (sample < 2) { + // note double counting of sample 3 + s_sum0[ltx] += s_sum0[ltx + 2] + s_sum0[ltx + 3]; + s_sum1[ltx] += s_sum1[ltx + 2] + s_sum1[ltx + 3]; + s_sumA[ltx] += s_sumA[ltx + 2] + s_sumA[ltx + 3]; + s_sumAA[ltx] += s_sumAA[ltx + 2] + s_sumAA[ltx + 3]; + } + __syncthreads(); + + if (sample == 0) { + // note, subtract to remove the double counting of sample == 3 + //s_sum0[ltx] += s_sum0[ltx+1] - s_sum0[ltx+3]; + //s_sum1[ltx] += s_sum1[ltx+1] - s_sum1[ltx+3]; + //s_sumA[ltx] += s_sumA[ltx+1] - s_sumA[ltx+3]; + //s_sumAA[ltx] += s_sumAA[ltx+1] - s_sumAA[ltx+3]; + const auto sum0 = s_sum0[ltx] + s_sum0[ltx + 1] - s_sum0[ltx + 3]; + const auto sum1 = s_sum1[ltx] + s_sum1[ltx + 1] - s_sum1[ltx + 3]; + const auto sumA = s_sumA[ltx] + s_sumA[ltx + 1] - s_sumA[ltx + 3]; + const auto sumAA = s_sumAA[ltx] + s_sumAA[ltx + 1] - s_sumAA[ltx + 3]; + const auto chi2 = sum0 > 0 ? (sumAA - sumA * sumA / sum1) / sum0 : static_cast(0); + chi2s[ch] = chi2; + sum0s[ch] = sum0; + sumAAs[ch] = sumAA; + +#ifdef DEBUG_TC_NULLHYPOT + if (ch == 0) { + printf("chi2 = %f sum0 = %d sumAA = %f\n", chi2, static_cast(sum0), sumAA); + } +#endif + } + } + } + + constexpr float fast_expf(float x) { return unsafe_expf<6>(x); } + constexpr float fast_logf(float x) { return unsafe_logf<7>(x); } + + //#define DEBUG_TC_MAKERATIO + // + // launch ctx parameters are + // 45 threads per channel, X channels per block, Y blocks + // 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9 + // TODO: it might be much beter to use 32 threads per channel instead of 45 + // to simplify the synchronization + // + __global__ void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values, + SampleVector::Scalar const* sample_value_errors, + uint32_t const* dids_eb, + uint32_t const* dids_ee, + bool const* useless_sample_values, + char const* pedestal_nums, + ConfigurationParameters::type const* amplitudeFitParametersEB, + ConfigurationParameters::type const* amplitudeFitParametersEE, + ConfigurationParameters::type const* timeFitParametersEB, + ConfigurationParameters::type const* timeFitParametersEE, + SampleVector::Scalar const* sumAAsNullHypot, + SampleVector::Scalar const* sum0sNullHypot, + SampleVector::Scalar* tMaxAlphaBetas, + SampleVector::Scalar* tMaxErrorAlphaBetas, + SampleVector::Scalar* g_accTimeMax, + SampleVector::Scalar* g_accTimeWgt, + TimeComputationState* g_state, + unsigned const int timeFitParameters_sizeEB, + unsigned const int timeFitParameters_sizeEE, + ConfigurationParameters::type const timeFitLimits_firstEB, + ConfigurationParameters::type const timeFitLimits_firstEE, + ConfigurationParameters::type const timeFitLimits_secondEB, + ConfigurationParameters::type const timeFitLimits_secondEE, + const int nchannels, + uint32_t const offsetForInputs) { + using ScalarType = SampleVector::Scalar; + + // constants + constexpr int nthreads_per_channel = 45; // n=10, n(n-1)/2 + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + const int gtx = threadIdx.x + blockDim.x * blockIdx.x; + const int ch = gtx / nthreads_per_channel; + const int ltx = threadIdx.x % nthreads_per_channel; + const int ch_start = ch * nsamples; + const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb; + const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch; + + // rmeove inactive threads + // TODO: need to understand if this is 100% safe in presence of syncthreads + if (ch >= nchannels) + return; + + const auto did = DetId{dids[inputCh]}; + const auto isBarrel = did.subdetId() == EcalBarrel; + const auto* amplitudeFitParameters = isBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE; + const auto* timeFitParameters = isBarrel ? timeFitParametersEB : timeFitParametersEE; + const auto timeFitParameters_size = isBarrel ? timeFitParameters_sizeEB : timeFitParameters_sizeEE; + const auto timeFitLimits_first = isBarrel ? timeFitLimits_firstEB : timeFitLimits_firstEE; + const auto timeFitLimits_second = isBarrel ? timeFitLimits_secondEB : timeFitLimits_secondEE; + + extern __shared__ char smem[]; + ScalarType* shr_chi2s = reinterpret_cast(smem); + ScalarType* shr_time_wgt = shr_chi2s + blockDim.x; + ScalarType* shr_time_max = shr_time_wgt + blockDim.x; + ScalarType* shrTimeMax = shr_time_max + blockDim.x; + ScalarType* shrTimeWgt = shrTimeMax + blockDim.x; + + // map tx -> (sample_i, sample_j) + int sample_i, sample_j = 0; + if (ltx >= 0 && ltx <= 8) { + sample_i = 0; + sample_j = 1 + ltx; + } else if (ltx <= 16) { + sample_i = 1; + sample_j = 2 + ltx - 9; + } else if (ltx <= 23) { + sample_i = 2; + sample_j = 3 + ltx - 17; + } else if (ltx <= 29) { + sample_i = 3; + sample_j = 4 + ltx - 24; + } else if (ltx <= 34) { + sample_i = 4; + sample_j = 5 + ltx - 30; + } else if (ltx <= 38) { + sample_i = 5; + sample_j = 6 + ltx - 35; + } else if (ltx <= 41) { + sample_i = 6; + sample_j = 7 + ltx - 39; + } else if (ltx <= 43) { + sample_i = 7; + sample_j = 8 + ltx - 42; + } else if (ltx <= 44) { + sample_i = 8; + sample_j = 9; + } else + assert(false); + + const auto tx_i = ch_start + sample_i; + const auto tx_j = ch_start + sample_j; + + // + // note, given the way we partition the block, with 45 threads per channel + // we will end up with inactive threads which need to be dragged along + // through the synching point + // + /* + bool const condToExit = ch >= nchannels + ? true + : useless_sample_values[tx_i] + || useless_sample_values[tx_j] + || sample_values[tx_i]<=1 || sample_values[tx_j]<=1; + */ + bool const condForUselessSamples = useless_sample_values[tx_i] || useless_sample_values[tx_j] || + sample_values[tx_i] <= 1 || sample_values[tx_j] <= 1; + + // + // see cpu implementation for explanation + // + ScalarType chi2 = std::numeric_limits::max(); + ScalarType tmax = 0; + ScalarType tmaxerr = 0; + shrTimeMax[threadIdx.x] = 0; + shrTimeWgt[threadIdx.x] = 0; + bool internalCondForSkipping1 = true; + bool internalCondForSkipping2 = true; + if (!condForUselessSamples) { + const auto rtmp = sample_values[tx_i] / sample_values[tx_j]; + const auto invampl_i = 1.0 / sample_values[tx_i]; + const auto relErr2_i = sample_value_errors[tx_i] * sample_value_errors[tx_i] * invampl_i * invampl_i; + const auto invampl_j = 1.0 / sample_values[tx_j]; + const auto relErr2_j = sample_value_errors[tx_j] * sample_value_errors[tx_j] * invampl_j * invampl_j; + const auto err1 = rtmp * rtmp * (relErr2_i + relErr2_j); + auto err2 = sample_value_errors[tx_j] * (sample_values[tx_i] - sample_values[tx_j]) * (invampl_j * invampl_j); + // TODO non-divergent branch for a block if each block has 1 channel + // otherwise non-divergent for groups of 45 threads + // at this point, pedestal_nums[ch] can be either 0, 1 or 2 + if (pedestal_nums[ch] == 2) + err2 *= err2 * 0.5; + const auto err3 = (0.289 * 0.289) * (invampl_j * invampl_j); + const auto total_error = std::sqrt(err1 + err2 + err3); + + const auto alpha = amplitudeFitParameters[0]; + const auto beta = amplitudeFitParameters[1]; + const auto alphabeta = alpha * beta; + const auto invalphabeta = 1.0 / alphabeta; + + // variables instead of a struct + const auto ratio_index = sample_i; + const auto ratio_step = sample_j - sample_i; + const auto ratio_value = rtmp; + const auto ratio_error = total_error; + + const auto rlim_i_j = fast_expf(static_cast(sample_j - sample_i) / beta) - 0.001; + internalCondForSkipping1 = !(total_error < 1.0 && rtmp > 0.001 && rtmp < rlim_i_j); + if (!internalCondForSkipping1) { + // + // precompute. + // in cpu version this was done conditionally + // however easier to do it here (precompute) and then just filter out + // if not needed + // + const auto l_timeFitLimits_first = timeFitLimits_first; + const auto l_timeFitLimits_second = timeFitLimits_second; + if (ratio_step == 1 && ratio_value >= l_timeFitLimits_first && ratio_value <= l_timeFitLimits_second) { + const auto time_max_i = static_cast(ratio_index); + auto u = timeFitParameters[timeFitParameters_size - 1]; +#pragma unroll + for (int k = timeFitParameters_size - 2; k >= 0; k--) + u = u * ratio_value + timeFitParameters[k]; + + auto du = (timeFitParameters_size - 1) * (timeFitParameters[timeFitParameters_size - 1]); + for (int k = timeFitParameters_size - 2; k >= 1; k--) + du = du * ratio_value + k * timeFitParameters[k]; + + const auto error2 = ratio_error * ratio_error * du * du; + const auto time_max = error2 > 0 ? (time_max_i - u) / error2 : static_cast(0); + const auto time_wgt = error2 > 0 ? 1.0 / error2 : static_cast(0); + + // store into shared mem + // note, this name is essentially identical to the one used + // below. + shrTimeMax[threadIdx.x] = error2 > 0 ? time_max : 0; + shrTimeWgt[threadIdx.x] = error2 > 0 ? time_wgt : 0; + } else { + shrTimeMax[threadIdx.x] = 0; + shrTimeWgt[threadIdx.x] = 0; + } + + // continue with ratios + const auto stepOverBeta = static_cast(ratio_step) / beta; + const auto offset = static_cast(ratio_index) + alphabeta; + const auto rmin = std::max(ratio_value - ratio_error, 0.001); + const auto rmax = std::min(ratio_value + ratio_error, + fast_expf(static_cast(ratio_step) / beta) - 0.001); + const auto time1 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmin)) / alpha) - 1.0); + const auto time2 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmax)) / alpha) - 1.0); + + // set these guys + tmax = 0.5 * (time1 + time2); + tmaxerr = 0.5 * std::sqrt((time1 - time2) * (time1 - time2)); +#ifdef DEBUG_TC_MAKERATIO + if (ch == 1 || ch == 0) + printf("ch = %d ltx = %d tmax = %f tmaxerr = %f time1 = %f time2 = %f offset = %f rmin = %f rmax = %f\n", + ch, + ltx, + tmax, + tmaxerr, + time1, + time2, + offset, + rmin, + rmax); +#endif + + SampleVector::Scalar sumAf = 0; + SampleVector::Scalar sumff = 0; + const int itmin = std::max(-1, static_cast(std::floor(tmax - alphabeta))); + auto loffset = (static_cast(itmin) - tmax) * invalphabeta; + // TODO: data dependence + for (int it = itmin + 1; it < nsamples; it++) { + loffset += invalphabeta; + if (useless_sample_values[ch_start + it]) + continue; + const auto inverr2 = 1.0 / (sample_value_errors[ch_start + it] * sample_value_errors[ch_start + it]); + const auto term1 = 1.0 + loffset; + const auto f = (term1 > 1e-6) ? fast_expf(alpha * (fast_logf(term1) - loffset)) : 0; + sumAf += sample_values[ch_start + it] * (f * inverr2); + sumff += f * (f * inverr2); + } + + const auto sumAA = sumAAsNullHypot[ch]; + const auto sum0 = sum0sNullHypot[ch]; + chi2 = sumAA; + // TODO: sum0 can not be 0 below, need to introduce the check upfront + if (sumff > 0) { + chi2 = sumAA - sumAf * (sumAf / sumff); + } + chi2 /= sum0; + +#ifdef DEBUG_TC_MAKERATIO + if (ch == 1 || ch == 0) + printf("ch = %d ltx = %d sumAf = %f sumff = %f sumAA = %f sum0 = %d tmax = %f tmaxerr = %f chi2 = %f\n", + ch, + ltx, + sumAf, + sumff, + sumAA, + static_cast(sum0), + tmax, + tmaxerr, + chi2); +#endif + + if (chi2 > 0 && tmax > 0 && tmaxerr > 0) + internalCondForSkipping2 = false; + else + chi2 = std::numeric_limits::max(); + } + } + + // store into smem + shr_chi2s[threadIdx.x] = chi2; + __syncthreads(); + + // find min chi2 - quite crude for now + // TODO validate/check + char iter = nthreads_per_channel / 2 + nthreads_per_channel % 2; + bool oddElements = nthreads_per_channel % 2; +#pragma unroll + while (iter >= 1) { + if (ltx < iter) + // for odd ns, the last guy will just store itself + // exception is for ltx == 0 and iter==1 + shr_chi2s[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0) + ? shr_chi2s[threadIdx.x] + : std::min(shr_chi2s[threadIdx.x], shr_chi2s[threadIdx.x + iter]); + __syncthreads(); + oddElements = iter % 2; + iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2; + } + + // filter out inactive or useless samples threads + if (!condForUselessSamples && !internalCondForSkipping1 && !internalCondForSkipping2) { + // min chi2, now compute weighted average of tmax measurements + // see cpu version for more explanation + const auto chi2min = shr_chi2s[threadIdx.x - ltx]; + const auto chi2Limit = chi2min + 1.0; + const auto inverseSigmaSquared = chi2 < chi2Limit ? 1.0 / (tmaxerr * tmaxerr) : 0.0; + +#ifdef DEBUG_TC_MAKERATIO + if (ch == 1 || ch == 0) + printf("ch = %d ltx = %d chi2min = %f chi2Limit = %f inverseSigmaSquared = %f\n", + ch, + ltx, + chi2min, + chi2Limit, + inverseSigmaSquared); +#endif + + // store into shared mem and run reduction + // TODO: check if cooperative groups would be better + // TODO: check if shuffling intrinsics are better + shr_time_wgt[threadIdx.x] = inverseSigmaSquared; + shr_time_max[threadIdx.x] = tmax * inverseSigmaSquared; + } else { + shr_time_wgt[threadIdx.x] = 0; + shr_time_max[threadIdx.x] = 0; + } + __syncthreads(); + + // reduce to compute time_max and time_wgt + iter = nthreads_per_channel / 2 + nthreads_per_channel % 2; + oddElements = nthreads_per_channel % 2; +#pragma unroll + while (iter >= 1) { + if (ltx < iter) { + shr_time_wgt[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0) + ? shr_time_wgt[threadIdx.x] + : shr_time_wgt[threadIdx.x] + shr_time_wgt[threadIdx.x + iter]; + shr_time_max[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0) + ? shr_time_max[threadIdx.x] + : shr_time_max[threadIdx.x] + shr_time_max[threadIdx.x + iter]; + shrTimeMax[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0) + ? shrTimeMax[threadIdx.x] + : shrTimeMax[threadIdx.x] + shrTimeMax[threadIdx.x + iter]; + shrTimeWgt[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0) + ? shrTimeWgt[threadIdx.x] + : shrTimeWgt[threadIdx.x] + shrTimeWgt[threadIdx.x + iter]; + } + + __syncthreads(); + oddElements = iter % 2; + iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2; + } + + // load from shared memory the 0th guy (will contain accumulated values) + // compute + // store into global mem + if (ltx == 0) { + const auto tmp_time_max = shr_time_max[threadIdx.x]; + const auto tmp_time_wgt = shr_time_wgt[threadIdx.x]; + + // we are done if there number of time ratios is 0 + if (tmp_time_wgt == 0 && tmp_time_max == 0) { + g_state[ch] = TimeComputationState::Finished; + return; + } + + // no div by 0 + const auto tMaxAlphaBeta = tmp_time_max / tmp_time_wgt; + const auto tMaxErrorAlphaBeta = 1.0 / std::sqrt(tmp_time_wgt); + + tMaxAlphaBetas[ch] = tMaxAlphaBeta; + tMaxErrorAlphaBetas[ch] = tMaxErrorAlphaBeta; + g_accTimeMax[ch] = shrTimeMax[threadIdx.x]; + g_accTimeWgt[ch] = shrTimeWgt[threadIdx.x]; + g_state[ch] = TimeComputationState::NotFinished; + +#ifdef DEBUG_TC_MAKERATIO + printf("ch = %d time_max = %f time_wgt = %f\n", ch, tmp_time_max, tmp_time_wgt); + printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f timeMax = %f timeWgt = %f\n", + ch, + tMaxAlphaBeta, + tMaxErrorAlphaBeta, + shrTimeMax[threadIdx.x], + shrTimeWgt[threadIdx.x]); +#endif + } + } + + /// launch ctx parameters are + /// 10 threads per channel, N channels per block, Y blocks + /// TODO: do we need to keep the state around or can be removed?! + //#define DEBUG_FINDAMPLCHI2_AND_FINISH + __global__ void kernel_time_compute_findamplchi2_and_finish( + SampleVector::Scalar const* sample_values, + SampleVector::Scalar const* sample_value_errors, + uint32_t const* dids_eb, + uint32_t const* dids_ee, + bool const* useless_samples, + SampleVector::Scalar const* g_tMaxAlphaBeta, + SampleVector::Scalar const* g_tMaxErrorAlphaBeta, + SampleVector::Scalar const* g_accTimeMax, + SampleVector::Scalar const* g_accTimeWgt, + ConfigurationParameters::type const* amplitudeFitParametersEB, + ConfigurationParameters::type const* amplitudeFitParametersEE, + SampleVector::Scalar const* sumAAsNullHypot, + SampleVector::Scalar const* sum0sNullHypot, + SampleVector::Scalar const* chi2sNullHypot, + TimeComputationState* g_state, + SampleVector::Scalar* g_ampMaxAlphaBeta, + SampleVector::Scalar* g_ampMaxError, + SampleVector::Scalar* g_timeMax, + SampleVector::Scalar* g_timeError, + const int nchannels, + uint32_t const offsetForInputs) { + using ScalarType = SampleVector::Scalar; + + // constants + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + const int gtx = threadIdx.x + blockIdx.x * blockDim.x; + const int ch = gtx / nsamples; + const int sample = threadIdx.x % nsamples; + const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb; + const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch; + + // configure shared mem + // per block, we need #threads per block * 2 * sizeof(ScalarType) + // we run with N channels per block + extern __shared__ char smem[]; + ScalarType* shr_sumAf = reinterpret_cast(smem); + ScalarType* shr_sumff = shr_sumAf + blockDim.x; + + if (ch >= nchannels) + return; + + auto state = g_state[ch]; + const auto did = DetId{dids[inputCh]}; + const auto* amplitudeFitParameters = + did.subdetId() == EcalBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE; + + // TODO is that better than storing into global and launching another kernel + // for the first 10 threads + if (state == TimeComputationState::NotFinished) { + const auto alpha = amplitudeFitParameters[0]; + const auto beta = amplitudeFitParameters[1]; + const auto alphabeta = alpha * beta; + const auto invalphabeta = 1.0 / alphabeta; + const auto tMaxAlphaBeta = g_tMaxAlphaBeta[ch]; + const auto sample_value = sample_values[gtx]; + const auto sample_value_error = sample_value_errors[gtx]; + const auto inverr2 = + useless_samples[gtx] ? static_cast(0) : 1.0 / (sample_value_error * sample_value_error); + const auto offset = (static_cast(sample) - tMaxAlphaBeta) * invalphabeta; + const auto term1 = 1.0 + offset; + const auto f = term1 > 1e-6 ? fast_expf(alpha * (fast_logf(term1) - offset)) : static_cast(0.0); + const auto sumAf = sample_value * (f * inverr2); + const auto sumff = f * (f * inverr2); + + // store into shared mem + shr_sumAf[threadIdx.x] = sumAf; + shr_sumff[threadIdx.x] = sumff; + } else { + shr_sumAf[threadIdx.x] = 0; + shr_sumff[threadIdx.x] = 0; + } + __syncthreads(); + + // reduce + // unroll completely here (but hardcoded) + if (sample < 5) { + shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x + 5]; + shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x + 5]; + } + __syncthreads(); + + if (sample < 2) { + // will need to subtract for ltx = 3, we double count here + shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x + 2] + shr_sumAf[threadIdx.x + 3]; + shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x + 2] + shr_sumff[threadIdx.x + 3]; + } + __syncthreads(); + + if (sample == 0) { + // exit if the state is done + // note, we do not exit before all __synchtreads are finished + if (state == TimeComputationState::Finished) { + g_timeMax[ch] = 5; + g_timeError[ch] = -999; + return; + } + + // subtract to avoid double counting + const auto sumff = shr_sumff[threadIdx.x] + shr_sumff[threadIdx.x + 1] - shr_sumff[threadIdx.x + 3]; + const auto sumAf = shr_sumAf[threadIdx.x] + shr_sumAf[threadIdx.x + 1] - shr_sumAf[threadIdx.x + 3]; + + const auto ampMaxAlphaBeta = sumff > 0 ? sumAf / sumff : 0; + const auto sumAA = sumAAsNullHypot[ch]; + const auto sum0 = sum0sNullHypot[ch]; + const auto nullChi2 = chi2sNullHypot[ch]; + if (sumff > 0) { + const auto chi2AlphaBeta = (sumAA - sumAf * sumAf / sumff) / sum0; + if (chi2AlphaBeta > nullChi2) { + // null hypothesis is better + state = TimeComputationState::Finished; +#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH + printf("ch = %d chi2AlphaBeta = %f nullChi2 = %f sumAA = %f sumAf = %f sumff = %f sum0 = %f\n", + ch, + chi2AlphaBeta, + nullChi2, + sumAA, + sumAf, + sumff, + sum0); +#endif + } + + // store to global + g_ampMaxAlphaBeta[ch] = ampMaxAlphaBeta; + } else { +#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH + printf("ch = %d sum0 = %f sumAA = %f sumff = %f sumAf = %f\n", ch, sum0, sumAA, sumff, sumAf); +#endif + state = TimeComputationState::Finished; + } + + // store the state to global and finish calcs + g_state[ch] = state; + if (state == TimeComputationState::Finished) { + // store default values into global + g_timeMax[ch] = 5; + g_timeError[ch] = -999; +#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH + printf("ch = %d finished state\n", ch); +#endif + return; + } + + const auto ampMaxError = g_ampMaxError[ch]; + const auto test_ratio = ampMaxAlphaBeta / ampMaxError; + const auto accTimeMax = g_accTimeMax[ch]; + const auto accTimeWgt = g_accTimeWgt[ch]; + const auto tMaxAlphaBeta = g_tMaxAlphaBeta[ch]; + const auto tMaxErrorAlphaBeta = g_tMaxErrorAlphaBeta[ch]; + // branch to separate large vs small pulses + // see cpu version for more info + if (test_ratio > 5.0 && accTimeWgt > 0) { + const auto tMaxRatio = accTimeWgt > 0 ? accTimeMax / accTimeWgt : static_cast(0); + const auto tMaxErrorRatio = accTimeWgt > 0 ? 1.0 / std::sqrt(accTimeWgt) : static_cast(0); + + if (test_ratio > 10.0) { + g_timeMax[ch] = tMaxRatio; + g_timeError[ch] = tMaxErrorRatio; + +#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH + printf("ch = %d tMaxRatio = %f tMaxErrorRatio = %f\n", ch, tMaxRatio, tMaxErrorRatio); +#endif + } else { + const auto timeMax = (tMaxAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) + + tMaxRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) / + 5.0; + const auto timeError = (tMaxErrorAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) + + tMaxErrorRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) / + 5.0; + state = TimeComputationState::Finished; + g_state[ch] = state; + g_timeMax[ch] = timeMax; + g_timeError[ch] = timeError; + +#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH + printf("ch = %d timeMax = %f timeError = %f\n", ch, timeMax, timeError); +#endif + } + } else { + state = TimeComputationState::Finished; + g_state[ch] = state; + g_timeMax[ch] = tMaxAlphaBeta; + g_timeError[ch] = tMaxErrorAlphaBeta; + +#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH + printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f\n", ch, tMaxAlphaBeta, tMaxErrorAlphaBeta); +#endif + } + } + } + + __global__ void kernel_time_compute_fixMGPAslew(uint16_t const* digis_eb, + uint16_t const* digis_ee, + SampleVector::Scalar* sample_values, + SampleVector::Scalar* sample_value_errors, + bool* useless_sample_values, + unsigned const int sample_mask, + const int nchannels, + uint32_t const offsetForInputs) { + using ScalarType = SampleVector::Scalar; + + // constants + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + const int gtx = threadIdx.x + blockIdx.x * blockDim.x; + const int ch = gtx / nsamples; + const int sample = threadIdx.x % nsamples; + const int inputGtx = ch >= offsetForInputs ? gtx - offsetForInputs * nsamples : gtx; + const auto* digis = ch >= offsetForInputs ? digis_ee : digis_eb; + + // remove thread for sample 0, oversubscribing is easier than .... + if (ch >= nchannels || sample == 0) + return; + + if (!use_sample(sample_mask, sample)) + return; + + const auto gainIdPrev = ecal::mgpa::gainId(digis[inputGtx - 1]); + const auto gainIdNext = ecal::mgpa::gainId(digis[inputGtx]); + if (gainIdPrev >= 1 && gainIdPrev <= 3 && gainIdNext >= 1 && gainIdNext <= 3 && gainIdPrev < gainIdNext) { + sample_values[gtx - 1] = 0; + sample_value_errors[gtx - 1] = 1e+9; + useless_sample_values[gtx - 1] = true; + } + } + + __global__ void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values, + SampleVector::Scalar const* sample_value_errors, + uint32_t const* dids, + bool const* useless_samples, + SampleVector::Scalar const* g_timeMax, + SampleVector::Scalar const* amplitudeFitParametersEB, + SampleVector::Scalar const* amplitudeFitParametersEE, + SampleVector::Scalar* g_amplitudeMax, + const int nchannels) { + using ScalarType = SampleVector::Scalar; + + // constants + constexpr ScalarType corr4 = 1.; + constexpr ScalarType corr6 = 1.; + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + const int gtx = threadIdx.x + blockIdx.x * blockDim.x; + const int ch = gtx / nsamples; + const int sample = threadIdx.x % nsamples; + + if (ch >= nchannels) + return; + + const auto did = DetId{dids[ch]}; + const auto* amplitudeFitParameters = + did.subdetId() == EcalBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE; + + // configure shared mem + extern __shared__ char smem[]; + ScalarType* shr_sum1 = reinterpret_cast(smem); + auto* shr_sumA = shr_sum1 + blockDim.x; + auto* shr_sumF = shr_sumA + blockDim.x; + auto* shr_sumAF = shr_sumF + blockDim.x; + auto* shr_sumFF = shr_sumAF + blockDim.x; + + const auto alpha = amplitudeFitParameters[0]; + const auto beta = amplitudeFitParameters[1]; + const auto timeMax = g_timeMax[ch]; + const auto pedestalLimit = timeMax - (alpha * beta) - 1.0; + const auto sample_value = sample_values[gtx]; + const auto sample_value_error = sample_value_errors[gtx]; + const auto inverr2 = + sample_value_error > 0 ? 1. / (sample_value_error * sample_value_error) : static_cast(0); + const auto termOne = 1 + (sample - timeMax) / (alpha * beta); + const auto f = termOne > 1.e-5 ? fast_expf(alpha * fast_logf(termOne) - (sample - timeMax) / beta) + : static_cast(0.); + + bool const cond = ((sample < pedestalLimit) || (f > 0.6 * corr6 && sample <= timeMax) || + (f > 0.4 * corr4 && sample >= timeMax)) && + !useless_samples[gtx]; + + // store into shared mem + shr_sum1[threadIdx.x] = cond ? inverr2 : static_cast(0); + shr_sumA[threadIdx.x] = cond ? sample_value * inverr2 : static_cast(0); + shr_sumF[threadIdx.x] = cond ? f * inverr2 : static_cast(0); + shr_sumAF[threadIdx.x] = cond ? (f * inverr2) * sample_value : static_cast(0); + shr_sumFF[threadIdx.x] = cond ? f * (f * inverr2) : static_cast(0); + + // reduction + if (sample <= 4) { + shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x + 5]; + shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x + 5]; + shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x + 5]; + shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x + 5]; + shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x + 5]; + } + __syncthreads(); + + if (sample < 2) { + // note: we double count sample 3 + shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x + 2] + shr_sum1[threadIdx.x + 3]; + shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x + 2] + shr_sumA[threadIdx.x + 3]; + shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x + 2] + shr_sumF[threadIdx.x + 3]; + shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x + 2] + shr_sumAF[threadIdx.x + 3]; + shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x + 2] + shr_sumFF[threadIdx.x + 3]; + } + __syncthreads(); + + if (sample == 0) { + const auto sum1 = shr_sum1[threadIdx.x] + shr_sum1[threadIdx.x + 1] - shr_sum1[threadIdx.x + 3]; + const auto sumA = shr_sumA[threadIdx.x] + shr_sumA[threadIdx.x + 1] - shr_sumA[threadIdx.x + 3]; + const auto sumF = shr_sumF[threadIdx.x] + shr_sumF[threadIdx.x + 1] - shr_sumF[threadIdx.x + 3]; + const auto sumAF = shr_sumAF[threadIdx.x] + shr_sumAF[threadIdx.x + 1] - shr_sumAF[threadIdx.x + 3]; + const auto sumFF = shr_sumFF[threadIdx.x] + shr_sumFF[threadIdx.x + 1] - shr_sumFF[threadIdx.x + 3]; + + const auto denom = sumFF * sum1 - sumF * sumF; + const auto condForDenom = sum1 > 0 && ecal::abs(denom) > 1.e-20; + const auto amplitudeMax = condForDenom ? (sumAF * sum1 - sumA * sumF) / denom : static_cast(0.); + + // store into global mem + g_amplitudeMax[ch] = amplitudeMax; + } + } + + //#define ECAL_RECO_CUDA_TC_INIT_DEBUG + __global__ void kernel_time_computation_init(uint16_t const* digis_eb, + uint32_t const* dids_eb, + uint16_t const* digis_ee, + uint32_t const* dids_ee, + float const* rms_x12, + float const* rms_x6, + float const* rms_x1, + float const* mean_x12, + float const* mean_x6, + float const* mean_x1, + float const* gain12Over6, + float const* gain6Over1, + SampleVector::Scalar* sample_values, + SampleVector::Scalar* sample_value_errors, + SampleVector::Scalar* ampMaxError, + bool* useless_sample_values, + char* pedestal_nums, + uint32_t const offsetForHashes, + uint32_t const offsetForInputs, + unsigned const int sample_maskEB, + unsigned const int sample_maskEE, + int nchannels) { + using ScalarType = SampleVector::Scalar; + + // constants + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + const int tx = threadIdx.x + blockDim.x * blockIdx.x; + const int ch = tx / nsamples; + const int inputTx = ch >= offsetForInputs ? tx - offsetForInputs * nsamples : tx; + const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch; + const auto* digis = ch >= offsetForInputs ? digis_ee : digis_eb; + const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb; + + if (ch < nchannels) { + // indices/inits + const int sample = tx % nsamples; + const int input_ch_start = inputCh * nsamples; + SampleVector::Scalar pedestal = 0.; + int num = 0; + + // configure shared mem + extern __shared__ char smem[]; + ScalarType* shrSampleValues = reinterpret_cast(smem); + ScalarType* shrSampleValueErrors = shrSampleValues + blockDim.x; + + // 0 and 1 sample values + const auto adc0 = ecal::mgpa::adc(digis[input_ch_start]); + const auto gainId0 = ecal::mgpa::gainId(digis[input_ch_start]); + const auto adc1 = ecal::mgpa::adc(digis[input_ch_start + 1]); + const auto gainId1 = ecal::mgpa::gainId(digis[input_ch_start + 1]); + const auto did = DetId{dids[inputCh]}; + const auto isBarrel = did.subdetId() == EcalBarrel; + const auto sample_mask = did.subdetId() == EcalBarrel ? sample_maskEB : sample_maskEE; + const auto hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) + : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId()); + + // set pedestal + // TODO this branch is non-divergent for a group of 10 threads + if (gainId0 == 1 && use_sample(sample_mask, 0)) { + pedestal = static_cast(adc0); + num = 1; + + const auto diff = adc1 - adc0; + if (gainId1 == 1 && use_sample(sample_mask, 1) && std::abs(diff) < 3 * rms_x12[hashedId]) { + pedestal = (pedestal + static_cast(adc1)) / 2.0; + num = 2; + } + } else { + pedestal = mean_x12[ch]; + } + + // ped subtracted and gain-renormalized samples. + const auto gainId = ecal::mgpa::gainId(digis[inputTx]); + const auto adc = ecal::mgpa::adc(digis[inputTx]); + + bool bad = false; + SampleVector::Scalar sample_value, sample_value_error; + // TODO divergent branch + // TODO: piece below is general both for amplitudes and timing + // potentially there is a way to reduce the amount of code... + if (!use_sample(sample_mask, sample)) { + bad = true; + sample_value = 0; + sample_value_error = 0; + } else if (gainId == 1) { + sample_value = static_cast(adc) - pedestal; + sample_value_error = rms_x12[hashedId]; + } else if (gainId == 2) { + sample_value = (static_cast(adc) - mean_x6[hashedId]) * gain12Over6[hashedId]; + sample_value_error = rms_x6[hashedId] * gain12Over6[hashedId]; + } else if (gainId == 3) { + sample_value = (static_cast(adc) - mean_x1[hashedId]) * gain6Over1[hashedId] * + gain12Over6[hashedId]; + sample_value_error = rms_x1[hashedId] * gain6Over1[hashedId] * gain12Over6[hashedId]; + } else { + sample_value = 0; + sample_value_error = 0; + bad = true; + } + + // TODO: make sure we save things correctly when sample is useless + const auto useless_sample = (sample_value_error <= 0) | bad; + useless_sample_values[tx] = useless_sample; + sample_values[tx] = sample_value; + sample_value_errors[tx] = useless_sample ? 1e+9 : sample_value_error; + + // DEBUG +#ifdef ECAL_RECO_CUDA_TC_INIT_DEBUG + if (ch == 0) { + printf("sample = %d sample_value = %f sample_value_error = %f useless = %c\n", + sample, + sample_value, + sample_value_error, + useless_sample ? '1' : '0'); + } +#endif + + // store into the shared mem + shrSampleValues[threadIdx.x] = sample_value_error > 0 ? sample_value : std::numeric_limits::min(); + shrSampleValueErrors[threadIdx.x] = sample_value_error; + __syncthreads(); + + // perform the reduction with min + if (sample < 5) { + // note, if equal -> we keep the value with lower sample as for cpu + shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 5] + ? shrSampleValueErrors[threadIdx.x + 5] + : shrSampleValueErrors[threadIdx.x]; + shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 5]); + } + __syncthreads(); + + // a bit of an overkill, but easier than to compare across 3 values + if (sample < 3) { + shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 3] + ? shrSampleValueErrors[threadIdx.x + 3] + : shrSampleValueErrors[threadIdx.x]; + shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 3]); + } + __syncthreads(); + + if (sample < 2) { + shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 2] + ? shrSampleValueErrors[threadIdx.x + 2] + : shrSampleValueErrors[threadIdx.x]; + shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 2]); + } + __syncthreads(); + + if (sample == 0) { + // we only needd the max error + const auto maxSampleValueError = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 1] + ? shrSampleValueErrors[threadIdx.x + 1] + : shrSampleValueErrors[threadIdx.x]; + + // # pedestal samples used + pedestal_nums[ch] = num; + // this is used downstream + ampMaxError[ch] = maxSampleValueError; + + // DEBUG +#ifdef ECAL_RECO_CUDA_TC_INIT_DEBUG + if (ch == 0) { + printf("pedestal_nums = %d ampMaxError = %f\n", num, maxSampleValueError); + } +#endif + } + } + } + + /// + /// launch context parameters: 1 thread per channel + /// + //#define DEBUG_TIME_CORRECTION + __global__ void kernel_time_correction_and_finalize( + // SampleVector::Scalar const* g_amplitude, + ::ecal::reco::StorageScalarType const* g_amplitude, + uint16_t const* digis_eb, + uint32_t const* dids_eb, + uint16_t const* digis_ee, + uint32_t const* dids_ee, + float const* amplitudeBinsEB, + float const* amplitudeBinsEE, + float const* shiftBinsEB, + float const* shiftBinsEE, + SampleVector::Scalar const* g_timeMax, + SampleVector::Scalar const* g_timeError, + float const* g_rms_x12, + float const* timeCalibConstant, + float* g_jitter, + float* g_jitterError, + uint32_t* flags, + const int amplitudeBinsSizeEB, + const int amplitudeBinsSizeEE, + ConfigurationParameters::type const timeConstantTermEB, + ConfigurationParameters::type const timeConstantTermEE, + float const offsetTimeValueEB, + float const offsetTimeValueEE, + ConfigurationParameters::type const timeNconstEB, + ConfigurationParameters::type const timeNconstEE, + ConfigurationParameters::type const amplitudeThresholdEB, + ConfigurationParameters::type const amplitudeThresholdEE, + ConfigurationParameters::type const outOfTimeThreshG12pEB, + ConfigurationParameters::type const outOfTimeThreshG12pEE, + ConfigurationParameters::type const outOfTimeThreshG12mEB, + ConfigurationParameters::type const outOfTimeThreshG12mEE, + ConfigurationParameters::type const outOfTimeThreshG61pEB, + ConfigurationParameters::type const outOfTimeThreshG61pEE, + ConfigurationParameters::type const outOfTimeThreshG61mEB, + ConfigurationParameters::type const outOfTimeThreshG61mEE, + uint32_t const offsetForHashes, + uint32_t const offsetForInputs, + const int nchannels) { + using ScalarType = SampleVector::Scalar; + + // constants + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + const int gtx = threadIdx.x + blockIdx.x * blockDim.x; + const int inputGtx = gtx >= offsetForInputs ? gtx - offsetForInputs : gtx; + const auto* dids = gtx >= offsetForInputs ? dids_ee : dids_eb; + const auto& digis = gtx >= offsetForInputs ? digis_ee : digis_eb; + + // filter out outside of range threads + if (gtx >= nchannels) + return; + + const auto did = DetId{dids[inputGtx]}; + const auto isBarrel = did.subdetId() == EcalBarrel; + const auto hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) + : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId()); + const auto* amplitudeBins = isBarrel ? amplitudeBinsEB : amplitudeBinsEE; + const auto* shiftBins = isBarrel ? shiftBinsEB : shiftBinsEE; + const auto amplitudeBinsSize = isBarrel ? amplitudeBinsSizeEB : amplitudeBinsSizeEE; + const auto timeConstantTerm = isBarrel ? timeConstantTermEB : timeConstantTermEE; + const auto timeNconst = isBarrel ? timeNconstEB : timeNconstEE; + const auto offsetTimeValue = isBarrel ? offsetTimeValueEB : offsetTimeValueEE; + const auto amplitudeThreshold = isBarrel ? amplitudeThresholdEB : amplitudeThresholdEE; + const auto outOfTimeThreshG12p = isBarrel ? outOfTimeThreshG12pEB : outOfTimeThreshG12pEE; + const auto outOfTimeThreshG12m = isBarrel ? outOfTimeThreshG12mEB : outOfTimeThreshG12mEE; + const auto outOfTimeThreshG61p = isBarrel ? outOfTimeThreshG61pEB : outOfTimeThreshG61pEE; + const auto outOfTimeThreshG61m = isBarrel ? outOfTimeThreshG61mEB : outOfTimeThreshG61mEE; + + // load some + const auto amplitude = g_amplitude[gtx]; + const auto rms_x12 = g_rms_x12[hashedId]; + const auto timeCalibConst = timeCalibConstant[hashedId]; + + int myBin = -1; + for (int bin = 0; bin < amplitudeBinsSize; bin++) { + if (amplitude > amplitudeBins[bin]) + myBin = bin; + else + break; + } + + ScalarType correction = 0; + if (myBin == -1) { + correction = shiftBins[0]; + } else if (myBin == amplitudeBinsSize - 1) { + correction = shiftBins[myBin]; + } else { + correction = shiftBins[myBin + 1] - shiftBins[myBin]; + correction *= (amplitude - amplitudeBins[myBin]) / (amplitudeBins[myBin + 1] - amplitudeBins[myBin]); + correction += shiftBins[myBin]; + } + + // correction * 1./25. + correction = correction * 0.04; + const auto timeMax = g_timeMax[gtx]; + const auto timeError = g_timeError[gtx]; + const auto jitter = timeMax - 5 + correction; + const auto jitterError = + std::sqrt(timeError * timeError + timeConstantTerm * timeConstantTerm * 0.04 * 0.04); // 0.04 = 1./25. + +#ifdef DEBUG_TIME_CORRECTION + // if (gtx == 0) { + printf("ch = %d timeMax = %f timeError = %f jitter = %f correction = %f\n", + gtx, + timeMax, + timeError, + jitter, + correction); +// } +#endif + + // store back to global + g_jitter[gtx] = jitter; + g_jitterError[gtx] = jitterError; + + // set the flag + // TODO: replace with something more efficient (if required), + // for now just to make it work + if (amplitude > amplitudeThreshold * rms_x12) { + auto threshP = outOfTimeThreshG12p; + auto threshM = outOfTimeThreshG12m; + if (amplitude > 3000.) { + for (int isample = 0; isample < nsamples; isample++) { + int gainid = ecal::mgpa::gainId(digis[nsamples * inputGtx + isample]); + if (gainid != 1) { + threshP = outOfTimeThreshG61p; + threshM = outOfTimeThreshG61m; + break; + } + } + } + + const auto correctedTime = (timeMax - 5) * 25 + timeCalibConst + offsetTimeValue; + const auto nterm = timeNconst * rms_x12 / amplitude; + const auto sigmat = std::sqrt(nterm * nterm + timeConstantTerm * timeConstantTerm); + if (correctedTime > sigmat * threshP || correctedTime < -sigmat * threshM) + flags[gtx] |= 0x1 << EcalUncalibratedRecHit::kOutOfTime; + } + } + + } // namespace multifit +} // namespace ecal diff --git a/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.h new file mode 100644 index 0000000000000..68500ddfe70d2 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.h @@ -0,0 +1,182 @@ +#ifndef RecoLocalCalo_EcalRecProducers_plugins_TimeComputationKernels_h +#define RecoLocalCalo_EcalRecProducers_plugins_TimeComputationKernels_h + +#include +#include + +#include + +#include "DataFormats/Math/interface/approx_exp.h" +#include "DataFormats/Math/interface/approx_log.h" + +#include "Common.h" +#include "DeclsForKernels.h" +#include "EigenMatrixTypes_gpu.h" + +//#define DEBUG + +//#define ECAL_RECO_CUDA_DEBUG + +namespace ecal { + namespace multifit { + + __global__ void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values, + SampleVector::Scalar const* sample_value_errors, + bool const* useless_sample_values, + SampleVector::Scalar* chi2s, + SampleVector::Scalar* sum0s, + SampleVector::Scalar* sumAAs, + int const nchannels); + // + // launch ctx parameters are + // 45 threads per channel, X channels per block, Y blocks + // 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9 + // TODO: it might be much beter to use 32 threads per channel instead of 45 + // to simplify the synchronization + // + __global__ void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values, + SampleVector::Scalar const* sample_value_errors, + uint32_t const* dids_eb, + uint32_t const* dids_ee, + bool const* useless_sample_values, + char const* pedestal_nums, + ConfigurationParameters::type const* amplitudeFitParametersEB, + ConfigurationParameters::type const* amplitudeFitParametersEE, + ConfigurationParameters::type const* timeFitParametersEB, + ConfigurationParameters::type const* timeFitParametersEE, + SampleVector::Scalar const* sumAAsNullHypot, + SampleVector::Scalar const* sum0sNullHypot, + SampleVector::Scalar* tMaxAlphaBetas, + SampleVector::Scalar* tMaxErrorAlphaBetas, + SampleVector::Scalar* g_accTimeMax, + SampleVector::Scalar* g_accTimeWgt, + TimeComputationState* g_state, + unsigned int const timeFitParameters_sizeEB, + unsigned int const timeFitParameters_sizeEE, + ConfigurationParameters::type const timeFitLimits_firstEB, + ConfigurationParameters::type const timeFitLimits_firstEE, + ConfigurationParameters::type const timeFitLimits_secondEB, + ConfigurationParameters::type const timeFitLimits_secondEE, + int const nchannels, + uint32_t const offsetForInputs); + + /// launch ctx parameters are + /// 10 threads per channel, N channels per block, Y blocks + /// TODO: do we need to keep the state around or can be removed?! + //#define DEBUG_FINDAMPLCHI2_AND_FINISH + __global__ void kernel_time_compute_findamplchi2_and_finish( + SampleVector::Scalar const* sample_values, + SampleVector::Scalar const* sample_value_errors, + uint32_t const* dids_eb, + uint32_t const* dids_ee, + bool const* useless_samples, + SampleVector::Scalar const* g_tMaxAlphaBeta, + SampleVector::Scalar const* g_tMaxErrorAlphaBeta, + SampleVector::Scalar const* g_accTimeMax, + SampleVector::Scalar const* g_accTimeWgt, + ConfigurationParameters::type const* amplitudeFitParametersEB, + ConfigurationParameters::type const* amplitudeFitParametersEE, + SampleVector::Scalar const* sumAAsNullHypot, + SampleVector::Scalar const* sum0sNullHypot, + SampleVector::Scalar const* chi2sNullHypot, + TimeComputationState* g_state, + SampleVector::Scalar* g_ampMaxAlphaBeta, + SampleVector::Scalar* g_ampMaxError, + SampleVector::Scalar* g_timeMax, + SampleVector::Scalar* g_timeError, + int const nchannels, + uint32_t const offsetForInputs); + + __global__ void kernel_time_compute_fixMGPAslew(uint16_t const* digis_eb, + uint16_t const* digis_ee, + SampleVector::Scalar* sample_values, + SampleVector::Scalar* sample_value_errors, + bool* useless_sample_values, + unsigned int const sample_mask, + int const nchannels, + uint32_t const offsetForInputs); + + __global__ void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values, + SampleVector::Scalar const* sample_value_errors, + uint32_t const* dids_eb, + uint32_t const* dids_ed, + bool const* useless_samples, + SampleVector::Scalar const* g_timeMax, + SampleVector::Scalar const* amplitudeFitParametersEB, + SampleVector::Scalar const* amplitudeFitParametersEE, + SampleVector::Scalar* g_amplitudeMax, + int const nchannels, + uint32_t const offsetForInputs); + + //#define ECAL_RECO_CUDA_TC_INIT_DEBUG + __global__ void kernel_time_computation_init(uint16_t const* digis_eb, + uint32_t const* dids_eb, + uint16_t const* digis_ee, + uint32_t const* dids_ee, + float const* rms_x12, + float const* rms_x6, + float const* rms_x1, + float const* mean_x12, + float const* mean_x6, + float const* mean_x1, + float const* gain12Over6, + float const* gain6Over1, + SampleVector::Scalar* sample_values, + SampleVector::Scalar* sample_value_errors, + SampleVector::Scalar* ampMaxError, + bool* useless_sample_values, + char* pedestal_nums, + uint32_t const offsetForHashes, + uint32_t const offsetForInputs, + unsigned int const sample_maskEB, + unsigned int const sample_maskEE, + int nchannels); + + /// + /// launch context parameters: 1 thread per channel + /// + //#define DEBUG_TIME_CORRECTION + __global__ void kernel_time_correction_and_finalize( + // SampleVector::Scalar const* g_amplitude, + ::ecal::reco::StorageScalarType const* g_amplitude, + uint16_t const* digis_eb, + uint32_t const* dids_eb, + uint16_t const* digis_ee, + uint32_t const* dids_ee, + float const* amplitudeBinsEB, + float const* amplitudeBinsEE, + float const* shiftBinsEB, + float const* shiftBinsEE, + SampleVector::Scalar const* g_timeMax, + SampleVector::Scalar const* g_timeError, + float const* g_rms_x12, + float const* timeCalibConstant, + ::ecal::reco::StorageScalarType* g_jitter, + ::ecal::reco::StorageScalarType* g_jitterError, + uint32_t* flags, + int const amplitudeBinsSizeEB, + int const amplitudeBinsSizeEE, + ConfigurationParameters::type const timeConstantTermEB, + ConfigurationParameters::type const timeConstantTermEE, + float const offsetTimeValueEB, + float const offsetTimeValueEE, + ConfigurationParameters::type const timeNconstEB, + ConfigurationParameters::type const timeNconstEE, + ConfigurationParameters::type const amplitudeThresholdEB, + ConfigurationParameters::type const amplitudeThresholdEE, + ConfigurationParameters::type const outOfTimeThreshG12pEB, + ConfigurationParameters::type const outOfTimeThreshG12pEE, + ConfigurationParameters::type const outOfTimeThreshG12mEB, + ConfigurationParameters::type const outOfTimeThreshG12mEE, + ConfigurationParameters::type const outOfTimeThreshG61pEB, + ConfigurationParameters::type const outOfTimeThreshG61pEE, + ConfigurationParameters::type const outOfTimeThreshG61mEB, + ConfigurationParameters::type const outOfTimeThreshG61mEE, + uint32_t const offsetForHashes, + uint32_t const offsetForInputs, + int const nchannels); + + } // namespace multifit +} // namespace ecal + +#endif // RecoLocalCalo_EcalRecProducers_plugins_TimeComputationKernels_h