From e12ea51bdd3bc8c2ba21b4f11fe2222b0146fe2e Mon Sep 17 00:00:00 2001 From: Viktor Khristenko Date: Mon, 30 Mar 2020 12:44:16 +0200 Subject: [PATCH 01/30] raw to digi is adapted to 110x patatrack branch --- CUDADataFormats/EcalDigi/BuildFile.xml | 8 + .../EcalDigi/interface/DigisCollection.h | 29 ++ CUDADataFormats/EcalDigi/src/classes.h | 3 + CUDADataFormats/EcalDigi/src/classes_def.xml | 4 + EventFilter/EcalRawToDigi/BuildFile.xml | 5 + .../EcalRawToDigi/interface/DeclsForKernels.h | 118 +++++ .../interface/ElectronicsIdGPU.h | 93 ++++ .../interface/ElectronicsMappingGPU.h | 45 ++ .../EcalRawToDigi/interface/UnpackGPU.h | 17 + .../EcalRawToDigi/plugins/BuildFile.xml | 6 + .../plugins/EcalCPUDigisProducer.cc | 152 ++++++ .../plugins/EcalRawESProducerGPU.h | 44 ++ .../plugins/EcalRawESProducersGPUDefs.cc | 14 + .../EcalRawToDigi/plugins/EcalRawToDigiGPU.cc | 175 +++++++ .../src/ElectronicsMappingGPU.cc | 62 +++ EventFilter/EcalRawToDigi/src/UnpackGPU.cu | 476 ++++++++++++++++++ 16 files changed, 1251 insertions(+) create mode 100644 CUDADataFormats/EcalDigi/BuildFile.xml create mode 100644 CUDADataFormats/EcalDigi/interface/DigisCollection.h create mode 100644 CUDADataFormats/EcalDigi/src/classes.h create mode 100644 CUDADataFormats/EcalDigi/src/classes_def.xml create mode 100644 EventFilter/EcalRawToDigi/interface/DeclsForKernels.h create mode 100644 EventFilter/EcalRawToDigi/interface/ElectronicsIdGPU.h create mode 100644 EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h create mode 100644 EventFilter/EcalRawToDigi/interface/UnpackGPU.h create mode 100644 EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc create mode 100644 EventFilter/EcalRawToDigi/plugins/EcalRawESProducerGPU.h create mode 100644 EventFilter/EcalRawToDigi/plugins/EcalRawESProducersGPUDefs.cc create mode 100644 EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc create mode 100644 EventFilter/EcalRawToDigi/src/ElectronicsMappingGPU.cc create mode 100644 EventFilter/EcalRawToDigi/src/UnpackGPU.cu diff --git a/CUDADataFormats/EcalDigi/BuildFile.xml b/CUDADataFormats/EcalDigi/BuildFile.xml new file mode 100644 index 0000000000000..a1838ba91dc91 --- /dev/null +++ b/CUDADataFormats/EcalDigi/BuildFile.xml @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/CUDADataFormats/EcalDigi/interface/DigisCollection.h b/CUDADataFormats/EcalDigi/interface/DigisCollection.h new file mode 100644 index 0000000000000..31134e3ddbd8f --- /dev/null +++ b/CUDADataFormats/EcalDigi/interface/DigisCollection.h @@ -0,0 +1,29 @@ +#ifndef CUDADataFormats_EcalDigi_interface_DigisCollection_h +#define CUDADataFormats_EcalDigi_interface_DigisCollection_h + +namespace ecal { + +// +// this is basically a view +// it does not own the actual memory -> does not reclaim +// +struct DigisCollection { + DigisCollection() = default; + DigisCollection(uint32_t *ids, uint16_t *data, uint32_t ndigis) + : ids{ids}, data{data}, ndigis{ndigis} + {} + DigisCollection(DigisCollection const&) = default; + DigisCollection& operator=(DigisCollection const&) = default; + + DigisCollection(DigisCollection&&) = default; + DigisCollection& operator=(DigisCollection&&) = default; + + // stride is statically known + uint32_t *ids=nullptr; + uint16_t *data=nullptr; + uint32_t ndigis; +}; + +} + +#endif // CUDADataFormats_EcalDigi_interface_DigisCollection_h diff --git a/CUDADataFormats/EcalDigi/src/classes.h b/CUDADataFormats/EcalDigi/src/classes.h new file mode 100644 index 0000000000000..981b7334a8d24 --- /dev/null +++ b/CUDADataFormats/EcalDigi/src/classes.h @@ -0,0 +1,3 @@ +#include "DataFormats/Common/interface/Wrapper.h" +#include "CUDADataFormats/Common/interface/Product.h" +#include "CUDADataFormats/EcalDigi/interface/DigisCollection.h" diff --git a/CUDADataFormats/EcalDigi/src/classes_def.xml b/CUDADataFormats/EcalDigi/src/classes_def.xml new file mode 100644 index 0000000000000..07beed46d89d0 --- /dev/null +++ b/CUDADataFormats/EcalDigi/src/classes_def.xml @@ -0,0 +1,4 @@ + + + + diff --git a/EventFilter/EcalRawToDigi/BuildFile.xml b/EventFilter/EcalRawToDigi/BuildFile.xml index 3bb940370c1f4..61a07973df153 100644 --- a/EventFilter/EcalRawToDigi/BuildFile.xml +++ b/EventFilter/EcalRawToDigi/BuildFile.xml @@ -18,6 +18,11 @@ + + + + + diff --git a/EventFilter/EcalRawToDigi/interface/DeclsForKernels.h b/EventFilter/EcalRawToDigi/interface/DeclsForKernels.h new file mode 100644 index 0000000000000..b9a0e739019ad --- /dev/null +++ b/EventFilter/EcalRawToDigi/interface/DeclsForKernels.h @@ -0,0 +1,118 @@ +#ifndef EventFilter_EcalRawToDigi_interface_DeclsForKernels_h +#define EventFilter_EcalRawToDigi_interface_DeclsForKernels_h + +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +#include "EventFilter/EcalRawToDigi/interface/DCCRawDataDefinitions.h" + +#include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h" + +namespace ecal { namespace raw { + +constexpr auto empty_event_size = EMPTYEVENTSIZE; +constexpr uint32_t nfeds_max = 54; +constexpr uint32_t nbytes_per_fed_max = 10 * 1024; + +struct InputDataCPU { + std::vector> data; + std::vector> offsets; + std::vector> feds; + + void allocate() { + // 2KB per FED resize + data.resize(nfeds_max * sizeof(unsigned char) * nbytes_per_fed_max); + offsets.resize(nfeds_max, 0); + feds.resize(nfeds_max, 0); + } +}; + +struct ConfigurationParameters { + uint32_t maxChannels; +}; + +struct OutputDataCPU { + // [0] - eb, [1] - ee + std::vector> nchannels; + + void allocate() { + nchannels.resize(2); + } +}; + +struct OutputDataGPU { + uint16_t *samplesEB=nullptr, *samplesEE = nullptr; + uint32_t *idsEB=nullptr, *idsEE = nullptr; + + // FIXME: we should separate max channels parameter for eb and ee + // FIXME: replace hardcoded values + void allocate(ConfigurationParameters const& config) { + cudaCheck( cudaMalloc((void**)&samplesEB, + config.maxChannels * sizeof(uint16_t) * 10) ); + cudaCheck( cudaMalloc((void**)&samplesEE, + config.maxChannels * sizeof(uint16_t) * 10) ); + cudaCheck( cudaMalloc((void**)&idsEB, + config.maxChannels * sizeof(uint32_t)) ); + cudaCheck( cudaMalloc((void**)&idsEE, + config.maxChannels * sizeof(uint32_t)) ); + } + + void deallocate(ConfigurationParameters const& config) { + if (samplesEB) { + cudaCheck( cudaFree(samplesEB) ); + cudaCheck( cudaFree(samplesEE) ); + cudaCheck( cudaFree(idsEB) ); + cudaCheck( cudaFree(idsEE) ); + } + } +}; + +struct ScratchDataGPU { + // [0] = EB + // [1] = EE + uint32_t *pChannelsCounter=nullptr; + + void allocate(ConfigurationParameters const& config) { + cudaCheck( cudaMalloc((void**)&pChannelsCounter, + sizeof(uint32_t) * 2) ); + } + + void deallocate(ConfigurationParameters const& config) { + if (pChannelsCounter) { + cudaCheck( cudaFree(pChannelsCounter) ); + } + } +}; + +struct InputDataGPU { + unsigned char *data=nullptr; + uint32_t *offsets=nullptr; + int *feds=nullptr; + + void allocate() { + cudaCheck( cudaMalloc((void**)&data, + sizeof(unsigned char) * nbytes_per_fed_max * nfeds_max) ); + cudaCheck( cudaMalloc((void**)&offsets, + sizeof(uint32_t) * nfeds_max) ); + cudaCheck( cudaMalloc((void**)&feds, + sizeof(int) * nfeds_max) ); + } + + void deallocate() { + if (data) { + cudaCheck( cudaFree(data) ); + cudaCheck( cudaFree(offsets) ); + cudaCheck( cudaFree(feds) ); + } + } +}; + +struct ConditionsProducts { + ElectronicsMappingGPU::Product const& eMappingProduct; +}; + +}} + +#endif // EventFilter_EcalRawToDigi_interface_DeclsForKernels_h diff --git a/EventFilter/EcalRawToDigi/interface/ElectronicsIdGPU.h b/EventFilter/EcalRawToDigi/interface/ElectronicsIdGPU.h new file mode 100644 index 0000000000000..654ac2a42e0fe --- /dev/null +++ b/EventFilter/EcalRawToDigi/interface/ElectronicsIdGPU.h @@ -0,0 +1,93 @@ +#ifndef EventFilter_EcalRawToDigi_interface_ElectronicsIdGPU_h +#define EventFilter_EcalRawToDigi_interface_ElectronicsIdGPU_h + +#include + +#include "DataFormats/EcalDetId/interface/EcalSubdetector.h" + +namespace ecal { namespace raw { + +/** \brief Ecal readout channel identification + [32:20] Unused (so far) + [19:13] DCC id + [12:6] tower + [5:3] strip + [2:0] xtal + Index starts from 1 + */ + +class ElectronicsIdGPU { +public: + /** Default constructor -- invalid value */ + constexpr ElectronicsIdGPU() : id_{0xFFFFFFFFu} {} + /** from raw */ + constexpr ElectronicsIdGPU(uint32_t id) : id_{id} {} + /** Constructor from dcc,tower,channel **/ + constexpr ElectronicsIdGPU( + uint8_t const dccid, uint8_t const towerid, + uint8_t const stripid, uint8_t const xtalid) + : id_{static_cast( + (xtalid & 0x7) | ((stripid & 0x7) << 3) | + ((towerid & 0x7F) << 6) | ((dccid & 0x7F) << 13))} + {} + + constexpr uint32_t operator()() { return id_; } + constexpr uint32_t rawId() const { return id_; } + + /// get the DCC (Ecal Local DCC value not global one) id + constexpr uint8_t dccId() const { return (id_ >> 13) & 0x7F; } + /// get the tower id + constexpr uint8_t towerId() const { return (id_ >> 6) & 0x7F; } + /// get the tower id + constexpr uint8_t stripId() const { return (id_ >> 3) & 0x7; } + /// get the channel id + constexpr uint8_t xtalId() const { return (id_ & 0x7); } + + /// get the subdet + //EcalSubdetector subdet() const; + + /// get a fast, compact, unique index for linear lookups (maximum value = 4194303) + constexpr uint32_t linearIndex() const { return id_ & 0x3FFFFF; } + + /// so far for EndCap only : + //int channelId() const; // xtal id between 1 and 25 + + static constexpr int kTowersInPhi = 4; // see EBDetId + static constexpr int kCrystalsInPhi = 20; // see EBDetId + + static constexpr uint8_t MAX_DCCID = 54; //To be updated with correct and final number + static constexpr uint8_t MIN_DCCID = 1; + static constexpr uint8_t MAX_TOWERID = 70; + static constexpr uint8_t MIN_TOWERID = 1; + static constexpr uint8_t MAX_STRIPID = 5; + static constexpr uint8_t MIN_STRIPID = 1; + static constexpr uint8_t MAX_CHANNELID = 25; + static constexpr uint8_t MIN_CHANNELID = 1; + static constexpr uint8_t MAX_XTALID = 5; + static constexpr uint8_t MIN_XTALID = 1; + + static constexpr int MIN_DCCID_EEM = 1; + static constexpr int MAX_DCCID_EEM = 9; + static constexpr int MIN_DCCID_EBM = 10; + static constexpr int MAX_DCCID_EBM = 27; + static constexpr int MIN_DCCID_EBP = 28; + static constexpr int MAX_DCCID_EBP = 45; + static constexpr int MIN_DCCID_EEP = 46; + static constexpr int MAX_DCCID_EEP = 54; + + static constexpr int DCCID_PHI0_EBM = 10; + static constexpr int DCCID_PHI0_EBP = 28; + + static constexpr int kDCCChannelBoundary = 17; + static constexpr int DCC_EBM = 10; // id of the DCC in EB- which contains phi=0 deg. + static constexpr int DCC_EBP = 28; // id of the DCC in EB+ which contains phi=0 deg. + static constexpr int DCC_EEM = 1; // id of the DCC in EE- which contains phi=0 deg. + static constexpr int DCC_EEP = 46; // id of the DCC in EE+ which contains phi=0 deg. + +private: + uint32_t id_; +}; + +}} + +#endif // EventFilter_EcalRawToDigi_interface_id_h diff --git a/EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h b/EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h new file mode 100644 index 0000000000000..91dacbd883473 --- /dev/null +++ b/EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h @@ -0,0 +1,45 @@ +#ifndef EventFilter_EcalRawToDigi_interface_ElectronicsMappingGPU_h +#define EventFilter_EcalRawToDigi_interface_ElectronicsMappingGPU_h + +#include "CondFormats/EcalObjects/interface/EcalMappingElectronics.h" + +#ifndef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" +#include "HeterogeneousCore/CUDACore/interface/ESProduct.h" +#endif + +namespace ecal { namespace raw { + +class ElectronicsMappingGPU { +public: + struct Product { + ~Product(); + uint32_t *eid2did; + }; + +#ifndef __CUDACC__ + + // rearrange pedestals + ElectronicsMappingGPU(EcalMappingElectronics const&); + + // will call dealloation for Product thru ~Product + ~ElectronicsMappingGPU() = default; + + // get device pointers + Product const& getProduct(cudaStream_t) const; + + // + static std::string name() { return std::string{"ecalElectronicsMappingGPU"}; } + +private: + // in the future, we need to arrange so to avoid this copy on the host + // store eb first then ee + std::vector> eid2did_; + + cms::cuda::ESProduct product_; +#endif +}; + +}} + +#endif // EventFilter_EcalRawToDigi_interface_ElectronicsMappingGPU_h diff --git a/EventFilter/EcalRawToDigi/interface/UnpackGPU.h b/EventFilter/EcalRawToDigi/interface/UnpackGPU.h new file mode 100644 index 0000000000000..8c80354699488 --- /dev/null +++ b/EventFilter/EcalRawToDigi/interface/UnpackGPU.h @@ -0,0 +1,17 @@ +#ifndef EventFilter_EcalRawToDigi_interface_UnpackGPU_h +#define EventFilter_EcalRawToDigi_interface_UnpackGPU_h + +#include "EventFilter/EcalRawToDigi/interface/DeclsForKernels.h" + +namespace ecal { namespace raw { + +// FIXME: bundle up uint32_t values +void entryPoint( + InputDataCPU const&, InputDataGPU&, + OutputDataGPU&, ScratchDataGPU&, + OutputDataCPU&, ConditionsProducts const&, + cudaStream_t, uint32_t const, uint32_t const); + +}} + +#endif // EventFilter_EcalRawToDigi_interface_UnpackGPU_h diff --git a/EventFilter/EcalRawToDigi/plugins/BuildFile.xml b/EventFilter/EcalRawToDigi/plugins/BuildFile.xml index c2bfbb6adef14..296a6b2461f8c 100644 --- a/EventFilter/EcalRawToDigi/plugins/BuildFile.xml +++ b/EventFilter/EcalRawToDigi/plugins/BuildFile.xml @@ -14,6 +14,12 @@ + + + + + + diff --git a/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc b/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc new file mode 100644 index 0000000000000..6f488053b204b --- /dev/null +++ b/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc @@ -0,0 +1,152 @@ +#include + +// framework +#include "FWCore/Framework/interface/stream/EDProducer.h" +//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h" +//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" + +// algorithm specific + +#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h" +#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" +#include "CUDADataFormats/EcalDigi/interface/DigisCollection.h" + +#include "CondFormats/DataRecord/interface/EcalMappingElectronicsRcd.h" + +#include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h" + +#include "EventFilter/EcalRawToDigi/interface/DeclsForKernels.h" +#include "EventFilter/EcalRawToDigi/interface/UnpackGPU.h" + +class EcalCPUDigisProducer + : public edm::stream::EDProducer +{ +public: + explicit EcalCPUDigisProducer(edm::ParameterSet const& ps); + ~EcalCPUDigisProducer() override; + static void fillDescriptions(edm::ConfigurationDescriptions&); + +private: + void acquire(edm::Event const&, + edm::EventSetup const&, + edm::WaitingTaskWithArenaHolder) override; + void produce(edm::Event&, edm::EventSetup const&) override; + +private: + edm::EDGetTokenT> digisInEBToken_, + digisInEEToken_; + edm::EDPutTokenT digisOutEBToken_; + edm::EDPutTokenT digisOutEEToken_; + + // FIXME better way to pass pointers from acquire to produce? + std::vector> idsebtmp, idseetmp; + std::vector> dataebtmp, dataeetmp; +}; + +void EcalCPUDigisProducer::fillDescriptions( + edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; + + desc.add("digisInLabelEB", + edm::InputTag{"ecalRawToDigiGPU", "ebDigisGPU"}); + desc.add("digisInLabelEE", + edm::InputTag{"ecalRawToDigiGPU", "eeDigisGPU"}); + desc.add("digisOutLabelEB", "ebDigis"); + desc.add("digisOutLabelEE", "eeDigis"); + + std::string label = "ecalCPUDigisProducer"; + confDesc.add(label, desc); +} + +EcalCPUDigisProducer::EcalCPUDigisProducer( + const edm::ParameterSet& ps) + : digisInEBToken_{consumes>( + ps.getParameter("digisInLabelEB"))} + , digisInEEToken_{consumes>( + ps.getParameter("digisInLabelEE"))} + , digisOutEBToken_{produces( + ps.getParameter("digisOutLabelEB"))} + , digisOutEEToken_{produces( + ps.getParameter("digisOutLabelEE"))} +{} + +EcalCPUDigisProducer::~EcalCPUDigisProducer() {} + +void EcalCPUDigisProducer::acquire( + edm::Event const& event, + edm::EventSetup const& setup, + edm::WaitingTaskWithArenaHolder taskHolder) +{ + // retrieve data/ctx + auto const& ebdigisProduct = event.get(digisInEBToken_); + auto const& eedigisProduct = event.get(digisInEEToken_); + cms::cuda::ScopedContextAcquire ctx{ebdigisProduct, std::move(taskHolder)}; + auto const& ebdigis = ctx.get(ebdigisProduct); + auto const& eedigis = ctx.get(eedigisProduct); + + // resize out tmp buffers + // FIXME remove hardcoded values + idsebtmp.resize(ebdigis.ndigis); + dataebtmp.resize(ebdigis.ndigis * 10); + idseetmp.resize(eedigis.ndigis); + dataeetmp.resize(eedigis.ndigis * 10); + + // enqeue transfers + cudaCheck( cudaMemcpyAsync(dataebtmp.data(), + ebdigis.data, + dataebtmp.size() * sizeof(uint16_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + cudaCheck( cudaMemcpyAsync(dataeetmp.data(), + eedigis.data, + dataeetmp.size() * sizeof(uint16_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + cudaCheck( cudaMemcpyAsync(idsebtmp.data(), + ebdigis.ids, + idsebtmp.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + cudaCheck( cudaMemcpyAsync(idseetmp.data(), + eedigis.ids, + idseetmp.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); +} + +void EcalCPUDigisProducer::produce( + edm::Event& event, + edm::EventSetup const& setup) +{ + // output collections + auto digisEB = std::make_unique(); + auto digisEE = std::make_unique(); + digisEB->resize(idsebtmp.size()); + digisEE->resize(idseetmp.size()); + + // cast constness away + // use pointers to buffers instead of move operator= semantics + // cause we have different allocators in there... + auto *dataEB = const_cast(digisEB->data().data()); + auto *dataEE = const_cast(digisEE->data().data()); + auto *idsEB = const_cast(digisEB->ids().data()); + auto *idsEE = const_cast(digisEE->ids().data()); + + // copy data + std::memcpy(dataEB, dataebtmp.data(), dataebtmp.size() * sizeof(uint16_t)); + std::memcpy(dataEE, dataeetmp.data(), dataeetmp.size() * sizeof(uint16_t)); + std::memcpy(idsEB, idsebtmp.data(), idsebtmp.size() * sizeof(uint32_t)); + std::memcpy(idsEE, idseetmp.data(), idseetmp.size() * sizeof(uint32_t)); + + event.put(digisOutEBToken_, std::move(digisEB)); + event.put(digisOutEEToken_, std::move(digisEE)); +} + +DEFINE_FWK_MODULE(EcalCPUDigisProducer); diff --git a/EventFilter/EcalRawToDigi/plugins/EcalRawESProducerGPU.h b/EventFilter/EcalRawToDigi/plugins/EcalRawESProducerGPU.h new file mode 100644 index 0000000000000..2aa5e3bc8fe89 --- /dev/null +++ b/EventFilter/EcalRawToDigi/plugins/EcalRawESProducerGPU.h @@ -0,0 +1,44 @@ +#ifndef RecoLocalCalo_EcalRecProducers_src_EcalRawESProducerGPU_h +#define RecoLocalCalo_EcalRecProducers_src_EcalRawESProducerGPU_h + +#include "FWCore/Framework/interface/ESProducer.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Utilities/interface/typelookup.h" +#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h" +#include "FWCore/Framework/interface/ESTransientHandle.h" +#include "FWCore/Framework/interface/ModuleFactory.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" + +#include + +template +class EcalRawESProducerGPU : public edm::ESProducer { +public: + explicit EcalRawESProducerGPU(edm::ParameterSet const& ps) { + auto const label = ps.getParameter("label"); + auto name = ps.getParameter("ComponentName"); + auto cc = setWhatProduced(this, name); + cc.setConsumes(token_, edm::ESInputTag{"", label}); + } + + std::unique_ptr produce(Record const& record) { + // retrieve conditions in old format + auto sourceProduct = record.getTransientHandle(token_); + + return std::make_unique(*sourceProduct); + } + + static void fillDescriptions(edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; + + std::string label = Target::name() + "ESProducer"; + desc.add("ComponentName", ""); + desc.add("label", "")->setComment("Product Label"); + confDesc.add(label, desc); + } + +private: + edm::ESGetToken token_; +}; + +#endif diff --git a/EventFilter/EcalRawToDigi/plugins/EcalRawESProducersGPUDefs.cc b/EventFilter/EcalRawToDigi/plugins/EcalRawESProducersGPUDefs.cc new file mode 100644 index 0000000000000..6538cb0f32816 --- /dev/null +++ b/EventFilter/EcalRawToDigi/plugins/EcalRawESProducersGPUDefs.cc @@ -0,0 +1,14 @@ +#include "EcalRawESProducerGPU.h" + +#include "CondFormats/DataRecord/interface/EcalMappingElectronicsRcd.h" + +#include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h" + +#include + +using EcalElectronicsMappingGPUESProducer = EcalRawESProducerGPU< + ecal::raw::ElectronicsMappingGPU, + EcalMappingElectronics, + EcalMappingElectronicsRcd>; + +DEFINE_FWK_EVENTSETUP_MODULE(EcalElectronicsMappingGPUESProducer); diff --git a/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc b/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc new file mode 100644 index 0000000000000..3198017117cb6 --- /dev/null +++ b/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc @@ -0,0 +1,175 @@ +#include + +// framework +#include "FWCore/Framework/interface/stream/EDProducer.h" +//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h" +//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" + +// algorithm specific + +#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h" +#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" +#include "CUDADataFormats/EcalDigi/interface/DigisCollection.h" + +#include "CondFormats/DataRecord/interface/EcalMappingElectronicsRcd.h" + +#include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h" + +#include "EventFilter/EcalRawToDigi/interface/DeclsForKernels.h" +#include "EventFilter/EcalRawToDigi/interface/UnpackGPU.h" + +class EcalRawToDigiGPU + : public edm::stream::EDProducer +{ +public: + explicit EcalRawToDigiGPU(edm::ParameterSet const& ps); + ~EcalRawToDigiGPU() override; + static void fillDescriptions(edm::ConfigurationDescriptions&); + +private: + void acquire(edm::Event const&, + edm::EventSetup const&, + edm::WaitingTaskWithArenaHolder) override; + void produce(edm::Event&, edm::EventSetup const&) override; + +private: + edm::EDGetTokenT rawDataToken_; + edm::EDPutTokenT> digisEBToken_, + digisEEToken_; + + cms::cuda::ContextState cudaState_; + + std::vector fedsToUnpack_; + + ecal::raw::ConfigurationParameters config_; + // FIXME move this to use raii + ecal::raw::InputDataCPU inputCPU_; + ecal::raw::InputDataGPU inputGPU_; + ecal::raw::OutputDataGPU outputGPU_; + ecal::raw::ScratchDataGPU scratchGPU_; + ecal::raw::OutputDataCPU outputCPU_; +}; + +void EcalRawToDigiGPU::fillDescriptions( + edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; + + desc.add("InputLabel", edm::InputTag("rawDataCollector")); + std::vector feds(54); + for (uint32_t i=0; i<54; ++i) + feds[i] = i+601; + desc.add>("FEDs", feds); + desc.add("maxChannels", 20000); + desc.add("digisLabelEB", "ebDigisGPU"); + desc.add("digisLabelEE", "eeDigisGPU"); + + std::string label = "ecalRawToDigiGPU"; + confDesc.add(label, desc); +} + +EcalRawToDigiGPU::EcalRawToDigiGPU( + const edm::ParameterSet& ps) + : rawDataToken_{consumes(ps.getParameter( + "InputLabel"))} + , digisEBToken_{produces>( + ps.getParameter("digisLabelEB"))} + , digisEEToken_{produces>( + ps.getParameter("digisLabelEE"))} + , fedsToUnpack_{ps.getParameter>("FEDs")} +{ + config_.maxChannels = ps.getParameter("maxChannels"); + + inputCPU_.allocate(); + inputGPU_.allocate(); + outputGPU_.allocate(config_); + scratchGPU_.allocate(config_); + outputCPU_.allocate(); +} + +EcalRawToDigiGPU::~EcalRawToDigiGPU() { + inputGPU_.deallocate(); + outputGPU_.deallocate(config_); + scratchGPU_.deallocate(config_); +} + +void EcalRawToDigiGPU::acquire( + edm::Event const& event, + edm::EventSetup const& setup, + edm::WaitingTaskWithArenaHolder holder) +{ + // raii + cms::cuda::ScopedContextAcquire ctx{ + event.streamID(), std::move(holder), cudaState_}; + + // conditions + edm::ESHandle eMappingHandle; + setup.get().get(eMappingHandle); + auto const& eMappingProduct = eMappingHandle->getProduct(ctx.stream()); + + // bundle up conditions + ecal::raw::ConditionsProducts conditions{eMappingProduct}; + + // event data + edm::Handle rawDataHandle; + event.getByToken(rawDataToken_, rawDataHandle); + + // iterate over feds + // TODO: another idea + // - loop over all feds to unpack and enqueue cuda memcpy + // - accumulate the sizes + // - after the loop launch cuda memcpy for sizes + // - enqueue the kernel + uint32_t currentCummOffset = 0; + uint32_t counter = 0; + for (auto const& fed : fedsToUnpack_) { + //std::cout << "fed: " << fed << std::endl; + auto const& data = rawDataHandle->FEDData(fed); + auto const nbytes = data.size(); + + // skip empty feds + if (nbytes < ecal::raw::empty_event_size) + continue; + + // copy raw data into plain buffer + std::memcpy(inputCPU_.data.data() + currentCummOffset, data.data(), nbytes); + // set the offset in bytes from the start + inputCPU_.offsets[counter] = currentCummOffset; + inputCPU_.feds[counter] = fed; + + // this is the current offset into the vector + currentCummOffset += nbytes; + ++counter; + } + + ecal::raw::entryPoint( + inputCPU_, inputGPU_, outputGPU_, scratchGPU_, outputCPU_, + conditions, ctx.stream(), counter, currentCummOffset); +} + +void EcalRawToDigiGPU::produce( + edm::Event& event, + edm::EventSetup const& setup) +{ + cms::cuda::ScopedContextProduce ctx{cudaState_}; + + // get the number of channels + auto const nchannelsEB = outputCPU_.nchannels[0]; + auto const nchannelsEE = outputCPU_.nchannels[1]; + + ecal::DigisCollection digisEB{outputGPU_.idsEB, + outputGPU_.samplesEB, nchannelsEB}; + ecal::DigisCollection digisEE{outputGPU_.idsEE, + outputGPU_.samplesEE, nchannelsEE}; + + ctx.emplace(event, digisEBToken_, std::move(digisEB)); + ctx.emplace(event, digisEEToken_, std::move(digisEE)); +} + +DEFINE_FWK_MODULE(EcalRawToDigiGPU); diff --git a/EventFilter/EcalRawToDigi/src/ElectronicsMappingGPU.cc b/EventFilter/EcalRawToDigi/src/ElectronicsMappingGPU.cc new file mode 100644 index 0000000000000..c09a963b62a1d --- /dev/null +++ b/EventFilter/EcalRawToDigi/src/ElectronicsMappingGPU.cc @@ -0,0 +1,62 @@ +#include "EventFilter/EcalRawToDigi/interface/ElectronicsMappingGPU.h" + +#include "FWCore/Utilities/interface/typelookup.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +#include "DataFormats/EcalDetId/interface/EcalElectronicsId.h" + +namespace ecal { namespace raw { + +// TODO: 0x3FFFFF * 4B ~= 16MB +// tmp solution for linear mapping of eid -> did +ElectronicsMappingGPU::ElectronicsMappingGPU(EcalMappingElectronics const& mapping) + : eid2did_(0x3FFFFF) +{ + + // fill in eb + // TODO: EB vector is actually empty + auto const& barrelValues = mapping.barrelItems(); + for (unsigned int i=0; ieid2did_.size() * sizeof(uint32_t)) ); + + // transfer + cudaCheck( cudaMemcpyAsync(product.eid2did, + this->eid2did_.data(), + this->eid2did_.size() * sizeof(uint32_t), + cudaMemcpyHostToDevice, + cudaStream) ); + } + ); + + return product; +} + +}} + +TYPELOOKUP_DATA_REG(ecal::raw::ElectronicsMappingGPU); diff --git a/EventFilter/EcalRawToDigi/src/UnpackGPU.cu b/EventFilter/EcalRawToDigi/src/UnpackGPU.cu new file mode 100644 index 0000000000000..8c9f05535b70d --- /dev/null +++ b/EventFilter/EcalRawToDigi/src/UnpackGPU.cu @@ -0,0 +1,476 @@ +#include "EventFilter/EcalRawToDigi/interface/UnpackGPU.h" +#include "EventFilter/EcalRawToDigi/interface/ElectronicsIdGPU.h" + +namespace ecal { namespace raw { + +__forceinline__ __device__ +void print_raw_buffer( + uint8_t const* const buffer, + uint32_t const nbytes, uint32_t const nbytes_per_row = 20) { + for (uint32_t i=0; i0) + printf("\n"); + printf("%02X ", buffer[i]); + } +} + +__forceinline__ __device__ +void print_first3bits(uint64_t const* buffer, uint32_t size) { + for (uint32_t i=0; i> 61) & 0x1; + uint8_t const b62 = (buffer[i] >> 62) & 0x1; + uint8_t const b63 = (buffer[i] >> 63) & 0x1; + printf("[word: %u] %u%u%u\n", i, + b63, b62, b61); + } +} + +__forceinline__ __device__ +bool is_barrel(uint8_t dccid) { + return dccid >= ElectronicsIdGPU::MIN_DCCID_EBM && + dccid <= ElectronicsIdGPU::MAX_DCCID_EBP; +} + +__forceinline__ __device__ +uint8_t fed2dcc(int fed) { return static_cast(fed - 600); } + +__forceinline__ __device__ +int zside_for_eb(ElectronicsIdGPU const& eid) { + int dcc = eid.dccId(); + return ((dcc >= ElectronicsIdGPU::MIN_DCCID_EBM && + dcc <= ElectronicsIdGPU::MAX_DCCID_EBM)) + ? -1 + : 1; + /* + if ((dcc >= MIN_DCCID_EBP && dcc <= MAX_DCCID_EBP)) + return +1; + */ +} + +__forceinline__ __device__ +bool is_synced_towerblock( + uint16_t const dccbx, + uint16_t const bx, + uint16_t const dccl1, + uint16_t const l1) { + bool const bxsync = (bx==0 && dccbx==3564) || (bx==dccbx && dccbx!=3564); + bool const l1sync = (l1 == ((dccl1 - 1) & 0xfff)); + return bxsync && l1sync; +} + +__forceinline__ __device__ +bool right_tower_for_eb(int tower) { + // for EB, two types of tower (LVRB top/bottom) + if ((tower > 12 && tower < 21) || + (tower > 28 && tower < 37) || + (tower > 44 && tower < 53) || + (tower > 60 && tower < 69)) + return true; + else + return false; +} + +__forceinline__ __device__ +uint32_t compute_ebdetid(ElectronicsIdGPU const& eid) { + // as in Geometry/EcalMaping/.../EcalElectronicsMapping + auto const dcc = eid.dccId(); + auto const tower = eid.towerId(); + auto const strip = eid.stripId(); + auto const xtal = eid.xtalId(); + + int smid = 0; + int iphi = 0; + bool EBPlus = (zside_for_eb(eid) > 0); + bool EBMinus = !EBPlus; + + if (zside_for_eb(eid) < 0) { + smid = dcc + 19 - ElectronicsIdGPU::DCCID_PHI0_EBM; + iphi = (smid - 19) * ElectronicsIdGPU::kCrystalsInPhi; + iphi += 5 * ((tower - 1) % ElectronicsIdGPU::kTowersInPhi); + } else { + smid = dcc + 1 - ElectronicsIdGPU::DCCID_PHI0_EBP; + iphi = (smid - 1) * ElectronicsIdGPU::kCrystalsInPhi; + iphi += 5 * (ElectronicsIdGPU::kTowersInPhi - ((tower - 1) % ElectronicsIdGPU::kTowersInPhi) - 1); + } + + bool RightTower = right_tower_for_eb(tower); + int ieta = 5 * ((tower - 1) / ElectronicsIdGPU::kTowersInPhi) + 1; + if (RightTower) { + ieta += (strip - 1); + if (strip % 2 == 1) { + if (EBMinus) + iphi += (xtal - 1) + 1; + else + iphi += (4 - (xtal - 1)) + 1; + } else { + if (EBMinus) + iphi += (4 - (xtal - 1)) + 1; + else + iphi += (xtal - 1) + 1; + } + } else { + ieta += 4 - (strip - 1); + if (strip % 2 == 1) { + if (EBMinus) + iphi += (4 - (xtal - 1)) + 1; + else + iphi += (xtal - 1) + 1; + } else { + if (EBMinus) + iphi += (xtal - 1) + 1; + else + iphi += (4 - (xtal - 1)) + 1; + } + } + + if (zside_for_eb(eid) < 0) + ieta = -ieta; + + DetId did{DetId::Ecal, EcalBarrel}; + return did.rawId() | + ((ieta > 0) + ? (0x10000 | (ieta << 9)) + : ((-ieta) << 9)) | (iphi & 0x1FF); +} + +__forceinline__ __device__ +int adc(uint16_t sample) { return sample & 0xfff; } +__forceinline__ __device__ +int gainId(uint16_t sample) { return (sample>>12) & 0x3; } + +template +__global__ +void kernel_unpack_test( + unsigned char const* __restrict__ data, + uint32_t const* __restrict__ offsets, + int const* __restrict__ feds, + uint16_t *samplesEB, + uint16_t *samplesEE, + uint32_t *idsEB, + uint32_t *idsEE, + uint32_t *pChannelsCounterEBEE, + uint32_t const* eid2did, + uint32_t const nbytesTotal) { + // indices + auto const ifed = blockIdx.x; + + // FIXME: use only the very first fed + //if (ifed!=10) return; + + // offset in bytes + auto const offset = offsets[ifed]; + // fed id + auto const fed = feds[ifed]; + auto const isBarrel = is_barrel(static_cast(fed - 600)); + // size + auto const size = ifed==gridDim.x-1 ? nbytesTotal - offset : offsets[ifed+1] - offset; + auto *samples = isBarrel ? samplesEB : samplesEE; + auto *ids = isBarrel ? idsEB : idsEE; + auto *pChannelsCounter = isBarrel + ? &pChannelsCounterEBEE[0] + : &pChannelsCounterEBEE[1]; + + // FIXME: debugging + //printf("ifed = %u fed = %d offset = %u size = %u\n", ifed, fed, offset, size); + + // offset to the right raw buffer + uint64_t const* buffer = reinterpret_cast(data + offset); + + // dump first 3 bits for each 64-bit word + //print_first3bits(buffer, size / 8); + + // + // fed header + // + //print_raw_buffer(reinterpret_cast(buffer), 8); + //printf("\n"); + auto const fed_header = buffer[0]; + uint32_t fed_id = (fed_header >> 8) & 0xfff; + uint32_t bx = (fed_header >> 20) & 0xfff; + uint32_t lv1 = (fed_header >> 32) & 0xffffff; + uint8_t trigger_type = (fed_header >> 56) & 0xf; + uint8_t const bid_fed_header = (fed_header >> 60) & 0xf; + //printf("fed = %d fed_id = %u bx = %u lv1 = %u tt=%hhu bid = 0x%u\n", + // fed, fed_id, bx, lv1, trigger_type, bid_fed_header); + + // + // dcc header: w1 + // + //print_raw_buffer(reinterpret_cast(buffer + 1), 8); + //printf("\n"); + auto const dcc_header = buffer[1]; + uint32_t event_length = dcc_header & 0xffffff; + uint8_t dcc_errors = (dcc_header >> 24) & 0xff; + uint32_t run_number = (dcc_header >> 32) & 0xffffff; + uint8_t const word_dcc = (dcc_header >> 56) & 0x3f; + uint8_t const bid_dcc_header = (dcc_header >> 62) & 0x3; + //printf("fed = %d size = %u event_length = %u dcc_errors = %u run_number = %u word_dcc = 0x%u bid_dcc_header = 0x%u\n", + // fed, size, 8*event_length, static_cast(dcc_errors), run_number, static_cast(word_dcc), static_cast(bid_dcc_header)); + + // + // dcc header w2 + // + //print_raw_buffer(reinterpret_cast(buffer + 2), 8); + //printf("\n"); + auto const w2 = buffer[2]; + uint32_t const run_type = w2 & 0xffffffff; + uint16_t const det_trigger_type = (w2 >> 32) & 0xffff; + uint8_t w2_dcc = (w2 >> 56) & 0x3f; + uint8_t w2_bid_dcc = (w2 >> 62) & 0x3; + //printf("run_type = %u det_trigger_type = %u w2_dcc = %u w2_bid_dcc = %u\n", + // run_type, det_trigger_type, w2_dcc, w2_bid_dcc); + + // + // dcc header w3 + // + auto const w3 = buffer[3]; + //print_raw_buffer(reinterpret_cast(&w3), 8); + //printf("\n"); + uint32_t const orbit_number = w3 & 0xffffffff; + uint8_t const sr = (w3 >> 32) & 0x1; + uint8_t const zs = (w3 >> 33) & 0x1; + uint8_t const tzs = (w3 >> 34) & 0x1; + uint8_t const sr_chstatus = (w3 >> 36) & 0xf; + uint8_t const tcc_chstatus1 = (w3 >> 40) & 0xf; + uint8_t const tcc_chstatus2 = (w3 >> 44) & 0xf; + uint8_t const tcc_chstatus3 = (w3 >> 48) & 0xf; + uint8_t const tcc_chstatus4 = (w3 >> 52) & 0xf; + uint8_t const w3_dcc = (w3 >> 56) & 0x3f; + uint8_t const w3_bid_dcc = (w3 >> 62) & 0x3; + //printf("orbit_number = %u sr = %u zs = %u tzs = %u sr_chstatus = %u\n", + // orbit_number, static_cast(sr), static_cast(zs), + // static_cast(tzs), static_cast(sr_chstatus)); + //printf("tcc_chstatus1 = %u tcc_chstatus2 = %u tcc_chstatus3 = %u tcc_chstatus4 = %u\n", + // static_cast(tcc_chstatus1), static_cast(tcc_chstatus2), + // static_cast(tcc_chstatus3), static_cast(tcc_chstatus4)); + + // + // w4 - w8 (including 5 64-bit words) + // + /* + for (uint32_t i=0; i<5; i++) { + auto const wi = buffer[4 + i]; + for (uint32_t i=0; i<14; i++) { + uint8_t value_i = (wi >> i*4) & 0xf; + printf("fe_chstatus_%u = %u ", i, static_cast(value_i)); + } + uint8_t wi_dcc = (wi >> 56) & 0x3f; + uint8_t wi_bid_dcc = (wi >> 62) & 0x3; + printf("wi_dcc = %u wi_bid-dcc = %u\n", + static_cast(wi_dcc), static_cast(wi_bid_dcc)); + printf("\n"); + } + */ + + // + // TCC block + // + { + auto const w = buffer[9]; + //print_raw_buffer(reinterpret_cast(&w), 8); + //printf("\n"); + uint8_t const tccid = w & 0xff; + uint8_t const bxlocal = (w >> 16) & 0xff; + uint8_t const e0 = (w >> 17) & 0x1; + uint8_t const w_bfield_0 = (w >> 29) & 0x7; + uint16_t const lv1local = (w >> 32) & 0xfff; + uint8_t const e1 = (w >> 44) & 0x1; + uint8_t const ntt = (w >> 48) & 0x7f; + uint8_t const ntimesamples = (w >> 55) & 0xf; + uint8_t const le0 = (w >> 59) & 0x1; + uint8_t const le1 = (w >> 60) & 0x1; + uint8_t const w_bfield_1 = (w >> 61) & 0x7; + //printf("tccid = %u bxlocal = %u e0 = %u w_bitfield_0 = %u lv1local = %u\n", + // tccid, bxlocal, e0, w_bfield_0, lv1local); + //printf("e1 = %u ntt = %u ntimesamples = %u le0 = %u le1 = %u w_bfield_1 = %u\n", + // e1, ntt, ntimesamples, le0, le1, w_bfield_1); + } + + // 9 for fed + dcc header + // 36 for 4 EE TCC blocks or 18 for 1 EB TCC block + // 6 for SR block size + //print_first3bits(buffer, size / 8); + //auto const* tower_block_start = buffer + 9 + 36 + 6; + //print_first3bits(tower_block_start, size / 8 - 10 - 36 - 6); + + // + // print Tower block headers + // + uint8_t ntccblockwords = isBarrel ? 18 : 36; + auto const* tower_blocks_start = buffer + 9 + ntccblockwords + 6; + auto const* trailer = buffer + (size / 8 - 1); + auto const* current_tower_block = tower_blocks_start; + while (current_tower_block != trailer) { + auto const w = *current_tower_block; + uint8_t ttid = w & 0xff; + uint8_t ntimesamples = (w >> 8) & 0x7f; + uint16_t bxlocal = (w >> 16) & 0xfff; + uint8_t e0 = (w >> 28) & 0x1; + uint8_t w_bfield_0 = (w >> 30) & 0x3; + uint16_t lv1local = (w >> 32) & 0xfff; + uint8_t e1 = (w >> 44) & 0x1; + uint16_t block_length = (w >> 48) & 0x1ff; + uint16_t w_bfield_1 = (w >> 62) & 0x3; + + // + uint16_t const dccbx = bx & 0xfff; + uint16_t const dccl1 = lv1 & 0xfff; + //printf("dccbx = %u bxlocal = %u dccl1 = %u l1local = %u\n", + // dccbx, bxlocal, dccl1, lv1local); + if (!is_synced_towerblock(dccbx, bxlocal, dccl1, lv1local)) { + current_tower_block += block_length; + continue; + } + + //printf("ttid = %u ntimesamples = %u\ bxlocal = %u e0 = %u w_bfield_0 = %u\n", + // ttid, ntimesamples, bxlocal, e0, w_bfield_0); + //printf("lv1local = %u e1 = %u block_length = %u w_bfield-1 = %u\n", + // lv1local, e1, block_length, w_bfield_1); + + // go thru all the channels + // get the next channel coordinates + uint32_t nchannels = (block_length - 1) / 3; + + // 1 threads per channel in this block + for (uint32_t ich=0; ich leave the loop + if (i_to_access>=nchannels) break; + + // inc the channel's counter and get the pos where to store + auto const wdata = current_tower_block[1 + i_to_access*3]; + uint8_t const stripid = wdata & 0x7; + uint8_t const xtalid = (wdata >> 4) & 0x7; + ElectronicsIdGPU eid{fed2dcc(fed), ttid, stripid, xtalid}; + auto const didraw = isBarrel + ? compute_ebdetid(eid) + : eid2did[eid.linearIndex()]; + // FIXME: what kind of channels are these guys + if (didraw == 0) + continue; + + // get samples + uint16_t sampleValues[10]; + sampleValues[0] = (wdata >> 16) & 0x3fff; + sampleValues[1] = (wdata >> 32) & 0x3fff; + sampleValues[2] = (wdata >> 48) & 0x3fff; + auto const wdata1 = current_tower_block[2+i_to_access*3]; + sampleValues[3] = wdata1 & 0x3fff; + sampleValues[4] = (wdata1 >> 16) & 0x3fff; + sampleValues[5] = (wdata1 >> 32) & 0x3fff; + sampleValues[6] = (wdata1 >> 48) & 0x3fff; + auto const wdata2 = current_tower_block[3+i_to_access*3]; + sampleValues[7] = wdata2 & 0x3fff; + sampleValues[8] = (wdata2 >> 16) & 0x3fff; + sampleValues[9] = (wdata2 >> 32) & 0x3fff; + //printf("stripid = %u xtalid = %u\n", stripid, xtalid); + + // check gain + bool isSaturation = true; + short firstGainZeroSampID{-1}, firstGainZeroSampADC{-1}; + for (uint32_t si=0; si<10; si++) { + if (gainId(sampleValues[si]) == 0) { + firstGainZeroSampID = si; + firstGainZeroSampADC = adc(sampleValues[si]); + break; + } + } + if (firstGainZeroSampID!=-1) { + unsigned int plateauEnd = std::min(10u ,(unsigned int)(firstGainZeroSampID+5)); + for (unsigned int s=firstGainZeroSampID; s gainId(sampleValues[si])) && + numGain<5) gainSwitchError=true; + if (gainId(sampleValues[si-1]) == gainId(sampleValues[si])) numGain++; + else numGain=1; + } + if (gainSwitchError) + continue; + } + + auto const pos = atomicAdd(pChannelsCounter, 1); + + // store to global + ids[pos] = didraw; + samples[pos*10] = sampleValues[0]; + samples[pos*10 + 1] = sampleValues[1]; + samples[pos*10 + 2] = sampleValues[2]; + samples[pos*10 + 3] = sampleValues[3]; + samples[pos*10 + 4] = sampleValues[4]; + samples[pos*10 + 5] = sampleValues[5]; + samples[pos*10 + 6] = sampleValues[6]; + samples[pos*10 + 7] = sampleValues[7]; + samples[pos*10 + 8] = sampleValues[8]; + samples[pos*10 + 9] = sampleValues[9]; + } + + current_tower_block += block_length; + } +} + +void entryPoint( + InputDataCPU const& inputCPU, + InputDataGPU& inputGPU, + OutputDataGPU& outputGPU, + ScratchDataGPU& scratchGPU, + OutputDataCPU& outputCPU, + ConditionsProducts const& conditions, + cudaStream_t cudaStream, + uint32_t const nfedsWithData, + uint32_t const nbytesTotal) { + // transfer + cudaCheck( cudaMemcpyAsync(inputGPU.data, + inputCPU.data.data(), + nbytesTotal * sizeof(unsigned char), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(inputGPU.offsets, + inputCPU.offsets.data(), + nfedsWithData * sizeof(uint32_t), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemsetAsync(scratchGPU.pChannelsCounter, + 0, + sizeof(uint32_t) * 2, // EB + EE + cudaStream) ); + cudaCheck( cudaMemcpyAsync(inputGPU.feds, + inputCPU.feds.data(), + nfedsWithData * sizeof(int), + cudaMemcpyHostToDevice, + cudaStream) ); + + kernel_unpack_test<32><<>>( + inputGPU.data, + inputGPU.offsets, + inputGPU.feds, + outputGPU.samplesEB, + outputGPU.samplesEE, + outputGPU.idsEB, + outputGPU.idsEE, + scratchGPU.pChannelsCounter, + conditions.eMappingProduct.eid2did, + nbytesTotal + ); + cudaCheck( cudaGetLastError() ); + + // transfer the counters for how many eb and ee channels we got + cudaCheck( cudaMemcpyAsync(outputCPU.nchannels.data(), + scratchGPU.pChannelsCounter, + sizeof(uint32_t) * 2, + cudaMemcpyDeviceToHost, + cudaStream) ); +} + +}} From dd8cd82dbfbe66ebb6f4016590fb57d848a49e14 Mon Sep 17 00:00:00 2001 From: Viktor Khristenko Date: Mon, 30 Mar 2020 15:37:21 +0200 Subject: [PATCH 02/30] adding validation source --- EventFilter/EcalRawToDigi/bin/BuildFile.xml | 7 + .../makeEcalRaw2DigiGpuValidationPlots.cpp | 224 ++++++++++++++++++ 2 files changed, 231 insertions(+) create mode 100644 EventFilter/EcalRawToDigi/bin/BuildFile.xml create mode 100644 EventFilter/EcalRawToDigi/bin/makeEcalRaw2DigiGpuValidationPlots.cpp diff --git a/EventFilter/EcalRawToDigi/bin/BuildFile.xml b/EventFilter/EcalRawToDigi/bin/BuildFile.xml new file mode 100644 index 0000000000000..792fe438d8799 --- /dev/null +++ b/EventFilter/EcalRawToDigi/bin/BuildFile.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/EventFilter/EcalRawToDigi/bin/makeEcalRaw2DigiGpuValidationPlots.cpp b/EventFilter/EcalRawToDigi/bin/makeEcalRaw2DigiGpuValidationPlots.cpp new file mode 100644 index 0000000000000..9fc9ec26e3714 --- /dev/null +++ b/EventFilter/EcalRawToDigi/bin/makeEcalRaw2DigiGpuValidationPlots.cpp @@ -0,0 +1,224 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "DataFormats/Common/interface/Wrapper.h" +#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" + +int main(int argc, char *argv[]) { + if (argc<3) { + std::cout << "run with: ./ \n"; + exit(0); + } + + // branches to use + edm::Wrapper *wgpuEB=nullptr, *wcpuEB=nullptr; + edm::Wrapper *wgpuEE=nullptr, *wcpuEE=nullptr; + + std::string inFileName{argv[1]}; + std::string outFileName{argv[2]}; + + // prep output + TFile rfout{outFileName.c_str(), "recreate"}; + + int const nbins = 400; + float const last = 4096.; + auto hADCEBGPU = new TH1D("hADCEBGPU", "hADCEBGPU", nbins, 0, last); + auto hADCEBCPU = new TH1D("hADCEBCPU", "hADCEBCPU", nbins, 0, last); + auto hADCEEGPU = new TH1D("hADCEEGPU", "hADCEEGPU", nbins, 0, last); + auto hADCEECPU = new TH1D("hADCEECPU", "hADCEECPU", nbins, 0, last); + + auto hGainEBGPU = new TH1D("hGainEBGPU", "hGainEBGPU", 4, 0, 4); + auto hGainEBCPU = new TH1D("hGainEBCPU", "hGainEBCPU", 4, 0, 4); + auto hGainEEGPU = new TH1D("hGainEEGPU", "hGainEEGPU", 4, 0, 4); + auto hGainEECPU = new TH1D("hGainEECPU", "hGainEECPU", 4, 0, 4); + + auto hADCEBGPUvsCPU = new TH2D("hADCEBGPUvsCPU", "hADCEBGPUvsCPU", + nbins, 0, last, nbins, 0, last); + auto hADCEEGPUvsCPU = new TH2D("hADCEEGPUvsCPU", "hADCEEGPUvsCPU", + nbins, 0, last, nbins, 0, last); + auto hGainEBGPUvsCPU = new TH2D("hGainEBGPUvsCPU", "hGainEBGPUvsCPU", + 4, 0, 4, 4, 0, 4); + auto hGainEEGPUvsCPU = new TH2D("hGainEEGPUvsCPU", "hGainEEGPUvsCPU", + 4, 0, 4, 4, 0, 4); + + // prep input + TFile rfin{inFileName.c_str()}; + TTree *rt = (TTree*)rfin.Get("Events"); + rt->SetBranchAddress("EBDigiCollection_ecalCPUDigisProducer_ebDigis_RECO.", + &wgpuEB); + rt->SetBranchAddress("EEDigiCollection_ecalCPUDigisProducer_eeDigis_RECO.", + &wgpuEE); + rt->SetBranchAddress("EBDigiCollection_ecalDigis_ebDigis_RECO.", + &wcpuEB); + rt->SetBranchAddress("EEDigiCollection_ecalDigis_eeDigis_RECO.", + &wcpuEE); + + // accumulate + auto const nentries = rt->GetEntries(); + std::cout << ">>> nentries = " << nentries << std::endl; + for (int ie=0; ieGetEntry(ie); + + auto const ngpuebs = wgpuEB->bareProduct().size(); + auto const ncpuebs = wcpuEB->bareProduct().size(); + auto const ngpuees = wgpuEE->bareProduct().size(); + auto const ncpuees = wcpuEE->bareProduct().size(); + + if (ngpuebs!=ncpuebs or ngpuees!=ncpuees) { + std::cerr << "*** mismatch in ndigis: " + << "ie = " << ie + << " ngpuebs = " << ngpuebs + << " ncpuebs = " << ncpuebs + << " ngpuees = " << ngpuees + << " ncpuees = " << ncpuees + << std::endl; + + // this is a must for now + //assert(ngpuebs==ncpuebs); + //assert(ngpuees==ncpuees); + } + + // assume identical sizes + auto const& idsgpuEB = wgpuEB->bareProduct().ids(); + auto const& datagpuEB = wgpuEB->bareProduct().data(); + auto const& idscpuEB = wcpuEB->bareProduct().ids(); + auto const& datacpuEB = wcpuEB->bareProduct().data(); + for (uint32_t ieb=0; iebFill(sampleGPU.adc()); + hGainEBGPU->Fill(sampleGPU.gainId()); + hADCEBCPU->Fill(sampleCPU.adc()); + hGainEBCPU->Fill(sampleCPU.gainId()); + hADCEBGPUvsCPU->Fill(sampleCPU.adc(), sampleGPU.adc()); + hGainEBGPUvsCPU->Fill(sampleCPU.gainId(), sampleGPU.gainId()); + } + } + + auto const& idsgpuEE = wgpuEE->bareProduct().ids(); + auto const& datagpuEE = wgpuEE->bareProduct().data(); + auto const& idscpuEE = wcpuEE->bareProduct().ids(); + auto const& datacpuEE = wcpuEE->bareProduct().data(); + for (uint32_t iee=0; ieeFill(sampleGPU.adc()); + hGainEEGPU->Fill(sampleGPU.gainId()); + hADCEECPU->Fill(sampleCPU.adc()); + hGainEECPU->Fill(sampleCPU.gainId()); + hADCEEGPUvsCPU->Fill(sampleCPU.adc(), sampleGPU.adc()); + hGainEEGPUvsCPU->Fill(sampleCPU.gainId(), sampleGPU.gainId()); + } + } + } + + { + TCanvas c{"plots", "plots", 4200, 6200}; + c.Divide(2, 4); + c.cd(1); + { + gPad->SetLogy(); + hADCEBCPU->SetLineColor(kBlack); + hADCEBCPU->SetLineWidth(1.); + hADCEBCPU->Draw(""); + hADCEBGPU->SetLineColor(kBlue); + hADCEBGPU->SetLineWidth(1.); + hADCEBGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hADCEBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + c.cd(2); + { + gPad->SetLogy(); + hADCEECPU->SetLineColor(kBlack); + hADCEECPU->SetLineWidth(1.); + hADCEECPU->Draw(""); + hADCEEGPU->SetLineColor(kBlue); + hADCEEGPU->SetLineWidth(1.); + hADCEEGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hADCEEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + c.cd(3); + { + gPad->SetLogy(); + hGainEBCPU->SetLineColor(kBlack); + hGainEBCPU->SetLineWidth(1.); + hGainEBCPU->Draw(""); + hGainEBGPU->SetLineColor(kBlue); + hGainEBGPU->SetLineWidth(1.); + hGainEBGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hGainEBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + c.cd(4); + { + gPad->SetLogy(); + hGainEECPU->SetLineColor(kBlack); + hGainEECPU->SetLineWidth(1.); + hGainEECPU->Draw(""); + hGainEEGPU->SetLineColor(kBlue); + hGainEEGPU->SetLineWidth(1.); + hGainEEGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hGainEEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + c.cd(5); + hADCEBGPUvsCPU->Draw("colz"); + c.cd(6); + hADCEEGPUvsCPU->Draw("colz"); + c.cd(7); + hGainEBGPUvsCPU->Draw("colz"); + c.cd(8); + hGainEEGPUvsCPU->Draw("colz"); + c.SaveAs("plots.pdf"); + } + + rfin.Close(); + rfout.Write(); + rfout.Close(); +} From 0d118e117ed6f7414f4436139a59310f18eab02b Mon Sep 17 00:00:00 2001 From: Viktor Khristenko Date: Mon, 30 Mar 2020 15:44:49 +0200 Subject: [PATCH 03/30] cuda data formats ecal rechit fixes for 110x --- CUDADataFormats/EcalRecHitSoA/BuildFile.xml | 2 + .../interface/EcalUncalibratedRecHit_soa.h | 77 +++++++++++-------- .../EcalRecHitSoA/interface/RecoTypes.h | 10 +-- CUDADataFormats/EcalRecHitSoA/src/classes.h | 1 + .../EcalRecHitSoA/src/classes_def.xml | 18 ++++- 5 files changed, 68 insertions(+), 40 deletions(-) diff --git a/CUDADataFormats/EcalRecHitSoA/BuildFile.xml b/CUDADataFormats/EcalRecHitSoA/BuildFile.xml index 794d2bf7abead..927a7a57a86a7 100644 --- a/CUDADataFormats/EcalRecHitSoA/BuildFile.xml +++ b/CUDADataFormats/EcalRecHitSoA/BuildFile.xml @@ -1,6 +1,8 @@ + + diff --git a/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h b/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h index e11c13ebdf4c2..d43f77315476d 100644 --- a/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h +++ b/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h @@ -11,29 +11,39 @@ namespace ecal { - namespace Tag { +namespace Tag { - struct soa {}; - struct ptr {}; +struct soa {}; +struct ptr {}; - } // namespace Tag +} - template - struct type_wrapper { - //#ifndef ECAL_MULTIFIT_DONOT_USE_PINNED_MEM - // using type = std::vector>; - //#else - using type = std::vector; - //#endif - }; +namespace Detail { - template - struct type_wrapper { +// empty base +template +struct Base {}; + +// add number of values for ptr case +template<> +struct Base<::ecal::Tag::ptr> { + uint32_t size; +}; + +} + +template +struct type_wrapper { + using type = std::vector>; +}; + +template +struct type_wrapper { using type = T*; - }; +}; - template - struct UncalibratedRecHit { +template +struct UncalibratedRecHit : public Detail::Base { UncalibratedRecHit() = default; UncalibratedRecHit(const UncalibratedRecHit&) = default; UncalibratedRecHit& operator=(const UncalibratedRecHit&) = default; @@ -43,8 +53,8 @@ namespace ecal { // TODO: std::array causes root's dictionary problems typename type_wrapper::type amplitudesAll; - // typename type_wrapper, L>::type amplitudesAll; +// typename type_wrapper, L>::type amplitudesAll; typename type_wrapper::type amplitude; typename type_wrapper::type chi2; typename type_wrapper::type pedestal; @@ -53,21 +63,22 @@ namespace ecal { typename type_wrapper::type did; typename type_wrapper::type flags; - template - typename std::enable_if::value, void>::type resize(size_t size) { - amplitudesAll.resize(size * EcalDataFrame::MAXSAMPLES); - amplitude.resize(size); - pedestal.resize(size); - chi2.resize(size); - did.resize(size); - flags.resize(size); - jitter.resize(size); - jitterError.resize(size); + template + typename std::enable_if::value, void>::type + resize(size_t size) { + amplitudesAll.resize(size * EcalDataFrame::MAXSAMPLES); + amplitude.resize(size); + pedestal.resize(size); + chi2.resize(size); + did.resize(size); + flags.resize(size); + jitter.resize(size); + jitterError.resize(size); } - }; +}; - using SoAUncalibratedRecHitCollection = UncalibratedRecHit; +using SoAUncalibratedRecHitCollection = UncalibratedRecHit; -} // namespace ecal +} -#endif // RecoLocalCalo_EcalRecAlgos_interface_EcalUncalibratedRecHit_soa_h +#endif // RecoLocalCalo_EcalRecAlgos_interface_EcalUncalibratedRecHit_soa_h diff --git a/CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h b/CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h index 5667a9225f29d..cf8571feb01ae 100644 --- a/CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h +++ b/CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h @@ -1,13 +1,11 @@ #ifndef CUDADataFormats_EcalRecHitSoA_interface_RecoTypes #define CUDADataFormats_EcalRecHitSoA_interface_RecoTypes -namespace ecal { - namespace reco { +namespace ecal { namespace reco { - using ComputationScalarType = float; - using StorageScalarType = float; +using ComputationScalarType = float; +using StorageScalarType = float; - } // namespace reco -} // namespace ecal +}} #endif diff --git a/CUDADataFormats/EcalRecHitSoA/src/classes.h b/CUDADataFormats/EcalRecHitSoA/src/classes.h index 8ad6b8d684b9a..3cab9957e62b4 100644 --- a/CUDADataFormats/EcalRecHitSoA/src/classes.h +++ b/CUDADataFormats/EcalRecHitSoA/src/classes.h @@ -1,2 +1,3 @@ #include "DataFormats/Common/interface/Wrapper.h" +#include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" diff --git a/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml b/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml index 461460835a723..68056d21ad4c1 100644 --- a/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml +++ b/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml @@ -1,5 +1,21 @@ + + + + + + + + + + + - + From 894d06a45295947d8ab106a4a0f3428e63e96995 Mon Sep 17 00:00:00 2001 From: Viktor Khristenko Date: Mon, 30 Mar 2020 16:17:27 +0200 Subject: [PATCH 04/30] ecal reco algos adapted for 110x --- RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml | 2 +- .../bin/makeEcalGpuValidationPlots.cpp | 232 --- ...eEcalMultifitResultsGpuValidationPlots.cpp | 261 +++ .../EcalRecAlgos/interface/DeclsForKernels.h | 420 ++-- .../interface/EcalGainRatiosGPU.h | 37 +- .../EcalRecAlgos/interface/EcalPedestalsGPU.h | 48 +- .../interface/EcalPulseCovariancesGPU.h | 35 +- .../interface/EcalPulseShapesGPU.h | 35 +- .../interface/EcalSamplesCorrelationGPU.h | 49 +- .../interface/EcalTimeBiasCorrectionsGPU.h | 47 +- .../interface/EcalTimeCalibConstantsGPU.h | 39 +- .../EcalUncalibRecHitMultiFitAlgo_gpu_new.h | 20 +- .../interface/EigenMatrixTypes_gpu.h | 73 +- .../src/AmplitudeComputationCommonKernels.cu | 819 ++++---- .../src/AmplitudeComputationCommonKernels.h | 151 +- .../src/AmplitudeComputationKernels.cu | 425 +++++ .../src/AmplitudeComputationKernels.h | 27 + .../src/AmplitudeComputationKernelsV1.cu | 372 ---- .../src/AmplitudeComputationKernelsV1.h | 50 - .../EcalRecAlgos/src/EcalGainRatiosGPU.cc | 83 +- .../EcalRecAlgos/src/EcalPedestalsGPU.cc | 167 +- .../src/EcalPulseCovariancesGPU.cc | 68 +- .../EcalRecAlgos/src/EcalPulseShapesGPU.cc | 68 +- .../src/EcalSamplesCorrelationGPU.cc | 143 +- .../src/EcalTimeBiasCorrectionsGPU.cc | 111 +- .../src/EcalTimeCalibConstantsGPU.cc | 65 +- .../EcalUncalibRecHitMultiFitAlgo_gpu_new.cu | 366 ++-- .../EcalRecAlgos/src/KernelHelpers.cu | 156 +- .../EcalRecAlgos/src/KernelHelpers.h | 464 ++++- .../src/TimeComputationKernels.cu | 1688 +++++++++-------- .../EcalRecAlgos/src/TimeComputationKernels.h | 209 +- .../EcalRecAlgos/src/inplace_fnnls.cu | 198 +- .../EcalRecAlgos/src/inplace_fnnls.h | 28 +- 33 files changed, 3914 insertions(+), 3042 deletions(-) delete mode 100644 RecoLocalCalo/EcalRecAlgos/bin/makeEcalGpuValidationPlots.cpp create mode 100644 RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp create mode 100644 RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu create mode 100644 RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.h delete mode 100644 RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernelsV1.cu delete mode 100644 RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernelsV1.h diff --git a/RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml b/RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml index bb20a5ac3e6da..bf61d052856ad 100644 --- a/RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml +++ b/RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml @@ -1,4 +1,4 @@ - + diff --git a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalGpuValidationPlots.cpp b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalGpuValidationPlots.cpp deleted file mode 100644 index 9691a07fc5e0a..0000000000000 --- a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalGpuValidationPlots.cpp +++ /dev/null @@ -1,232 +0,0 @@ -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "DataFormats/Common/interface/Wrapper.h" -#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h" -#include "DataFormats/EcalRecHit/interface/EcalRecHitCollections.h" -#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" - -int main(int argc, char *argv[]) { - if (argc < 3) { - std::cout << "run with: ./validateGPU \n"; - exit(0); - } - - edm::Wrapper> *wgpuEB = nullptr; - edm::Wrapper> *wgpuEE = nullptr; - edm::Wrapper *wcpuEB = nullptr; - edm::Wrapper *wcpuEE = nullptr; - - std::string fileName = argv[1]; - std::string outFileName = argv[2]; - - // output - TFile rfout{outFileName.c_str(), "recreate"}; - - int nbins = 300; - float last = 3000.; - - int nbins_chi2 = 1000; - float last_chi2 = 1000.; - - int nbins_delta = 201; // use an odd number to center around 0 - float delta = 0.2; - - auto hSOIAmplitudesEBGPU = new TH1D("hSOIAmplitudesEBGPU", "hSOIAmplitudesEBGPU", nbins, 0, last); - auto hSOIAmplitudesEEGPU = new TH1D("hSOIAmplitudesEEGPU", "hSOIAmplitudesEEGPU", nbins, 0, last); - auto hSOIAmplitudesEBCPU = new TH1D("hSOIAmplitudesEBCPU", "hSOIAmplitudesEBCPU", nbins, 0, last); - auto hSOIAmplitudesEECPU = new TH1D("hSOIAmplitudesEECPU", "hSOIAmplitudesEECPU", nbins, 0, last); - - auto hChi2EBGPU = new TH1D("hChi2EBGPU", "hChi2EBGPU", nbins_chi2, 0, last_chi2); - auto hChi2EEGPU = new TH1D("hChi2EEGPU", "hChi2EEGPU", nbins_chi2, 0, last_chi2); - auto hChi2EBCPU = new TH1D("hChi2EBCPU", "hChi2EBCPU", nbins_chi2, 0, last_chi2); - auto hChi2EECPU = new TH1D("hChi2EECPU", "hChi2EECPU", nbins_chi2, 0, last_chi2); - - auto hSOIAmplitudesEBGPUvsCPU = - new TH2D("hSOIAmplitudesEBGPUvsCPU", "hSOIAmplitudesEBGPUvsCPU", nbins, 0, last, nbins, 0, last); - auto hSOIAmplitudesEEGPUvsCPU = - new TH2D("hSOIAmplitudesEEGPUvsCPU", "hSOIAmplitudesEEGPUvsCPU", nbins, 0, last, nbins, 0, last); - auto hSOIAmplitudesEBdeltavsCPU = - new TH2D("hSOIAmplitudesEBdeltavsCPU", "hSOIAmplitudesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); - auto hSOIAmplitudesEEdeltavsCPU = - new TH2D("hSOIAmplitudesEEdeltavsCPU", "hSOIAmplitudesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); - - auto hChi2EBGPUvsCPU = - new TH2D("hChi2EBGPUvsCPU", "hChi2EBGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2); - auto hChi2EEGPUvsCPU = - new TH2D("hChi2EEGPUvsCPU", "hChi2EEGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2); - auto hChi2EBdeltavsCPU = - new TH2D("hChi2EBdeltavsCPU", "hChi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); - auto hChi2EEdeltavsCPU = - new TH2D("hChi2EEdeltavsCPU", "hChi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); - - // input - std::cout << "validating file " << fileName << std::endl; - TFile rf{fileName.c_str()}; - TTree *rt = (TTree *)rf.Get("Events"); - rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalUncalibRecHitProducerGPU_EcalUncalibRecHitsEB_RECO.", - &wgpuEB); - rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalUncalibRecHitProducerGPU_EcalUncalibRecHitsEE_RECO.", - &wgpuEE); - rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEB_RECO.", &wcpuEB); - rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEE_RECO.", &wcpuEE); - - constexpr float eps_diff = 1e-3; - - // accumulate - auto const nentries = rt->GetEntries(); - std::cout << "#events to validate over: " << nentries << std::endl; - for (int ie = 0; ie < nentries; ++ie) { - rt->GetEntry(ie); - - const char *ordinal[] = {"th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th"}; - auto cpu_eb_size = wcpuEB->bareProduct().size(); - auto cpu_ee_size = wcpuEE->bareProduct().size(); - auto gpu_eb_size = wgpuEB->bareProduct().amplitude.size(); - auto gpu_ee_size = wgpuEE->bareProduct().amplitude.size(); - if (cpu_eb_size != gpu_eb_size or cpu_ee_size != gpu_ee_size) { - std::cerr << ie << ordinal[ie % 10] << " entry:\n" - << " EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << gpu_eb_size - << " (gpu)\n" - << " EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << gpu_ee_size - << " (gpu)" << std::endl; - continue; - } - - assert(wgpuEB->bareProduct().amplitude.size() == wcpuEB->bareProduct().size()); - assert(wgpuEE->bareProduct().amplitude.size() == wcpuEE->bareProduct().size()); - auto const neb = wcpuEB->bareProduct().size(); - auto const nee = wcpuEE->bareProduct().size(); - - for (uint32_t i = 0; i < neb; ++i) { - auto const soi_amp_gpu = wgpuEB->bareProduct().amplitude[i]; - auto const soi_amp_cpu = wcpuEB->bareProduct()[i].amplitude(); - auto const chi2_gpu = wgpuEB->bareProduct().chi2[i]; - auto const chi2_cpu = wcpuEB->bareProduct()[i].chi2(); - - hSOIAmplitudesEBGPU->Fill(soi_amp_gpu); - hSOIAmplitudesEBCPU->Fill(soi_amp_cpu); - hSOIAmplitudesEBGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu); - hSOIAmplitudesEBdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu - soi_amp_cpu); - hChi2EBGPU->Fill(chi2_gpu); - hChi2EBCPU->Fill(chi2_cpu); - hChi2EBGPUvsCPU->Fill(chi2_cpu, chi2_gpu); - hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu); - - if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or - std::isnan(chi2_gpu)) { - printf("EB eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n", - ie, - i, - soi_amp_gpu, - soi_amp_cpu, - chi2_gpu, - chi2_cpu); - if (std::isnan(chi2_gpu)) - printf("*** nan ***\n"); - } - } - - for (uint32_t i = 0; i < nee; ++i) { - auto const soi_amp_gpu = wgpuEE->bareProduct().amplitude[i]; - auto const soi_amp_cpu = wcpuEE->bareProduct()[i].amplitude(); - auto const chi2_gpu = wgpuEE->bareProduct().chi2[i]; - auto const chi2_cpu = wcpuEE->bareProduct()[i].chi2(); - - hSOIAmplitudesEEGPU->Fill(soi_amp_gpu); - hSOIAmplitudesEECPU->Fill(soi_amp_cpu); - hSOIAmplitudesEEGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu); - hSOIAmplitudesEEdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu - soi_amp_cpu); - hChi2EEGPU->Fill(chi2_gpu); - hChi2EECPU->Fill(chi2_cpu); - hChi2EEGPUvsCPU->Fill(chi2_cpu, chi2_gpu); - hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu); - - if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or - std::isnan(chi2_gpu)) { - printf("EE eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n", - ie, - static_cast(neb + i), - soi_amp_gpu, - soi_amp_cpu, - chi2_gpu, - chi2_cpu); - if (std::isnan(chi2_gpu)) - printf("*** nan ***\n"); - } - } - } - - { - TCanvas c("plots", "plots", 4200, 6200); - c.Divide(2, 3); - - c.cd(1); - gPad->SetLogy(); - hSOIAmplitudesEBCPU->SetLineColor(kBlack); - hSOIAmplitudesEBCPU->SetLineWidth(1.); - hSOIAmplitudesEBCPU->Draw(""); - hSOIAmplitudesEBGPU->SetLineColor(kBlue); - hSOIAmplitudesEBGPU->SetLineWidth(1.); - hSOIAmplitudesEBGPU->Draw("SAME"); - c.cd(2); - gPad->SetLogy(); - hSOIAmplitudesEECPU->SetLineColor(kBlack); - hSOIAmplitudesEECPU->SetLineWidth(1.); - hSOIAmplitudesEECPU->Draw(""); - hSOIAmplitudesEEGPU->SetLineColor(kBlue); - hSOIAmplitudesEEGPU->SetLineWidth(1.); - hSOIAmplitudesEEGPU->Draw("SAME"); - c.cd(3); - hSOIAmplitudesEBGPUvsCPU->Draw("COLZ"); - c.cd(4); - hSOIAmplitudesEEGPUvsCPU->Draw("COLZ"); - c.cd(5); - hSOIAmplitudesEBdeltavsCPU->Draw("COLZ"); - c.cd(6); - hSOIAmplitudesEEdeltavsCPU->Draw("COLZ"); - - c.SaveAs("ecal-amplitudes.pdf"); - - c.cd(1); - gPad->SetLogy(); - hChi2EBCPU->SetLineColor(kBlack); - hChi2EBCPU->SetLineWidth(1.); - hChi2EBCPU->Draw(""); - hChi2EBGPU->SetLineColor(kBlue); - hChi2EBGPU->SetLineWidth(1.); - hChi2EBGPU->Draw("SAME"); - c.cd(2); - gPad->SetLogy(); - hChi2EECPU->SetLineColor(kBlack); - hChi2EECPU->SetLineWidth(1.); - hChi2EECPU->Draw(""); - hChi2EEGPU->SetLineColor(kBlue); - hChi2EEGPU->SetLineWidth(1.); - hChi2EEGPU->Draw("SAME"); - c.cd(3); - hChi2EBGPUvsCPU->Draw("COLZ"); - c.cd(4); - hChi2EEGPUvsCPU->Draw("COLZ"); - c.cd(5); - hChi2EBdeltavsCPU->Draw("COLZ"); - c.cd(6); - hChi2EEdeltavsCPU->Draw("COLZ"); - - c.SaveAs("ecal-chi2.pdf"); - } - - rf.Close(); - rfout.Write(); - rfout.Close(); - - return 0; -} diff --git a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp new file mode 100644 index 0000000000000..a336de13b9e7d --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp @@ -0,0 +1,261 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "DataFormats/Common/interface/Wrapper.h" +#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h" +#include "DataFormats/EcalRecHit/interface/EcalRecHitCollections.h" +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" + +int main(int argc, char *argv[]) { + if (argc<3) { + std::cout << "run with: ./validateGPU \n"; + exit(0); + } + + edm::Wrapper> *wgpuEB=nullptr; + edm::Wrapper> *wgpuEE=nullptr; + edm::Wrapper *wcpuEB = nullptr; + edm::Wrapper *wcpuEE = nullptr; + + std::string fileName = argv[1]; + std::string outFileName = argv[2]; + + // output + TFile rfout{outFileName.c_str(), "recreate"}; + + int nbins = 300; + float last = 3000.; + + int nbins_chi2 = 1000; + float last_chi2 = 1000.; + + int nbins_delta = 201; // use an odd number to center around 0 + float delta = 0.2; + + auto hSOIAmplitudesEBGPU = new TH1D("hSOIAmplitudesEBGPU", "hSOIAmplitudesEBGPU", nbins, 0, last); + auto hSOIAmplitudesEEGPU = new TH1D("hSOIAmplitudesEEGPU", "hSOIAmplitudesEEGPU", nbins, 0, last); + auto hSOIAmplitudesEBCPU = new TH1D("hSOIAmplitudesEBCPU", "hSOIAmplitudesEBCPU", nbins, 0, last); + auto hSOIAmplitudesEECPU = new TH1D("hSOIAmplitudesEECPU", "hSOIAmplitudesEECPU", nbins, 0, last); + + auto hChi2EBGPU = new TH1D("hChi2EBGPU", "hChi2EBGPU", nbins_chi2, 0, last_chi2); + auto hChi2EEGPU = new TH1D("hChi2EEGPU", "hChi2EEGPU", nbins_chi2, 0, last_chi2); + auto hChi2EBCPU = new TH1D("hChi2EBCPU", "hChi2EBCPU", nbins_chi2, 0, last_chi2); + auto hChi2EECPU = new TH1D("hChi2EECPU", "hChi2EECPU", nbins_chi2, 0, last_chi2); + + auto hSOIAmplitudesEBGPUvsCPU = new TH2D("hSOIAmplitudesEBGPUvsCPU", "hSOIAmplitudesEBGPUvsCPU", nbins, 0, last, nbins, 0, last); + auto hSOIAmplitudesEEGPUvsCPU = new TH2D("hSOIAmplitudesEEGPUvsCPU", "hSOIAmplitudesEEGPUvsCPU", nbins, 0, last, nbins, 0, last); + auto hSOIAmplitudesEBdeltavsCPU = new TH2D("hSOIAmplitudesEBdeltavsCPU", "hSOIAmplitudesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); + auto hSOIAmplitudesEEdeltavsCPU = new TH2D("hSOIAmplitudesEEdeltavsCPU", "hSOIAmplitudesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); + + auto hChi2EBGPUvsCPU = new TH2D("hChi2EBGPUvsCPU", "hChi2EBGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2); + auto hChi2EEGPUvsCPU = new TH2D("hChi2EEGPUvsCPU", "hChi2EEGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2); + auto hChi2EBdeltavsCPU = new TH2D("hChi2EBdeltavsCPU", "hChi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); + auto hChi2EEdeltavsCPU = new TH2D("hChi2EEdeltavsCPU", "hChi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); + + // input + std::cout << "validating file " << fileName << std::endl; + TFile rf{fileName.c_str()}; + TTree *rt = (TTree*)rf.Get("Events"); + rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEB_RECO.", &wgpuEB); + rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEE_RECO.", &wgpuEE); + rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEB_RECO.", &wcpuEB); + rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEE_RECO.", &wcpuEE); + + constexpr float eps_diff = 1e-3; + + // accumulate + auto const nentries = rt->GetEntries(); + std::cout << "#events to validate over: " << nentries << std::endl; + for (int ie=0; ieGetEntry(ie); + + const char* ordinal[] = { "th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th" }; + auto cpu_eb_size = wcpuEB->bareProduct().size(); + auto cpu_ee_size = wcpuEE->bareProduct().size(); + auto gpu_eb_size = wgpuEB->bareProduct().amplitude.size(); + auto gpu_ee_size = wgpuEE->bareProduct().amplitude.size(); + if (cpu_eb_size != gpu_eb_size or cpu_ee_size != gpu_ee_size) { + std::cerr << ie << ordinal[ie % 10] << " entry:\n" + << " EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << gpu_eb_size << " (gpu)\n" + << " EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << gpu_ee_size << " (gpu)" << std::endl; + continue; + } + + assert(wgpuEB->bareProduct().amplitude.size() == wcpuEB->bareProduct().size()); + assert(wgpuEE->bareProduct().amplitude.size() == wcpuEE->bareProduct().size()); + auto const neb = wcpuEB->bareProduct().size(); + auto const nee = wcpuEE->bareProduct().size(); + + for (uint32_t i=0; ibareProduct().did[i]; + auto const soi_amp_gpu = wgpuEB->bareProduct().amplitude[i]; + auto const cpu_iter = wcpuEB->bareProduct().find(DetId{did_gpu}); + if (cpu_iter == wcpuEB->bareProduct().end()) { + std::cerr << ie << ordinal[ie % 10] << " entry\n" + << " Did not find a DetId " << did_gpu + << " in a CPU collection\n"; + continue; + } + auto const soi_amp_cpu = cpu_iter->amplitude(); + auto const chi2_gpu = wgpuEB->bareProduct().chi2[i]; + auto const chi2_cpu = cpu_iter->chi2(); + + hSOIAmplitudesEBGPU->Fill(soi_amp_gpu); + hSOIAmplitudesEBCPU->Fill(soi_amp_cpu); + hSOIAmplitudesEBGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu); + hSOIAmplitudesEBdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu-soi_amp_cpu); + hChi2EBGPU->Fill(chi2_gpu); + hChi2EBCPU->Fill(chi2_cpu); + hChi2EBGPUvsCPU->Fill(chi2_cpu, chi2_gpu); + hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu); + + if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or + (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu)) + { + printf("EB eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n", + ie, i, soi_amp_gpu, soi_amp_cpu, chi2_gpu, chi2_cpu); + if (std::isnan(chi2_gpu)) + printf("*** nan ***\n"); + } + } + + for (uint32_t i=0; ibareProduct().did[i]; + auto const soi_amp_gpu = wgpuEE->bareProduct().amplitude[i]; + auto const cpu_iter = wcpuEE->bareProduct().find(DetId{did_gpu}); + if (cpu_iter == wcpuEE->bareProduct().end()) { + std::cerr << ie << ordinal[ie % 10] << " entry\n" + << " did not find a DetId " << did_gpu + << " in a CPU collection\n"; + continue; + } + auto const soi_amp_cpu = cpu_iter->amplitude(); + auto const chi2_gpu = wgpuEE->bareProduct().chi2[i]; + auto const chi2_cpu = cpu_iter->chi2(); + + hSOIAmplitudesEEGPU->Fill(soi_amp_gpu); + hSOIAmplitudesEECPU->Fill(soi_amp_cpu); + hSOIAmplitudesEEGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu); + hSOIAmplitudesEEdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu-soi_amp_cpu); + hChi2EEGPU->Fill(chi2_gpu); + hChi2EECPU->Fill(chi2_cpu); + hChi2EEGPUvsCPU->Fill(chi2_cpu, chi2_gpu); + hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu); + + if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or + (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu)) + { + printf("EE eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n", + ie, static_cast(neb+i), soi_amp_gpu, soi_amp_cpu, chi2_gpu, chi2_cpu); + if (std::isnan(chi2_gpu)) + printf("*** nan ***\n"); + } + } + } + + { + TCanvas c("plots", "plots", 4200, 6200); + c.Divide(2, 3); + + c.cd(1); + { + gPad->SetLogy(); + hSOIAmplitudesEBCPU->SetLineColor(kBlack); + hSOIAmplitudesEBCPU->SetLineWidth(1.); + hSOIAmplitudesEBCPU->Draw(""); + hSOIAmplitudesEBGPU->SetLineColor(kBlue); + hSOIAmplitudesEBGPU->SetLineWidth(1.); + hSOIAmplitudesEBGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hSOIAmplitudesEBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + c.cd(2); + { + gPad->SetLogy(); + hSOIAmplitudesEECPU->SetLineColor(kBlack); + hSOIAmplitudesEECPU->SetLineWidth(1.); + hSOIAmplitudesEECPU->Draw(""); + hSOIAmplitudesEEGPU->SetLineColor(kBlue); + hSOIAmplitudesEEGPU->SetLineWidth(1.); + hSOIAmplitudesEEGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hSOIAmplitudesEEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + c.cd(3); + hSOIAmplitudesEBGPUvsCPU->Draw("COLZ"); + c.cd(4); + hSOIAmplitudesEEGPUvsCPU->Draw("COLZ"); + c.cd(5); + hSOIAmplitudesEBdeltavsCPU->Draw("COLZ"); + c.cd(6); + hSOIAmplitudesEEdeltavsCPU->Draw("COLZ"); + + c.SaveAs("ecal-amplitudes.pdf"); + + c.cd(1); + { + gPad->SetLogy(); + hChi2EBCPU->SetLineColor(kBlack); + hChi2EBCPU->SetLineWidth(1.); + hChi2EBCPU->Draw(""); + hChi2EBGPU->SetLineColor(kBlue); + hChi2EBGPU->SetLineWidth(1.); + hChi2EBGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hChi2EBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + c.cd(2); + { + gPad->SetLogy(); + hChi2EECPU->SetLineColor(kBlack); + hChi2EECPU->SetLineWidth(1.); + hChi2EECPU->Draw(""); + hChi2EEGPU->SetLineColor(kBlue); + hChi2EEGPU->SetLineWidth(1.); + hChi2EEGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hChi2EEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + c.cd(3); + hChi2EBGPUvsCPU->Draw("COLZ"); + c.cd(4); + hChi2EEGPUvsCPU->Draw("COLZ"); + c.cd(5); + hChi2EBdeltavsCPU->Draw("COLZ"); + c.cd(6); + hChi2EEdeltavsCPU->Draw("COLZ"); + + c.SaveAs("ecal-chi2.pdf"); + } + + rf.Close(); + rfout.Write(); + rfout.Close(); + + return 0; +} diff --git a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h index b997906006a22..5ff32c0bc2259 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h @@ -26,6 +26,8 @@ #include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h" +#include "CUDADataFormats/EcalDigi/interface/DigisCollection.h" + struct EcalPulseShape; class EcalSampleMask; class EcalTimeBiasCorrections; @@ -36,220 +38,238 @@ class EcalSamplesCorrelation; class EBDigiCollection; class EEDigiCollection; -namespace ecal { - namespace multifit { - - enum class TimeComputationState : char { NotFinished = 0, Finished = 1 }; - enum class MinimizationState : char { - NotFinished = 0, - Finished = 1, - Precomputed = 2, - }; - - // event input data on cpu, just const refs - struct EventInputDataCPU { - EBDigiCollection const& ebDigis; - EEDigiCollection const& eeDigis; - }; - - // - struct EventInputDataGPU { - uint16_t* digis; - uint32_t* ids; - - void allocate(uint32_t size) { - cudaCheck(cudaMalloc((void**)&digis, sizeof(uint16_t) * size * EcalDataFrame::MAXSAMPLES)); - cudaCheck(cudaMalloc((void**)&ids, sizeof(uint32_t) * size)); - } - - void deallocate() { - cudaCheck(cudaFree(digis)); - cudaCheck(cudaFree(ids)); - } - }; - - // parameters have a fixed type - // Can we go by with single precision - struct ConfigurationParameters { - using type = double; - // device ptrs - type *amplitudeFitParametersEB = nullptr, *amplitudeFitParametersEE = nullptr; - - uint32_t timeFitParametersSizeEB, timeFitParametersSizeEE; - // device ptrs - type *timeFitParametersEB = nullptr, *timeFitParametersEE = nullptr; - - type timeFitLimitsFirstEB, timeFitLimitsFirstEE; - type timeFitLimitsSecondEB, timeFitLimitsSecondEE; - - type timeConstantTermEB, timeConstantTermEE; - - type timeNconstEB, timeNconstEE; - - type amplitudeThreshEE, amplitudeThreshEB; - - type outOfTimeThreshG12pEB, outOfTimeThreshG12mEB; - type outOfTimeThreshG12pEE, outOfTimeThreshG12mEE; - type outOfTimeThreshG61pEE, outOfTimeThreshG61mEE; - type outOfTimeThreshG61pEB, outOfTimeThreshG61mEB; - - std::array kernelMinimizeThreads; - - bool shouldRunTimingComputation; - }; - - struct EventOutputDataGPU final : public ::ecal::UncalibratedRecHit<::ecal::Tag::ptr> { - void allocate(ConfigurationParameters const& configParameters, uint32_t size) { - cudaCheck(cudaMalloc((void**)&litudesAll, size * sizeof(SampleVector))); - cudaCheck(cudaMalloc((void**)&litude, size * sizeof(::ecal::reco::StorageScalarType))); - cudaCheck(cudaMalloc((void**)&chi2, size * sizeof(::ecal::reco::StorageScalarType))); - cudaCheck(cudaMalloc((void**)&pedestal, size * sizeof(::ecal::reco::StorageScalarType))); +namespace ecal { namespace multifit { + +enum class TimeComputationState : char { + NotFinished = 0, + Finished = 1 +}; +enum class MinimizationState : char { + NotFinished = 0, + Finished = 1, + Precomputed = 2, +}; + +// +struct EventInputDataGPU { + ecal::DigisCollection const& ebDigis; + ecal::DigisCollection const& eeDigis; +}; + +// parameters have a fixed type +// Can we go by with single precision +struct ConfigurationParameters { + using type = double; + // device ptrs + type *amplitudeFitParametersEB=nullptr, *amplitudeFitParametersEE=nullptr; + + uint32_t timeFitParametersSizeEB, timeFitParametersSizeEE; + // device ptrs + type *timeFitParametersEB=nullptr, *timeFitParametersEE=nullptr; + + type timeFitLimitsFirstEB, timeFitLimitsFirstEE; + type timeFitLimitsSecondEB, timeFitLimitsSecondEE; + + type timeConstantTermEB, timeConstantTermEE; + + type timeNconstEB, timeNconstEE; + + type amplitudeThreshEE, amplitudeThreshEB; + + type outOfTimeThreshG12pEB, outOfTimeThreshG12mEB; + type outOfTimeThreshG12pEE, outOfTimeThreshG12mEE; + type outOfTimeThreshG61pEE, outOfTimeThreshG61mEE; + type outOfTimeThreshG61pEB, outOfTimeThreshG61mEB; + + std::array kernelMinimizeThreads; + + bool shouldRunTimingComputation; +}; + +struct EventOutputDataGPU final : public ::ecal::UncalibratedRecHit<::ecal::Tag::ptr> +{ + void allocate(ConfigurationParameters const& configParameters, uint32_t size) { + cudaCheck( cudaMalloc((void**)&litudesAll, + size * sizeof(SampleVector)) ); + cudaCheck( cudaMalloc((void**)&litude, + size * sizeof(::ecal::reco::StorageScalarType)) ); + cudaCheck( cudaMalloc((void**)&chi2, + size * sizeof(::ecal::reco::StorageScalarType)) ); + cudaCheck( cudaMalloc((void**)&pedestal, + size * sizeof(::ecal::reco::StorageScalarType)) ); if (configParameters.shouldRunTimingComputation) { - cudaCheck(cudaMalloc((void**)&jitter, size * sizeof(::ecal::reco::StorageScalarType))); - cudaCheck(cudaMalloc((void**)&jitterError, size * sizeof(::ecal::reco::StorageScalarType))); + cudaCheck( cudaMalloc((void**)&jitter, + size * sizeof(::ecal::reco::StorageScalarType)) ); + cudaCheck( cudaMalloc((void**)&jitterError, + size * sizeof(::ecal::reco::StorageScalarType)) ); } - cudaCheck(cudaMalloc((void**)&did, size * sizeof(uint32_t))); - cudaCheck(cudaMalloc((void**)&flags, size * sizeof(uint32_t))); - } - - void deallocate(ConfigurationParameters const& configParameters) { - cudaCheck(cudaFree(amplitudesAll)); - cudaCheck(cudaFree(amplitude)); - cudaCheck(cudaFree(chi2)); - cudaCheck(cudaFree(pedestal)); + cudaCheck( cudaMalloc((void**)&did, + size * sizeof(uint32_t)) ); + cudaCheck( cudaMalloc((void**)&flags, + size * sizeof(uint32_t)) ); + } + + void deallocate(ConfigurationParameters const& configParameters) { + cudaCheck( cudaFree(amplitudesAll) ); + cudaCheck( cudaFree(amplitude) ); + cudaCheck( cudaFree(chi2) ); + cudaCheck( cudaFree(pedestal) ); if (configParameters.shouldRunTimingComputation) { - cudaCheck(cudaFree(jitter)); - cudaCheck(cudaFree(jitterError)); + cudaCheck( cudaFree(jitter) ); + cudaCheck( cudaFree(jitterError) ); } - cudaCheck(cudaFree(did)); - cudaCheck(cudaFree(flags)); - } - }; - - struct EventDataForScratchGPU { - SampleVector* samples = nullptr; - SampleGainVector* gainsNoise = nullptr; - - SampleMatrix* noisecov = nullptr; - PulseMatrixType* pulse_matrix = nullptr; - FullSampleMatrix* pulse_covariances = nullptr; - BXVectorType* activeBXs = nullptr; - char* acState = nullptr; - - bool *hasSwitchToGain6 = nullptr, *hasSwitchToGain1 = nullptr, *isSaturated = nullptr; - - SampleVector::Scalar *sample_values, *sample_value_errors; - bool* useless_sample_values; - SampleVector::Scalar* chi2sNullHypot; - SampleVector::Scalar* sum0sNullHypot; - SampleVector::Scalar* sumAAsNullHypot; - char* pedestal_nums; - SampleVector::Scalar *tMaxAlphaBetas, *tMaxErrorAlphaBetas; - SampleVector::Scalar *accTimeMax, *accTimeWgt; - SampleVector::Scalar *ampMaxAlphaBeta, *ampMaxError; - SampleVector::Scalar *timeMax, *timeError; - TimeComputationState* tcState; - - void allocate(ConfigurationParameters const& configParameters, uint32_t size) { - cudaCheck(cudaMalloc((void**)&samples, size * sizeof(SampleVector))); - cudaCheck(cudaMalloc((void**)&gainsNoise, size * sizeof(SampleGainVector))); - - cudaCheck(cudaMalloc((void**)&pulse_covariances, size * sizeof(FullSampleMatrix))); - cudaCheck(cudaMalloc((void**)&noisecov, size * sizeof(SampleMatrix))); - cudaCheck(cudaMalloc((void**)&pulse_matrix, size * sizeof(PulseMatrixType))); - cudaCheck(cudaMalloc((void**)&activeBXs, size * sizeof(BXVectorType))); - cudaCheck(cudaMalloc((void**)&acState, size * sizeof(char))); - - cudaCheck(cudaMalloc((void**)&hasSwitchToGain6, size * sizeof(bool))); - cudaCheck(cudaMalloc((void**)&hasSwitchToGain1, size * sizeof(bool))); - cudaCheck(cudaMalloc((void**)&isSaturated, size * sizeof(bool))); + cudaCheck( cudaFree(did) ); + cudaCheck( cudaFree(flags) ); + } +}; + +struct EventDataForScratchGPU { + SampleVector *samples = nullptr; + SampleGainVector *gainsNoise = nullptr; + + SampleMatrix* noisecov = nullptr; + PulseMatrixType *pulse_matrix = nullptr; + BXVectorType *activeBXs = nullptr; + char *acState = nullptr; + + bool *hasSwitchToGain6=nullptr, + *hasSwitchToGain1=nullptr, + *isSaturated=nullptr; + + SampleVector::Scalar *sample_values, *sample_value_errors; + bool *useless_sample_values; + SampleVector::Scalar* chi2sNullHypot; + SampleVector::Scalar* sum0sNullHypot; + SampleVector::Scalar* sumAAsNullHypot; + char* pedestal_nums; + SampleVector::Scalar *tMaxAlphaBetas, *tMaxErrorAlphaBetas; + SampleVector::Scalar *accTimeMax, *accTimeWgt; + SampleVector::Scalar *ampMaxAlphaBeta, *ampMaxError; + SampleVector::Scalar *timeMax, *timeError; + TimeComputationState *tcState; + + void allocate(ConfigurationParameters const& configParameters, uint32_t size) { + cudaCheck( cudaMalloc((void**)&samples, + size * sizeof(SampleVector)) ); + cudaCheck( cudaMalloc((void**)&gainsNoise, + size * sizeof(SampleGainVector)) ); + + cudaCheck( cudaMalloc((void**)&noisecov, + size * sizeof(SampleMatrix)) ); + cudaCheck( cudaMalloc((void**)&pulse_matrix, + size * sizeof(PulseMatrixType)) ); + cudaCheck( cudaMalloc((void**)&activeBXs, + size * sizeof(BXVectorType)) ); + cudaCheck( cudaMalloc((void**)&acState, + size * sizeof(char)) ); + + cudaCheck( cudaMalloc((void**)&hasSwitchToGain6, + size * sizeof(bool)) ); + cudaCheck( cudaMalloc((void**)&hasSwitchToGain1, + size * sizeof(bool)) ); + cudaCheck( cudaMalloc((void**)&isSaturated, + size * sizeof(bool)) ); if (configParameters.shouldRunTimingComputation) { - cudaCheck(cudaMalloc((void**)&sample_values, size * sizeof(SampleVector))); - cudaCheck(cudaMalloc((void**)&sample_value_errors, size * sizeof(SampleVector))); - cudaCheck(cudaMalloc((void**)&useless_sample_values, size * sizeof(bool) * EcalDataFrame::MAXSAMPLES)); - cudaCheck(cudaMalloc((void**)&chi2sNullHypot, size * sizeof(SampleVector::Scalar))); - cudaCheck(cudaMalloc((void**)&sum0sNullHypot, size * sizeof(SampleVector::Scalar))); - cudaCheck(cudaMalloc((void**)&sumAAsNullHypot, size * sizeof(SampleVector::Scalar))); - cudaCheck(cudaMalloc((void**)&pedestal_nums, size * sizeof(char))); - - cudaCheck(cudaMalloc((void**)&tMaxAlphaBetas, size * sizeof(SampleVector::Scalar))); - cudaCheck(cudaMalloc((void**)&tMaxErrorAlphaBetas, size * sizeof(SampleVector::Scalar))); - cudaCheck(cudaMalloc((void**)&accTimeMax, size * sizeof(SampleVector::Scalar))); - cudaCheck(cudaMalloc((void**)&accTimeWgt, size * sizeof(SampleVector::Scalar))); - cudaCheck(cudaMalloc((void**)&MaxAlphaBeta, size * sizeof(SampleVector::Scalar))); - cudaCheck(cudaMalloc((void**)&MaxError, size * sizeof(SampleVector::Scalar))); - cudaCheck(cudaMalloc((void**)&timeMax, size * sizeof(SampleVector::Scalar))); - cudaCheck(cudaMalloc((void**)&timeError, size * sizeof(SampleVector::Scalar))); - cudaCheck(cudaMalloc((void**)&tcState, size * sizeof(TimeComputationState))); + cudaCheck( cudaMalloc((void**)&sample_values, + size * sizeof(SampleVector)) ); + cudaCheck( cudaMalloc((void**)&sample_value_errors, + size * sizeof(SampleVector)) ); + cudaCheck( cudaMalloc((void**)&useless_sample_values, + size * sizeof(bool) * EcalDataFrame::MAXSAMPLES) ); + cudaCheck( cudaMalloc((void**)&chi2sNullHypot, + size * sizeof(SampleVector::Scalar)) ); + cudaCheck( cudaMalloc((void**)&sum0sNullHypot, + size * sizeof(SampleVector::Scalar)) ); + cudaCheck( cudaMalloc((void**)&sumAAsNullHypot, + size * sizeof(SampleVector::Scalar)) ); + cudaCheck( cudaMalloc((void**)&pedestal_nums, + size * sizeof(char)) ); + + cudaCheck( cudaMalloc((void**)&tMaxAlphaBetas, + size * sizeof(SampleVector::Scalar)) ); + cudaCheck( cudaMalloc((void**)&tMaxErrorAlphaBetas, + size * sizeof(SampleVector::Scalar)) ); + cudaCheck( cudaMalloc((void**)&accTimeMax, + size * sizeof(SampleVector::Scalar)) ); + cudaCheck( cudaMalloc((void**)&accTimeWgt, + size * sizeof(SampleVector::Scalar)) ); + cudaCheck( cudaMalloc((void**)&MaxAlphaBeta, + size * sizeof(SampleVector::Scalar)) ); + cudaCheck( cudaMalloc((void**)&MaxError, + size * sizeof(SampleVector::Scalar)) ); + cudaCheck( cudaMalloc((void**)&timeMax, + size * sizeof(SampleVector::Scalar)) ); + cudaCheck( cudaMalloc((void**)&timeError, + size * sizeof(SampleVector::Scalar)) ); + cudaCheck( cudaMalloc((void**)&tcState, + size * sizeof(TimeComputationState)) ); } - } + } - void deallocate(ConfigurationParameters const& configParameters) { - cudaCheck(cudaFree(samples)); - cudaCheck(cudaFree(gainsNoise)); + void deallocate(ConfigurationParameters const& configParameters) { + cudaCheck( cudaFree(samples) ); + cudaCheck( cudaFree(gainsNoise) ); - cudaCheck(cudaFree(pulse_covariances)); - cudaCheck(cudaFree(noisecov)); - cudaCheck(cudaFree(pulse_matrix)); - cudaCheck(cudaFree(activeBXs)); - cudaCheck(cudaFree(acState)); + cudaCheck( cudaFree(noisecov) ); + cudaCheck( cudaFree(pulse_matrix) ); + cudaCheck( cudaFree(activeBXs) ); + cudaCheck( cudaFree(acState) ); - cudaCheck(cudaFree(hasSwitchToGain6)); - cudaCheck(cudaFree(hasSwitchToGain1)); - cudaCheck(cudaFree(isSaturated)); + cudaCheck( cudaFree(hasSwitchToGain6) ); + cudaCheck( cudaFree(hasSwitchToGain1) ); + cudaCheck( cudaFree(isSaturated) ); if (configParameters.shouldRunTimingComputation) { - cudaCheck(cudaFree(sample_values)); - cudaCheck(cudaFree(sample_value_errors)); - cudaCheck(cudaFree(useless_sample_values)); - cudaCheck(cudaFree(chi2sNullHypot)); - cudaCheck(cudaFree(sum0sNullHypot)); - cudaCheck(cudaFree(sumAAsNullHypot)); - cudaCheck(cudaFree(pedestal_nums)); - - cudaCheck(cudaFree(tMaxAlphaBetas)); - cudaCheck(cudaFree(tMaxErrorAlphaBetas)); - cudaCheck(cudaFree(accTimeMax)); - cudaCheck(cudaFree(accTimeWgt)); - cudaCheck(cudaFree(ampMaxAlphaBeta)); - cudaCheck(cudaFree(ampMaxError)); - cudaCheck(cudaFree(timeMax)); - cudaCheck(cudaFree(timeError)); - cudaCheck(cudaFree(tcState)); + cudaCheck( cudaFree(sample_values) ); + cudaCheck( cudaFree(sample_value_errors) ); + cudaCheck( cudaFree(useless_sample_values) ); + cudaCheck( cudaFree(chi2sNullHypot) ); + cudaCheck( cudaFree(sum0sNullHypot) ); + cudaCheck( cudaFree(sumAAsNullHypot) ); + cudaCheck( cudaFree(pedestal_nums) ); + + cudaCheck( cudaFree(tMaxAlphaBetas) ); + cudaCheck( cudaFree(tMaxErrorAlphaBetas) ); + cudaCheck( cudaFree(accTimeMax) ); + cudaCheck( cudaFree(accTimeWgt) ); + cudaCheck( cudaFree(ampMaxAlphaBeta) ); + cudaCheck( cudaFree(ampMaxError) ); + cudaCheck( cudaFree(timeMax) ); + cudaCheck( cudaFree(timeError) ); + cudaCheck( cudaFree(tcState) ); } - } - }; - - // const refs products to conditions - struct ConditionsProducts { - EcalPedestalsGPU::Product const& pedestals; - EcalGainRatiosGPU::Product const& gainRatios; - EcalPulseShapesGPU::Product const& pulseShapes; - EcalPulseCovariancesGPU::Product const& pulseCovariances; - EcalSamplesCorrelationGPU::Product const& samplesCorrelation; - EcalTimeBiasCorrectionsGPU::Product const& timeBiasCorrections; - EcalTimeCalibConstantsGPU::Product const& timeCalibConstants; - EcalSampleMask const& sampleMask; - EcalTimeOffsetConstant const& timeOffsetConstant; - uint32_t offsetForHashes; - }; - - //*/ - - struct xyz { - int x, y, z; - }; - - struct conf_data { - xyz threads; - bool runV1; - cudaStream_t cuStream; - }; - - } // namespace multifit -} // namespace ecal + } +}; + +// const refs products to conditions +struct ConditionsProducts { + EcalPedestalsGPU::Product const& pedestals; + EcalGainRatiosGPU::Product const& gainRatios; + EcalPulseShapesGPU::Product const& pulseShapes; + EcalPulseCovariancesGPU::Product const& pulseCovariances; + EcalSamplesCorrelationGPU::Product const& samplesCorrelation; + EcalTimeBiasCorrectionsGPU::Product const& timeBiasCorrections; + EcalTimeCalibConstantsGPU::Product const& timeCalibConstants; + EcalSampleMask const& sampleMask; + EcalTimeOffsetConstant const& timeOffsetConstant; + uint32_t offsetForHashes; +}; + +//*/ + +struct xyz { + int x,y,z; +}; + +struct conf_data { + xyz threads; + bool runV1; + cudaStream_t cuStream; +}; + +}} #endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h index e268e5d3d5c13..674695e472ec1 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h @@ -8,38 +8,37 @@ #include "HeterogeneousCore/CUDACore/interface/ESProduct.h" #endif -#include - class EcalGainRatiosGPU { public: - struct Product { - ~Product(); - float *gain12Over6 = nullptr, *gain6Over1 = nullptr; - }; + struct Product { + ~Product(); + float *gain12Over6=nullptr, *gain6Over1=nullptr; + }; #ifndef __CUDACC__ - // rearrange pedestals - EcalGainRatiosGPU(EcalGainRatios const&); + // rearrange pedestals + EcalGainRatiosGPU(EcalGainRatios const&); - // will call dealloation for Product thru ~Product - ~EcalGainRatiosGPU() = default; + // will call dealloation for Product thru ~Product + ~EcalGainRatiosGPU() = default; - // get device pointers - Product const& getProduct(cudaStream_t) const; + // get device pointers + Product const& getProduct(cudaStream_t) const; - // - static std::string name() { return std::string{"ecalGainRatiosGPU"}; } + // + static std::string name() { return std::string{"ecalGainRatiosGPU"}; } private: - // in the future, we need to arrange so to avoid this copy on the host - // store eb first then ee - std::vector> gain12Over6_; - std::vector> gain6Over1_; + // in the future, we need to arrange so to avoid this copy on the host + // store eb first then ee + std::vector> gain12Over6_; + std::vector> gain6Over1_; - cms::cuda::ESProduct product_; + cms::cuda::ESProduct product_; #endif }; + #endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h index 420697dea6bda..419b7273afa6d 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h @@ -8,41 +8,39 @@ #include "HeterogeneousCore/CUDACore/interface/ESProduct.h" #endif -#include - class EcalPedestalsGPU { public: - struct Product { - ~Product(); - float *mean_x12 = nullptr, *mean_x6 = nullptr, *mean_x1 = nullptr; - float *rms_x12 = nullptr, *rms_x6 = nullptr, *rms_x1 = nullptr; - }; + struct Product { + ~Product(); + float *mean_x12=nullptr, *mean_x6=nullptr, *mean_x1=nullptr; + float *rms_x12=nullptr, *rms_x6=nullptr, *rms_x1=nullptr; + }; #ifndef __CUDACC__ - // rearrange pedestals - EcalPedestalsGPU(EcalPedestals const &); + // rearrange pedestals + EcalPedestalsGPU(EcalPedestals const&); - // will call dealloation for Product thru ~Product - ~EcalPedestalsGPU() = default; + // will call dealloation for Product thru ~Product + ~EcalPedestalsGPU() = default; - // get device pointers - Product const &getProduct(cudaStream_t) const; + // get device pointers + Product const& getProduct(cudaStream_t) const; - // - static std::string name() { return std::string{"ecalPedestalsGPU"}; } + // + static std::string name() { return std::string{"ecalPedestalsGPU"}; } private: - // in the future, we need to arrange so to avoid this copy on the host - // store eb first then ee - std::vector> mean_x12_; - std::vector> rms_x12_; - std::vector> mean_x6_; - std::vector> rms_x6_; - std::vector> mean_x1_; - std::vector> rms_x1_; - - cms::cuda::ESProduct product_; + // in the future, we need to arrange so to avoid this copy on the host + // store eb first then ee + std::vector> mean_x12_; + std::vector> rms_x12_; + std::vector> mean_x6_; + std::vector> rms_x6_; + std::vector> mean_x1_; + std::vector> rms_x1_; + + cms::cuda::ESProduct product_; #endif }; diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h index b5b9271f6e65e..0a3df41e8b85e 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseCovariancesGPU.h @@ -8,35 +8,34 @@ #include "HeterogeneousCore/CUDACore/interface/ESProduct.h" #endif -#include - class EcalPulseCovariancesGPU { public: - struct Product { - ~Product(); - EcalPulseCovariance* values = nullptr; - }; + struct Product { + ~Product(); + EcalPulseCovariance *values=nullptr; + }; #ifndef __CUDACC__ - // rearrange pedestals - EcalPulseCovariancesGPU(EcalPulseCovariances const&); + // rearrange pedestals + EcalPulseCovariancesGPU(EcalPulseCovariances const&); - // will call dealloation for Product thru ~Product - ~EcalPulseCovariancesGPU() = default; + // will call dealloation for Product thru ~Product + ~EcalPulseCovariancesGPU() = default; - // get device pointers - Product const& getProduct(cudaStream_t) const; + // get device pointers + Product const& getProduct(cudaStream_t) const; - // - static std::string name() { return std::string{"ecalPulseCovariancesGPU"}; } + // + static std::string name() { return std::string{"ecalPulseCovariancesGPU"}; } private: - // reuse original vectors (although with default allocator) - std::vector const& valuesEB_; - std::vector const& valuesEE_; + // reuse original vectors (although with default allocator) + std::vector const& valuesEB_; + std::vector const& valuesEE_; - cms::cuda::ESProduct product_; + cms::cuda::ESProduct product_; #endif }; + #endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h index 88893b626ce05..4fddcf24aac32 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalPulseShapesGPU.h @@ -8,35 +8,34 @@ #include "HeterogeneousCore/CUDACore/interface/ESProduct.h" #endif -#include - class EcalPulseShapesGPU { public: - struct Product { - ~Product(); - EcalPulseShape* values = nullptr; - }; + struct Product { + ~Product(); + EcalPulseShape *values=nullptr; + }; #ifndef __CUDACC__ - // rearrange pedestals - EcalPulseShapesGPU(EcalPulseShapes const&); + // rearrange pedestals + EcalPulseShapesGPU(EcalPulseShapes const&); - // will call dealloation for Product thru ~Product - ~EcalPulseShapesGPU() = default; + // will call dealloation for Product thru ~Product + ~EcalPulseShapesGPU() = default; - // get device pointers - Product const& getProduct(cudaStream_t) const; + // get device pointers + Product const& getProduct(cudaStream_t) const; - // - static std::string name() { return std::string{"ecalPulseShapesGPU"}; } + // + static std::string name() { return std::string{"ecalPulseShapesGPU"}; } private: - // reuse original vectors (although with default allocator) - std::vector const& valuesEB_; - std::vector const& valuesEE_; + // reuse original vectors (although with default allocator) + std::vector const& valuesEB_; + std::vector const& valuesEE_; - cms::cuda::ESProduct product_; + cms::cuda::ESProduct product_; #endif }; + #endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h index dac1ee041bfc5..3ae409a18e74c 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalSamplesCorrelationGPU.h @@ -8,39 +8,42 @@ #include "HeterogeneousCore/CUDACore/interface/ESProduct.h" #endif -#include - class EcalSamplesCorrelationGPU { public: - struct Product { - ~Product(); - double *EBG12SamplesCorrelation = nullptr, *EBG6SamplesCorrelation = nullptr, *EBG1SamplesCorrelation = nullptr; - double *EEG12SamplesCorrelation = nullptr, *EEG6SamplesCorrelation = nullptr, *EEG1SamplesCorrelation = nullptr; - }; + struct Product { + ~Product(); + double *EBG12SamplesCorrelation=nullptr, + *EBG6SamplesCorrelation=nullptr, + *EBG1SamplesCorrelation=nullptr; + double *EEG12SamplesCorrelation=nullptr, + *EEG6SamplesCorrelation=nullptr, + *EEG1SamplesCorrelation=nullptr; + }; #ifndef __CUDACC__ - // rearrange pedestals - EcalSamplesCorrelationGPU(EcalSamplesCorrelation const&); + // rearrange pedestals + EcalSamplesCorrelationGPU(EcalSamplesCorrelation const&); - // will call dealloation for Product thru ~Product - ~EcalSamplesCorrelationGPU() = default; + // will call dealloation for Product thru ~Product + ~EcalSamplesCorrelationGPU() = default; - // get device pointers - Product const& getProduct(cudaStream_t) const; + // get device pointers + Product const& getProduct(cudaStream_t) const; - // - static std::string name() { return std::string{"ecalSamplesCorrelationGPU"}; } + // + static std::string name() { return std::string{"ecalSamplesCorrelationGPU"}; } private: - std::vector const& EBG12SamplesCorrelation_; - std::vector const& EBG6SamplesCorrelation_; - std::vector const& EBG1SamplesCorrelation_; - std::vector const& EEG12SamplesCorrelation_; - std::vector const& EEG6SamplesCorrelation_; - std::vector const& EEG1SamplesCorrelation_; - - cms::cuda::ESProduct product_; + std::vector const& EBG12SamplesCorrelation_; + std::vector const& EBG6SamplesCorrelation_; + std::vector const& EBG1SamplesCorrelation_; + std::vector const& EEG12SamplesCorrelation_; + std::vector const& EEG6SamplesCorrelation_; + std::vector const& EEG1SamplesCorrelation_; + + cms::cuda::ESProduct product_; #endif }; + #endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h index 70af33b52f216..cbabea3351eb8 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h @@ -8,44 +8,45 @@ #include "HeterogeneousCore/CUDACore/interface/ESProduct.h" #endif -#include - class EcalTimeBiasCorrectionsGPU { public: - struct Product { - ~Product(); - float *EBTimeCorrAmplitudeBins, *EBTimeCorrShiftBins; - float *EETimeCorrAmplitudeBins, *EETimeCorrShiftBins; - int EBTimeCorrAmplitudeBinsSize, EETimeCorrAmplitudeBinsSize; - }; + struct Product { + ~Product(); + float *EBTimeCorrAmplitudeBins, *EBTimeCorrShiftBins; + float *EETimeCorrAmplitudeBins, *EETimeCorrShiftBins; + int EBTimeCorrAmplitudeBinsSize, EETimeCorrAmplitudeBinsSize; + }; - // rearrange pedestals - EcalTimeBiasCorrectionsGPU(EcalTimeBiasCorrections const&); + // rearrange pedestals + EcalTimeBiasCorrectionsGPU(EcalTimeBiasCorrections const&); #ifndef __CUDACC__ - // will call dealloation for Product thru ~Product - ~EcalTimeBiasCorrectionsGPU() = default; + // will call dealloation for Product thru ~Product + ~EcalTimeBiasCorrectionsGPU() = default; - // get device pointers - Product const& getProduct(cudaStream_t) const; + // get device pointers + Product const& getProduct(cudaStream_t) const; - // - static std::string name() { return std::string{"ecalTimeBiasCorrectionsGPU"}; } + // + static std::string name() { return std::string{"ecalTimeBiasCorrectionsGPU"}; } #endif - std::vector const& EBTimeCorrAmplitudeBins() const { return EBTimeCorrAmplitudeBins_; } - std::vector const& EETimeCorrAmplitudeBins() const { return EETimeCorrAmplitudeBins_; } + std::vector const& EBTimeCorrAmplitudeBins() const + { return EBTimeCorrAmplitudeBins_; } + std::vector const& EETimeCorrAmplitudeBins() const + { return EETimeCorrAmplitudeBins_; } private: - std::vector const& EBTimeCorrAmplitudeBins_; - std::vector const& EBTimeCorrShiftBins_; - std::vector const& EETimeCorrAmplitudeBins_; - std::vector const& EETimeCorrShiftBins_; + std::vector const& EBTimeCorrAmplitudeBins_; + std::vector const& EBTimeCorrShiftBins_; + std::vector const& EETimeCorrAmplitudeBins_; + std::vector const& EETimeCorrShiftBins_; #ifndef __CUDACC__ - cms::cuda::ESProduct product_; + cms::cuda::ESProduct product_; #endif }; + #endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h index fd640e7c989b3..f82f4d5a0530f 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h @@ -8,38 +8,37 @@ #include "HeterogeneousCore/CUDACore/interface/ESProduct.h" #endif -#include - class EcalTimeCalibConstantsGPU { public: - struct Product { - ~Product(); - float* values = nullptr; - }; + struct Product { + ~Product(); + float *values=nullptr; + }; #ifndef __CUDACC__ - // rearrange pedestals - EcalTimeCalibConstantsGPU(EcalTimeCalibConstants const&); + // rearrange pedestals + EcalTimeCalibConstantsGPU(EcalTimeCalibConstants const&); - // will call dealloation for Product thru ~Product - ~EcalTimeCalibConstantsGPU() = default; + // will call dealloation for Product thru ~Product + ~EcalTimeCalibConstantsGPU() = default; - // get device pointers - Product const& getProduct(cudaStream_t) const; + // get device pointers + Product const& getProduct(cudaStream_t) const; - // TODO: do this centrally - // get offset for hashes. equals number of barrel items - uint32_t getOffset() const { return valuesEB_.size(); } + // TODO: do this centrally + // get offset for hashes. equals number of barrel items + uint32_t getOffset() const { return valuesEB_.size(); } - // - static std::string name() { return std::string{"ecalTimeCalibConstantsGPU"}; } + // + static std::string name() { return std::string{"ecalTimeCalibConstantsGPU"}; } private: - std::vector const& valuesEB_; - std::vector const& valuesEE_; + std::vector const& valuesEB_; + std::vector const& valuesEE_; - cms::cuda::ESProduct product_; + cms::cuda::ESProduct product_; #endif }; + #endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalUncalibRecHitMultiFitAlgo_gpu_new.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalUncalibRecHitMultiFitAlgo_gpu_new.h index 424a6e612c2c1..04193663f1e37 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalUncalibRecHitMultiFitAlgo_gpu_new.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalUncalibRecHitMultiFitAlgo_gpu_new.h @@ -3,22 +3,18 @@ #include -#include +#include #include "RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h" -namespace ecal { - namespace multifit { +namespace ecal { namespace multifit { - void entryPoint(EventInputDataCPU const&, - EventInputDataGPU&, - EventOutputDataGPU&, - EventDataForScratchGPU&, - ConditionsProducts const&, - ConfigurationParameters const&, - cudaStream_t); +void entryPoint( + EventInputDataGPU const&, + EventOutputDataGPU&, EventDataForScratchGPU&, + ConditionsProducts const&, ConfigurationParameters const&, + cudaStream_t); - } -} // namespace ecal +}} #endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EigenMatrixTypes_gpu.h b/RecoLocalCalo/EcalRecAlgos/interface/EigenMatrixTypes_gpu.h index b162f9b1c9784..d769f65ed0735 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EigenMatrixTypes_gpu.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EigenMatrixTypes_gpu.h @@ -6,43 +6,40 @@ #include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h" -namespace ecal { - namespace multifit { - - constexpr int SampleVectorSize = 10; - constexpr int FullSampleVectorSize = 19; - constexpr int PulseVectorSize = 12; - constexpr int NGains = 3; - - using data_type = ::ecal::reco::ComputationScalarType; - - typedef Eigen::Matrix PulseMatrixType; - typedef Eigen::Matrix BXVectorType; - using SampleMatrixD = Eigen::Matrix; - - typedef Eigen::Matrix SampleVector; - typedef Eigen::Matrix FullSampleVector; - typedef Eigen::Matrix PulseVector; - typedef Eigen::Matrix BXVector; - typedef Eigen::Matrix SampleGainVector; - typedef Eigen::Matrix SampleMatrix; - typedef Eigen::Matrix FullSampleMatrix; - typedef Eigen::Matrix PulseMatrix; - typedef Eigen::Matrix - SamplePulseMatrix; - typedef Eigen::LLT SampleDecompLLT; - typedef Eigen::LLT SampleDecompLLTD; - typedef Eigen::LLT PulseDecompLLT; - typedef Eigen::LDLT PulseDecompLDLT; - - typedef Eigen::Matrix SingleMatrix; - typedef Eigen::Matrix SingleVector; - - typedef std::array SampleMatrixGainArray; - - using PermutationMatrix = Eigen::PermutationMatrix; - - } // namespace multifit -} // namespace ecal +namespace ecal { namespace multifit { + +constexpr int SampleVectorSize = 10; +constexpr int FullSampleVectorSize = 19; +constexpr int PulseVectorSize = 12; +constexpr int NGains = 3; + +using data_type = ::ecal::reco::ComputationScalarType; + +typedef Eigen::Matrix PulseMatrixType; +typedef Eigen::Matrix BXVectorType; +using SampleMatrixD = Eigen::Matrix; + +typedef Eigen::Matrix SampleVector; +typedef Eigen::Matrix FullSampleVector; +typedef Eigen::Matrix PulseVector; +typedef Eigen::Matrix BXVector; +typedef Eigen::Matrix SampleGainVector; +typedef Eigen::Matrix SampleMatrix; +typedef Eigen::Matrix FullSampleMatrix; +typedef Eigen::Matrix PulseMatrix; +typedef Eigen::Matrix SamplePulseMatrix; +typedef Eigen::LLT SampleDecompLLT; +typedef Eigen::LLT SampleDecompLLTD; +typedef Eigen::LLT PulseDecompLLT; +typedef Eigen::LDLT PulseDecompLDLT; + +typedef Eigen::Matrix SingleMatrix; +typedef Eigen::Matrix SingleVector; + +typedef std::array SampleMatrixGainArray; + +using PermutationMatrix = Eigen::PermutationMatrix; + +}} #endif diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu index bc2b1300123dd..83a3e2b39ed0b 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu @@ -16,68 +16,95 @@ #include "inplace_fnnls.h" #include "KernelHelpers.h" -namespace ecal { - namespace multifit { - - /// - /// assume kernel launch configuration is - /// (MAXSAMPLES * nchannels, blocks) - /// TODO: is there a point to split this kernel further to separate reductions - /// - __global__ void kernel_prep_1d_and_initialize(EcalPulseShape const* shapes_in, - uint16_t const* digis_in, - uint32_t const* dids, - SampleVector* amplitudes, - SampleVector* amplitudesForMinimization, - SampleGainVector* gainsNoise, - float const* mean_x1, - float const* mean_x12, - float const* rms_x12, - float const* mean_x6, - float const* gain6Over1, - float const* gain12Over6, - bool* hasSwitchToGain6, - bool* hasSwitchToGain1, - bool* isSaturated, - ::ecal::reco::StorageScalarType* energies, - ::ecal::reco::StorageScalarType* chi2, - ::ecal::reco::StorageScalarType* g_pedestal, - uint32_t* flags, - char* acState, - BXVectorType* bxs, - uint32_t const offsetForHashes, - bool const gainSwitchUseMaxSampleEB, - bool const gainSwitchUseMaxSampleEE, - int const nchannels) { - constexpr bool dynamicPedestal = false; //---- default to false, ok - constexpr int nsamples = EcalDataFrame::MAXSAMPLES; - constexpr int sample_max = 5; - constexpr int full_pulse_max = 9; - int const tx = threadIdx.x + blockIdx.x * blockDim.x; - int const nchannels_per_block = blockDim.x / nsamples; - int const total_threads = nchannels * nsamples; - int const ch = tx / nsamples; - int const sample = threadIdx.x % nsamples; - - if (ch < nchannels) { +namespace ecal { namespace multifit { + +/// +/// assume kernel launch configuration is +/// (MAXSAMPLES * nchannels, blocks) +/// +__global__ +void kernel_prep_1d_and_initialize( + EcalPulseShape const* shapes_in, + uint16_t const* digis_in_eb, + uint32_t const* dids_eb, + uint16_t const* digis_in_ee, + uint32_t const* dids_ee, + SampleVector* amplitudes, + SampleVector* amplitudesForMinimization, + SampleGainVector* gainsNoise, + float const* mean_x1, + float const* mean_x12, + float const* rms_x12, + float const* mean_x6, + float const* gain6Over1, + float const* gain12Over6, + bool* hasSwitchToGain6, + bool* hasSwitchToGain1, + bool* isSaturated, + ::ecal::reco::StorageScalarType* energies, + ::ecal::reco::StorageScalarType* chi2, + ::ecal::reco::StorageScalarType* g_pedestal, + uint32_t *dids_out, + uint32_t *flags, + char* acState, + BXVectorType *bxs, + uint32_t const offsetForHashes, + uint32_t const offsetForInputs, + bool const gainSwitchUseMaxSampleEB, + bool const gainSwitchUseMaxSampleEE, + int const nchannels) { + constexpr bool dynamicPedestal = false; //---- default to false, ok + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + constexpr int sample_max = 5; + constexpr int full_pulse_max = 9; + int const tx = threadIdx.x + blockIdx.x*blockDim.x; + int const nchannels_per_block = blockDim.x / nsamples; + int const total_threads = nchannels * nsamples; + int const ch = tx / nsamples; + // for accessing input arrays + int const inputCh = ch >= offsetForInputs + ? ch - offsetForInputs + : ch; + int const inputTx = ch >= offsetForInputs + ? tx - offsetForInputs*10 + : tx; + // eb is first and then ee + auto const* digis_in = ch >= offsetForInputs + ? digis_in_ee + : digis_in_eb; + auto const* dids = ch >= offsetForInputs + ? dids_ee + : dids_eb; + int const sample = threadIdx.x % nsamples; + + if (ch < nchannels) { // array of 10 x channels per block // TODO: any other way of doing simple reduction // assume bool is 1 byte, should be quite safe extern __shared__ char shared_mem[]; - bool* shr_hasSwitchToGain6 = reinterpret_cast(shared_mem); - bool* shr_hasSwitchToGain1 = shr_hasSwitchToGain6 + nchannels_per_block * nsamples; - bool* shr_hasSwitchToGain0 = shr_hasSwitchToGain1 + nchannels_per_block * nsamples; - bool* shr_isSaturated = shr_hasSwitchToGain0 + nchannels_per_block * nsamples; - bool* shr_hasSwitchToGain0_tmp = shr_isSaturated + nchannels_per_block * nsamples; - char* shr_counts = reinterpret_cast(shr_hasSwitchToGain0_tmp) + nchannels_per_block * nsamples; + bool* shr_hasSwitchToGain6 = reinterpret_cast( + shared_mem); + bool* shr_hasSwitchToGain1 = shr_hasSwitchToGain6 + + nchannels_per_block*nsamples; + bool* shr_hasSwitchToGain0 = shr_hasSwitchToGain1 + + nchannels_per_block*nsamples; + bool* shr_isSaturated = shr_hasSwitchToGain0 + + nchannels_per_block*nsamples; + bool* shr_hasSwitchToGain0_tmp = shr_isSaturated + + nchannels_per_block*nsamples; + char* shr_counts = reinterpret_cast( + shr_hasSwitchToGain0_tmp) + nchannels_per_block*nsamples; // // indices // - auto const did = DetId{dids[ch]}; + auto const did = DetId{dids[inputCh]}; auto const isBarrel = did.subdetId() == EcalBarrel; // TODO offset for ee, 0 for eb - auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId()); + auto const hashedId = isBarrel + ? hashedIndexEB(did.rawId()) + : offsetForHashes + hashedIndexEE(did.rawId()); + // // pulse shape template @@ -86,15 +113,15 @@ namespace ecal { isample+=nsamples) shapes_out[ch](isample + 7) = shapes_in[hashedId].pdfval[isample]; */ - + // will be used in the future for setting state auto const rmsForChecking = rms_x12[hashedId]; // // amplitudes // - int const adc = ecal::mgpa::adc(digis_in[tx]); - int const gainId = ecal::mgpa::gainId(digis_in[tx]); + int const adc = ecal::mgpa::adc(digis_in[inputTx]); + int const gainId = ecal::mgpa::gainId(digis_in[inputTx]); SampleVector::Scalar amplitude = 0.; SampleVector::Scalar pedestal = 0.; SampleVector::Scalar gainratio = 0.; @@ -106,12 +133,13 @@ namespace ecal { shr_hasSwitchToGain0[threadIdx.x] = shr_hasSwitchToGain0_tmp[threadIdx.x]; shr_counts[threadIdx.x] = 0; __syncthreads(); - + // non-divergent branch (except for the last 4 threads) - if (threadIdx.x <= blockDim.x - 5) { -#pragma unroll - for (int i = 0; i < 5; i++) - shr_counts[threadIdx.x] += shr_hasSwitchToGain0[threadIdx.x + i]; + if (threadIdx.x<=blockDim.x-5) { + #pragma unroll + for (int i=0; i<5; i++) + shr_counts[threadIdx.x] += + shr_hasSwitchToGain0[threadIdx.x+i]; } shr_isSaturated[threadIdx.x] = shr_counts[threadIdx.x] == 5; @@ -120,89 +148,102 @@ namespace ecal { // TODO // if (sample < 5) { - shr_hasSwitchToGain6[threadIdx.x] = - shr_hasSwitchToGain6[threadIdx.x] || shr_hasSwitchToGain6[threadIdx.x + 5]; - shr_hasSwitchToGain1[threadIdx.x] = - shr_hasSwitchToGain1[threadIdx.x] || shr_hasSwitchToGain1[threadIdx.x + 5]; - - // duplication of hasSwitchToGain0 in order not to - // introduce another syncthreads - shr_hasSwitchToGain0_tmp[threadIdx.x] = - shr_hasSwitchToGain0_tmp[threadIdx.x] || shr_hasSwitchToGain0_tmp[threadIdx.x + 5]; + shr_hasSwitchToGain6[threadIdx.x] = + shr_hasSwitchToGain6[threadIdx.x] || + shr_hasSwitchToGain6[threadIdx.x + 5]; + shr_hasSwitchToGain1[threadIdx.x] = + shr_hasSwitchToGain1[threadIdx.x] || + shr_hasSwitchToGain1[threadIdx.x + 5]; + + // duplication of hasSwitchToGain0 in order not to + // introduce another syncthreads + shr_hasSwitchToGain0_tmp[threadIdx.x] = + shr_hasSwitchToGain0_tmp[threadIdx.x] || + shr_hasSwitchToGain0_tmp[threadIdx.x+5]; } __syncthreads(); - - if (sample < 2) { - // note, both threads per channel take value [3] twice to avoid another if - shr_hasSwitchToGain6[threadIdx.x] = shr_hasSwitchToGain6[threadIdx.x] || - shr_hasSwitchToGain6[threadIdx.x + 2] || - shr_hasSwitchToGain6[threadIdx.x + 3]; - shr_hasSwitchToGain1[threadIdx.x] = shr_hasSwitchToGain1[threadIdx.x] || - shr_hasSwitchToGain1[threadIdx.x + 2] || - shr_hasSwitchToGain1[threadIdx.x + 3]; - - shr_hasSwitchToGain0_tmp[threadIdx.x] = shr_hasSwitchToGain0_tmp[threadIdx.x] || - shr_hasSwitchToGain0_tmp[threadIdx.x + 2] || - shr_hasSwitchToGain0_tmp[threadIdx.x + 3]; - - // sample < 2 -> first 2 threads of each channel will be used here - // => 0 -> will compare 3 and 4 and put into 0 - // => 1 -> will compare 4 and 5 and put into 1 - shr_isSaturated[threadIdx.x] = shr_isSaturated[threadIdx.x + 3] || shr_isSaturated[threadIdx.x + 4]; + + if (sample<2) { + // note, both threads per channel take value [3] twice to avoid another if + shr_hasSwitchToGain6[threadIdx.x] = + shr_hasSwitchToGain6[threadIdx.x] || + shr_hasSwitchToGain6[threadIdx.x+2] || + shr_hasSwitchToGain6[threadIdx.x+3]; + shr_hasSwitchToGain1[threadIdx.x] = + shr_hasSwitchToGain1[threadIdx.x] || + shr_hasSwitchToGain1[threadIdx.x+2] || + shr_hasSwitchToGain1[threadIdx.x+3]; + + shr_hasSwitchToGain0_tmp[threadIdx.x] = + shr_hasSwitchToGain0_tmp[threadIdx.x] || + shr_hasSwitchToGain0_tmp[threadIdx.x+2] || + shr_hasSwitchToGain0_tmp[threadIdx.x+3]; + + // sample < 2 -> first 2 threads of each channel will be used here + // => 0 -> will compare 3 and 4 and put into 0 + // => 1 -> will compare 4 and 5 and put into 1 + shr_isSaturated[threadIdx.x] = + shr_isSaturated[threadIdx.x+3] || shr_isSaturated[threadIdx.x+4]; } __syncthreads(); bool check_hasSwitchToGain0 = false; - if (sample == 0) { - shr_hasSwitchToGain6[threadIdx.x] = - shr_hasSwitchToGain6[threadIdx.x] || shr_hasSwitchToGain6[threadIdx.x + 1]; - shr_hasSwitchToGain1[threadIdx.x] = - shr_hasSwitchToGain1[threadIdx.x] || shr_hasSwitchToGain1[threadIdx.x + 1]; - shr_hasSwitchToGain0_tmp[threadIdx.x] = - shr_hasSwitchToGain0_tmp[threadIdx.x] || shr_hasSwitchToGain0_tmp[threadIdx.x + 1]; - - hasSwitchToGain6[ch] = shr_hasSwitchToGain6[threadIdx.x]; - hasSwitchToGain1[ch] = shr_hasSwitchToGain1[threadIdx.x]; - - // set only for the threadIdx.x corresponding to sample==0 - check_hasSwitchToGain0 = shr_hasSwitchToGain0_tmp[threadIdx.x]; - - shr_isSaturated[threadIdx.x + 3] = shr_isSaturated[threadIdx.x] || shr_isSaturated[threadIdx.x + 1]; - isSaturated[ch] = shr_isSaturated[threadIdx.x + 3]; + if (sample==0) { + shr_hasSwitchToGain6[threadIdx.x] = + shr_hasSwitchToGain6[threadIdx.x] || + shr_hasSwitchToGain6[threadIdx.x+1]; + shr_hasSwitchToGain1[threadIdx.x] = + shr_hasSwitchToGain1[threadIdx.x] || + shr_hasSwitchToGain1[threadIdx.x+1]; + shr_hasSwitchToGain0_tmp[threadIdx.x] = + shr_hasSwitchToGain0_tmp[threadIdx.x] || + shr_hasSwitchToGain0_tmp[threadIdx.x+1]; + + hasSwitchToGain6[ch] = shr_hasSwitchToGain6[threadIdx.x]; + hasSwitchToGain1[ch] = shr_hasSwitchToGain1[threadIdx.x]; + + // set only for the threadIdx.x corresponding to sample==0 + check_hasSwitchToGain0 = shr_hasSwitchToGain0_tmp[threadIdx.x]; + + shr_isSaturated[threadIdx.x+3] = + shr_isSaturated[threadIdx.x] || + shr_isSaturated[threadIdx.x+1]; + isSaturated[ch] = shr_isSaturated[threadIdx.x+3]; } // TODO: w/o this sync, there is a race // if (threadIdx == sample_max) below uses max sample thread, not for 0 sample // check if we can remove it __syncthreads(); - + // TODO: divergent branch - if (gainId == 0 || gainId == 3) { - pedestal = mean_x1[hashedId]; - gainratio = gain6Over1[hashedId] * gain12Over6[hashedId]; - gainsNoise[ch](sample) = 2; - } else if (gainId == 1) { - pedestal = mean_x12[hashedId]; - gainratio = 1.; - gainsNoise[ch](sample) = 0; - } else if (gainId == 2) { - pedestal = mean_x6[hashedId]; - gainratio = gain12Over6[hashedId]; - gainsNoise[ch](sample) = 1; + if (gainId==0 || gainId==3) { + pedestal = mean_x1[hashedId]; + gainratio = gain6Over1[hashedId] * gain12Over6[hashedId]; + gainsNoise[ch](sample) = 2; + } else if (gainId==1) { + pedestal = mean_x12[hashedId]; + gainratio = 1.; + gainsNoise[ch](sample) = 0; + } else if (gainId==2) { + pedestal = mean_x6[hashedId]; + gainratio = gain12Over6[hashedId]; + gainsNoise[ch](sample) = 1; } - + // TODO: compile time constant -> branch should be non-divergent if (dynamicPedestal) - amplitude = static_cast(adc) * gainratio; + amplitude = static_cast(adc) * gainratio; else - amplitude = (static_cast(adc) - pedestal) * gainratio; + amplitude = (static_cast(adc) - pedestal) * gainratio; amplitudes[ch][sample] = amplitude; #ifdef ECAL_RECO_CUDA_DEBUG - printf("%d %d %d %d %f %f %f\n", tx, ch, sample, adc, amplitude, pedestal, gainratio); - if (adc == 0) - printf("adc is zero\n"); + printf("%d %d %d %d %f %f %f\n", tx, ch, sample, adc, amplitude, + pedestal, gainratio); + if (adc==0) + printf("adc is zero\n"); #endif // @@ -211,289 +252,325 @@ namespace ecal { amplitudesForMinimization[ch](sample) = 0; bxs[ch](sample) = sample - 5; - // select the thread for the max sample + // select the thread for the max sample //---> hardcoded above to be 5th sample, ok if (sample == sample_max) { - // - // initialization - // - acState[ch] = static_cast(MinimizationState::NotFinished); - energies[ch] = 0; - chi2[ch] = 0; - g_pedestal[ch] = 0; - uint32_t flag = 0; - - // start of this channel in shared mem - int const chStart = threadIdx.x - sample_max; - // thread for the max sample in shared mem - int const threadMax = threadIdx.x; - auto const gainSwitchUseMaxSample = isBarrel ? gainSwitchUseMaxSampleEB : gainSwitchUseMaxSampleEE; - - // this flag setting is applied to all of the cases - if (shr_hasSwitchToGain6[chStart]) - flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain6; - if (shr_hasSwitchToGain1[chStart]) - flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain1; - - // this corresponds to cpu branching on lastSampleBeforeSaturation - // likely false - if (check_hasSwitchToGain0) { - // assign for the case some sample having gainId == 0 - //energies[ch] = amplitudes[ch][sample_max]; - energies[ch] = amplitude; - - // check if samples before sample_max have true - bool saturated_before_max = false; -#pragma unroll - for (char ii = 0; ii < 5; ii++) - saturated_before_max = saturated_before_max || shr_hasSwitchToGain0[chStart + ii]; - - // if saturation is in the max sample and not in the first 5 - if (!saturated_before_max && shr_hasSwitchToGain0[threadMax]) - energies[ch] = 49140; // 4095 * 12 - //---- AM FIXME : no pedestal subtraction??? - //It should be "(4095. - pedestal) * gainratio" - - // set state flag to terminate further processing of this channel - acState[ch] = static_cast(MinimizationState::Precomputed); - flag |= 0x1 << EcalUncalibratedRecHit::kSaturated; + // + // initialization + // + acState[ch] = static_cast(MinimizationState::NotFinished); + energies[ch] = 0; + chi2[ch] = 0; + g_pedestal[ch] = 0; + uint32_t flag = 0; + dids_out[ch] = did.rawId(); + + // start of this channel in shared mem + int const chStart = threadIdx.x - sample_max; + // thread for the max sample in shared mem + int const threadMax = threadIdx.x; + auto const gainSwitchUseMaxSample = isBarrel + ? gainSwitchUseMaxSampleEB + : gainSwitchUseMaxSampleEE; + + // this flag setting is applied to all of the cases + if (shr_hasSwitchToGain6[chStart]) + flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain6; + if (shr_hasSwitchToGain1[chStart]) + flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain1; + + // this corresponds to cpu branching on lastSampleBeforeSaturation + // likely false + if (check_hasSwitchToGain0) { + // assign for the case some sample having gainId == 0 + //energies[ch] = amplitudes[ch][sample_max]; + energies[ch] = amplitude; + + // check if samples before sample_max have true + bool saturated_before_max = false; + #pragma unroll + for (char ii=0; ii<5; ii++) + saturated_before_max = saturated_before_max || + shr_hasSwitchToGain0[chStart + ii]; + + // if saturation is in the max sample and not in the first 5 + if (!saturated_before_max && + shr_hasSwitchToGain0[threadMax]) + energies[ch] = 49140; // 4095 * 12 + //---- AM FIXME : no pedestal subtraction??? + //It should be "(4095. - pedestal) * gainratio" + + // set state flag to terminate further processing of this channel + acState[ch] = static_cast(MinimizationState::Precomputed); + flag |= 0x1 << EcalUncalibratedRecHit::kSaturated; + flags[ch] = flag; + return; + } + + // according to cpu version +// auto max_amplitude = amplitudes[ch][sample_max]; + auto const max_amplitude = amplitude; + // according to cpu version + auto shape_value = shapes_in[hashedId].pdfval[full_pulse_max-7]; + // note, no syncing as the same thread will be accessing here + bool hasGainSwitch = shr_hasSwitchToGain6[chStart] + || shr_hasSwitchToGain1[chStart] + || shr_isSaturated[chStart+3]; + + // pedestal is final unconditionally + g_pedestal[ch] = pedestal; + if (hasGainSwitch && gainSwitchUseMaxSample) { + // thread for sample=0 will access the right guys + energies[ch] = max_amplitude / shape_value; + acState[ch] = static_cast(MinimizationState::Precomputed); + flags[ch] = flag; + return; + } + + // this happens cause sometimes rms_x12 is 0... + // needs to be checkec why this is the case + // general case here is that noisecov is a Zero matrix + if (rmsForChecking == 0) { + acState[ch] = static_cast(MinimizationState::Precomputed); + flags[ch] = flag; + return; + } + + // for the case when no shortcuts were taken flags[ch] = flag; - return; - } - - // according to cpu version - // auto max_amplitude = amplitudes[ch][sample_max]; - auto const max_amplitude = amplitude; - // according to cpu version - auto shape_value = shapes_in[hashedId].pdfval[full_pulse_max - 7]; - // note, no syncing as the same thread will be accessing here - bool hasGainSwitch = - shr_hasSwitchToGain6[chStart] || shr_hasSwitchToGain1[chStart] || shr_isSaturated[chStart + 3]; - - // pedestal is final unconditionally - g_pedestal[ch] = pedestal; - if (hasGainSwitch && gainSwitchUseMaxSample) { - // thread for sample=0 will access the right guys - energies[ch] = max_amplitude / shape_value; - acState[ch] = static_cast(MinimizationState::Precomputed); - flags[ch] = flag; - return; - } - - // this happens cause sometimes rms_x12 is 0... - // needs to be checkec why this is the case - // general case here is that noisecov is a Zero matrix - if (rmsForChecking == 0) { - acState[ch] = static_cast(MinimizationState::Precomputed); - flags[ch] = flag; - return; - } - - // for the case when no shortcuts were taken - flags[ch] = flag; } - } } +} - /// - /// assume kernel launch configuration is - /// ([MAXSAMPLES, MAXSAMPLES], nchannels) - /// - __global__ void kernel_prep_2d(EcalPulseCovariance const* pulse_cov_in, - FullSampleMatrix* pulse_cov_out, - SampleGainVector const* gainNoise, - uint32_t const* dids, - float const* rms_x12, - float const* rms_x6, - float const* rms_x1, - float const* gain12Over6, - float const* gain6Over1, - double const* G12SamplesCorrelationEB, - double const* G6SamplesCorrelationEB, - double const* G1SamplesCorrelationEB, - double const* G12SamplesCorrelationEE, - double const* G6SamplesCorrelationEE, - double const* G1SamplesCorrelationEE, - SampleMatrix* noisecov, - PulseMatrixType* pulse_matrix, - EcalPulseShape const* pulse_shape, - bool const* hasSwitchToGain6, - bool const* hasSwitchToGain1, - bool const* isSaturated, - uint32_t const offsetForHashes) { - int ch = blockIdx.x; - int tx = threadIdx.x; - int ty = threadIdx.y; - constexpr int nsamples = EcalDataFrame::MAXSAMPLES; - constexpr float addPedestalUncertainty = 0.f; - constexpr bool dynamicPedestal = false; - constexpr bool simplifiedNoiseModelForGainSwitch = true; //---- default is true - constexpr int template_samples = EcalPulseShape::TEMPLATESAMPLES; - - bool tmp0 = hasSwitchToGain6[ch]; - bool tmp1 = hasSwitchToGain1[ch]; - auto const did = DetId{dids[ch]}; - auto const isBarrel = did.subdetId() == EcalBarrel; - auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId()); - auto const G12SamplesCorrelation = isBarrel ? G12SamplesCorrelationEB : G12SamplesCorrelationEE; - auto const* G6SamplesCorrelation = isBarrel ? G6SamplesCorrelationEB : G6SamplesCorrelationEE; - auto const* G1SamplesCorrelation = isBarrel ? G1SamplesCorrelationEB : G1SamplesCorrelationEE; - bool tmp2 = isSaturated[ch]; - bool hasGainSwitch = tmp0 || tmp1 || tmp2; - auto const vidx = ecal::abs(ty - tx); - - // only ty == 0 and 1 will go for a second iteration - for (int iy = ty; iy < template_samples; iy += nsamples) - for (int ix = tx; ix < template_samples; ix += nsamples) - pulse_cov_out[ch](iy + 7, ix + 7) = pulse_cov_in[hashedId].covval[iy][ix]; - - // non-divergent branch for all threads per block - if (hasGainSwitch) { +/// +/// assume kernel launch configuration is +/// ([MAXSAMPLES, MAXSAMPLES], nchannels) +/// +__global__ +void kernel_prep_2d(SampleGainVector const* gainNoise, + uint32_t const* dids_eb, + uint32_t const* dids_ee, + float const* rms_x12, + float const* rms_x6, + float const* rms_x1, + float const* gain12Over6, + float const* gain6Over1, + double const* G12SamplesCorrelationEB, + double const* G6SamplesCorrelationEB, + double const* G1SamplesCorrelationEB, + double const* G12SamplesCorrelationEE, + double const* G6SamplesCorrelationEE, + double const* G1SamplesCorrelationEE, + SampleMatrix* noisecov, + PulseMatrixType* pulse_matrix, + EcalPulseShape const* pulse_shape, + bool const* hasSwitchToGain6, + bool const* hasSwitchToGain1, + bool const* isSaturated, + uint32_t const offsetForHashes, + uint32_t const offsetForInputs) { + int const ch = blockIdx.x; + int const tx = threadIdx.x; + int const ty = threadIdx.y; + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + constexpr float addPedestalUncertainty = 0.f; + constexpr bool dynamicPedestal = false; + constexpr bool simplifiedNoiseModelForGainSwitch = true; //---- default is true + constexpr int template_samples = EcalPulseShape::TEMPLATESAMPLES; + + // to access input arrays (ids and digis only) + int const inputCh = ch >= offsetForInputs + ? ch - offsetForInputs + : ch; + auto const* dids = ch >= offsetForInputs + ? dids_ee + : dids_eb; + + bool tmp0 = hasSwitchToGain6[ch]; + bool tmp1 = hasSwitchToGain1[ch]; + auto const did = DetId{dids[inputCh]}; + auto const isBarrel = did.subdetId() == EcalBarrel; + auto const hashedId = isBarrel + ? hashedIndexEB(did.rawId()) + : offsetForHashes + hashedIndexEE(did.rawId()); + auto const G12SamplesCorrelation = isBarrel + ? G12SamplesCorrelationEB + : G12SamplesCorrelationEE; + auto const* G6SamplesCorrelation = isBarrel + ? G6SamplesCorrelationEB + : G6SamplesCorrelationEE; + auto const* G1SamplesCorrelation = isBarrel + ? G1SamplesCorrelationEB + : G1SamplesCorrelationEE; + bool tmp2 = isSaturated[ch]; + bool hasGainSwitch = tmp0 || tmp1 || tmp2; + auto const vidx = ecal::abs(ty - tx); + + // non-divergent branch for all threads per block + if (hasGainSwitch) { // TODO: did not include simplified noise model float noise_value = 0; // non-divergent branch - all threads per block - // TODO: all of these constants indicate that - // that these parts could be splitted into completely different + // TODO: all of these constants indicate that + // that these parts could be splitted into completely different // kernels and run one of them only depending on the config if (simplifiedNoiseModelForGainSwitch) { - int isample_max = 5; // according to cpu defs - int gainidx = gainNoise[ch][isample_max]; - - // non-divergent branches - if (gainidx == 0) - //noise_value = rms_x12[ch]*rms_x12[ch]*noisecorrs[0](ty, tx); - noise_value = rms_x12[hashedId] * rms_x12[hashedId] * G12SamplesCorrelation[vidx]; - if (gainidx == 1) - // noise_value = gain12Over6[ch]*gain12Over6[ch] * rms_x6[ch]*rms_x6[ch] - // *noisecorrs[1](ty, tx); - noise_value = gain12Over6[hashedId] * gain12Over6[hashedId] * rms_x6[hashedId] * rms_x6[hashedId] * - G6SamplesCorrelation[vidx]; - if (gainidx == 2) - // noise_value = gain12Over6[ch]*gain12Over6[ch] - // * gain6Over1[ch]*gain6Over1[ch] * rms_x1[ch]*rms_x1[ch] - // * noisecorrs[2](ty, tx); - noise_value = gain12Over6[hashedId] * gain12Over6[hashedId] * gain6Over1[hashedId] * gain6Over1[hashedId] * - rms_x1[hashedId] * rms_x1[hashedId] * G1SamplesCorrelation[vidx]; - if (!dynamicPedestal && addPedestalUncertainty > 0.f) - noise_value += addPedestalUncertainty * addPedestalUncertainty; + int isample_max = 5; // according to cpu defs + int gainidx = gainNoise[ch][isample_max]; + + // non-divergent branches + if (gainidx==0) + //noise_value = rms_x12[ch]*rms_x12[ch]*noisecorrs[0](ty, tx); + noise_value = rms_x12[hashedId]*rms_x12[hashedId] + * G12SamplesCorrelation[vidx]; + if (gainidx==1) +// noise_value = gain12Over6[ch]*gain12Over6[ch] * rms_x6[ch]*rms_x6[ch] +// *noisecorrs[1](ty, tx); + noise_value = gain12Over6[hashedId]*gain12Over6[hashedId] + * rms_x6[hashedId]*rms_x6[hashedId] + * G6SamplesCorrelation[vidx]; + if (gainidx==2) +// noise_value = gain12Over6[ch]*gain12Over6[ch] +// * gain6Over1[ch]*gain6Over1[ch] * rms_x1[ch]*rms_x1[ch] +// * noisecorrs[2](ty, tx); + noise_value = gain12Over6[hashedId]*gain12Over6[hashedId] + * gain6Over1[hashedId]*gain6Over1[hashedId] + * rms_x1[hashedId]*rms_x1[hashedId] + * G1SamplesCorrelation[vidx]; + if (!dynamicPedestal && addPedestalUncertainty>0.f) + noise_value += addPedestalUncertainty*addPedestalUncertainty; } else { - int gainidx = 0; - char mask = gainidx; - int pedestal = gainNoise[ch][ty] == mask ? 1 : 0; - // noise_value += /* gainratio is 1*/ rms_x12[ch]*rms_x12[ch] - // *pedestal*noisecorrs[0](ty, tx); - noise_value += - /* gainratio is 1*/ rms_x12[hashedId] * rms_x12[hashedId] * pedestal * G12SamplesCorrelation[vidx]; - // non-divergent branch - if (!dynamicPedestal && addPedestalUncertainty > 0.f) { - noise_value += /* gainratio is 1 */ - addPedestalUncertainty * addPedestalUncertainty * pedestal; - } - - // - gainidx = 1; - mask = gainidx; - pedestal = gainNoise[ch][ty] == mask ? 1 : 0; - // noise_value += gain12Over6[ch]*gain12Over6[ch] - // *rms_x6[ch]*rms_x6[ch]*pedestal*noisecorrs[1](ty, tx); - noise_value += gain12Over6[hashedId] * gain12Over6[hashedId] * rms_x6[hashedId] * rms_x6[hashedId] * - pedestal * G6SamplesCorrelation[vidx]; - // non-divergent branch - if (!dynamicPedestal && addPedestalUncertainty > 0.f) { - noise_value += gain12Over6[hashedId] * gain12Over6[hashedId] * addPedestalUncertainty * - addPedestalUncertainty * pedestal; - } - - // - gainidx = 2; - mask = gainidx; - pedestal = gainNoise[ch][ty] == mask ? 1 : 0; - float tmp = gain6Over1[hashedId] * gain12Over6[hashedId]; - // noise_value += tmp*tmp * rms_x1[ch]*rms_x1[ch] - // *pedestal*noisecorrs[2](ty, tx); - noise_value += tmp * tmp * rms_x1[hashedId] * rms_x1[hashedId] * pedestal * G1SamplesCorrelation[vidx]; - // non-divergent branch - if (!dynamicPedestal && addPedestalUncertainty > 0.f) { - noise_value += tmp * tmp * addPedestalUncertainty * addPedestalUncertainty * pedestal; - } + int gainidx=0; + char mask = gainidx; + int pedestal = gainNoise[ch][ty] == mask ? 1 : 0; +// noise_value += /* gainratio is 1*/ rms_x12[ch]*rms_x12[ch] +// *pedestal*noisecorrs[0](ty, tx); + noise_value += /* gainratio is 1*/ rms_x12[hashedId]*rms_x12[hashedId] + * pedestal* G12SamplesCorrelation[vidx]; + // non-divergent branch + if (!dynamicPedestal && addPedestalUncertainty>0.f) { + noise_value += /* gainratio is 1 */ + addPedestalUncertainty*addPedestalUncertainty*pedestal; + } + + // + gainidx=1; + mask = gainidx; + pedestal = gainNoise[ch][ty] == mask ? 1 : 0; +// noise_value += gain12Over6[ch]*gain12Over6[ch] +// *rms_x6[ch]*rms_x6[ch]*pedestal*noisecorrs[1](ty, tx); + noise_value += gain12Over6[hashedId]*gain12Over6[hashedId] + *rms_x6[hashedId]*rms_x6[hashedId]*pedestal + * G6SamplesCorrelation[vidx]; + // non-divergent branch + if (!dynamicPedestal && addPedestalUncertainty>0.f) { + noise_value += gain12Over6[hashedId]*gain12Over6[hashedId] + *addPedestalUncertainty*addPedestalUncertainty + *pedestal; + } + + // + gainidx=2; + mask = gainidx; + pedestal = gainNoise[ch][ty] == mask ? 1 : 0; + float tmp = gain6Over1[hashedId] * gain12Over6[hashedId]; +// noise_value += tmp*tmp * rms_x1[ch]*rms_x1[ch] +// *pedestal*noisecorrs[2](ty, tx); + noise_value += tmp*tmp * rms_x1[hashedId]*rms_x1[hashedId] + *pedestal* G1SamplesCorrelation[vidx]; + // non-divergent branch + if (!dynamicPedestal && addPedestalUncertainty>0.f) { + noise_value += tmp*tmp * addPedestalUncertainty*addPedestalUncertainty + * pedestal; + } } noisecov[ch](ty, tx) = noise_value; - } else { + } else { auto rms = rms_x12[hashedId]; - float noise_value = rms * rms * G12SamplesCorrelation[vidx]; - if (!dynamicPedestal && addPedestalUncertainty > 0.f) { - //---- add fully correlated component to noise covariance to inflate pedestal uncertainty - noise_value += addPedestalUncertainty * addPedestalUncertainty; + float noise_value = rms*rms * G12SamplesCorrelation[vidx]; + if (!dynamicPedestal && addPedestalUncertainty>0.f) { + //---- add fully correlated component to noise covariance to inflate pedestal uncertainty + noise_value += addPedestalUncertainty*addPedestalUncertainty; } noisecov[ch](ty, tx) = noise_value; - } - - // pulse matrix - // int const bx = tx - 5; // -5 -4 -3 ... 3 4 - // int bx = (*bxs)(tx); - // int const offset = 7 - 3 - bx; - int const posToAccess = 9 - tx + ty; // see cpu for reference - float const value = posToAccess >= 7 ? pulse_shape[hashedId].pdfval[posToAccess - 7] : 0; - pulse_matrix[ch](ty, tx) = value; } - __global__ void kernel_permute_results(SampleVector* amplitudes, - BXVectorType const* activeBXs, - ::ecal::reco::StorageScalarType* energies, - char const* acState, - int const nchannels) { - // constants - constexpr int nsamples = EcalDataFrame::MAXSAMPLES; - - // indices - int const tx = threadIdx.x + blockIdx.x * blockDim.x; - int const ch = tx / nsamples; - int const iii = tx % nsamples; // this is to address activeBXs - - if (ch >= nchannels) + // pulse matrix +// int const bx = tx - 5; // -5 -4 -3 ... 3 4 +// int bx = (*bxs)(tx); +// int const offset = 7 - 3 - bx; + int const posToAccess = 9 - tx + ty; // see cpu for reference + float const value = posToAccess>=7 + ? pulse_shape[hashedId].pdfval[posToAccess-7] + : 0; + pulse_matrix[ch](ty, tx) = value; +} + +__global__ +void kernel_permute_results( + SampleVector *amplitudes, + BXVectorType const*activeBXs, + ::ecal::reco::StorageScalarType *energies, + char const* acState, + int const nchannels) { + // constants + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + int const tx = threadIdx.x + blockIdx.x * blockDim.x; + int const ch = tx / nsamples; + int const iii = tx % nsamples; // this is to address activeBXs + + if (ch >= nchannels) return; + + // channels that have amplitude precomputed do not need results to be permuted + auto const state = static_cast(acState[ch]); + if (static_cast(acState[ch]) == + MinimizationState::Precomputed) return; - // channels that have amplitude precomputed do not need results to be permuted - auto const state = static_cast(acState[ch]); - if (static_cast(acState[ch]) == MinimizationState::Precomputed) - return; + // configure shared memory and cp into it + extern __shared__ char smem[]; + SampleVector::Scalar* values = reinterpret_cast( + smem); + values[threadIdx.x] = amplitudes[ch](iii); + __syncthreads(); - // configure shared memory and cp into it - extern __shared__ char smem[]; - SampleVector::Scalar* values = reinterpret_cast(smem); - values[threadIdx.x] = amplitudes[ch](iii); - __syncthreads(); + // get the sample for this bx + auto const sample = static_cast(activeBXs[ch](iii)) + 5; - // get the sample for this bx - auto const sample = static_cast(activeBXs[ch](iii)) + 5; + // store back to global + amplitudes[ch](sample) = values[threadIdx.x]; - // store back to global - amplitudes[ch](sample) = values[threadIdx.x]; - - // store sample 5 separately - // only for the case when minimization was performed - // not for cases with precomputed amplitudes - if (sample == 5) + // store sample 5 separately + // only for the case when minimization was performed + // not for cases with precomputed amplitudes + if (sample == 5) energies[ch] = values[threadIdx.x]; - } +} /// /// Build an Ecal RecHit. /// TODO: Use SoA data structures on the host directly -/// the reason for removing this from minimize kernel is to isolate the minimize + +/// the reason for removing this from minimize kernel is to isolate the minimize + /// again, building an aos rec hit involves strides... -> bad memory access pattern /// #ifdef RUN_BUILD_AOS_RECHIT - __global__ void kernel_build_rechit( - float const* energies, float const* chi2s, uint32_t* dids, EcalUncalibratedRecHit* rechits, int nchannels) { - int idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx < nchannels) { - rechits[idx] = EcalUncalibratedRecHit{dids[idx], energies[idx], 0, 0, chi2s[idx], 0}; - } +__global__ +void kernel_build_rechit( + float const* energies, + float const* chi2s, + uint32_t* dids, + EcalUncalibratedRecHit* rechits, + int nchannels) { + int idx = threadIdx.x + blockDim.x * blockIdx.x; + if (idx < nchannels) { + rechits[idx] = EcalUncalibratedRecHit{dids[idx], energies[idx], + 0, 0, chi2s[idx], 0}; } +} #endif - } // namespace multifit -} // namespace ecal +}} diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.h b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.h index 6a3bc9ac43795..4b01e056fe0a8 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.h +++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.h @@ -6,89 +6,100 @@ #include "RecoLocalCalo/EcalRecAlgos/interface/Common.h" class EcalPulseShape; -// this flag setting is applied to all of the cases + // this flag setting is applied to all of the cases class EcalPulseCovariance; class EcalUncalibratedRecHit; -namespace ecal { - namespace multifit { +namespace ecal { namespace multifit { - /// - /// assume kernel launch configuration is - /// (MAXSAMPLES * nchannels, blocks) - /// TODO: is there a point to split this kernel further to separate reductions - /// - __global__ void kernel_prep_1d_and_initialize(EcalPulseShape const* shapes_in, - uint16_t const* digis_in, - uint32_t const* dids, - SampleVector* amplitudes, - SampleVector* amplitudesForMinimization, - SampleGainVector* gainsNoise, - float const* mean_x1, - float const* mean_x12, - float const* rms_x12, - float const* mean_x6, - float const* gain6Over1, - float const* gain12Over6, - bool* hasSwitchToGain6, - bool* hasSwitchToGain1, - bool* isSaturated, - ::ecal::reco::StorageScalarType* energies, - ::ecal::reco::StorageScalarType* chi2, - ::ecal::reco::StorageScalarType* pedestal, - uint32_t* flags, - char* acState, - BXVectorType* bxs, - uint32_t offsetForHashes, - bool const gainSwitchUseMaxSampleEB, - bool const gainSwitchUseMaxSampleEE, - int const nchannels); +/// +/// assume kernel launch configuration is +/// (MAXSAMPLES * nchannels, blocks) +/// TODO: is there a point to split this kernel further to separate reductions +/// +__global__ +void kernel_prep_1d_and_initialize(EcalPulseShape const* shapes_in, + uint16_t const* digis_in_eb, + uint32_t const* dids_eb, + uint16_t const* digis_in_ee, + uint32_t const* dids_ee, + SampleVector* amplitudes, + SampleVector* amplitudesForMinimization, + SampleGainVector* gainsNoise, + float const* mean_x1, + float const* mean_x12, + float const* rms_x12, + float const* mean_x6, + float const* gain6Over1, + float const* gain12Over6, + bool* hasSwitchToGain6, + bool* hasSwitchToGain1, + bool* isSaturated, + ::ecal::reco::StorageScalarType* energies, + ::ecal::reco::StorageScalarType* chi2, + ::ecal::reco::StorageScalarType* pedestal, + uint32_t *dids_out, + uint32_t *flags, + char* acState, + BXVectorType *bxs, + uint32_t const offsetForHashes, + uint32_t const offsetForInputs, + bool const gainSwitchUseMaxSampleEB, + bool const gainSwitchUseMaxSampleEE, + int const nchannels); - /// - /// assume kernel launch configuration is - /// ([MAXSAMPLES, MAXSAMPLES], nchannels) - /// - __global__ void kernel_prep_2d(EcalPulseCovariance const* pulse_cov_in, - FullSampleMatrix* pulse_cov_out, - SampleGainVector const* gainNoise, - uint32_t const* dids, - float const* rms_x12, - float const* rms_x6, - float const* rms_x1, - float const* gain12Over6, - float const* gain6Over1, - double const* G12SamplesCorrelationEB, - double const* G6SamplesCorrelationEB, - double const* G1SamplesCorrelationEB, - double const* G12SamplesCorrelationEE, - double const* G6SamplesCorrelationEE, - double const* G1SamplesCorrelationEE, - SampleMatrix* noisecov, - PulseMatrixType* pulse_matrix, - EcalPulseShape const* pulse_shape, - bool const* hasSwitchToGain6, - bool const* hasSwitchToGain1, - bool const* isSaturated, - uint32_t const offsetForHashes); +/// +/// assume kernel launch configuration is +/// ([MAXSAMPLES, MAXSAMPLES], nchannels) +/// +__global__ +void kernel_prep_2d(SampleGainVector const* gainNoise, + uint32_t const* dids_eb, + uint32_t const* dids_ee, + float const* rms_x12, + float const* rms_x6, + float const* rms_x1, + float const* gain12Over6, + float const* gain6Over1, + double const* G12SamplesCorrelationEB, + double const* G6SamplesCorrelationEB, + double const* G1SamplesCorrelationEB, + double const* G12SamplesCorrelationEE, + double const* G6SamplesCorrelationEE, + double const* G1SamplesCorrelationEE, + SampleMatrix* noisecov, + PulseMatrixType* pulse_matrix, + EcalPulseShape const* pulse_shape, + bool const* hasSwitchToGain6, + bool const* hasSwitchToGain1, + bool const* isSaturated, + uint32_t const offsetForHashes, + uint32_t const offsetForInputs); - __global__ void kernel_permute_results(SampleVector* amplitudes, - BXVectorType const* activeBXs, - ::ecal::reco::StorageScalarType* energies, - char const* acState, - int const nchannels); +__global__ +void kernel_permute_results( + SampleVector *amplitudes, + BXVectorType const* activeBXs, + ::ecal::reco::StorageScalarType *energies, + char const* acState, + int const nchannels); /// /// Build an Ecal RecHit. /// TODO: Use SoA data structures on the host directly -/// the reason for removing this from minimize kernel is to isolate the minimize + +/// the reason for removing this from minimize kernel is to isolate the minimize + /// again, building an aos rec hit involves strides... -> bad memory access pattern /// #ifdef RUN_BUILD_AOS_RECHIT - __global__ void kernel_build_rechit( - float const* energies, float const* chi2s, uint32_t* dids, EcalUncalibratedRecHit* rechits, int nchannels); +__global__ +void kernel_build_rechit( + float const* energies, + float const* chi2s, + uint32_t* dids, + EcalUncalibratedRecHit* rechits, + int nchannels); #endif - } // namespace multifit -} // namespace ecal +}} -#endif // RecoLocalCalo_EcalRecAlgos_src_AmplitudeComputationCommonKernels +#endif // RecoLocalCalo_EcalRecAlgos_src_AmplitudeComputationCommonKernels diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu new file mode 100644 index 0000000000000..fb6b396089151 --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu @@ -0,0 +1,425 @@ +#include +#include + +#include "cuda.h" + +#include "DataFormats/EcalDigi/interface/EcalDataFrame.h" +#include "DataFormats/Math/interface/approx_exp.h" +#include "DataFormats/Math/interface/approx_log.h" + +#include "CondFormats/EcalObjects/interface/EcalPulseShapes.h" +#include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h" +#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" + +#include "inplace_fnnls.h" +#include "KernelHelpers.h" +#include "AmplitudeComputationKernels.h" +#include "AmplitudeComputationCommonKernels.h" + +namespace ecal { namespace multifit { + +void eigen_solve_submatrix(SampleMatrix& mat, + SampleVector& invec, + SampleVector& outvec, unsigned NP) { + using namespace Eigen; + switch( NP ) { // pulse matrix is always square. + case 10: { + Matrix temp = mat.topLeftCorner<10,10>(); + outvec.head<10>() = temp.ldlt().solve(invec.head<10>()); + break; + } + case 9: { + Matrix temp = mat.topLeftCorner<9,9>(); + outvec.head<9>() = temp.ldlt().solve(invec.head<9>()); + break; + } + case 8: { + Matrix temp = mat.topLeftCorner<8,8>(); + outvec.head<8>() = temp.ldlt().solve(invec.head<8>()); + break; + } + case 7: { + Matrix temp = mat.topLeftCorner<7,7>(); + outvec.head<7>() = temp.ldlt().solve(invec.head<7>()); + break; + } + case 6: { + Matrix temp = mat.topLeftCorner<6,6>(); + outvec.head<6>() = temp.ldlt().solve(invec.head<6>()); + break; + } + case 5: { + Matrix temp = mat.topLeftCorner<5,5>(); + outvec.head<5>() = temp.ldlt().solve(invec.head<5>()); + break; + } + case 4: { + Matrix temp = mat.topLeftCorner<4,4>(); + outvec.head<4>() = temp.ldlt().solve(invec.head<4>()); + break; + } + case 3: { + Matrix temp = mat.topLeftCorner<3,3>(); + outvec.head<3>() = temp.ldlt().solve(invec.head<3>()); + break; + } + case 2: { + Matrix temp = mat.topLeftCorner<2,2>(); + outvec.head<2>() = temp.ldlt().solve(invec.head<2>()); + break; + } + case 1: { + Matrix temp = mat.topLeftCorner<1,1>(); + outvec.head<1>() = temp.ldlt().solve(invec.head<1>()); + break; + } + default: + return; + } +} + +template +__device__ __forceinline__ +bool update_covariance( + EcalPulseCovariance const& pulse_covariance, + MatrixType& inverse_cov, + SampleVector const& amplitudes) { + constexpr int nsamples = SampleVector::RowsAtCompileTime; + constexpr int npulses = BXVectorType::RowsAtCompileTime; + + #pragma unroll + for (unsigned int ipulse=0; ipulse ipulse - firstOffset + int bx = ipulse - 5; + int first_sample_t = std::max(0, bx+3); + int offset = -3 - bx; + + auto const value_sq = amplitude * amplitude; + + unsigned int nsample_pulse = nsamples - first_sample_t; + + for (int col=first_sample_t; col solution vector, what we are fitting for +/// - samples -> raw detector responses +/// - passive constraint - satisfied constraint +/// - active constraint - unsatisfied (yet) constraint +/// +__global__ +void kernel_minimize( + uint32_t const* dids_eb, + uint32_t const* dids_ee, + SampleMatrix const* __restrict__ noisecov, + EcalPulseCovariance const* __restrict__ pulse_covariance, + BXVectorType *bxs, + SampleVector const* __restrict__ samples, + SampleVector* amplitudes, + PulseMatrixType const* __restrict__ pulse_matrix, + ::ecal::reco::StorageScalarType* chi2s, + ::ecal::reco::StorageScalarType* energies, + char *acState, + int nchannels, + int max_iterations, + uint32_t const offsetForHashes, + uint32_t const offsetForInputs) { + // FIXME: ecal has 10 samples and 10 pulses.... + // but this needs to be properly treated and renamed everywhere + constexpr auto NSAMPLES = SampleMatrix::RowsAtCompileTime; + constexpr auto NPULSES = SampleMatrix::RowsAtCompileTime; + static_assert(NSAMPLES == NPULSES); + + using DataType = SampleVector::Scalar; + + extern __shared__ char shrmem[]; + DataType *shrMatrixLForFnnlsStorage = + reinterpret_cast(shrmem) + MapSymM::total * threadIdx.x; + DataType *shrAtAStorage = + reinterpret_cast(shrmem) + MapSymM::total * ( + threadIdx.x + blockDim.x); + + // FIXME: remove eitehr idx or ch -> they are teh same thing + int idx = threadIdx.x + blockDim.x*blockIdx.x; + auto const ch = idx; + if (idx < nchannels) { + if (static_cast(acState[idx]) == + MinimizationState::Precomputed) + return; + + // get the hash + int const inputCh = ch >= offsetForInputs + ? ch - offsetForInputs + : ch; + auto const* dids = ch >= offsetForInputs + ? dids_ee + : dids_eb; + auto const did = DetId{dids[inputCh]}; + auto const isBarrel = did.subdetId() == EcalBarrel; + auto const hashedId = isBarrel + ? hashedIndexEB(did.rawId()) + : offsetForHashes + hashedIndexEE(did.rawId()); + + // inits + int iter = 0; + int npassive = 0; + + ColumnVector pulseOffsets; + #pragma unroll + for (int i=0; i resultAmplitudes; + #pragma unroll + for (int counter=0; counter= max_iterations) + break; + + //inverse_cov = noisecov[idx]; + //DataType covMatrixStorage[MapSymM::total]; + DataType* covMatrixStorage = shrMatrixLForFnnlsStorage; + MapSymM covMatrix{covMatrixStorage}; + int counter = 0; + #pragma unroll + for (int col=0; col::total]; + MapSymM matrixL{matrixLStorage}; + compute_decomposition_unrolled(matrixL, covMatrix); + + // L * A = P + ColMajorMatrix A; + solve_forward_subst_matrix(A, pulse_matrix[idx], matrixL); + + // L b = s + float reg_b[NSAMPLES]; + solve_forward_subst_vector(reg_b, samples[idx], matrixL); + + // FIXME: shared mem + //DataType AtAStorage[MapSymM::total]; + MapSymM AtA{shrAtAStorage}; + //SampleMatrix AtA; + SampleVector Atb; + #pragma unroll + for (int icol=0; icol::total]; + MapSymM matrixLForFnnls{shrMatrixLForFnnlsStorage}; + + fnnls( + AtA, + Atb, + //amplitudes[idx], + resultAmplitudes, + npassive, + pulseOffsets, + matrixLForFnnls, + 1e-11, + 500 + ); + + { + DataType accum[NSAMPLES]; + // load accum + #pragma unroll + for (int counter=0; counter totalChannels + ? 1 + : (totalChannels + threads_min - 1) / threads_min; + uint32_t const offsetForHashes = conditions.offsetForHashes; + uint32_t const offsetForInputs = eventInputGPU.ebDigis.ndigis; + auto const nbytesShared = 2 * threads_min * + MapSymM::total * sizeof(DataType); + kernel_minimize<<>>( + eventInputGPU.ebDigis.ids, + eventInputGPU.eeDigis.ids, + scratch.noisecov, + conditions.pulseCovariances.values, + scratch.activeBXs, + scratch.samples, + (SampleVector*)eventOutputGPU.amplitudesAll, + scratch.pulse_matrix, + eventOutputGPU.chi2, + eventOutputGPU.amplitude, + scratch.acState, + totalChannels, + 50, + offsetForHashes, + offsetForInputs); + cudaCheck(cudaGetLastError()); +} + +} + +}} diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.h b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.h new file mode 100644 index 0000000000000..f54fef09b1f17 --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.h @@ -0,0 +1,27 @@ +#ifndef RecoLocalCalo_EcalRecAlgos_src_AmplitudeComputationKernels +#define RecoLocalCalo_EcalRecAlgos_src_AmplitudeComputationKernels + +#include "RecoLocalCalo/EcalRecAlgos/interface/EigenMatrixTypes_gpu.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/Common.h" + +class EcalPulseShape; +class EcalPulseCovariance; +class EcalUncalibratedRecHit; + +namespace ecal { namespace multifit { + +namespace v1 { + +void minimization_procedure( + EventInputDataGPU const& eventInputGPU, + EventOutputDataGPU& eventOutputGPU, EventDataForScratchGPU& scratch, + ConditionsProducts const& conditions, + ConfigurationParameters const& configParameters, + cudaStream_t cudaStream); + +} + +}} + +#endif // RecoLocalCalo_EcalRecAlgos_src_AmplitudeComputationKernelsV1 diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernelsV1.cu b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernelsV1.cu deleted file mode 100644 index 880e729c2c72d..0000000000000 --- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernelsV1.cu +++ /dev/null @@ -1,372 +0,0 @@ -#include -#include - -#include "cuda.h" - -#include "DataFormats/EcalDigi/interface/EcalDataFrame.h" -#include "DataFormats/Math/interface/approx_exp.h" -#include "DataFormats/Math/interface/approx_log.h" - -#include "CondFormats/EcalObjects/interface/EcalPulseShapes.h" -#include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h" -#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" - -#include "inplace_fnnls.h" -#include "AmplitudeComputationKernelsV1.h" -#include "AmplitudeComputationCommonKernels.h" - -namespace ecal { - namespace multifit { - - void eigen_solve_submatrix(SampleMatrix& mat, SampleVector& invec, SampleVector& outvec, unsigned NP) { - using namespace Eigen; - switch (NP) { // pulse matrix is always square. - case 10: { - Matrix temp = mat.topLeftCorner<10, 10>(); - outvec.head<10>() = temp.ldlt().solve(invec.head<10>()); - break; - } - case 9: { - Matrix temp = mat.topLeftCorner<9, 9>(); - outvec.head<9>() = temp.ldlt().solve(invec.head<9>()); - break; - } - case 8: { - Matrix temp = mat.topLeftCorner<8, 8>(); - outvec.head<8>() = temp.ldlt().solve(invec.head<8>()); - break; - } - case 7: { - Matrix temp = mat.topLeftCorner<7, 7>(); - outvec.head<7>() = temp.ldlt().solve(invec.head<7>()); - break; - } - case 6: { - Matrix temp = mat.topLeftCorner<6, 6>(); - outvec.head<6>() = temp.ldlt().solve(invec.head<6>()); - break; - } - case 5: { - Matrix temp = mat.topLeftCorner<5, 5>(); - outvec.head<5>() = temp.ldlt().solve(invec.head<5>()); - break; - } - case 4: { - Matrix temp = mat.topLeftCorner<4, 4>(); - outvec.head<4>() = temp.ldlt().solve(invec.head<4>()); - break; - } - case 3: { - Matrix temp = mat.topLeftCorner<3, 3>(); - outvec.head<3>() = temp.ldlt().solve(invec.head<3>()); - break; - } - case 2: { - Matrix temp = mat.topLeftCorner<2, 2>(); - outvec.head<2>() = temp.ldlt().solve(invec.head<2>()); - break; - } - case 1: { - Matrix temp = mat.topLeftCorner<1, 1>(); - outvec.head<1>() = temp.ldlt().solve(invec.head<1>()); - break; - } - default: - return; - } - } - -#define PRINT_MATRIX_10x10(M) \ - printf( \ - "%f %f %f %f %f %f %f %f %f %f\n%f %f %f %f %f %f %f %f %f %f\n%f %f %f %f %f %f %f %f %f %f\n%f %f %f %f %f " \ - "%f %f %f %f %f\n%f %f %f %f %f %f %f %f %f %f\n%f %f %f %f %f %f %f %f %f %f\n%f %f %f %f %f %f %f %f %f " \ - "%f\n%f %f %f %f %f %f %f %f %f %f\n%f %f %f %f %f %f %f %f %f %f\n%f %f %f %f %f %f %f %f %f %f\n", \ - M(0, 0), \ - M(1, 0), \ - M(2, 0), \ - M(3, 0), \ - M(4, 0), \ - M(5, 0), \ - M(6, 0), \ - M(7, 0), \ - M(8, 0), \ - M(9, 0), \ - M(0, 1), \ - M(1, 1), \ - M(2, 1), \ - M(3, 1), \ - M(4, 1), \ - M(5, 1), \ - M(6, 1), \ - M(7, 1), \ - M(8, 1), \ - M(9, 1), \ - M(0, 2), \ - M(1, 2), \ - M(2, 2), \ - M(3, 2), \ - M(4, 2), \ - M(5, 2), \ - M(6, 2), \ - M(7, 2), \ - M(8, 2), \ - M(9, 2), \ - M(0, 3), \ - M(1, 3), \ - M(2, 3), \ - M(3, 3), \ - M(4, 3), \ - M(5, 3), \ - M(6, 3), \ - M(7, 3), \ - M(8, 3), \ - M(9, 3), \ - M(0, 4), \ - M(1, 4), \ - M(2, 4), \ - M(3, 4), \ - M(4, 4), \ - M(5, 4), \ - M(6, 4), \ - M(7, 4), \ - M(8, 4), \ - M(9, 4), \ - M(0, 5), \ - M(1, 5), \ - M(2, 5), \ - M(3, 5), \ - M(4, 5), \ - M(5, 5), \ - M(6, 5), \ - M(7, 5), \ - M(8, 5), \ - M(9, 5), \ - M(0, 6), \ - M(1, 6), \ - M(2, 6), \ - M(3, 6), \ - M(4, 6), \ - M(5, 6), \ - M(6, 6), \ - M(7, 6), \ - M(8, 6), \ - M(9, 6), \ - M(0, 7), \ - M(1, 7), \ - M(2, 7), \ - M(3, 7), \ - M(4, 7), \ - M(5, 7), \ - M(6, 7), \ - M(7, 7), \ - M(8, 7), \ - M(9, 7), \ - M(0, 8), \ - M(1, 8), \ - M(2, 8), \ - M(3, 8), \ - M(4, 8), \ - M(5, 8), \ - M(6, 8), \ - M(7, 8), \ - M(8, 8), \ - M(9, 8), \ - M(0, 9), \ - M(1, 9), \ - M(2, 9), \ - M(3, 9), \ - M(4, 9), \ - M(5, 9), \ - M(6, 9), \ - M(7, 9), \ - M(8, 9), \ - M(9, 9)) - - __device__ __forceinline__ bool update_covariance(SampleMatrix const& noisecov, - FullSampleMatrix const& full_pulse_cov, - SampleMatrix& inverse_cov, - BXVectorType const& bxs, - SampleDecompLLT& covariance_decomposition, - SampleVector const& amplitudes) { - constexpr int nsamples = SampleVector::RowsAtCompileTime; - constexpr int npulses = BXVectorType::RowsAtCompileTime; - - inverse_cov = noisecov; - - for (unsigned int ipulse = 0; ipulse < npulses; ipulse++) { - if (amplitudes.coeff(ipulse) == 0) - continue; - - int bx = bxs.coeff(ipulse); - int first_sample_t = std::max(0, bx + 3); - int offset = 7 - 3 - bx; - - auto const value = amplitudes.coeff(ipulse); - auto const value_sq = value * value; - - unsigned int nsample_pulse = nsamples - first_sample_t; - inverse_cov.block(first_sample_t, first_sample_t, nsample_pulse, nsample_pulse) += - value_sq * - full_pulse_cov.block(first_sample_t + offset, first_sample_t + offset, nsample_pulse, nsample_pulse); - } - - return true; - } - - __device__ __forceinline__ SampleVector::Scalar compute_chi2(SampleDecompLLT& covariance_decomposition, - PulseMatrixType const& pulse_matrix, - SampleVector const& amplitudes, - SampleVector const& samples) { - return covariance_decomposition.matrixL().solve(pulse_matrix * amplitudes - samples).squaredNorm(); - } - - /// - /// launch ctx parameters are (nchannels / block, blocks) - /// TODO: trivial impl for now, there must be a way to improve - /// - /// Conventions: - /// - amplitudes -> solution vector, what we are fitting for - /// - samples -> raw detector responses - /// - passive constraint - satisfied constraint - /// - active constraint - unsatisfied (yet) constraint - /// - __global__ void kernel_minimize(SampleMatrix const* noisecov, - FullSampleMatrix const* full_pulse_cov, - BXVectorType* bxs, - SampleVector const* samples, - SampleVector* amplitudes, - PulseMatrixType* pulse_matrix, - ::ecal::reco::StorageScalarType* chi2s, - char* acState, - int nchannels, - int max_iterations) { - int idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx < nchannels) { - if (static_cast(acState[idx]) == MinimizationState::Precomputed) - return; - - // inits - int iter = 0; - int npassive = 0; - - // inits - SampleDecompLLT covariance_decomposition; - SampleMatrix inverse_cov; - SampleVector::Scalar chi2 = 0, chi2_now = 0; - -#ifdef ECAL_MULTIFIT_KERNEL_MINIMIZE_V1 -// PRINT_MATRIX_10x10(noisecov[idx]); -#endif - - // loop until ocnverge - while (true) { - if (iter >= max_iterations) - break; - - update_covariance( - noisecov[idx], full_pulse_cov[idx], inverse_cov, bxs[idx], covariance_decomposition, amplitudes[idx]); - - // compute actual covariance decomposition - covariance_decomposition.compute(inverse_cov); - - // prepare input matrices for fnnls - SampleMatrix A = covariance_decomposition.matrixL().solve(pulse_matrix[idx]); - SampleVector b = covariance_decomposition.matrixL().solve(samples[idx]); - - inplace_fnnls(A, b, amplitudes[idx], npassive, bxs[idx], pulse_matrix[idx]); - - chi2_now = compute_chi2(covariance_decomposition, pulse_matrix[idx], amplitudes[idx], samples[idx]); - auto deltachi2 = chi2_now - chi2; - -#ifdef ECAL_MULTIFIT_KERNEL_MINIMIZE_V1 - if (iter > 10) { - printf("idx = %d iter = %d chi2 = %f chi2old = %f\n", idx, iter, chi2_now, chi2); - - printf("noisecov(0, i): %f %f %f %f %f %f %f %f %f %f\n", - noisecov[idx](0, 0), - noisecov[idx](0, 1), - noisecov[idx](0, 2), - noisecov[idx](0, 3), - noisecov[idx](0, 4), - noisecov[idx](0, 5), - noisecov[idx](0, 6), - noisecov[idx](0, 7), - noisecov[idx](0, 8), - noisecov[idx](0, 9)); - - printf("ampls: %f %f %f %f %f %f %f %f %f %f\n", - amplitudes[idx](0), - amplitudes[idx](1), - amplitudes[idx](2), - amplitudes[idx](3), - amplitudes[idx](4), - amplitudes[idx](5), - amplitudes[idx](6), - amplitudes[idx](7), - amplitudes[idx](8), - amplitudes[idx](9)); - } -#endif - - chi2 = chi2_now; - - if (ecal::abs(deltachi2) < 1e-3) - break; - - //---- AM: TEST - //---- it was 3 lines above, now here as in the CPU version - ++iter; - } - - // the rest will be set later - chi2s[idx] = chi2; - } - } - - namespace v1 { - - void minimization_procedure(EventInputDataCPU const& eventInputCPU, - EventInputDataGPU& eventInputGPU, - EventOutputDataGPU& eventOutputGPU, - EventDataForScratchGPU& scratch, - ConditionsProducts const& conditions, - ConfigurationParameters const& configParameters, - cudaStream_t cudaStream) { - unsigned int totalChannels = eventInputCPU.ebDigis.size() + eventInputCPU.eeDigis.size(); - // unsigned int threads_min = conf.threads.x; - // TODO: configure from python - unsigned int threads_min = configParameters.kernelMinimizeThreads[0]; - unsigned int blocks_min = threads_min > totalChannels ? 1 : (totalChannels + threads_min - 1) / threads_min; - kernel_minimize<<>>(scratch.noisecov, - scratch.pulse_covariances, - scratch.activeBXs, - scratch.samples, - (SampleVector*)eventOutputGPU.amplitudesAll, - scratch.pulse_matrix, - eventOutputGPU.chi2, - scratch.acState, - totalChannels, - 50); - cudaCheck(cudaGetLastError()); - - // - // permute computed amplitudes - // and assign the final uncalibared energy value - // - unsigned int threadsPermute = 32 * EcalDataFrame::MAXSAMPLES; // 32 * 10 - unsigned int blocksPermute = - threadsPermute > 10 * totalChannels ? 1 : (10 * totalChannels + threadsPermute - 1) / threadsPermute; - int bytesPermute = threadsPermute * sizeof(SampleVector::Scalar); - kernel_permute_results<<>>( - (SampleVector*)eventOutputGPU.amplitudesAll, - scratch.activeBXs, - eventOutputGPU.amplitude, - scratch.acState, - totalChannels); - cudaCheck(cudaGetLastError()); - } - - } // namespace v1 - - } // namespace multifit -} // namespace ecal diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernelsV1.h b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernelsV1.h deleted file mode 100644 index f3c075e2a2e38..0000000000000 --- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernelsV1.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef RecoLocalCalo_EcalRecAlgos_src_AmplitudeComputationKernelsV1 -#define RecoLocalCalo_EcalRecAlgos_src_AmplitudeComputationKernelsV1 - -#include "RecoLocalCalo/EcalRecAlgos/interface/EigenMatrixTypes_gpu.h" -#include "RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h" -#include "RecoLocalCalo/EcalRecAlgos/interface/Common.h" - -class EcalPulseShape; -class EcalPulseCovariance; -class EcalUncalibratedRecHit; - -namespace ecal { - namespace multifit { - - namespace v1 { - - void minimization_procedure(EventInputDataCPU const& eventInputCPU, - EventInputDataGPU& eventInputGPU, - EventOutputDataGPU& eventOutputGPU, - EventDataForScratchGPU& scratch, - ConditionsProducts const& conditions, - ConfigurationParameters const& configParameters, - cudaStream_t cudaStream); - - } - - /// - /// TODO: trivial impl for now, there must be a way to improve - /// - /// Conventions: - /// - amplitudes -> solution vector, what we are fitting for - /// - samples -> raw detector responses - /// - passive constraint - satisfied constraint - /// - active constraint - unsatisfied (yet) constraint - /// - __global__ void kernel_minimize(SampleMatrix const* noisecov, - FullSampleMatrix const* full_pulse_cov, - BXVectorType* bxs, - SampleVector const* samples, - SampleVector* amplitudes, - PulseMatrixType* pulse_matrix, - ::ecal::reco::StorageScalarType* chi2s, - char* acState, - int nchannels, - int max_iterations); - - } // namespace multifit -} // namespace ecal - -#endif // RecoLocalCalo_EcalRecAlgos_src_AmplitudeComputationKernelsV1 diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalGainRatiosGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalGainRatiosGPU.cc index d5980d8a757aa..bcb199b133c0d 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalGainRatiosGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalGainRatiosGPU.cc @@ -3,50 +3,57 @@ #include "FWCore/Utilities/interface/typelookup.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -EcalGainRatiosGPU::EcalGainRatiosGPU(EcalGainRatios const& values) - : gain12Over6_(values.size()), gain6Over1_(values.size()) { - // fill in eb - auto const& barrelValues = values.barrelItems(); - for (unsigned int i = 0; i < barrelValues.size(); i++) { - gain12Over6_[i] = barrelValues[i].gain12Over6(); - gain6Over1_[i] = barrelValues[i].gain6Over1(); - } - - // fill in ee - auto const& endcapValues = values.endcapItems(); - auto const offset = barrelValues.size(); - for (unsigned int i = 0; i < endcapValues.size(); i++) { - gain12Over6_[offset + i] = endcapValues[i].gain12Over6(); - gain6Over1_[offset + i] = endcapValues[i].gain6Over1(); - } +EcalGainRatiosGPU::EcalGainRatiosGPU(EcalGainRatios const& values) + : gain12Over6_(values.size()) + , gain6Over1_(values.size()) +{ + // fill in eb + auto const& barrelValues = values.barrelItems(); + for (unsigned int i=0; igain12Over6_.size() * sizeof(float))); - cudaCheck(cudaMalloc((void**)&product.gain6Over1, this->gain6Over1_.size() * sizeof(float))); - // transfer - cudaCheck(cudaMemcpyAsync(product.gain12Over6, - this->gain12Over6_.data(), - this->gain12Over6_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream)); - cudaCheck(cudaMemcpyAsync(product.gain6Over1, - this->gain6Over1_.data(), - this->gain6Over1_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream)); - }); +EcalGainRatiosGPU::Product const& EcalGainRatiosGPU::getProduct( + cudaStream_t cudaStream) const +{ + auto const& product = product_.dataForCurrentDeviceAsync(cudaStream, + [this](EcalGainRatiosGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck( cudaMalloc((void**)&product.gain12Over6, + this->gain12Over6_.size() * sizeof(float)) ); + cudaCheck( cudaMalloc((void**)&product.gain6Over1, + this->gain6Over1_.size() * sizeof(float)) ); + // transfer + cudaCheck( cudaMemcpyAsync(product.gain12Over6, + this->gain12Over6_.data(), + this->gain12Over6_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.gain6Over1, + this->gain6Over1_.data(), + this->gain6Over1_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + } + ); - return product; + return product; } TYPELOOKUP_DATA_REG(EcalGainRatiosGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalPedestalsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalPedestalsGPU.cc index 9e3284cd9c7c8..401ad8c454737 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalPedestalsGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalPedestalsGPU.cc @@ -3,92 +3,103 @@ #include "FWCore/Utilities/interface/typelookup.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -EcalPedestalsGPU::EcalPedestalsGPU(EcalPedestals const& pedestals) - : mean_x12_(pedestals.size()), - rms_x12_(pedestals.size()), - mean_x6_(pedestals.size()), - rms_x6_(pedestals.size()), - mean_x1_(pedestals.size()), - rms_x1_(pedestals.size()) { - // fill in eb - auto const& barrelValues = pedestals.barrelItems(); - for (unsigned int i = 0; i < barrelValues.size(); i++) { - mean_x12_[i] = barrelValues[i].mean_x12; - rms_x12_[i] = barrelValues[i].rms_x12; - mean_x6_[i] = barrelValues[i].mean_x6; - rms_x6_[i] = barrelValues[i].rms_x6; - mean_x1_[i] = barrelValues[i].mean_x1; - rms_x1_[i] = barrelValues[i].rms_x1; - } +EcalPedestalsGPU::EcalPedestalsGPU(EcalPedestals const& pedestals) + : mean_x12_(pedestals.size()) + , rms_x12_(pedestals.size()) + , mean_x6_(pedestals.size()) + , rms_x6_(pedestals.size()) + , mean_x1_(pedestals.size()) + , rms_x1_(pedestals.size()) +{ - // fill in ee - auto const& endcapValues = pedestals.endcapItems(); - auto const offset = barrelValues.size(); - for (unsigned int i = 0; i < endcapValues.size(); i++) { - mean_x12_[offset + i] = endcapValues[i].mean_x12; - rms_x12_[offset + i] = endcapValues[i].rms_x12; - mean_x6_[offset + i] = endcapValues[i].mean_x6; - rms_x6_[offset + i] = endcapValues[i].rms_x6; - mean_x1_[offset + i] = endcapValues[i].mean_x1; - rms_x1_[offset + i] = endcapValues[i].rms_x1; - } + // fill in eb + auto const& barrelValues = pedestals.barrelItems(); + for (unsigned int i=0; imean_x12_.size() * sizeof(float))); - cudaCheck(cudaMalloc((void**)&product.rms_x12, this->mean_x12_.size() * sizeof(float))); - cudaCheck(cudaMalloc((void**)&product.mean_x6, this->mean_x12_.size() * sizeof(float))); - cudaCheck(cudaMalloc((void**)&product.rms_x6, this->mean_x12_.size() * sizeof(float))); - cudaCheck(cudaMalloc((void**)&product.mean_x1, this->mean_x12_.size() * sizeof(float))); - cudaCheck(cudaMalloc((void**)&product.rms_x1, this->mean_x12_.size() * sizeof(float))); +EcalPedestalsGPU::Product const& EcalPedestalsGPU::getProduct( + cudaStream_t cudaStream) const +{ + auto const& product = product_.dataForCurrentDeviceAsync(cudaStream, + [this](EcalPedestalsGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck( cudaMalloc((void**)&product.mean_x12, + this->mean_x12_.size() * sizeof(float)) ); + cudaCheck( cudaMalloc((void**)&product.rms_x12, + this->mean_x12_.size() * sizeof(float)) ); + cudaCheck( cudaMalloc((void**)&product.mean_x6, + this->mean_x12_.size() * sizeof(float)) ); + cudaCheck( cudaMalloc((void**)&product.rms_x6, + this->mean_x12_.size() * sizeof(float)) ); + cudaCheck( cudaMalloc((void**)&product.mean_x1, + this->mean_x12_.size() * sizeof(float)) ); + cudaCheck( cudaMalloc((void**)&product.rms_x1, + this->mean_x12_.size() * sizeof(float)) ); - // transfer - cudaCheck(cudaMemcpyAsync(product.mean_x12, - this->mean_x12_.data(), - this->mean_x12_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream)); - cudaCheck(cudaMemcpyAsync(product.rms_x12, - this->rms_x12_.data(), - this->rms_x12_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream)); - cudaCheck(cudaMemcpyAsync(product.mean_x6, - this->mean_x6_.data(), - this->mean_x6_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream)); - cudaCheck(cudaMemcpyAsync(product.rms_x6, - this->rms_x6_.data(), - this->rms_x6_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream)); - cudaCheck(cudaMemcpyAsync(product.mean_x1, - this->mean_x1_.data(), - this->mean_x1_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream)); - cudaCheck(cudaMemcpyAsync(product.rms_x1, - this->rms_x1_.data(), - this->rms_x1_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream)); - }); + // transfer + cudaCheck( cudaMemcpyAsync(product.mean_x12, + this->mean_x12_.data(), + this->mean_x12_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.rms_x12, + this->rms_x12_.data(), + this->rms_x12_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.mean_x6, + this->mean_x6_.data(), + this->mean_x6_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.rms_x6, + this->rms_x6_.data(), + this->rms_x6_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.mean_x1, + this->mean_x1_.data(), + this->mean_x1_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.rms_x1, + this->rms_x1_.data(), + this->rms_x1_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + } + ); - return product; + return product; } TYPELOOKUP_DATA_REG(EcalPedestalsGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalPulseCovariancesGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalPulseCovariancesGPU.cc index bbeda99652e22..121a5b9e684f7 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalPulseCovariancesGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalPulseCovariancesGPU.cc @@ -3,40 +3,48 @@ #include "FWCore/Utilities/interface/typelookup.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -EcalPulseCovariancesGPU::EcalPulseCovariancesGPU(EcalPulseCovariances const& values) - : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {} +EcalPulseCovariancesGPU::EcalPulseCovariancesGPU(EcalPulseCovariances const& values) + : valuesEB_{values.barrelItems()} + , valuesEE_{values.endcapItems()} +{} EcalPulseCovariancesGPU::Product::~Product() { - // deallocation - cudaCheck(cudaFree(values)); + // deallocation + cudaCheck( cudaFree(values) ); } -EcalPulseCovariancesGPU::Product const& EcalPulseCovariancesGPU::getProduct(cudaStream_t cudaStream) const { - auto const& product = product_.dataForCurrentDeviceAsync( - cudaStream, [this](EcalPulseCovariancesGPU::Product& product, cudaStream_t cudaStream) { - // malloc - cudaCheck(cudaMalloc((void**)&product.values, - (this->valuesEE_.size() + this->valuesEB_.size()) * sizeof(EcalPulseCovariance))); - - // offset in terms of sizeof(EcalPulseCovariance) - uint32_t offset = this->valuesEB_.size(); - - // transfer eb - cudaCheck(cudaMemcpyAsync(product.values, - this->valuesEB_.data(), - this->valuesEB_.size() * sizeof(EcalPulseCovariance), - cudaMemcpyHostToDevice, - cudaStream)); - - // transfer ee starting at values + offset - cudaCheck(cudaMemcpyAsync(product.values + offset, - this->valuesEE_.data(), - this->valuesEE_.size() * sizeof(EcalPulseCovariance), - cudaMemcpyHostToDevice, - cudaStream)); - }); - - return product; +EcalPulseCovariancesGPU::Product const& EcalPulseCovariancesGPU::getProduct( + cudaStream_t cudaStream) const +{ + auto const& product = product_.dataForCurrentDeviceAsync(cudaStream, + [this](EcalPulseCovariancesGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck( cudaMalloc((void**)&product.values, + (this->valuesEE_.size() + this->valuesEB_.size()) + * sizeof(EcalPulseCovariance)) ); + + // offset in terms of sizeof(EcalPulseCovariance) + uint32_t offset = this->valuesEB_.size(); + + // transfer eb + cudaCheck( cudaMemcpyAsync(product.values, + this->valuesEB_.data(), + this->valuesEB_.size() * + sizeof(EcalPulseCovariance), + cudaMemcpyHostToDevice, + cudaStream) ); + + // transfer ee starting at values + offset + cudaCheck( cudaMemcpyAsync(product.values + offset, + this->valuesEE_.data(), + this->valuesEE_.size() * + sizeof(EcalPulseCovariance), + cudaMemcpyHostToDevice, + cudaStream) ); + } + ); + + return product; } TYPELOOKUP_DATA_REG(EcalPulseCovariancesGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalPulseShapesGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalPulseShapesGPU.cc index aee122a01627d..8e8f00795d225 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalPulseShapesGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalPulseShapesGPU.cc @@ -3,40 +3,48 @@ #include "FWCore/Utilities/interface/typelookup.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -EcalPulseShapesGPU::EcalPulseShapesGPU(EcalPulseShapes const& values) - : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {} +EcalPulseShapesGPU::EcalPulseShapesGPU(EcalPulseShapes const& values) + : valuesEB_{values.barrelItems()} + , valuesEE_{values.endcapItems()} +{} EcalPulseShapesGPU::Product::~Product() { - // deallocation - cudaCheck(cudaFree(values)); + // deallocation + cudaCheck( cudaFree(values) ); } -EcalPulseShapesGPU::Product const& EcalPulseShapesGPU::getProduct(cudaStream_t cudaStream) const { - auto const& product = product_.dataForCurrentDeviceAsync( - cudaStream, [this](EcalPulseShapesGPU::Product& product, cudaStream_t cudaStream) { - // malloc - cudaCheck(cudaMalloc((void**)&product.values, - (this->valuesEE_.size() + this->valuesEB_.size()) * sizeof(EcalPulseShape))); - - // offset in terms of sizeof(EcalPulseShape) - plain c array - uint32_t offset = this->valuesEB_.size(); - - // transfer eb - cudaCheck(cudaMemcpyAsync(product.values, - this->valuesEB_.data(), - this->valuesEB_.size() * sizeof(EcalPulseShape), - cudaMemcpyHostToDevice, - cudaStream)); - - // transfer ee starting at values + offset - cudaCheck(cudaMemcpyAsync(product.values + offset, - this->valuesEE_.data(), - this->valuesEE_.size() * sizeof(EcalPulseShape), - cudaMemcpyHostToDevice, - cudaStream)); - }); - - return product; +EcalPulseShapesGPU::Product const& EcalPulseShapesGPU::getProduct( + cudaStream_t cudaStream) const +{ + auto const& product = product_.dataForCurrentDeviceAsync(cudaStream, + [this](EcalPulseShapesGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck( cudaMalloc((void**)&product.values, + (this->valuesEE_.size() + this->valuesEB_.size()) + * sizeof(EcalPulseShape)) ); + + // offset in terms of sizeof(EcalPulseShape) - plain c array + uint32_t offset = this->valuesEB_.size(); + + // transfer eb + cudaCheck( cudaMemcpyAsync(product.values, + this->valuesEB_.data(), + this->valuesEB_.size() * + sizeof(EcalPulseShape), + cudaMemcpyHostToDevice, + cudaStream) ); + + // transfer ee starting at values + offset + cudaCheck( cudaMemcpyAsync(product.values + offset, + this->valuesEE_.data(), + this->valuesEE_.size() * + sizeof(EcalPulseShape), + cudaMemcpyHostToDevice, + cudaStream) ); + } + ); + + return product; } TYPELOOKUP_DATA_REG(EcalPulseShapesGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalSamplesCorrelationGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalSamplesCorrelationGPU.cc index 2a98067f51d9e..7294c759aaa0d 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalSamplesCorrelationGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalSamplesCorrelationGPU.cc @@ -3,74 +3,91 @@ #include "FWCore/Utilities/interface/typelookup.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -EcalSamplesCorrelationGPU::EcalSamplesCorrelationGPU(EcalSamplesCorrelation const& values) - : EBG12SamplesCorrelation_{values.EBG12SamplesCorrelation}, - EBG6SamplesCorrelation_{values.EBG6SamplesCorrelation}, - EBG1SamplesCorrelation_{values.EBG1SamplesCorrelation}, - EEG12SamplesCorrelation_{values.EEG12SamplesCorrelation}, - EEG6SamplesCorrelation_{values.EEG6SamplesCorrelation}, - EEG1SamplesCorrelation_{values.EEG1SamplesCorrelation} {} +EcalSamplesCorrelationGPU::EcalSamplesCorrelationGPU( + EcalSamplesCorrelation const& values) + : EBG12SamplesCorrelation_{values.EBG12SamplesCorrelation} + , EBG6SamplesCorrelation_{values.EBG6SamplesCorrelation} + , EBG1SamplesCorrelation_{values.EBG1SamplesCorrelation} + , EEG12SamplesCorrelation_{values.EEG12SamplesCorrelation} + , EEG6SamplesCorrelation_{values.EEG6SamplesCorrelation} + , EEG1SamplesCorrelation_{values.EEG1SamplesCorrelation} +{} EcalSamplesCorrelationGPU::Product::~Product() { - // deallocation - cudaCheck(cudaFree(EBG12SamplesCorrelation)); - cudaCheck(cudaFree(EBG6SamplesCorrelation)); - cudaCheck(cudaFree(EBG1SamplesCorrelation)); - cudaCheck(cudaFree(EEG12SamplesCorrelation)); - cudaCheck(cudaFree(EEG6SamplesCorrelation)); - cudaCheck(cudaFree(EEG1SamplesCorrelation)); + // deallocation + cudaCheck( cudaFree(EBG12SamplesCorrelation) ); + cudaCheck( cudaFree(EBG6SamplesCorrelation) ); + cudaCheck( cudaFree(EBG1SamplesCorrelation) ); + cudaCheck( cudaFree(EEG12SamplesCorrelation) ); + cudaCheck( cudaFree(EEG6SamplesCorrelation) ); + cudaCheck( cudaFree(EEG1SamplesCorrelation) ); } -EcalSamplesCorrelationGPU::Product const& EcalSamplesCorrelationGPU::getProduct(cudaStream_t cudaStream) const { - auto const& product = product_.dataForCurrentDeviceAsync( - cudaStream, [this](EcalSamplesCorrelationGPU::Product& product, cudaStream_t cudaStream) { - // malloc - cudaCheck(cudaMalloc((void**)&product.EBG12SamplesCorrelation, - this->EBG12SamplesCorrelation_.size() * sizeof(double))); - cudaCheck( - cudaMalloc((void**)&product.EBG6SamplesCorrelation, this->EBG6SamplesCorrelation_.size() * sizeof(double))); - cudaCheck( - cudaMalloc((void**)&product.EBG1SamplesCorrelation, this->EBG1SamplesCorrelation_.size() * sizeof(double))); - cudaCheck(cudaMalloc((void**)&product.EEG12SamplesCorrelation, - this->EEG12SamplesCorrelation_.size() * sizeof(double))); - cudaCheck( - cudaMalloc((void**)&product.EEG6SamplesCorrelation, this->EEG6SamplesCorrelation_.size() * sizeof(double))); - cudaCheck( - cudaMalloc((void**)&product.EEG1SamplesCorrelation, this->EEG1SamplesCorrelation_.size() * sizeof(double))); - // transfer - cudaCheck(cudaMemcpyAsync(product.EBG12SamplesCorrelation, - this->EBG12SamplesCorrelation_.data(), - this->EBG12SamplesCorrelation_.size() * sizeof(double), - cudaMemcpyHostToDevice, - cudaStream)); - cudaCheck(cudaMemcpyAsync(product.EBG6SamplesCorrelation, - this->EBG6SamplesCorrelation_.data(), - this->EBG6SamplesCorrelation_.size() * sizeof(double), - cudaMemcpyHostToDevice, - cudaStream)); - cudaCheck(cudaMemcpyAsync(product.EBG1SamplesCorrelation, - this->EBG1SamplesCorrelation_.data(), - this->EBG1SamplesCorrelation_.size() * sizeof(double), - cudaMemcpyHostToDevice, - cudaStream)); - cudaCheck(cudaMemcpyAsync(product.EEG12SamplesCorrelation, - this->EEG12SamplesCorrelation_.data(), - this->EEG12SamplesCorrelation_.size() * sizeof(double), - cudaMemcpyHostToDevice, - cudaStream)); - cudaCheck(cudaMemcpyAsync(product.EEG6SamplesCorrelation, - this->EEG6SamplesCorrelation_.data(), - this->EEG6SamplesCorrelation_.size() * sizeof(double), - cudaMemcpyHostToDevice, - cudaStream)); - cudaCheck(cudaMemcpyAsync(product.EEG1SamplesCorrelation, - this->EEG1SamplesCorrelation_.data(), - this->EEG1SamplesCorrelation_.size() * sizeof(double), - cudaMemcpyHostToDevice, - cudaStream)); - }); +EcalSamplesCorrelationGPU::Product const& EcalSamplesCorrelationGPU::getProduct( + cudaStream_t cudaStream) const +{ + auto const& product = product_.dataForCurrentDeviceAsync(cudaStream, + [this](EcalSamplesCorrelationGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck( cudaMalloc((void**)&product.EBG12SamplesCorrelation, + this->EBG12SamplesCorrelation_.size() * + sizeof(double)) ); + cudaCheck( cudaMalloc((void**)&product.EBG6SamplesCorrelation, + this->EBG6SamplesCorrelation_.size() * + sizeof(double)) ); + cudaCheck( cudaMalloc((void**)&product.EBG1SamplesCorrelation, + this->EBG1SamplesCorrelation_.size() * + sizeof(double)) ); + cudaCheck( cudaMalloc((void**)&product.EEG12SamplesCorrelation, + this->EEG12SamplesCorrelation_.size() * + sizeof(double)) ); + cudaCheck( cudaMalloc((void**)&product.EEG6SamplesCorrelation, + this->EEG6SamplesCorrelation_.size() * + sizeof(double)) ); + cudaCheck( cudaMalloc((void**)&product.EEG1SamplesCorrelation, + this->EEG1SamplesCorrelation_.size() * + sizeof(double)) ); + // transfer + cudaCheck( cudaMemcpyAsync(product.EBG12SamplesCorrelation, + this->EBG12SamplesCorrelation_.data(), + this->EBG12SamplesCorrelation_.size() * + sizeof(double), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.EBG6SamplesCorrelation, + this->EBG6SamplesCorrelation_.data(), + this->EBG6SamplesCorrelation_.size() * + sizeof(double), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.EBG1SamplesCorrelation, + this->EBG1SamplesCorrelation_.data(), + this->EBG1SamplesCorrelation_.size() * + sizeof(double), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.EEG12SamplesCorrelation, + this->EEG12SamplesCorrelation_.data(), + this->EEG12SamplesCorrelation_.size() * + sizeof(double), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.EEG6SamplesCorrelation, + this->EEG6SamplesCorrelation_.data(), + this->EEG6SamplesCorrelation_.size() * + sizeof(double), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.EEG1SamplesCorrelation, + this->EEG1SamplesCorrelation_.data(), + this->EEG1SamplesCorrelation_.size() * + sizeof(double), + cudaMemcpyHostToDevice, + cudaStream) ); + } + ); - return product; + return product; } TYPELOOKUP_DATA_REG(EcalSamplesCorrelationGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalTimeBiasCorrectionsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalTimeBiasCorrectionsGPU.cc index 9ab0a6302a9c4..277661b030c68 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalTimeBiasCorrectionsGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalTimeBiasCorrectionsGPU.cc @@ -3,59 +3,76 @@ #include "FWCore/Utilities/interface/typelookup.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -EcalTimeBiasCorrectionsGPU::EcalTimeBiasCorrectionsGPU(EcalTimeBiasCorrections const& values) - : EBTimeCorrAmplitudeBins_{values.EBTimeCorrAmplitudeBins}, - EBTimeCorrShiftBins_{values.EBTimeCorrShiftBins}, - EETimeCorrAmplitudeBins_{values.EETimeCorrAmplitudeBins}, - EETimeCorrShiftBins_{values.EETimeCorrShiftBins} {} +EcalTimeBiasCorrectionsGPU::EcalTimeBiasCorrectionsGPU( + EcalTimeBiasCorrections const& values) + : EBTimeCorrAmplitudeBins_{values.EBTimeCorrAmplitudeBins} + , EBTimeCorrShiftBins_{values.EBTimeCorrShiftBins} + , EETimeCorrAmplitudeBins_{values.EETimeCorrAmplitudeBins} + , EETimeCorrShiftBins_{values.EETimeCorrShiftBins} +{} EcalTimeBiasCorrectionsGPU::Product::~Product() { - // deallocation - cudaCheck(cudaFree(EBTimeCorrAmplitudeBins)); - cudaCheck(cudaFree(EBTimeCorrShiftBins)); - cudaCheck(cudaFree(EETimeCorrAmplitudeBins)); - cudaCheck(cudaFree(EETimeCorrShiftBins)); + // deallocation + cudaCheck( cudaFree(EBTimeCorrAmplitudeBins) ); + cudaCheck( cudaFree(EBTimeCorrShiftBins) ); + cudaCheck( cudaFree(EETimeCorrAmplitudeBins) ); + cudaCheck( cudaFree(EETimeCorrShiftBins) ); } -EcalTimeBiasCorrectionsGPU::Product const& EcalTimeBiasCorrectionsGPU::getProduct(cudaStream_t cudaStream) const { - auto const& product = product_.dataForCurrentDeviceAsync( - cudaStream, [this](EcalTimeBiasCorrectionsGPU::Product& product, cudaStream_t cudaStream) { - // to get the size of vectors later on - // should be removed and host conditions' objects used directly - product.EBTimeCorrAmplitudeBinsSize = this->EBTimeCorrAmplitudeBins_.size(); - product.EETimeCorrAmplitudeBinsSize = this->EETimeCorrAmplitudeBins_.size(); +EcalTimeBiasCorrectionsGPU::Product const& EcalTimeBiasCorrectionsGPU::getProduct( + cudaStream_t cudaStream) const +{ + auto const& product = product_.dataForCurrentDeviceAsync(cudaStream, + [this](EcalTimeBiasCorrectionsGPU::Product& product, cudaStream_t cudaStream) { + // to get the size of vectors later on + // should be removed and host conditions' objects used directly + product.EBTimeCorrAmplitudeBinsSize = + this->EBTimeCorrAmplitudeBins_.size(); + product.EETimeCorrAmplitudeBinsSize = + this->EETimeCorrAmplitudeBins_.size(); - // malloc - cudaCheck(cudaMalloc((void**)&product.EBTimeCorrAmplitudeBins, - this->EBTimeCorrAmplitudeBins_.size() * sizeof(float))); - cudaCheck(cudaMalloc((void**)&product.EBTimeCorrShiftBins, this->EBTimeCorrShiftBins_.size() * sizeof(float))); - cudaCheck(cudaMalloc((void**)&product.EETimeCorrAmplitudeBins, - this->EETimeCorrAmplitudeBins_.size() * sizeof(float))); - cudaCheck(cudaMalloc((void**)&product.EETimeCorrShiftBins, this->EETimeCorrShiftBins_.size() * sizeof(float))); - // transfer - cudaCheck(cudaMemcpyAsync(product.EBTimeCorrAmplitudeBins, - this->EBTimeCorrAmplitudeBins_.data(), - this->EBTimeCorrAmplitudeBins_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream)); - cudaCheck(cudaMemcpyAsync(product.EBTimeCorrShiftBins, - this->EBTimeCorrShiftBins_.data(), - this->EBTimeCorrShiftBins_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream)); - cudaCheck(cudaMemcpyAsync(product.EETimeCorrAmplitudeBins, - this->EETimeCorrAmplitudeBins_.data(), - this->EETimeCorrAmplitudeBins_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream)); - cudaCheck(cudaMemcpyAsync(product.EETimeCorrShiftBins, - this->EETimeCorrShiftBins_.data(), - this->EETimeCorrShiftBins_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream)); - }); + // malloc + cudaCheck( cudaMalloc((void**)&product.EBTimeCorrAmplitudeBins, + this->EBTimeCorrAmplitudeBins_.size() * + sizeof(float)) ); + cudaCheck( cudaMalloc((void**)&product.EBTimeCorrShiftBins, + this->EBTimeCorrShiftBins_.size() * + sizeof(float)) ); + cudaCheck( cudaMalloc((void**)&product.EETimeCorrAmplitudeBins, + this->EETimeCorrAmplitudeBins_.size() * + sizeof(float)) ); + cudaCheck( cudaMalloc((void**)&product.EETimeCorrShiftBins, + this->EETimeCorrShiftBins_.size() * + sizeof(float)) ); + // transfer + cudaCheck( cudaMemcpyAsync(product.EBTimeCorrAmplitudeBins, + this->EBTimeCorrAmplitudeBins_.data(), + this->EBTimeCorrAmplitudeBins_.size() * + sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.EBTimeCorrShiftBins, + this->EBTimeCorrShiftBins_.data(), + this->EBTimeCorrShiftBins_.size() * + sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.EETimeCorrAmplitudeBins, + this->EETimeCorrAmplitudeBins_.data(), + this->EETimeCorrAmplitudeBins_.size() * + sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.EETimeCorrShiftBins, + this->EETimeCorrShiftBins_.data(), + this->EETimeCorrShiftBins_.size() * + sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + } + ); - return product; + return product; } TYPELOOKUP_DATA_REG(EcalTimeBiasCorrectionsGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalTimeCalibConstantsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalTimeCalibConstantsGPU.cc index d724a33f1d4e1..1da155b2539f2 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalTimeCalibConstantsGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalTimeCalibConstantsGPU.cc @@ -3,38 +3,47 @@ #include "FWCore/Utilities/interface/typelookup.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -EcalTimeCalibConstantsGPU::EcalTimeCalibConstantsGPU(EcalTimeCalibConstants const& values) - : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {} +EcalTimeCalibConstantsGPU::EcalTimeCalibConstantsGPU( + EcalTimeCalibConstants const& values) + : valuesEB_{values.barrelItems()} + , valuesEE_{values.endcapItems()} +{} EcalTimeCalibConstantsGPU::Product::~Product() { - // deallocation - cudaCheck(cudaFree(values)); + // deallocation + cudaCheck( cudaFree(values) ); } -EcalTimeCalibConstantsGPU::Product const& EcalTimeCalibConstantsGPU::getProduct(cudaStream_t cudaStream) const { - auto const& product = product_.dataForCurrentDeviceAsync( - cudaStream, [this](EcalTimeCalibConstantsGPU::Product& product, cudaStream_t cudaStream) { - // malloc - cudaCheck( - cudaMalloc((void**)&product.values, (this->valuesEB_.size() + this->valuesEE_.size()) * sizeof(float))); - - // offset in floats, not bytes - auto const offset = this->valuesEB_.size(); - - // transfer - cudaCheck(cudaMemcpyAsync(product.values, - this->valuesEB_.data(), - this->valuesEB_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream)); - cudaCheck(cudaMemcpyAsync(product.values + offset, - this->valuesEE_.data(), - this->valuesEE_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream)); - }); - - return product; +EcalTimeCalibConstantsGPU::Product const& EcalTimeCalibConstantsGPU::getProduct( + cudaStream_t cudaStream) const +{ + auto const& product = product_.dataForCurrentDeviceAsync(cudaStream, + [this](EcalTimeCalibConstantsGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck( cudaMalloc((void**)&product.values, + (this->valuesEB_.size() + this->valuesEE_.size()) * + sizeof(float)) ); + + // offset in floats, not bytes + auto const offset = this->valuesEB_.size(); + + // transfer + cudaCheck( cudaMemcpyAsync(product.values, + this->valuesEB_.data(), + this->valuesEB_.size() * + sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.values + offset, + this->valuesEE_.data(), + this->valuesEE_.size() * + sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + } + ); + + return product; } TYPELOOKUP_DATA_REG(EcalTimeCalibConstantsGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu index 79b70716a675b..b67bb74235e4a 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu @@ -18,143 +18,114 @@ #include "cuda.h" #include "AmplitudeComputationCommonKernels.h" -#include "AmplitudeComputationKernelsV1.h" +#include "AmplitudeComputationKernels.h" #include "TimeComputationKernels.h" //#define DEBUG //#define ECAL_RECO_CUDA_DEBUG -namespace ecal { - namespace multifit { +namespace ecal { namespace multifit { + +void entryPoint( + EventInputDataGPU const& eventInputGPU, + EventOutputDataGPU& eventOutputGPU, EventDataForScratchGPU& scratch, + ConditionsProducts const& conditions, + ConfigurationParameters const& configParameters, + cudaStream_t cudaStream) { + using digis_type = std::vector; + using dids_type = std::vector; + // accodring to the cpu setup //----> hardcoded + bool const gainSwitchUseMaxSampleEB = true; + // accodring to the cpu setup //----> hardcoded + bool const gainSwitchUseMaxSampleEE = false; + + uint32_t const offsetForHashes = conditions.offsetForHashes; + uint32_t const offsetForInputs = eventInputGPU.ebDigis.ndigis; + unsigned int totalChannels = eventInputGPU.ebDigis.ndigis + + eventInputGPU.eeDigis.ndigis; + + // + // 1d preparation kernel + // + unsigned int nchannels_per_block = 32; + unsigned int threads_1d = 10 * nchannels_per_block; + unsigned int blocks_1d = threads_1d > 10*totalChannels + ? 1 : (totalChannels*10 + threads_1d - 1) / threads_1d; + int shared_bytes = nchannels_per_block * EcalDataFrame::MAXSAMPLES * ( + sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(char) + + sizeof(bool) + ); + kernel_prep_1d_and_initialize<<>>( + conditions.pulseShapes.values, + eventInputGPU.ebDigis.data, + eventInputGPU.ebDigis.ids, + eventInputGPU.eeDigis.data, + eventInputGPU.eeDigis.ids, + scratch.samples, + (SampleVector*)eventOutputGPU.amplitudesAll, + scratch.gainsNoise, + conditions.pedestals.mean_x1, + conditions.pedestals.mean_x12, + conditions.pedestals.rms_x12, + conditions.pedestals.mean_x6, + conditions.gainRatios.gain6Over1, + conditions.gainRatios.gain12Over6, + scratch.hasSwitchToGain6, + scratch.hasSwitchToGain1, + scratch.isSaturated, + eventOutputGPU.amplitude, + eventOutputGPU.chi2, + eventOutputGPU.pedestal, + eventOutputGPU.did, + eventOutputGPU.flags, + scratch.acState, + scratch.activeBXs, + offsetForHashes, + offsetForInputs, + gainSwitchUseMaxSampleEB, + gainSwitchUseMaxSampleEE, + totalChannels); + cudaCheck(cudaGetLastError()); - void entryPoint(EventInputDataCPU const& eventInputCPU, - EventInputDataGPU& eventInputGPU, - EventOutputDataGPU& eventOutputGPU, - EventDataForScratchGPU& scratch, - ConditionsProducts const& conditions, - ConfigurationParameters const& configParameters, - cudaStream_t cudaStream) { - using digis_type = std::vector; - using dids_type = std::vector; - // accodring to the cpu setup //----> hardcoded - bool const gainSwitchUseMaxSampleEB = true; - // accodring to the cpu setup //----> hardcoded - bool const gainSwitchUseMaxSampleEE = false; + // + // 2d preparation kernel + // + int blocks_2d = totalChannels; + dim3 threads_2d{10, 10}; + kernel_prep_2d<<>>( + scratch.gainsNoise, + eventInputGPU.ebDigis.ids, + eventInputGPU.eeDigis.ids, + conditions.pedestals.rms_x12, + conditions.pedestals.rms_x6, + conditions.pedestals.rms_x1, + conditions.gainRatios.gain12Over6, + conditions.gainRatios.gain6Over1, + conditions.samplesCorrelation.EBG12SamplesCorrelation, + conditions.samplesCorrelation.EBG6SamplesCorrelation, + conditions.samplesCorrelation.EBG1SamplesCorrelation, + conditions.samplesCorrelation.EEG12SamplesCorrelation, + conditions.samplesCorrelation.EEG6SamplesCorrelation, + conditions.samplesCorrelation.EEG1SamplesCorrelation, + scratch.noisecov, + scratch.pulse_matrix, + conditions.pulseShapes.values, + scratch.hasSwitchToGain6, + scratch.hasSwitchToGain1, + scratch.isSaturated, + offsetForHashes, + offsetForInputs); + cudaCheck(cudaGetLastError()); + + // run minimization kernels + v1::minimization_procedure( + eventInputGPU, eventOutputGPU, + scratch, conditions, configParameters, cudaStream); - uint32_t const offsetForHashes = conditions.offsetForHashes; - unsigned int totalChannels = eventInputCPU.ebDigis.size() + eventInputCPU.eeDigis.size(); - - // temporary for recording - /*cudaEvent_t start_event; - cudaEvent_t end_event; - cudaCheck( cudaEventCreate(&start_event) ); - cudaCheck( cudaEventCreate(&end_event) ); - - cudaCheck (cudaEventRecord(start_event, 0) ); - */ - - // - // in what follows we copy eb then ee. - // offset by size - // - - // - // copy event data: digis + ids, not really async as vectors have default - // allocators - // - cudaCheck(cudaMemcpyAsync(eventInputGPU.digis, - eventInputCPU.ebDigis.data().data(), - eventInputCPU.ebDigis.data().size() * sizeof(digis_type::value_type), - cudaMemcpyHostToDevice, - cudaStream)); - cudaCheck(cudaMemcpyAsync(eventInputGPU.digis + eventInputCPU.ebDigis.data().size(), - eventInputCPU.eeDigis.data().data(), - eventInputCPU.eeDigis.data().size() * sizeof(digis_type::value_type), - cudaMemcpyHostToDevice, - cudaStream)); - - cudaCheck(cudaMemcpyAsync(eventInputGPU.ids, - eventInputCPU.ebDigis.ids().data(), - eventInputCPU.ebDigis.ids().size() * sizeof(dids_type::value_type), - cudaMemcpyHostToDevice, - cudaStream)); - cudaCheck(cudaMemcpyAsync(eventInputGPU.ids + eventInputCPU.ebDigis.ids().size(), - eventInputCPU.eeDigis.ids().data(), - eventInputCPU.eeDigis.ids().size() * sizeof(dids_type::value_type), - cudaMemcpyHostToDevice, - cudaStream)); - - // - // 1d preparation kernel - // - unsigned int nchannels_per_block = 32; - unsigned int threads_1d = 10 * nchannels_per_block; - unsigned int blocks_1d = threads_1d > 10 * totalChannels ? 1 : (totalChannels * 10 + threads_1d - 1) / threads_1d; - int shared_bytes = nchannels_per_block * EcalDataFrame::MAXSAMPLES * - (sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(char) + sizeof(bool)); - kernel_prep_1d_and_initialize<<>>( - conditions.pulseShapes.values, - eventInputGPU.digis, - eventInputGPU.ids, - scratch.samples, - (SampleVector*)eventOutputGPU.amplitudesAll, - scratch.gainsNoise, - conditions.pedestals.mean_x1, - conditions.pedestals.mean_x12, - conditions.pedestals.rms_x12, - conditions.pedestals.mean_x6, - conditions.gainRatios.gain6Over1, - conditions.gainRatios.gain12Over6, - scratch.hasSwitchToGain6, - scratch.hasSwitchToGain1, - scratch.isSaturated, - eventOutputGPU.amplitude, - eventOutputGPU.chi2, - eventOutputGPU.pedestal, - eventOutputGPU.flags, - scratch.acState, - scratch.activeBXs, - offsetForHashes, - gainSwitchUseMaxSampleEB, - gainSwitchUseMaxSampleEE, - totalChannels); - cudaCheck(cudaGetLastError()); - - // - // 2d preparation kernel - // - int blocks_2d = totalChannels; - dim3 threads_2d{10, 10}; - kernel_prep_2d<<>>(conditions.pulseCovariances.values, - scratch.pulse_covariances, - scratch.gainsNoise, - eventInputGPU.ids, - conditions.pedestals.rms_x12, - conditions.pedestals.rms_x6, - conditions.pedestals.rms_x1, - conditions.gainRatios.gain12Over6, - conditions.gainRatios.gain6Over1, - conditions.samplesCorrelation.EBG12SamplesCorrelation, - conditions.samplesCorrelation.EBG6SamplesCorrelation, - conditions.samplesCorrelation.EBG1SamplesCorrelation, - conditions.samplesCorrelation.EEG12SamplesCorrelation, - conditions.samplesCorrelation.EEG6SamplesCorrelation, - conditions.samplesCorrelation.EEG1SamplesCorrelation, - scratch.noisecov, - scratch.pulse_matrix, - conditions.pulseShapes.values, - scratch.hasSwitchToGain6, - scratch.hasSwitchToGain1, - scratch.isSaturated, - offsetForHashes); - cudaCheck(cudaGetLastError()); - - // run minimization kernels - v1::minimization_procedure( - eventInputCPU, eventInputGPU, eventOutputGPU, scratch, conditions, configParameters, cudaStream); - - if (configParameters.shouldRunTimingComputation) { + if (configParameters.shouldRunTimingComputation) { + // // TODO: this guy can run concurrently with other kernels, // there is no dependence on the order of execution @@ -162,9 +133,12 @@ namespace ecal { unsigned int threads_time_init = threads_1d; unsigned int blocks_time_init = blocks_1d; int sharedBytesInit = 2 * threads_time_init * sizeof(SampleVector::Scalar); - kernel_time_computation_init<<>>( - eventInputGPU.digis, - eventInputGPU.ids, + kernel_time_computation_init<<>>( + eventInputGPU.ebDigis.data, + eventInputGPU.ebDigis.ids, + eventInputGPU.eeDigis.data, + eventInputGPU.eeDigis.ids, conditions.pedestals.rms_x12, conditions.pedestals.rms_x6, conditions.pedestals.rms_x1, @@ -179,57 +153,69 @@ namespace ecal { scratch.useless_sample_values, scratch.pedestal_nums, offsetForHashes, + offsetForInputs, conditions.sampleMask.getEcalSampleMaskRecordEB(), conditions.sampleMask.getEcalSampleMaskRecordEE(), - totalChannels); + totalChannels + ); cudaCheck(cudaGetLastError()); - // - // TODO: small kernel only for EB. It needs to be checked if + // + // TODO: small kernel only for EB. It needs to be checked if /// fusing such small kernels is beneficial in here // // we are running only over EB digis // therefore we need to create threads/blocks only for that unsigned int const threadsFixMGPA = threads_1d; - unsigned int const blocksFixMGPA = - threadsFixMGPA > 10 * eventInputCPU.ebDigis.size() + unsigned int const blocksFixMGPA = + threadsFixMGPA > 10 * eventInputGPU.ebDigis.ndigis ? 1 - : (10 * eventInputCPU.ebDigis.size() + threadsFixMGPA - 1) / threadsFixMGPA; - kernel_time_compute_fixMGPAslew<<>>( - eventInputGPU.digis, + : (10 * eventInputGPU.ebDigis.ndigis + threadsFixMGPA - 1) + / threadsFixMGPA; + kernel_time_compute_fixMGPAslew<<>>( + eventInputGPU.ebDigis.data, + eventInputGPU.eeDigis.data, scratch.sample_values, scratch.sample_value_errors, scratch.useless_sample_values, conditions.sampleMask.getEcalSampleMaskRecordEB(), - totalChannels); + totalChannels, + offsetForInputs + ); cudaCheck(cudaGetLastError()); // + // // - // - int sharedBytes = EcalDataFrame::MAXSAMPLES * nchannels_per_block * 4 * sizeof(SampleVector::Scalar); + int sharedBytes = EcalDataFrame::MAXSAMPLES * nchannels_per_block * + 4 * sizeof(SampleVector::Scalar); auto const threads_nullhypot = threads_1d; auto const blocks_nullhypot = blocks_1d; - kernel_time_compute_nullhypot<<>>( + kernel_time_compute_nullhypot<<>>( scratch.sample_values, scratch.sample_value_errors, scratch.useless_sample_values, scratch.chi2sNullHypot, scratch.sum0sNullHypot, scratch.sumAAsNullHypot, - totalChannels); + totalChannels + ); cudaCheck(cudaGetLastError()); unsigned int nchannels_per_block_makeratio = 10; unsigned int threads_makeratio = 45 * nchannels_per_block_makeratio; unsigned int blocks_makeratio = threads_makeratio > 45 * totalChannels - ? 1 - : (totalChannels * 45 + threads_makeratio - 1) / threads_makeratio; + ? 1 + : (totalChannels * 45 + threads_makeratio - 1) / threads_makeratio; int sharedBytesMakeRatio = 5 * threads_makeratio * sizeof(SampleVector::Scalar); - kernel_time_compute_makeratio<<>>( + kernel_time_compute_makeratio<<>>( scratch.sample_values, scratch.sample_value_errors, - eventInputGPU.ids, + eventInputGPU.ebDigis.ids, + eventInputGPU.eeDigis.ids, scratch.useless_sample_values, scratch.pedestal_nums, configParameters.amplitudeFitParametersEB, @@ -243,13 +229,15 @@ namespace ecal { scratch.accTimeMax, scratch.accTimeWgt, scratch.tcState, - configParameters.timeFitParametersSizeEB, + configParameters.timeFitParametersSizeEB, configParameters.timeFitParametersSizeEE, configParameters.timeFitLimitsFirstEB, configParameters.timeFitLimitsFirstEE, configParameters.timeFitLimitsSecondEB, configParameters.timeFitLimitsSecondEE, - totalChannels); + totalChannels, + offsetForInputs + ); cudaCheck(cudaGetLastError()); // @@ -257,41 +245,48 @@ namespace ecal { // auto const threads_findamplchi2 = threads_1d; auto const blocks_findamplchi2 = blocks_1d; - int const sharedBytesFindAmplChi2 = 2 * threads_findamplchi2 * sizeof(SampleVector::Scalar); + int const sharedBytesFindAmplChi2 = 2 * threads_findamplchi2 * + sizeof(SampleVector::Scalar); kernel_time_compute_findamplchi2_and_finish<<>>(scratch.sample_values, - scratch.sample_value_errors, - eventInputGPU.ids, - scratch.useless_sample_values, - scratch.tMaxAlphaBetas, - scratch.tMaxErrorAlphaBetas, - scratch.accTimeMax, - scratch.accTimeWgt, - configParameters.amplitudeFitParametersEB, - configParameters.amplitudeFitParametersEE, - scratch.sumAAsNullHypot, - scratch.sum0sNullHypot, - scratch.chi2sNullHypot, - scratch.tcState, - scratch.ampMaxAlphaBeta, - scratch.ampMaxError, - scratch.timeMax, - scratch.timeError, - totalChannels); + threads_findamplchi2, + sharedBytesFindAmplChi2, cudaStream>>>( + scratch.sample_values, + scratch.sample_value_errors, + eventInputGPU.ebDigis.ids, + eventInputGPU.eeDigis.ids, + scratch.useless_sample_values, + scratch.tMaxAlphaBetas, + scratch.tMaxErrorAlphaBetas, + scratch.accTimeMax, + scratch.accTimeWgt, + configParameters.amplitudeFitParametersEB, + configParameters.amplitudeFitParametersEE, + scratch.sumAAsNullHypot, + scratch.sum0sNullHypot, + scratch.chi2sNullHypot, + scratch.tcState, + scratch.ampMaxAlphaBeta, + scratch.ampMaxError, + scratch.timeMax, + scratch.timeError, + totalChannels, + offsetForInputs + ); cudaCheck(cudaGetLastError()); - + // // // auto const threads_timecorr = 32; - auto const blocks_timecorr = - threads_timecorr > totalChannels ? 1 : (totalChannels + threads_timecorr - 1) / threads_timecorr; - kernel_time_correction_and_finalize<<>>( + auto const blocks_timecorr = threads_timecorr > totalChannels + ? 1 : (totalChannels + threads_timecorr-1) / threads_timecorr; + kernel_time_correction_and_finalize<<>>( eventOutputGPU.amplitude, - eventInputGPU.digis, - eventInputGPU.ids, + eventInputGPU.ebDigis.data, + eventInputGPU.ebDigis.ids, + eventInputGPU.eeDigis.data, + eventInputGPU.eeDigis.ids, conditions.timeBiasCorrections.EBTimeCorrAmplitudeBins, conditions.timeBiasCorrections.EETimeCorrAmplitudeBins, conditions.timeBiasCorrections.EBTimeCorrShiftBins, @@ -322,18 +317,19 @@ namespace ecal { configParameters.outOfTimeThreshG61mEB, configParameters.outOfTimeThreshG61mEE, offsetForHashes, - totalChannels); + offsetForInputs, + totalChannels + ); cudaCheck(cudaGetLastError()); - } + } - /* + /* cudaEventRecord(end_event, 0); cudaEventSynchronize(end_event); float ms; cudaEventElapsedTime(&ms, start_event, end_event); std::cout << "elapsed time = " << ms << std::endl; */ - } +} - } // namespace multifit -} // namespace ecal +}} diff --git a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu index b85f002464f65..6b60f4fc35560 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu @@ -3,74 +3,88 @@ #include "DataFormats/EcalDetId/interface/EBDetId.h" #include "DataFormats/EcalDetId/interface/EEDetId.h" -namespace ecal { - namespace multifit { - - namespace internal { - - namespace barrel { - - __device__ __forceinline__ bool positiveZ(uint32_t id) { return id & 0x10000; } - - __device__ __forceinline__ uint32_t ietaAbs(uint32_t id) { return (id >> 9) & 0x7F; } - - __device__ __forceinline__ uint32_t iphi(uint32_t id) { return id & 0x1FF; } - - } // namespace barrel - - } // namespace internal - - __device__ uint32_t hashedIndexEB(uint32_t id) { - using namespace internal::barrel; - return (EBDetId::MAX_IETA + (positiveZ(id) ? ietaAbs(id) - 1 : -ietaAbs(id))) * EBDetId::MAX_IPHI + iphi(id) - 1; - } - - namespace internal { - - namespace endcap { - - __device__ __forceinline__ uint32_t ix(uint32_t id) { return (id >> 7) & 0x7F; } - - __device__ __forceinline__ uint32_t iy(uint32_t id) { return id & 0x7F; } - - __device__ __forceinline__ bool positiveZ(uint32_t id) { return id & 0x4000; } - - // these constants come from EE Det Id - __constant__ const unsigned short kxf[] = { - 41, 51, 41, 51, 41, 51, 36, 51, 36, 51, 26, 51, 26, 51, 26, 51, 21, 51, 21, 51, 21, 51, 21, 51, 21, - 51, 16, 51, 16, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 9, 51, 9, 51, 9, 51, 9, 51, 9, 51, - 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 4, 51, 4, 51, 4, - 51, 4, 51, 4, 56, 1, 58, 1, 59, 1, 60, 1, 61, 1, 61, 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, - 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, 1, 61, 1, 61, 1, 60, 1, 59, 1, 58, 4, 56, 4, 51, 4, - 51, 4, 51, 4, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, - 9, 51, 9, 51, 9, 51, 9, 51, 9, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 16, 51, 16, 51, 21, - 51, 21, 51, 21, 51, 21, 51, 21, 51, 26, 51, 26, 51, 26, 51, 36, 51, 36, 51, 41, 51, 41, 51, 41, 51}; - - __constant__ const unsigned short kdi[] = { - 0, 10, 20, 30, 40, 50, 60, 75, 90, 105, 120, 145, 170, 195, 220, 245, 270, - 300, 330, 360, 390, 420, 450, 480, 510, 540, 570, 605, 640, 675, 710, 747, 784, 821, - 858, 895, 932, 969, 1006, 1043, 1080, 1122, 1164, 1206, 1248, 1290, 1332, 1374, 1416, 1458, 1500, - 1545, 1590, 1635, 1680, 1725, 1770, 1815, 1860, 1905, 1950, 1995, 2040, 2085, 2130, 2175, 2220, 2265, - 2310, 2355, 2400, 2447, 2494, 2541, 2588, 2635, 2682, 2729, 2776, 2818, 2860, 2903, 2946, 2988, 3030, - 3071, 3112, 3152, 3192, 3232, 3272, 3311, 3350, 3389, 3428, 3467, 3506, 3545, 3584, 3623, 3662, 3701, - 3740, 3779, 3818, 3857, 3896, 3935, 3974, 4013, 4052, 4092, 4132, 4172, 4212, 4253, 4294, 4336, 4378, - 4421, 4464, 4506, 4548, 4595, 4642, 4689, 4736, 4783, 4830, 4877, 4924, 4969, 5014, 5059, 5104, 5149, - 5194, 5239, 5284, 5329, 5374, 5419, 5464, 5509, 5554, 5599, 5644, 5689, 5734, 5779, 5824, 5866, 5908, - 5950, 5992, 6034, 6076, 6118, 6160, 6202, 6244, 6281, 6318, 6355, 6392, 6429, 6466, 6503, 6540, 6577, - 6614, 6649, 6684, 6719, 6754, 6784, 6814, 6844, 6874, 6904, 6934, 6964, 6994, 7024, 7054, 7079, 7104, - 7129, 7154, 7179, 7204, 7219, 7234, 7249, 7264, 7274, 7284, 7294, 7304, 7314}; - - } // namespace endcap - - } // namespace internal - - __device__ uint32_t hashedIndexEE(uint32_t id) { - using namespace internal::endcap; - - const uint32_t jx(ix(id)); - const uint32_t jd(2 * (iy(id) - 1) + (jx - 1) / 50); - return ((positiveZ(id) ? EEDetId::kEEhalf : 0) + kdi[jd] + jx - kxf[jd]); - } - - } // namespace multifit -} // namespace ecal +namespace ecal { namespace multifit { + +namespace internal { + +namespace barrel { + +__device__ +__forceinline__ +bool positiveZ(uint32_t id) { return id & 0x10000; } + +__device__ +__forceinline__ +uint32_t ietaAbs(uint32_t id) { return (id >> 9) & 0x7F; } + +__device__ +__forceinline__ +uint32_t iphi(uint32_t id) { return id & 0x1FF; } + +} + +} + +__device__ +uint32_t hashedIndexEB(uint32_t id) { + using namespace internal::barrel; + return (EBDetId::MAX_IETA + + (positiveZ(id) ? ietaAbs(id)-1 : -ietaAbs(id)) ) * EBDetId::MAX_IPHI + + iphi(id)-1; +} + +namespace internal { + +namespace endcap { + +__device__ +__forceinline__ +uint32_t ix(uint32_t id) { return (id >> 7) & 0x7F; } + +__device__ +__forceinline__ +uint32_t iy(uint32_t id) { return id & 0x7F; } + +__device__ +__forceinline__ +bool positiveZ(uint32_t id) { return id & 0x4000; } + +// these constants come from EE Det Id +__constant__ +const unsigned short kxf[] = { + 41, 51, 41, 51, 41, 51, 36, 51, 36, 51, 26, 51, 26, 51, 26, 51, 21, 51, 21, 51, 21, 51, 21, 51, 21, 51, 16, 51, 16, + 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 9, 51, 9, 51, 9, 51, 9, 51, 9, 51, 6, 51, 6, 51, 6, 51, 6, 51, + 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 4, 51, 4, 51, 4, 51, 4, 51, 4, 56, 1, 58, 1, 59, 1, 60, 1, + 61, 1, 61, 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, 1, 61, 1, 61, 1, 60, + 1, 59, 1, 58, 4, 56, 4, 51, 4, 51, 4, 51, 4, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, + 51, 6, 51, 6, 51, 9, 51, 9, 51, 9, 51, 9, 51, 9, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 16, 51, 16, 51, + 21, 51, 21, 51, 21, 51, 21, 51, 21, 51, 26, 51, 26, 51, 26, 51, 36, 51, 36, 51, 41, 51, 41, 51, 41, 51}; + +__constant__ +const unsigned short kdi[] = { + 0, 10, 20, 30, 40, 50, 60, 75, 90, 105, 120, 145, 170, 195, 220, 245, 270, 300, 330, + 360, 390, 420, 450, 480, 510, 540, 570, 605, 640, 675, 710, 747, 784, 821, 858, 895, 932, 969, + 1006, 1043, 1080, 1122, 1164, 1206, 1248, 1290, 1332, 1374, 1416, 1458, 1500, 1545, 1590, 1635, 1680, 1725, 1770, + 1815, 1860, 1905, 1950, 1995, 2040, 2085, 2130, 2175, 2220, 2265, 2310, 2355, 2400, 2447, 2494, 2541, 2588, 2635, + 2682, 2729, 2776, 2818, 2860, 2903, 2946, 2988, 3030, 3071, 3112, 3152, 3192, 3232, 3272, 3311, 3350, 3389, 3428, + 3467, 3506, 3545, 3584, 3623, 3662, 3701, 3740, 3779, 3818, 3857, 3896, 3935, 3974, 4013, 4052, 4092, 4132, 4172, + 4212, 4253, 4294, 4336, 4378, 4421, 4464, 4506, 4548, 4595, 4642, 4689, 4736, 4783, 4830, 4877, 4924, 4969, 5014, + 5059, 5104, 5149, 5194, 5239, 5284, 5329, 5374, 5419, 5464, 5509, 5554, 5599, 5644, 5689, 5734, 5779, 5824, 5866, + 5908, 5950, 5992, 6034, 6076, 6118, 6160, 6202, 6244, 6281, 6318, 6355, 6392, 6429, 6466, 6503, 6540, 6577, 6614, + 6649, 6684, 6719, 6754, 6784, 6814, 6844, 6874, 6904, 6934, 6964, 6994, 7024, 7054, 7079, 7104, 7129, 7154, 7179, + 7204, 7219, 7234, 7249, 7264, 7274, 7284, 7294, 7304, 7314}; + +} + +} + +__device__ +uint32_t hashedIndexEE(uint32_t id) { + using namespace internal::endcap; + + const uint32_t jx ( ix(id) ) ; + const uint32_t jd ( 2*( iy(id) - 1 ) + ( jx - 1 )/50 ) ; + return ( ( positiveZ(id) ? EEDetId::kEEhalf : 0) + kdi[jd] + jx - kxf[jd] ) ; +} + +}} diff --git a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h index b148ab91915d1..888bdc103b0d4 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h +++ b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h @@ -1,14 +1,464 @@ #ifndef RecoLocalCalo_EcalRecAlgos_src_KernelHelpers_h #define RecoLocalCalo_EcalRecAlgos_src_KernelHelpers_h -namespace ecal { - namespace multifit { +#include - __device__ uint32_t hashedIndexEB(uint32_t id); +namespace ecal { namespace multifit { - __device__ uint32_t hashedIndexEE(uint32_t id); +template +using ColMajorMatrix = Eigen::Matrix; - } // namespace multifit -} // namespace ecal +template +using RowMajorMatrix = Eigen::Matrix; -#endif // RecoLocalCalo_EcalRecAlgos_src_KernelHelpers_h +template +using ColumnVector = Eigen::Matrix; + +template +using RowVector = Eigen::Matrix; + +__device__ +uint32_t hashedIndexEB(uint32_t id); + +__device__ +uint32_t hashedIndexEE(uint32_t id); + + +// FIXME: provide specialization for Row Major layout +template +< + typename T, + int Stride, + int Order = Eigen::ColMajor +> +struct MapSymM { + using type = T; + using base_type = typename std::remove_const::type; + + static constexpr int total = Stride * (Stride + 1) / 2; + static constexpr int stride = Stride; + T* data; + + __forceinline__ __device__ + MapSymM(T *data) : data{data} {} + + __forceinline__ __device__ + T const& operator()(int const row, int const col) const { + auto const tmp = (Stride - col) * (Stride - col + 1) / 2; + auto const index = total - tmp + row - col; + return data[index]; + } + + template + __forceinline__ __device__ + typename std::enable_if::value, base_type>::type& + operator()(int const row, int const col) { + auto const tmp = (Stride - col) * (Stride - col + 1) / 2; + auto const index = total - tmp + row - col; + return data[index]; + } +}; + +// FIXME: either use/modify/improve eigen or make this more generic +// this is a map for a pulse matrix to building a 2d matrix for each channel +// and hide indexing +template +< + typename T +> +struct MapMForPM { + using type = T; + using base_type = typename std::remove_cv::type; + + type* data; + __forceinline__ __device__ + MapMForPM(type* data) : data{data} {} + + __forceinline__ __device__ + base_type operator()(int const row, int const col) const { + auto const index = 2 - col + row; + return index>=0 ? data[index] : 0; + } +}; + +// simple/trivial cholesky decomposition impl +template +__forceinline__ __device__ +void compute_decomposition_unrolled(MatrixType1& L, MatrixType2 const& M) { + auto const sqrtm_0_0 = std::sqrt(M(0, 0)); + L(0, 0) = sqrtm_0_0; + using T = typename MatrixType1::base_type; + + #pragma unroll + for (int i=1; i +__forceinline__ __device__ +void compute_decomposition(MatrixType1& L, MatrixType2 const& M, int const N) { + auto const sqrtm_0_0 = std::sqrt(M(0, 0)); + L(0, 0) = sqrtm_0_0; + using T = typename MatrixType1::base_type; + + for (int i=1; i +__forceinline__ __device__ +void compute_decomposition_forwardsubst_with_offsets( + MatrixType1& L, MatrixType2 const& M, + float b[MatrixType1::stride], + VectorType const& Atb, + int const N, + ColumnVector const& pulseOffsets) { + auto const real_0 = pulseOffsets(0); + auto const sqrtm_0_0 = std::sqrt(M(real_0, real_0)); + L(0, 0) = sqrtm_0_0; + using T = typename MatrixType1::base_type; + b[0] = Atb(real_0) / sqrtm_0_0; + + for (int i=1; i +__forceinline__ __device__ +void update_decomposition_forwardsubst_with_offsets( + MatrixType1& L, MatrixType2 const& M, + float b[MatrixType1::stride], + VectorType const& Atb, + int const N, + ColumnVector const& pulseOffsets) { + using T = typename MatrixType1::base_type; + auto const i = N-1; + auto const i_real = pulseOffsets(i); + T sumsq {0}; + T total = 0; + for (int j=0; j +__device__ +void solve_forward_subst_matrix( + MatrixType1 &A, + MatrixType2 const& pulseMatrixView, + MatrixType3 const& matrixL) { + // FIXME: this assumes pulses are on columns and samples on rows + constexpr auto NPULSES = MatrixType2::ColsAtCompileTime; + constexpr auto NSAMPLES = MatrixType2::RowsAtCompileTime; + + #pragma unroll + for (int icol=0; icol +__device__ +void solve_forward_subst_vector( + float reg_b[MatrixType1::RowsAtCompileTime], + MatrixType1 inputAmplitudesView, + MatrixType2 matrixL) { + constexpr auto NSAMPLES = MatrixType1::RowsAtCompileTime; + + float reg_b_tmp[NSAMPLES]; + float reg_L[NSAMPLES]; + + // preload a column and load column 0 of cholesky + #pragma unroll + for (int i=0; i +__device__ +void fnnls( + MatrixType const& AtA, + VectorType const& Atb, + VectorType& solution, + int& npassive, + ColumnVector &pulseOffsets, + MapSymM &matrixL, + double const eps, + int const maxIterations) { + // constants + constexpr auto NPULSES = VectorType::RowsAtCompileTime; + + // to keep track of where to terminate if converged + Eigen::Index w_max_idx_prev = 0; + float w_max_prev = 0; + auto eps_to_use = eps; + bool recompute = false; + + // used throughout + VectorType s; + float reg_b[NPULSES]; + //float matrixLStorage[MapSymM::total]; + //MapSymM matrixL{matrixLStorage}; + + int iter = 0; + while (true) { + if (iter > 0 || npassive==0) { + auto const nactive = NPULSES - npassive; + // exit if there are no more pulses to constrain + if (nactive==0) break; + + // compute the gradient + //w.tail(nactive) = Atb.tail(nactive) - (AtA * solution).tail(nactive); + Eigen::Index w_max_idx; + float w_max = -std::numeric_limits::max(); + for (int icol=npassive; icol icol_real + ? AtA(counter, icol_real) * solution(counter) + : AtA(icol_real, counter) * solution(counter); + + auto const w = atb - sum; + if (w > w_max) { + w_max = w; + w_max_idx = icol - npassive; + } + } + + // check for convergence + if (w_max= maxIterations) break; + + w_max_prev = w_max; + w_max_idx_prev = w_max_idx; + + // move index to the right part of the vector + w_max_idx += npassive; + + Eigen::numext::swap(pulseOffsets.coeffRef(npassive), + pulseOffsets.coeffRef(w_max_idx)); + ++npassive; + } + + // inner loop + while (true) { + if (npassive == 0) break; + + //s.head(npassive) + //auto const& matrixL = + // AtA.topLeftCorner(npassive, npassive) + // .llt().matrixL(); + //.solve(Atb.head(npassive)); + if (recompute || iter==0) + compute_decomposition_forwardsubst_with_offsets( + matrixL, AtA, reg_b, Atb, + npassive, pulseOffsets); + else + update_decomposition_forwardsubst_with_offsets( + matrixL, AtA, reg_b, Atb, + npassive, pulseOffsets); + + // run backward substituion + s(npassive-1) = reg_b[npassive-1] / matrixL(npassive-1, npassive-1); + for (int i=npassive-2; i>=0; --i) { + float total=0; + for (int j=i+1; j have to recompute the whole decomp + recompute = true; + + auto alpha = std::numeric_limits::max(); + Eigen::Index alpha_idx = 0, alpha_idx_real = 0; + for (int i=0; i(s_sum0 + nchannels_per_block * nsamples); - SampleVector::Scalar* s_sumA = s_sum1 + nchannels_per_block * nsamples; - SampleVector::Scalar* s_sumAA = s_sumA + nchannels_per_block * nsamples; + SampleVector::Scalar* s_sum1 = reinterpret_cast( + s_sum0 + nchannels_per_block*nsamples); + SampleVector::Scalar* s_sumA = s_sum1 + nchannels_per_block*nsamples; + SampleVector::Scalar* s_sumAA = s_sumA + nchannels_per_block*nsamples; // TODO make sure no div by 0 - auto const inv_error = - useless_sample_values[tx] ? 0.0 : 1.0 / (sample_value_errors[tx] * sample_value_errors[tx]); + auto const inv_error = useless_sample_values[tx] + ? 0.0 + : 1.0 / (sample_value_errors[tx] * sample_value_errors[tx]); auto const sample_value = sample_values[tx]; s_sum0[ltx] = useless_sample_values[tx] ? 0 : 1; s_sum1[ltx] = inv_error; @@ -64,190 +68,215 @@ namespace ecal { __syncthreads(); // 5 threads for [0, 4] samples - if (sample < 5) { - s_sum0[ltx] += s_sum0[ltx + 5]; - s_sum1[ltx] += s_sum1[ltx + 5]; - s_sumA[ltx] += s_sumA[ltx + 5]; - s_sumAA[ltx] += s_sumAA[ltx + 5]; + if (sample<5) { + s_sum0[ltx] += s_sum0[ltx+5]; + s_sum1[ltx] += s_sum1[ltx+5]; + s_sumA[ltx] += s_sumA[ltx+5]; + s_sumAA[ltx] += s_sumAA[ltx+5]; } __syncthreads(); - if (sample < 2) { - // note double counting of sample 3 - s_sum0[ltx] += s_sum0[ltx + 2] + s_sum0[ltx + 3]; - s_sum1[ltx] += s_sum1[ltx + 2] + s_sum1[ltx + 3]; - s_sumA[ltx] += s_sumA[ltx + 2] + s_sumA[ltx + 3]; - s_sumAA[ltx] += s_sumAA[ltx + 2] + s_sumAA[ltx + 3]; + if (sample<2) { + // note double counting of sample 3 + s_sum0[ltx] += s_sum0[ltx+2] + s_sum0[ltx+3]; + s_sum1[ltx] += s_sum1[ltx+2] + s_sum1[ltx+3]; + s_sumA[ltx] += s_sumA[ltx+2] + s_sumA[ltx+3]; + s_sumAA[ltx] += s_sumAA[ltx+2] + s_sumAA[ltx+3]; } __syncthreads(); if (sample == 0) { - // note, subtract to remove the double counting of sample == 3 - //s_sum0[ltx] += s_sum0[ltx+1] - s_sum0[ltx+3]; - //s_sum1[ltx] += s_sum1[ltx+1] - s_sum1[ltx+3]; - //s_sumA[ltx] += s_sumA[ltx+1] - s_sumA[ltx+3]; - //s_sumAA[ltx] += s_sumAA[ltx+1] - s_sumAA[ltx+3]; - auto const sum0 = s_sum0[ltx] + s_sum0[ltx + 1] - s_sum0[ltx + 3]; - auto const sum1 = s_sum1[ltx] + s_sum1[ltx + 1] - s_sum1[ltx + 3]; - auto const sumA = s_sumA[ltx] + s_sumA[ltx + 1] - s_sumA[ltx + 3]; - auto const sumAA = s_sumAA[ltx] + s_sumAA[ltx + 1] - s_sumAA[ltx + 3]; - auto const chi2 = sum0 > 0 ? (sumAA - sumA * sumA / sum1) / sum0 : static_cast(0); - chi2s[ch] = chi2; - sum0s[ch] = sum0; - sumAAs[ch] = sumAA; + // note, subtract to remove the double counting of sample == 3 + //s_sum0[ltx] += s_sum0[ltx+1] - s_sum0[ltx+3]; + //s_sum1[ltx] += s_sum1[ltx+1] - s_sum1[ltx+3]; + //s_sumA[ltx] += s_sumA[ltx+1] - s_sumA[ltx+3]; + //s_sumAA[ltx] += s_sumAA[ltx+1] - s_sumAA[ltx+3]; + auto const sum0 = s_sum0[ltx] + s_sum0[ltx+1] - s_sum0[ltx+3]; + auto const sum1 = s_sum1[ltx] + s_sum1[ltx+1] - s_sum1[ltx+3]; + auto const sumA = s_sumA[ltx] + s_sumA[ltx+1] - s_sumA[ltx+3]; + auto const sumAA = s_sumAA[ltx] + s_sumAA[ltx+1] - s_sumAA[ltx+3]; + auto const chi2 = sum0>0 + ? (sumAA - sumA * sumA / sum1) / sum0 + : static_cast(0); + chi2s[ch] = chi2; + sum0s[ch] = sum0; + sumAAs[ch] = sumAA; #ifdef DEBUG_TC_NULLHYPOT - if (ch == 0) { - printf("chi2 = %f sum0 = %d sumAA = %f\n", chi2, static_cast(sum0), sumAA); - } + if (ch == 0) { + printf("chi2 = %f sum0 = %d sumAA = %f\n", + chi2, static_cast(sum0), sumAA); + } #endif } - } } - - constexpr float fast_expf(float x) { return unsafe_expf<6>(x); } - constexpr float fast_logf(float x) { return unsafe_logf<7>(x); } - - //#define DEBUG_TC_MAKERATIO - // - // launch ctx parameters are - // 45 threads per channel, X channels per block, Y blocks - // 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9 - // TODO: it might be much beter to use 32 threads per channel instead of 45 - // to simplify the synchronization - // - __global__ void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values, - SampleVector::Scalar const* sample_value_errors, - uint32_t const* dids, - bool const* useless_sample_values, - char const* pedestal_nums, - ConfigurationParameters::type const* amplitudeFitParametersEB, - ConfigurationParameters::type const* amplitudeFitParametersEE, - ConfigurationParameters::type const* timeFitParametersEB, - ConfigurationParameters::type const* timeFitParametersEE, - SampleVector::Scalar const* sumAAsNullHypot, - SampleVector::Scalar const* sum0sNullHypot, - SampleVector::Scalar* tMaxAlphaBetas, - SampleVector::Scalar* tMaxErrorAlphaBetas, - SampleVector::Scalar* g_accTimeMax, - SampleVector::Scalar* g_accTimeWgt, - TimeComputationState* g_state, - unsigned int const timeFitParameters_sizeEB, - unsigned int const timeFitParameters_sizeEE, - ConfigurationParameters::type const timeFitLimits_firstEB, - ConfigurationParameters::type const timeFitLimits_firstEE, - ConfigurationParameters::type const timeFitLimits_secondEB, - ConfigurationParameters::type const timeFitLimits_secondEE, - int const nchannels) { - using ScalarType = SampleVector::Scalar; - - // constants - constexpr int nthreads_per_channel = 45; // n=10, n(n-1)/2 - constexpr int nsamples = EcalDataFrame::MAXSAMPLES; - - // indices - int const gtx = threadIdx.x + blockDim.x * blockIdx.x; - int const ch = gtx / nthreads_per_channel; - int const lch = threadIdx.x / nthreads_per_channel; - int const ltx = threadIdx.x % nthreads_per_channel; - int const ch_start = ch * nsamples; - int const lch_start = lch * nthreads_per_channel; - int const nchannels_per_block = blockDim.x / nthreads_per_channel; - - // rmeove inactive threads - // TODO: need to understand if this is 100% safe in presence of syncthreads - if (ch >= nchannels) - return; - - auto const did = DetId{dids[ch]}; - auto const isBarrel = did.subdetId() == EcalBarrel; - auto const* amplitudeFitParameters = isBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE; - auto const* timeFitParameters = isBarrel ? timeFitParametersEB : timeFitParametersEE; - auto const timeFitParameters_size = isBarrel ? timeFitParameters_sizeEB : timeFitParameters_sizeEE; - auto const timeFitLimits_first = isBarrel ? timeFitLimits_firstEB : timeFitLimits_firstEE; - auto const timeFitLimits_second = isBarrel ? timeFitLimits_secondEB : timeFitLimits_secondEE; - - extern __shared__ char smem[]; - ScalarType* shr_chi2s = reinterpret_cast(smem); - ScalarType* shr_time_wgt = shr_chi2s + blockDim.x; - ScalarType* shr_time_max = shr_time_wgt + blockDim.x; - ScalarType* shrTimeMax = shr_time_max + blockDim.x; - ScalarType* shrTimeWgt = shrTimeMax + blockDim.x; - - // map tx -> (sample_i, sample_j) - int sample_i, sample_j = 0; - if (ltx >= 0 && ltx <= 8) { +} + +constexpr float fast_expf(float x) { return unsafe_expf<6>(x); } +constexpr float fast_logf(float x) { return unsafe_logf<7>(x); } + +//#define DEBUG_TC_MAKERATIO +// +// launch ctx parameters are +// 45 threads per channel, X channels per block, Y blocks +// 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9 +// TODO: it might be much beter to use 32 threads per channel instead of 45 +// to simplify the synchronization +// +__global__ +void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values, + SampleVector::Scalar const* sample_value_errors, + uint32_t const* dids_eb, + uint32_t const* dids_ee, + bool const* useless_sample_values, + char const* pedestal_nums, + ConfigurationParameters::type const* amplitudeFitParametersEB, + ConfigurationParameters::type const* amplitudeFitParametersEE, + ConfigurationParameters::type const* timeFitParametersEB, + ConfigurationParameters::type const* timeFitParametersEE, + SampleVector::Scalar const* sumAAsNullHypot, + SampleVector::Scalar const* sum0sNullHypot, + SampleVector::Scalar* tMaxAlphaBetas, + SampleVector::Scalar* tMaxErrorAlphaBetas, + SampleVector::Scalar* g_accTimeMax, + SampleVector::Scalar* g_accTimeWgt, + TimeComputationState* g_state, + unsigned int const timeFitParameters_sizeEB, + unsigned int const timeFitParameters_sizeEE, + ConfigurationParameters::type const timeFitLimits_firstEB, + ConfigurationParameters::type const timeFitLimits_firstEE, + ConfigurationParameters::type const timeFitLimits_secondEB, + ConfigurationParameters::type const timeFitLimits_secondEE, + int const nchannels, + uint32_t const offsetForInputs) { + using ScalarType = SampleVector::Scalar; + + // constants + constexpr int nthreads_per_channel = 45; // n=10, n(n-1)/2 + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + int const gtx = threadIdx.x + blockDim.x*blockIdx.x; + int const ch = gtx / nthreads_per_channel; + int const lch = threadIdx.x / nthreads_per_channel; + int const ltx = threadIdx.x % nthreads_per_channel; + int const ch_start = ch*nsamples; + int const lch_start = lch*nthreads_per_channel; + int const nchannels_per_block = blockDim.x / nthreads_per_channel; + auto const* dids = ch >= offsetForInputs + ? dids_ee + : dids_eb; + int const inputCh = ch >= offsetForInputs + ? ch - offsetForInputs + : ch; + + // rmeove inactive threads + // TODO: need to understand if this is 100% safe in presence of syncthreads + if (ch >= nchannels) return; + + auto const did = DetId{dids[inputCh]}; + auto const isBarrel = did.subdetId() == EcalBarrel; + auto const* amplitudeFitParameters = isBarrel + ? amplitudeFitParametersEB + : amplitudeFitParametersEE; + auto const* timeFitParameters = isBarrel + ? timeFitParametersEB + : timeFitParametersEE; + auto const timeFitParameters_size = isBarrel + ? timeFitParameters_sizeEB + : timeFitParameters_sizeEE; + auto const timeFitLimits_first = isBarrel + ? timeFitLimits_firstEB + : timeFitLimits_firstEE; + auto const timeFitLimits_second = isBarrel + ? timeFitLimits_secondEB + : timeFitLimits_secondEE; + + extern __shared__ char smem[]; + ScalarType* shr_chi2s = reinterpret_cast(smem); + ScalarType* shr_time_wgt = shr_chi2s + blockDim.x; + ScalarType* shr_time_max = shr_time_wgt + blockDim.x; + ScalarType* shrTimeMax = shr_time_max + blockDim.x; + ScalarType* shrTimeWgt = shrTimeMax + blockDim.x; + + // map tx -> (sample_i, sample_j) + int sample_i, sample_j = 0; + if (ltx>=0 && ltx<=8) { sample_i = 0; - sample_j = 1 + ltx; - } else if (ltx <= 16) { + sample_j = 1+ltx; + } else if (ltx<=16) { sample_i = 1; - sample_j = 2 + ltx - 9; - } else if (ltx <= 23) { + sample_j = 2+ltx-9; + } else if (ltx<=23) { sample_i = 2; sample_j = 3 + ltx - 17; - } else if (ltx <= 29) { + } else if (ltx<=29) { sample_i = 3; sample_j = 4 + ltx - 24; - } else if (ltx <= 34) { + } else if (ltx<=34) { sample_i = 4; sample_j = 5 + ltx - 30; - } else if (ltx <= 38) { + } else if (ltx<=38) { sample_i = 5; sample_j = 6 + ltx - 35; - } else if (ltx <= 41) { + } else if (ltx<=41) { sample_i = 6; sample_j = 7 + ltx - 39; - } else if (ltx <= 43) { + } else if (ltx<=43) { sample_i = 7; sample_j = 8 + ltx - 42; - } else if (ltx <= 44) { + } else if (ltx <= 44) { sample_i = 8; sample_j = 9; - } else + } else assert(false); - auto const tx_i = ch_start + sample_i; - auto const tx_j = ch_start + sample_j; + auto const tx_i = ch_start + sample_i; + auto const tx_j = ch_start + sample_j; - // - // note, given the way we partition the block, with 45 threads per channel - // we will end up with inactive threads which need to be dragged along - // through the synching point - // - /* + // + // note, given the way we partition the block, with 45 threads per channel + // we will end up with inactive threads which need to be dragged along + // through the synching point + // + /* bool const condToExit = ch >= nchannels ? true : useless_sample_values[tx_i] || useless_sample_values[tx_j] || sample_values[tx_i]<=1 || sample_values[tx_j]<=1; */ - bool const condForUselessSamples = useless_sample_values[tx_i] || useless_sample_values[tx_j] || - sample_values[tx_i] <= 1 || sample_values[tx_j] <= 1; - - // - // see cpu implementation for explanation - // - ScalarType chi2 = std::numeric_limits::max(); - ScalarType tmax = 0; - ScalarType tmaxerr = 0; - shrTimeMax[threadIdx.x] = 0; - shrTimeWgt[threadIdx.x] = 0; - bool internalCondForSkipping1 = true; - bool internalCondForSkipping2 = true; - if (!condForUselessSamples) { + bool const condForUselessSamples = useless_sample_values[tx_i] + || useless_sample_values[tx_j] + || sample_values[tx_i]<=1 || sample_values[tx_j]<=1; + + // + // see cpu implementation for explanation + // + ScalarType chi2 = std::numeric_limits::max(); + ScalarType tmax = 0; + ScalarType tmaxerr = 0; + shrTimeMax[threadIdx.x] = 0; + shrTimeWgt[threadIdx.x] = 0; + bool internalCondForSkipping1 = true; + bool internalCondForSkipping2 = true; + if (!condForUselessSamples) { auto const rtmp = sample_values[tx_i] / sample_values[tx_j]; auto const invampl_i = 1.0 / sample_values[tx_i]; - auto const relErr2_i = sample_value_errors[tx_i] * sample_value_errors[tx_i] * invampl_i * invampl_i; + auto const relErr2_i = sample_value_errors[tx_i]*sample_value_errors[tx_i]* + invampl_i*invampl_i; auto const invampl_j = 1.0 / sample_values[tx_j]; - auto const relErr2_j = sample_value_errors[tx_j] * sample_value_errors[tx_j] * invampl_j * invampl_j; + auto const relErr2_j = sample_value_errors[tx_j]*sample_value_errors[tx_j]* + invampl_j*invampl_j; auto const err1 = rtmp * rtmp * (relErr2_i + relErr2_j); - auto err2 = sample_value_errors[tx_j] * (sample_values[tx_i] - sample_values[tx_j]) * (invampl_j * invampl_j); + auto err2 = sample_value_errors[tx_j]* + (sample_values[tx_i] - sample_values[tx_j])*(invampl_j*invampl_j); // TODO non-divergent branch for a block if each block has 1 channel // otherwise non-divergent for groups of 45 threads // at this point, pedestal_nums[ch] can be either 0, 1 or 2 - if (pedestal_nums[ch] == 2) - err2 *= err2 * 0.5; - auto const err3 = (0.289 * 0.289) * (invampl_j * invampl_j); + if (pedestal_nums[ch]==2) + err2 *= err2 * 0.5; + auto const err3 = (0.289*0.289) * (invampl_j*invampl_j); auto const total_error = std::sqrt(err1 + err2 + err3); auto const alpha = amplitudeFitParameters[0]; @@ -261,153 +290,158 @@ namespace ecal { auto const ratio_value = rtmp; auto const ratio_error = total_error; - auto const rlim_i_j = fast_expf(static_cast(sample_j - sample_i) / beta) - 0.001; - internalCondForSkipping1 = !(total_error < 1.0 && rtmp > 0.001 && rtmp < rlim_i_j); + auto const rlim_i_j = fast_expf( + static_cast(sample_j - sample_i) / beta) - 0.001; + internalCondForSkipping1 = !(total_error<1.0 && rtmp>0.001 && rtmp= l_timeFitLimits_first && ratio_value <= l_timeFitLimits_second) { - auto const time_max_i = static_cast(ratio_index); - auto u = timeFitParameters[timeFitParameters_size - 1]; + // + // precompute. + // in cpu version this was done conditionally + // however easier to do it here (precompute) and then just filter out + // if not needed + // + auto const l_timeFitLimits_first = timeFitLimits_first; + auto const l_timeFitLimits_second = timeFitLimits_second; + if (ratio_step == 1 + && ratio_value >= l_timeFitLimits_first + && ratio_value <= l_timeFitLimits_second) { + + auto const time_max_i = static_cast(ratio_index); + auto u = timeFitParameters[timeFitParameters_size - 1]; #pragma unroll - for (int k = timeFitParameters_size - 2; k >= 0; k--) - u = u * ratio_value + timeFitParameters[k]; - - auto du = (timeFitParameters_size - 1) * (timeFitParameters[timeFitParameters_size - 1]); - for (int k = timeFitParameters_size - 2; k >= 1; k--) - du = du * ratio_value + k * timeFitParameters[k]; - - auto const error2 = ratio_error * ratio_error * du * du; - auto const time_max = error2 > 0 ? (time_max_i - u) / error2 : static_cast(0); - auto const time_wgt = error2 > 0 ? 1.0 / error2 : static_cast(0); - - // store into shared mem - // note, this name is essentially identical to the one used - // below. - shrTimeMax[threadIdx.x] = error2 > 0 ? time_max : 0; - shrTimeWgt[threadIdx.x] = error2 > 0 ? time_wgt : 0; - } else { - shrTimeMax[threadIdx.x] = 0; - shrTimeWgt[threadIdx.x] = 0; - } - - // continue with ratios - auto const stepOverBeta = static_cast(ratio_step) / beta; - auto const offset = static_cast(ratio_index) + alphabeta; - auto const rmin = std::max(ratio_value - ratio_error, 0.001); - auto const rmax = std::min(ratio_value + ratio_error, - fast_expf(static_cast(ratio_step) / beta) - 0.001); - auto const time1 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmin)) / alpha) - 1.0); - auto const time2 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmax)) / alpha) - 1.0); - - // set these guys - tmax = 0.5 * (time1 + time2); - tmaxerr = 0.5 * std::sqrt((time1 - time2) * (time1 - time2)); + for (int k=timeFitParameters_size-2; k>=0; k--) + u = u*ratio_value + timeFitParameters[k]; + + auto du = (timeFitParameters_size - 1) * + (timeFitParameters[timeFitParameters_size - 1]); + for (int k=timeFitParameters_size - 2; k>=1; k--) + du = du*ratio_value + k*timeFitParameters[k]; + + auto const error2 = ratio_error * ratio_error * du * du; + auto const time_max = error2 > 0 + ? (time_max_i - u) / error2 + : static_cast(0); + auto const time_wgt = error2 > 0 + ? 1.0 / error2 + : static_cast(0); + + // store into shared mem + // note, this name is essentially identical to the one used + // below. + shrTimeMax[threadIdx.x] = error2 > 0 ? time_max : 0; + shrTimeWgt[threadIdx.x] = error2 > 0 ? time_wgt : 0; + } else { + shrTimeMax[threadIdx.x] = 0; + shrTimeWgt[threadIdx.x] = 0; + } + + // continue with ratios + auto const stepOverBeta = static_cast(ratio_step) / beta; + auto const offset = static_cast(ratio_index) + alphabeta; + auto const rmin = std::max(ratio_value - ratio_error, 0.001); + auto const rmax = std::min(ratio_value + ratio_error, + fast_expf(static_cast(ratio_step) / beta) + - 0.001); + auto const time1 = + offset - + ratio_step / + (fast_expf((stepOverBeta - fast_logf(rmin)) / + alpha) - 1.0); + auto const time2 = + offset - + ratio_step / + (fast_expf((stepOverBeta - fast_logf(rmax)) / + alpha) - 1.0); + + // set these guys + tmax = 0.5 * (time1 + time2); + tmaxerr = 0.5 * std::sqrt((time1 - time2) * (time1 - time2)); #ifdef DEBUG_TC_MAKERATIO - if (ch == 1 || ch == 0) - printf("ch = %d ltx = %d tmax = %f tmaxerr = %f time1 = %f time2 = %f offset = %f rmin = %f rmax = %f\n", - ch, - ltx, - tmax, - tmaxerr, - time1, - time2, - offset, - rmin, - rmax); + if (ch == 1 || ch == 0) + printf("ch = %d ltx = %d tmax = %f tmaxerr = %f time1 = %f time2 = %f offset = %f rmin = %f rmax = %f\n", + ch, ltx, tmax, tmaxerr, time1, time2, offset, rmin, rmax); #endif - SampleVector::Scalar sumAf = 0; - SampleVector::Scalar sumff = 0; - int const itmin = std::max(-1, static_cast(std::floor(tmax - alphabeta))); - auto loffset = (static_cast(itmin) - tmax) * invalphabeta; - // TODO: data dependence - for (int it = itmin + 1; it < nsamples; it++) { - loffset += invalphabeta; - if (useless_sample_values[ch_start + it]) - continue; - auto const inverr2 = 1.0 / (sample_value_errors[ch_start + it] * sample_value_errors[ch_start + it]); - auto const term1 = 1.0 + loffset; - auto const f = (term1 > 1e-6) ? fast_expf(alpha * (fast_logf(term1) - loffset)) : 0; - sumAf += sample_values[ch_start + it] * (f * inverr2); - sumff += f * (f * inverr2); - } - - auto const sumAA = sumAAsNullHypot[ch]; - auto const sum0 = sum0sNullHypot[ch]; - chi2 = sumAA; - ScalarType amp = 0; - // TODO: sum0 can not be 0 below, need to introduce the check upfront - if (sumff > 0) { - chi2 = sumAA - sumAf * (sumAf / sumff); - amp = sumAf / sumff; - } - chi2 /= sum0; + SampleVector::Scalar sumAf = 0; + SampleVector::Scalar sumff = 0; + int const itmin = std::max(-1, static_cast(std::floor(tmax - alphabeta))); + auto loffset = (static_cast(itmin) - tmax) * invalphabeta; + // TODO: data dependence + for (int it = itmin+1; it 1e-6) + ? fast_expf(alpha * (fast_logf(term1) - loffset)) + : 0; + sumAf += sample_values[ch_start+it] * (f * inverr2); + sumff += f*(f*inverr2); + } + + auto const sumAA = sumAAsNullHypot[ch]; + auto const sum0 = sum0sNullHypot[ch]; + chi2 = sumAA; + ScalarType amp = 0; + // TODO: sum0 can not be 0 below, need to introduce the check upfront + if (sumff > 0) { + chi2 = sumAA - sumAf * (sumAf / sumff); + amp = sumAf / sumff; + } + chi2 /= sum0; #ifdef DEBUG_TC_MAKERATIO - if (ch == 1 || ch == 0) - printf("ch = %d ltx = %d sumAf = %f sumff = %f sumAA = %f sum0 = %d tmax = %f tmaxerr = %f chi2 = %f\n", - ch, - ltx, - sumAf, - sumff, - sumAA, - static_cast(sum0), - tmax, - tmaxerr, - chi2); + if (ch == 1 || ch == 0) + printf("ch = %d ltx = %d sumAf = %f sumff = %f sumAA = %f sum0 = %d tmax = %f tmaxerr = %f chi2 = %f\n", + ch, ltx, sumAf, sumff, sumAA, static_cast(sum0), tmax, tmaxerr, chi2); #endif - if (chi2 > 0 && tmax > 0 && tmaxerr > 0) - internalCondForSkipping2 = false; - else - chi2 = std::numeric_limits::max(); + if (chi2>0 && tmax>0 && tmaxerr>0) + internalCondForSkipping2 = false; + else + chi2 = std::numeric_limits::max(); } - } + } - // store into smem - shr_chi2s[threadIdx.x] = chi2; - __syncthreads(); + // store into smem + shr_chi2s[threadIdx.x] = chi2; + __syncthreads(); - // find min chi2 - quite crude for now - // TODO validate/check - char iter = nthreads_per_channel / 2 + nthreads_per_channel % 2; - bool oddElements = nthreads_per_channel % 2; + // find min chi2 - quite crude for now + // TODO validate/check + char iter = nthreads_per_channel / 2 + nthreads_per_channel % 2; + bool oddElements = nthreads_per_channel % 2; #pragma unroll - while (iter >= 1) { + while (iter>=1) { if (ltx < iter) - // for odd ns, the last guy will just store itself - // exception is for ltx == 0 and iter==1 - shr_chi2s[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0) - ? shr_chi2s[threadIdx.x] - : std::min(shr_chi2s[threadIdx.x], shr_chi2s[threadIdx.x + iter]); + // for odd ns, the last guy will just store itself + // exception is for ltx == 0 and iter==1 + shr_chi2s[threadIdx.x] = oddElements && (ltx==iter-1 && ltx>0) + ? shr_chi2s[threadIdx.x] + : std::min(shr_chi2s[threadIdx.x], shr_chi2s[threadIdx.x+iter]); __syncthreads(); oddElements = iter % 2; - iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2; - } + iter = iter==1 ? iter/2 : iter/2 + iter%2; + } - // filter out inactive or useless samples threads - if (!condForUselessSamples && !internalCondForSkipping1 && !internalCondForSkipping2) { + // filter out inactive or useless samples threads + if (!condForUselessSamples && !internalCondForSkipping1 + && !internalCondForSkipping2) { // min chi2, now compute weighted average of tmax measurements // see cpu version for more explanation auto const chi2min = shr_chi2s[threadIdx.x - ltx]; auto const chi2Limit = chi2min + 1.0; - auto const inverseSigmaSquared = chi2 < chi2Limit ? 1.0 / (tmaxerr * tmaxerr) : 0.0; + auto const inverseSigmaSquared = + chi2 < chi2Limit + ? 1.0 / (tmaxerr * tmaxerr) + : 0.0; #ifdef DEBUG_TC_MAKERATIO if (ch == 1 || ch == 0) - printf("ch = %d ltx = %d chi2min = %f chi2Limit = %f inverseSigmaSquared = %f\n", - ch, - ltx, - chi2min, - chi2Limit, - inverseSigmaSquared); + printf("ch = %d ltx = %d chi2min = %f chi2Limit = %f inverseSigmaSquared = %f\n", + ch, ltx, chi2min, chi2Limit, inverseSigmaSquared); #endif // store into shared mem and run reduction @@ -415,48 +449,48 @@ namespace ecal { // TODO: check if shuffling intrinsics are better shr_time_wgt[threadIdx.x] = inverseSigmaSquared; shr_time_max[threadIdx.x] = tmax * inverseSigmaSquared; - } else { + } else { shr_time_wgt[threadIdx.x] = 0; shr_time_max[threadIdx.x] = 0; - } - __syncthreads(); + } + __syncthreads(); - // reduce to compute time_max and time_wgt - iter = nthreads_per_channel / 2 + nthreads_per_channel % 2; - oddElements = nthreads_per_channel % 2; + // reduce to compute time_max and time_wgt + iter = nthreads_per_channel / 2 + nthreads_per_channel % 2; + oddElements = nthreads_per_channel % 2; #pragma unroll - while (iter >= 1) { + while (iter>=1) { if (ltx < iter) { - shr_time_wgt[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0) - ? shr_time_wgt[threadIdx.x] - : shr_time_wgt[threadIdx.x] + shr_time_wgt[threadIdx.x + iter]; - shr_time_max[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0) - ? shr_time_max[threadIdx.x] - : shr_time_max[threadIdx.x] + shr_time_max[threadIdx.x + iter]; - shrTimeMax[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0) - ? shrTimeMax[threadIdx.x] - : shrTimeMax[threadIdx.x] + shrTimeMax[threadIdx.x + iter]; - shrTimeWgt[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0) - ? shrTimeWgt[threadIdx.x] - : shrTimeWgt[threadIdx.x] + shrTimeWgt[threadIdx.x + iter]; + shr_time_wgt[threadIdx.x] = oddElements && (ltx==iter-1 && ltx>0) + ? shr_time_wgt[threadIdx.x] + : shr_time_wgt[threadIdx.x] + shr_time_wgt[threadIdx.x+iter]; + shr_time_max[threadIdx.x] = oddElements && (ltx==iter-1 && ltx>0) + ? shr_time_max[threadIdx.x] + : shr_time_max[threadIdx.x] + shr_time_max[threadIdx.x+iter]; + shrTimeMax[threadIdx.x] = oddElements && (ltx==iter-1 && ltx>0) + ? shrTimeMax[threadIdx.x] + : shrTimeMax[threadIdx.x] + shrTimeMax[threadIdx.x+iter]; + shrTimeWgt[threadIdx.x] = oddElements && (ltx==iter-1 && ltx>0) + ? shrTimeWgt[threadIdx.x] + : shrTimeWgt[threadIdx.x] + shrTimeWgt[threadIdx.x+iter]; } - + __syncthreads(); oddElements = iter % 2; - iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2; - } + iter = iter==1 ? iter/2 : iter/2 + iter%2; + } - // load from shared memory the 0th guy (will contain accumulated values) - // compute - // store into global mem - if (ltx == 0) { + // load from shared memory the 0th guy (will contain accumulated values) + // compute + // store into global mem + if (ltx == 0) { auto const tmp_time_max = shr_time_max[threadIdx.x]; auto const tmp_time_wgt = shr_time_wgt[threadIdx.x]; // we are done if there number of time ratios is 0 - if (tmp_time_wgt == 0 && tmp_time_max == 0) { - g_state[ch] = TimeComputationState::Finished; - return; + if (tmp_time_wgt==0 && tmp_time_max==0) { + g_state[ch] = TimeComputationState::Finished; + return ; } // no div by 0 @@ -470,25 +504,26 @@ namespace ecal { g_state[ch] = TimeComputationState::NotFinished; #ifdef DEBUG_TC_MAKERATIO - printf("ch = %d time_max = %f time_wgt = %f\n", ch, tmp_time_max, tmp_time_wgt); - printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f timeMax = %f timeWgt = %f\n", - ch, - tMaxAlphaBeta, - tMaxErrorAlphaBeta, - shrTimeMax[threadIdx.x], - shrTimeWgt[threadIdx.x]); + printf("ch = %d time_max = %f time_wgt = %f\n", + ch, tmp_time_max, tmp_time_wgt); + printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f timeMax = %f timeWgt = %f\n", + ch, tMaxAlphaBeta, tMaxErrorAlphaBeta, + shrTimeMax[threadIdx.x], + shrTimeWgt[threadIdx.x]); #endif - } } - - /// launch ctx parameters are - /// 10 threads per channel, N channels per block, Y blocks - /// TODO: do we need to keep the state around or can be removed?! - //#define DEBUG_FINDAMPLCHI2_AND_FINISH - __global__ void kernel_time_compute_findamplchi2_and_finish( +} + +/// launch ctx parameters are +/// 10 threads per channel, N channels per block, Y blocks +/// TODO: do we need to keep the state around or can be removed?! +//#define DEBUG_FINDAMPLCHI2_AND_FINISH +__global__ +void kernel_time_compute_findamplchi2_and_finish( SampleVector::Scalar const* sample_values, SampleVector::Scalar const* sample_value_errors, - uint32_t const* dids, + uint32_t const* dids_eb, + uint32_t const* dids_ee, bool const* useless_samples, SampleVector::Scalar const* g_tMaxAlphaBeta, SampleVector::Scalar const* g_tMaxErrorAlphaBeta, @@ -504,36 +539,44 @@ namespace ecal { SampleVector::Scalar* g_ampMaxError, SampleVector::Scalar* g_timeMax, SampleVector::Scalar* g_timeError, - int const nchannels) { - using ScalarType = SampleVector::Scalar; - - // constants - constexpr int nsamples = EcalDataFrame::MAXSAMPLES; - - // indices - int const gtx = threadIdx.x + blockIdx.x * blockDim.x; - int const ch = gtx / nsamples; - int const sample = threadIdx.x % nsamples; - int const ch_start = ch * nsamples; - - // configure shared mem - // per block, we need #threads per block * 2 * sizeof(ScalarType) - // we run with N channels per block - extern __shared__ char smem[]; - ScalarType* shr_sumAf = reinterpret_cast(smem); - ScalarType* shr_sumff = shr_sumAf + blockDim.x; - - if (ch >= nchannels) - return; - - auto state = g_state[ch]; - auto const did = DetId{dids[ch]}; - auto const* amplitudeFitParameters = - did.subdetId() == EcalBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE; - - // TODO is that better than storing into global and launching another kernel - // for the first 10 threads - if (state == TimeComputationState::NotFinished) { + int const nchannels, + uint32_t const offsetForInputs) { + using ScalarType = SampleVector::Scalar; + + // constants + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + int const gtx = threadIdx.x + blockIdx.x*blockDim.x; + int const ch = gtx / nsamples; + int const sample = threadIdx.x % nsamples; + int const ch_start = ch * nsamples; + auto const* dids = ch >= offsetForInputs + ? dids_ee + : dids_eb; + int const inputCh = ch >= offsetForInputs + ? ch - offsetForInputs + : ch; + + // configure shared mem + // per block, we need #threads per block * 2 * sizeof(ScalarType) + // we run with N channels per block + extern __shared__ char smem[]; + ScalarType* shr_sumAf = reinterpret_cast(smem); + ScalarType* shr_sumff = shr_sumAf + blockDim.x; + + if (ch >= nchannels) return; + + auto state = g_state[ch]; + auto const did = DetId{dids[inputCh]}; + auto const* amplitudeFitParameters = did.subdetId() == EcalBarrel + ? amplitudeFitParametersEB + : amplitudeFitParametersEE; + + + // TODO is that better than storing into global and launching another kernel + // for the first 10 threads + if (state == TimeComputationState::NotFinished) { auto const alpha = amplitudeFitParameters[0]; auto const beta = amplitudeFitParameters[1]; auto const alphabeta = alpha * beta; @@ -541,91 +584,96 @@ namespace ecal { auto const tMaxAlphaBeta = g_tMaxAlphaBeta[ch]; auto const sample_value = sample_values[gtx]; auto const sample_value_error = sample_value_errors[gtx]; - auto const inverr2 = - useless_samples[gtx] ? static_cast(0) : 1.0 / (sample_value_error * sample_value_error); - auto const offset = (static_cast(sample) - tMaxAlphaBeta) * invalphabeta; + auto const inverr2 = useless_samples[gtx] + ? static_cast(0) + : 1.0 / (sample_value_error * sample_value_error); + auto const offset = (static_cast(sample) - tMaxAlphaBeta) + * invalphabeta; auto const term1 = 1.0 + offset; - auto const f = term1 > 1e-6 ? fast_expf(alpha * (fast_logf(term1) - offset)) : static_cast(0.0); + auto const f = term1 > 1e-6 + ? fast_expf(alpha * (fast_logf(term1) - offset)) + : static_cast(0.0); auto const sumAf = sample_value * (f * inverr2); auto const sumff = f * (f * inverr2); // store into shared mem shr_sumAf[threadIdx.x] = sumAf; shr_sumff[threadIdx.x] = sumff; - } else { + } else { shr_sumAf[threadIdx.x] = 0; shr_sumff[threadIdx.x] = 0; - } - __syncthreads(); - - // reduce - // unroll completely here (but hardcoded) - if (sample < 5) { - shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x + 5]; - shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x + 5]; - } - __syncthreads(); - - if (sample < 2) { + } + __syncthreads(); + + // reduce + // unroll completely here (but hardcoded) + if (sample<5) { + shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x+5]; + shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x+5]; + } + __syncthreads(); + + if (sample<2) { // will need to subtract for ltx = 3, we double count here - shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x + 2] + shr_sumAf[threadIdx.x + 3]; - shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x + 2] + shr_sumff[threadIdx.x + 3]; - } - __syncthreads(); + shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x+2] + + shr_sumAf[threadIdx.x+3]; + shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x+2] + + shr_sumff[threadIdx.x+3]; + } + __syncthreads(); - if (sample == 0) { + if (sample==0) { // exit if the state is done // note, we do not exit before all __synchtreads are finished if (state == TimeComputationState::Finished) { - g_timeMax[ch] = 5; - g_timeError[ch] = -999; - return; + g_timeMax[ch] = 5; + g_timeError[ch] = -999; + return; } // subtract to avoid double counting - auto const sumff = shr_sumff[threadIdx.x] + shr_sumff[threadIdx.x + 1] - shr_sumff[threadIdx.x + 3]; - auto const sumAf = shr_sumAf[threadIdx.x] + shr_sumAf[threadIdx.x + 1] - shr_sumAf[threadIdx.x + 3]; - - auto const ampMaxAlphaBeta = sumff > 0 ? sumAf / sumff : 0; + auto const sumff = shr_sumff[threadIdx.x] + + shr_sumff[threadIdx.x+1] + - shr_sumff[threadIdx.x+3]; + auto const sumAf = shr_sumAf[threadIdx.x] + + shr_sumAf[threadIdx.x+1] + - shr_sumAf[threadIdx.x+3]; + + auto const ampMaxAlphaBeta = sumff>0 ? sumAf / sumff : 0; auto const sumAA = sumAAsNullHypot[ch]; auto const sum0 = sum0sNullHypot[ch]; auto const nullChi2 = chi2sNullHypot[ch]; if (sumff > 0) { - auto const chi2AlphaBeta = (sumAA - sumAf * sumAf / sumff) / sum0; - if (chi2AlphaBeta > nullChi2) { - // null hypothesis is better - state = TimeComputationState::Finished; + auto const chi2AlphaBeta = (sumAA - sumAf * sumAf / sumff) / sum0; + if (chi2AlphaBeta > nullChi2) { + // null hypothesis is better + state = TimeComputationState::Finished; #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH - printf("ch = %d chi2AlphaBeta = %f nullChi2 = %f sumAA = %f sumAf = %f sumff = %f sum0 = %f\n", - ch, - chi2AlphaBeta, - nullChi2, - sumAA, - sumAf, - sumff, - sum0); + printf("ch = %d chi2AlphaBeta = %f nullChi2 = %f sumAA = %f sumAf = %f sumff = %f sum0 = %f\n", + ch, chi2AlphaBeta, nullChi2, sumAA, sumAf, sumff, sum0); #endif - } + } - // store to global - g_ampMaxAlphaBeta[ch] = ampMaxAlphaBeta; + // store to global + g_ampMaxAlphaBeta[ch] = ampMaxAlphaBeta; } else { #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH - printf("ch = %d sum0 = %f sumAA = %f sumff = %f sumAf = %f\n", ch, sum0, sumAA, sumff, sumAf); + printf("ch = %d sum0 = %f sumAA = %f sumff = %f sumAf = %f\n", + ch, sum0, sumAA, sumff, sumAf); #endif - state = TimeComputationState::Finished; + state = TimeComputationState::Finished; } // store the state to global and finish calcs g_state[ch] = state; if (state == TimeComputationState::Finished) { - // store default values into global - g_timeMax[ch] = 5; - g_timeError[ch] = -999; + // store default values into global + g_timeMax[ch] = 5; + g_timeError[ch] = -999; #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH - printf("ch = %d finished state\n", ch); + printf("ch = %d finished state\n", ch); #endif - return; + return; } auto const ampMaxError = g_ampMaxError[ch]; @@ -636,242 +684,306 @@ namespace ecal { auto const tMaxErrorAlphaBeta = g_tMaxErrorAlphaBeta[ch]; // branch to separate large vs small pulses // see cpu version for more info - if (test_ratio > 5.0 && accTimeWgt > 0) { - auto const tMaxRatio = accTimeWgt > 0 ? accTimeMax / accTimeWgt : static_cast(0); - auto const tMaxErrorRatio = accTimeWgt > 0 ? 1.0 / std::sqrt(accTimeWgt) : static_cast(0); - - if (test_ratio > 10.0) { - g_timeMax[ch] = tMaxRatio; - g_timeError[ch] = tMaxErrorRatio; - + if (test_ratio > 5.0 && accTimeWgt>0) { + auto const tMaxRatio = accTimeWgt>0 + ? accTimeMax / accTimeWgt + : static_cast(0); + auto const tMaxErrorRatio = accTimeWgt>0 + ? 1.0 / std::sqrt(accTimeWgt) + : static_cast(0); + + if (test_ratio > 10.0) { + g_timeMax[ch] = tMaxRatio; + g_timeError[ch] = tMaxErrorRatio; + #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH - printf("ch = %d tMaxRatio = %f tMaxErrorRatio = %f\n", ch, tMaxRatio, tMaxErrorRatio); + printf("ch = %d tMaxRatio = %f tMaxErrorRatio = %f\n", + ch, tMaxRatio, tMaxErrorRatio); #endif - } else { - auto const timeMax = (tMaxAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) + - tMaxRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) / - 5.0; - auto const timeError = (tMaxErrorAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) + - tMaxErrorRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) / - 5.0; - state = TimeComputationState::Finished; - g_state[ch] = state; - g_timeMax[ch] = timeMax; - g_timeError[ch] = timeError; + } else { + auto const timeMax = + (tMaxAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) + + tMaxRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) / 5.0; + auto const timeError = + (tMaxErrorAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) + + tMaxErrorRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) / 5.0; + state = TimeComputationState::Finished; + g_state[ch] = state; + g_timeMax[ch] = timeMax; + g_timeError[ch] = timeError; #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH - printf("ch = %d timeMax = %f timeError = %f\n", ch, timeMax, timeError); + printf("ch = %d timeMax = %f timeError = %f\n", + ch, timeMax, timeError); #endif - } - } else { - state = TimeComputationState::Finished; - g_state[ch] = state; - g_timeMax[ch] = tMaxAlphaBeta; - g_timeError[ch] = tMaxErrorAlphaBeta; + } + } + else { + state = TimeComputationState::Finished; + g_state[ch] = state; + g_timeMax[ch] = tMaxAlphaBeta; + g_timeError[ch] = tMaxErrorAlphaBeta; #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH - printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f\n", ch, tMaxAlphaBeta, tMaxErrorAlphaBeta); + printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f\n", + ch, tMaxAlphaBeta, tMaxErrorAlphaBeta); #endif } - } } - - __global__ void kernel_time_compute_fixMGPAslew(uint16_t const* digis, - SampleVector::Scalar* sample_values, - SampleVector::Scalar* sample_value_errors, - bool* useless_sample_values, - unsigned int const sample_mask, - int const nchannels) { - using ScalarType = SampleVector::Scalar; - - // constants - constexpr int nsamples = EcalDataFrame::MAXSAMPLES; - - // indices - int const gtx = threadIdx.x + blockIdx.x * blockDim.x; - int const ch = gtx / nsamples; - int const sample = threadIdx.x % nsamples; - - // remove thread for sample 0, oversubscribing is easier than .... - if (ch >= nchannels || sample == 0) - return; - - if (!use_sample(sample_mask, sample)) - return; - - auto const gainIdPrev = ecal::mgpa::gainId(digis[gtx - 1]); - auto const gainIdNext = ecal::mgpa::gainId(digis[gtx]); - if (gainIdPrev >= 1 && gainIdPrev <= 3 && gainIdNext >= 1 && gainIdNext <= 3 && gainIdPrev < gainIdNext) { - sample_values[gtx - 1] = 0; - sample_value_errors[gtx - 1] = 1e+9; - useless_sample_values[gtx - 1] = true; - } +} + +__global__ +void kernel_time_compute_fixMGPAslew(uint16_t const* digis_eb, + uint16_t const* digis_ee, + SampleVector::Scalar* sample_values, + SampleVector::Scalar* sample_value_errors, + bool* useless_sample_values, + unsigned int const sample_mask, + int const nchannels, + uint32_t const offsetForInputs) { + using ScalarType = SampleVector::Scalar; + + // constants + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + int const gtx = threadIdx.x + blockIdx.x * blockDim.x; + int const ch = gtx / nsamples; + int const sample = threadIdx.x % nsamples; + int const inputCh = ch >= offsetForInputs + ? ch - offsetForInputs + : ch; + int const inputGtx = ch >= offsetForInputs + ? gtx - offsetForInputs*nsamples + : gtx; + auto const* digis = ch >= offsetForInputs + ? digis_ee + : digis_eb; + + // remove thread for sample 0, oversubscribing is easier than .... + if (ch >= nchannels || sample==0) return; + + if (!use_sample(sample_mask, sample)) return; + + auto const gainIdPrev = ecal::mgpa::gainId(digis[inputGtx-1]); + auto const gainIdNext = ecal::mgpa::gainId(digis[inputGtx]); + if (gainIdPrev>=1 && gainIdPrev<=3 && + gainIdNext>=1 && gainIdNext<=3 && gainIdPrev < gainIdNext) { + sample_values[gtx-1] = 0; + sample_value_errors[gtx-1] = 1e+9; + useless_sample_values[gtx-1] = true; + } +} + +__global__ +void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values, + SampleVector::Scalar const* sample_value_errors, + uint32_t const* dids, + bool const* useless_samples, + SampleVector::Scalar const* g_timeMax, + SampleVector::Scalar const* amplitudeFitParametersEB, + SampleVector::Scalar const* amplitudeFitParametersEE, + SampleVector::Scalar *g_amplitudeMax, + int const nchannels) { + using ScalarType = SampleVector::Scalar; + + // constants + constexpr ScalarType corr4 = 1.; + constexpr ScalarType corr6 = 1.; + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + int const gtx = threadIdx.x + blockIdx.x * blockDim.x; + int const ch = gtx / nsamples; + int const sample = threadIdx.x % nsamples; + + if (ch >= nchannels) return; + + auto const did = DetId{dids[ch]}; + auto const* amplitudeFitParameters = did.subdetId() == EcalBarrel + ? amplitudeFitParametersEB + : amplitudeFitParametersEE; + + // configure shared mem + extern __shared__ char smem[]; + ScalarType* shr_sum1 = reinterpret_cast(smem); + auto *shr_sumA = shr_sum1 + blockDim.x; + auto *shr_sumF = shr_sumA + blockDim.x; + auto *shr_sumAF = shr_sumF + blockDim.x; + auto *shr_sumFF = shr_sumAF + blockDim.x; + + auto const alpha = amplitudeFitParameters[0]; + auto const beta = amplitudeFitParameters[1]; + auto const timeMax = g_timeMax[ch]; + auto const pedestalLimit = timeMax - (alpha * beta) - 1.0; + auto const sample_value = sample_values[gtx]; + auto const sample_value_error = sample_value_errors[gtx]; + auto const inverr2 = sample_value_error > 0 + ? 1. / (sample_value_error * sample_value_error) + : static_cast(0); + auto const termOne = 1 + (sample - timeMax) / (alpha * beta); + auto const f = termOne > 1.e-5 + ? fast_expf(alpha * fast_logf(termOne) - + (sample - timeMax) / beta) + : static_cast(0.); + + bool const cond = ((sample < pedestalLimit) || + (f>0.6*corr6 && sample<=timeMax) || + (f>0.4*corr4 && sample>=timeMax)) && !useless_samples[gtx]; + + // store into shared mem + shr_sum1[threadIdx.x] = cond ? inverr2 : static_cast(0); + shr_sumA[threadIdx.x] = cond + ? sample_value * inverr2 + : static_cast(0); + shr_sumF[threadIdx.x] = cond + ? f * inverr2 + : static_cast(0); + shr_sumAF[threadIdx.x] = cond + ? (f*inverr2)*sample_value + : static_cast(0); + shr_sumFF[threadIdx.x] = cond + ? f*(f*inverr2) + : static_cast(0); + + // reduction + if (sample <= 4) { + shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x+5]; + shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x+5]; + shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x+5]; + shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x+5]; + shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x+5]; } + __syncthreads(); - __global__ void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values, - SampleVector::Scalar const* sample_value_errors, - uint32_t const* dids, - bool const* useless_samples, - SampleVector::Scalar const* g_timeMax, - SampleVector::Scalar const* amplitudeFitParametersEB, - SampleVector::Scalar const* amplitudeFitParametersEE, - SampleVector::Scalar* g_amplitudeMax, - int const nchannels) { - using ScalarType = SampleVector::Scalar; - - // constants - constexpr ScalarType corr4 = 1.; - constexpr ScalarType corr6 = 1.; - constexpr int nsamples = EcalDataFrame::MAXSAMPLES; - - // indices - int const gtx = threadIdx.x + blockIdx.x * blockDim.x; - int const ch = gtx / nsamples; - int const sample = threadIdx.x % nsamples; - - if (ch >= nchannels) - return; - - auto const did = DetId{dids[ch]}; - auto const* amplitudeFitParameters = - did.subdetId() == EcalBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE; - - // configure shared mem - extern __shared__ char smem[]; - ScalarType* shr_sum1 = reinterpret_cast(smem); - auto* shr_sumA = shr_sum1 + blockDim.x; - auto* shr_sumF = shr_sumA + blockDim.x; - auto* shr_sumAF = shr_sumF + blockDim.x; - auto* shr_sumFF = shr_sumAF + blockDim.x; - - auto const alpha = amplitudeFitParameters[0]; - auto const beta = amplitudeFitParameters[1]; - auto const timeMax = g_timeMax[ch]; - auto const pedestalLimit = timeMax - (alpha * beta) - 1.0; - auto const sample_value = sample_values[gtx]; - auto const sample_value_error = sample_value_errors[gtx]; - auto const inverr2 = - sample_value_error > 0 ? 1. / (sample_value_error * sample_value_error) : static_cast(0); - auto const termOne = 1 + (sample - timeMax) / (alpha * beta); - auto const f = termOne > 1.e-5 ? fast_expf(alpha * fast_logf(termOne) - (sample - timeMax) / beta) - : static_cast(0.); - - bool const cond = ((sample < pedestalLimit) || (f > 0.6 * corr6 && sample <= timeMax) || - (f > 0.4 * corr4 && sample >= timeMax)) && - !useless_samples[gtx]; - - // store into shared mem - shr_sum1[threadIdx.x] = cond ? inverr2 : static_cast(0); - shr_sumA[threadIdx.x] = cond ? sample_value * inverr2 : static_cast(0); - shr_sumF[threadIdx.x] = cond ? f * inverr2 : static_cast(0); - shr_sumAF[threadIdx.x] = cond ? (f * inverr2) * sample_value : static_cast(0); - shr_sumFF[threadIdx.x] = cond ? f * (f * inverr2) : static_cast(0); - - // reduction - if (sample <= 4) { - shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x + 5]; - shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x + 5]; - shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x + 5]; - shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x + 5]; - shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x + 5]; - } - __syncthreads(); - - if (sample < 2) { + if (sample < 2) { // note: we double count sample 3 - shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x + 2] + shr_sum1[threadIdx.x + 3]; - shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x + 2] + shr_sumA[threadIdx.x + 3]; - shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x + 2] + shr_sumF[threadIdx.x + 3]; - shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x + 2] + shr_sumAF[threadIdx.x + 3]; - shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x + 2] + shr_sumFF[threadIdx.x + 3]; - } - __syncthreads(); - - if (sample == 0) { - auto const sum1 = shr_sum1[threadIdx.x] + shr_sum1[threadIdx.x + 1] - shr_sum1[threadIdx.x + 3]; - auto const sumA = shr_sumA[threadIdx.x] + shr_sumA[threadIdx.x + 1] - shr_sumA[threadIdx.x + 3]; - auto const sumF = shr_sumF[threadIdx.x] + shr_sumF[threadIdx.x + 1] - shr_sumF[threadIdx.x + 3]; - auto const sumAF = shr_sumAF[threadIdx.x] + shr_sumAF[threadIdx.x + 1] - shr_sumAF[threadIdx.x + 3]; - auto const sumFF = shr_sumFF[threadIdx.x] + shr_sumFF[threadIdx.x + 1] - shr_sumFF[threadIdx.x + 3]; - - auto const denom = sumFF * sum1 - sumF * sumF; - auto const condForDenom = sum1 > 0 && ecal::abs(denom) > 1.e-20; - auto const amplitudeMax = condForDenom ? (sumAF * sum1 - sumA * sumF) / denom : static_cast(0.); + shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x+2] + shr_sum1[threadIdx.x+3]; + shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x+2] + shr_sumA[threadIdx.x+3]; + shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x+2] + shr_sumF[threadIdx.x+3]; + shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x+2] + + shr_sumAF[threadIdx.x+3]; + shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x+2] + + shr_sumFF[threadIdx.x+3]; + } + __syncthreads(); + + if (sample == 0) { + auto const sum1 = shr_sum1[threadIdx.x] + + shr_sum1[threadIdx.x+1] - shr_sum1[threadIdx.x+3]; + auto const sumA = shr_sumA[threadIdx.x] + + shr_sumA[threadIdx.x+1] - shr_sumA[threadIdx.x+3]; + auto const sumF = shr_sumF[threadIdx.x] + + shr_sumF[threadIdx.x+1] - shr_sumF[threadIdx.x+3]; + auto const sumAF = shr_sumAF[threadIdx.x] + + shr_sumAF[threadIdx.x+1] - shr_sumAF[threadIdx.x+3]; + auto const sumFF = shr_sumFF[threadIdx.x] + + shr_sumFF[threadIdx.x+1] - shr_sumFF[threadIdx.x+3]; + + auto const denom = sumFF * sum1 - sumF*sumF; + auto const condForDenom = sum1 > 0 && ecal::abs(denom)>1.e-20; + auto const amplitudeMax = condForDenom + ? (sumAF * sum1 - sumA * sumF) / denom + : static_cast(0.); // store into global mem g_amplitudeMax[ch] = amplitudeMax; - } } - - //#define ECAL_RECO_CUDA_TC_INIT_DEBUG - __global__ void kernel_time_computation_init(uint16_t const* digis, - uint32_t const* dids, - float const* rms_x12, - float const* rms_x6, - float const* rms_x1, - float const* mean_x12, - float const* mean_x6, - float const* mean_x1, - float const* gain12Over6, - float const* gain6Over1, - SampleVector::Scalar* sample_values, - SampleVector::Scalar* sample_value_errors, - SampleVector::Scalar* ampMaxError, - bool* useless_sample_values, - char* pedestal_nums, - uint32_t const offsetForHashes, - unsigned int const sample_maskEB, - unsigned int const sample_maskEE, - int nchannels) { - using ScalarType = SampleVector::Scalar; - - // constants - constexpr int nsamples = EcalDataFrame::MAXSAMPLES; - - // indices - int tx = threadIdx.x + blockDim.x * blockIdx.x; - int ch = tx / nsamples; - - if (ch < nchannels) { +} + +//#define ECAL_RECO_CUDA_TC_INIT_DEBUG +__global__ +void kernel_time_computation_init(uint16_t const* digis_eb, + uint32_t const* dids_eb, + uint16_t const* digis_ee, + uint32_t const* dids_ee, + float const* rms_x12, + float const* rms_x6, + float const* rms_x1, + float const* mean_x12, + float const* mean_x6, + float const* mean_x1, + float const* gain12Over6, + float const* gain6Over1, + SampleVector::Scalar* sample_values, + SampleVector::Scalar* sample_value_errors, + SampleVector::Scalar* ampMaxError, + bool* useless_sample_values, + char* pedestal_nums, + uint32_t const offsetForHashes, + uint32_t const offsetForInputs, + unsigned int const sample_maskEB, + unsigned int const sample_maskEE, + int nchannels) { + using ScalarType = SampleVector::Scalar; + + // constants + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + int const tx = threadIdx.x + blockDim.x*blockIdx.x; + int const ch = tx/nsamples; + int const inputTx = ch >= offsetForInputs + ? tx - offsetForInputs*nsamples + : tx; + int const inputCh = ch >= offsetForInputs + ? ch - offsetForInputs + : ch; + auto const* digis = ch >= offsetForInputs + ? digis_ee + : digis_eb; + auto const* dids = ch >= offsetForInputs + ? dids_ee + : dids_eb; + + if (ch < nchannels) { // indices/inits - int sample = tx % nsamples; - int ch_start = ch * nsamples; + int const sample = tx % nsamples; + int const ch_start = ch*nsamples; + int const input_ch_start = inputCh*nsamples; SampleVector::Scalar pedestal = 0.; int num = 0; // configure shared mem extern __shared__ char smem[]; - ScalarType* shrSampleValues = reinterpret_cast(smem); + ScalarType* shrSampleValues = + reinterpret_cast(smem); ScalarType* shrSampleValueErrors = shrSampleValues + blockDim.x; // 0 and 1 sample values - auto const adc0 = ecal::mgpa::adc(digis[ch_start]); - auto const gainId0 = ecal::mgpa::gainId(digis[ch_start]); - auto const adc1 = ecal::mgpa::adc(digis[ch_start + 1]); - auto const gainId1 = ecal::mgpa::gainId(digis[ch_start + 1]); - auto const did = DetId{dids[ch]}; + auto const adc0 = ecal::mgpa::adc(digis[input_ch_start]); + auto const gainId0 = ecal::mgpa::gainId(digis[input_ch_start]); + auto const adc1 = ecal::mgpa::adc(digis[input_ch_start+1]); + auto const gainId1 = ecal::mgpa::gainId(digis[input_ch_start+1]); + auto const did = DetId{dids[inputCh]}; auto const isBarrel = did.subdetId() == EcalBarrel; - auto const sample_mask = did.subdetId() == EcalBarrel ? sample_maskEB : sample_maskEE; - auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId()); + auto const sample_mask = did.subdetId() == EcalBarrel + ? sample_maskEB + : sample_maskEE; + auto const hashedId = isBarrel + ? hashedIndexEB(did.rawId()) + : offsetForHashes + hashedIndexEE(did.rawId()); // set pedestal // TODO this branch is non-divergent for a group of 10 threads if (gainId0 == 1 && use_sample(sample_mask, 0)) { - pedestal = static_cast(adc0); - num = 1; - - auto const diff = adc1 - adc0; - if (gainId1 == 1 && use_sample(sample_mask, 1) && std::abs(diff) < 3 * rms_x12[hashedId]) { - pedestal = (pedestal + static_cast(adc1)) / 2.0; - num = 2; - } + pedestal = static_cast(adc0); + num=1; + + auto const diff = adc1 - adc0; + if (gainId1 == 1 && use_sample(sample_mask, 1) + && std::abs(diff) < 3*rms_x12[hashedId]) { + pedestal = + (pedestal + static_cast(adc1)) / 2.0; + num=2; + } } else { - pedestal = mean_x12[ch]; + pedestal = mean_x12[ch]; } // ped subtracted and gain-renormalized samples. - auto const gainId = ecal::mgpa::gainId(digis[tx]); - auto const adc = ecal::mgpa::adc(digis[tx]); + auto const gainId = ecal::mgpa::gainId(digis[inputTx]); + auto const adc = ecal::mgpa::adc(digis[inputTx]); bool bad = false; SampleVector::Scalar sample_value, sample_value_error; @@ -879,23 +991,25 @@ namespace ecal { // TODO: piece below is general both for amplitudes and timing // potentially there is a way to reduce the amount of code... if (!use_sample(sample_mask, sample)) { - bad = true; - sample_value = 0; - sample_value_error = 0; + bad = true; + sample_value = 0; + sample_value_error = 0; } else if (gainId == 1) { - sample_value = static_cast(adc) - pedestal; - sample_value_error = rms_x12[hashedId]; + sample_value = static_cast(adc) - pedestal; + sample_value_error = rms_x12[hashedId]; } else if (gainId == 2) { - sample_value = (static_cast(adc) - mean_x6[hashedId]) * gain12Over6[hashedId]; - sample_value_error = rms_x6[hashedId] * gain12Over6[hashedId]; + sample_value = (static_cast(adc) + - mean_x6[hashedId]) * gain12Over6[hashedId]; + sample_value_error = rms_x6[hashedId] * gain12Over6[hashedId]; } else if (gainId == 3) { - sample_value = (static_cast(adc) - mean_x1[hashedId]) * gain6Over1[hashedId] * - gain12Over6[hashedId]; - sample_value_error = rms_x1[hashedId] * gain6Over1[hashedId] * gain12Over6[hashedId]; + sample_value = (static_cast(adc) + - mean_x1[hashedId]) * gain6Over1[hashedId] * gain12Over6[hashedId]; + sample_value_error = rms_x1[hashedId] + * gain6Over1[hashedId] * gain12Over6[hashedId]; } else { - sample_value = 0; - sample_value_error = 0; - bad = true; + sample_value = 0; + sample_value_error = 0; + bad = true; } // TODO: make sure we save things correctly when sample is useless @@ -907,76 +1021,90 @@ namespace ecal { // DEBUG #ifdef ECAL_RECO_CUDA_TC_INIT_DEBUG if (ch == 0) { - printf("sample = %d sample_value = %f sample_value_error = %f useless = %c\n", - sample, - sample_value, - sample_value_error, - useless_sample ? '1' : '0'); + printf("sample = %d sample_value = %f sample_value_error = %f useless = %c\n", + sample, sample_value, sample_value_error, + useless_sample ? '1' : '0'); } #endif // store into the shared mem - shrSampleValues[threadIdx.x] = sample_value_error > 0 ? sample_value : std::numeric_limits::min(); + shrSampleValues[threadIdx.x] = sample_value_error > 0 + ? sample_value + : std::numeric_limits::min(); shrSampleValueErrors[threadIdx.x] = sample_value_error; __syncthreads(); // perform the reduction with min if (sample < 5) { - // note, if equal -> we keep the value with lower sample as for cpu - shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 5] - ? shrSampleValueErrors[threadIdx.x + 5] - : shrSampleValueErrors[threadIdx.x]; - shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 5]); + // note, if equal -> we keep the value with lower sample as for cpu + shrSampleValueErrors[threadIdx.x] = + shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x+5] + ? shrSampleValueErrors[threadIdx.x+5] + : shrSampleValueErrors[threadIdx.x]; + shrSampleValues[threadIdx.x] = + std::max(shrSampleValues[threadIdx.x], + shrSampleValues[threadIdx.x+5]); } __syncthreads(); // a bit of an overkill, but easier than to compare across 3 values - if (sample < 3) { - shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 3] - ? shrSampleValueErrors[threadIdx.x + 3] - : shrSampleValueErrors[threadIdx.x]; - shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 3]); + if (sample<3) { + shrSampleValueErrors[threadIdx.x] = + shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x+3] + ? shrSampleValueErrors[threadIdx.x+3] + : shrSampleValueErrors[threadIdx.x]; + shrSampleValues[threadIdx.x] = + std::max(shrSampleValues[threadIdx.x], + shrSampleValues[threadIdx.x+3]); } __syncthreads(); if (sample < 2) { - shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 2] - ? shrSampleValueErrors[threadIdx.x + 2] - : shrSampleValueErrors[threadIdx.x]; - shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 2]); + shrSampleValueErrors[threadIdx.x] = + shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x+2] + ? shrSampleValueErrors[threadIdx.x+2] + : shrSampleValueErrors[threadIdx.x]; + shrSampleValues[threadIdx.x] = + std::max(shrSampleValues[threadIdx.x], + shrSampleValues[threadIdx.x+2]); } __syncthreads(); - + if (sample == 0) { - // we only needd the max error - auto const maxSampleValueError = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 1] - ? shrSampleValueErrors[threadIdx.x + 1] - : shrSampleValueErrors[threadIdx.x]; - - // # pedestal samples used - pedestal_nums[ch] = num; - // this is used downstream - ampMaxError[ch] = maxSampleValueError; - - // DEBUG + // we only needd the max error + auto const maxSampleValueError = + shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x+1] + ? shrSampleValueErrors[threadIdx.x+1] + : shrSampleValueErrors[threadIdx.x]; + + // # pedestal samples used + pedestal_nums[ch] = num; + // this is used downstream + ampMaxError[ch] = maxSampleValueError; + + // DEBUG #ifdef ECAL_RECO_CUDA_TC_INIT_DEBUG - if (ch == 0) { - printf("pedestal_nums = %d ampMaxError = %f\n", num, maxSampleValueError); - } + if (ch == 0) { + printf("pedestal_nums = %d ampMaxError = %f\n", + num, maxSampleValueError); + } #endif } - } } - - /// - /// launch context parameters: 1 thread per channel - /// - //#define DEBUG_TIME_CORRECTION - __global__ void kernel_time_correction_and_finalize( - // SampleVector::Scalar const* g_amplitude, +} + +/// +/// launch context parameters: 1 thread per channel +/// +//#define DEBUG_TIME_CORRECTION +__global__ +void kernel_time_correction_and_finalize( +// SampleVector::Scalar const* g_amplitude, ::ecal::reco::StorageScalarType const* g_amplitude, - uint16_t const* digis, - uint32_t const* dids, + uint16_t const* digis_eb, + uint32_t const* dids_eb, + uint16_t const* digis_ee, + uint32_t const* dids_ee, float const* amplitudeBinsEB, float const* amplitudeBinsEE, float const* shiftBinsEB, @@ -985,9 +1113,9 @@ namespace ecal { SampleVector::Scalar const* g_timeError, float const* g_rms_x12, float const* timeCalibConstant, - float* g_jitter, - float* g_jitterError, - uint32_t* flags, + float *g_jitter, + float *g_jitterError, + uint32_t *flags, int const amplitudeBinsSizeEB, int const amplitudeBinsSizeEE, ConfigurationParameters::type const timeConstantTermEB, @@ -1007,105 +1135,137 @@ namespace ecal { ConfigurationParameters::type const outOfTimeThreshG61mEB, ConfigurationParameters::type const outOfTimeThreshG61mEE, uint32_t const offsetForHashes, + uint32_t const offsetForInputs, int const nchannels) { - using ScalarType = SampleVector::Scalar; - - // constants - constexpr int nsamples = EcalDataFrame::MAXSAMPLES; - - // indices - int const gtx = threadIdx.x + blockIdx.x * blockDim.x; - - // filter out outside of range threads - if (gtx >= nchannels) - return; - - auto const did = DetId{dids[gtx]}; - auto const isBarrel = did.subdetId() == EcalBarrel; - auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId()); - auto const* amplitudeBins = isBarrel ? amplitudeBinsEB : amplitudeBinsEE; - auto const* shiftBins = isBarrel ? shiftBinsEB : shiftBinsEE; - auto const amplitudeBinsSize = isBarrel ? amplitudeBinsSizeEB : amplitudeBinsSizeEE; - auto const timeConstantTerm = isBarrel ? timeConstantTermEB : timeConstantTermEE; - auto const timeNconst = isBarrel ? timeNconstEB : timeNconstEE; - auto const offsetTimeValue = isBarrel ? offsetTimeValueEB : offsetTimeValueEE; - auto const amplitudeThreshold = isBarrel ? amplitudeThresholdEB : amplitudeThresholdEE; - auto const outOfTimeThreshG12p = isBarrel ? outOfTimeThreshG12pEB : outOfTimeThreshG12pEE; - auto const outOfTimeThreshG12m = isBarrel ? outOfTimeThreshG12mEB : outOfTimeThreshG12mEE; - auto const outOfTimeThreshG61p = isBarrel ? outOfTimeThreshG61pEB : outOfTimeThreshG61pEE; - auto const outOfTimeThreshG61m = isBarrel ? outOfTimeThreshG61mEB : outOfTimeThreshG61mEE; - - // load some - auto const amplitude = g_amplitude[gtx]; - auto const rms_x12 = g_rms_x12[hashedId]; - auto const timeCalibConst = timeCalibConstant[hashedId]; - - int myBin = -1; - for (int bin = 0; bin < amplitudeBinsSize; bin++) { - if (amplitude > amplitudeBins[bin]) - myBin = bin; - else - break; - } - - ScalarType correction = 0; - if (myBin == -1) { + using ScalarType = SampleVector::Scalar; + + // constants + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + int const gtx = threadIdx.x + blockIdx.x * blockDim.x; + int const inputGtx = gtx >= offsetForInputs + ? gtx - offsetForInputs + : gtx; + auto const* dids = gtx >= offsetForInputs + ? dids_ee + : dids_eb; + auto const& digis = gtx >= offsetForInputs + ? digis_ee + : digis_eb; + + // filter out outside of range threads + if (gtx >= nchannels) return; + + auto const did = DetId{dids[inputGtx]}; + auto const isBarrel = did.subdetId() == EcalBarrel; + auto const hashedId = isBarrel + ? hashedIndexEB(did.rawId()) + : offsetForHashes + hashedIndexEE(did.rawId()); + auto const* amplitudeBins = isBarrel + ? amplitudeBinsEB + : amplitudeBinsEE; + auto const* shiftBins = isBarrel + ? shiftBinsEB + : shiftBinsEE; + auto const amplitudeBinsSize = isBarrel + ? amplitudeBinsSizeEB + : amplitudeBinsSizeEE; + auto const timeConstantTerm = isBarrel + ? timeConstantTermEB + : timeConstantTermEE; + auto const timeNconst = isBarrel + ? timeNconstEB + : timeNconstEE; + auto const offsetTimeValue = isBarrel + ? offsetTimeValueEB + : offsetTimeValueEE; + auto const amplitudeThreshold = isBarrel + ? amplitudeThresholdEB + : amplitudeThresholdEE; + auto const outOfTimeThreshG12p = isBarrel + ? outOfTimeThreshG12pEB + : outOfTimeThreshG12pEE; + auto const outOfTimeThreshG12m = isBarrel + ? outOfTimeThreshG12mEB + : outOfTimeThreshG12mEE; + auto const outOfTimeThreshG61p = isBarrel + ? outOfTimeThreshG61pEB + : outOfTimeThreshG61pEE; + auto const outOfTimeThreshG61m = isBarrel + ? outOfTimeThreshG61mEB + : outOfTimeThreshG61mEE; + + // load some + auto const amplitude = g_amplitude[gtx]; + auto const rms_x12 = g_rms_x12[hashedId]; + auto const timeCalibConst = timeCalibConstant[hashedId]; + + int myBin = -1; + for (int bin=0; bin amplitudeBins[bin]) + myBin = bin; + else + break; + } + + ScalarType correction = 0; + if (myBin == -1) { correction = shiftBins[0]; - } else if (myBin == amplitudeBinsSize - 1) { + } else if (myBin == amplitudeBinsSize-1) { correction = shiftBins[myBin]; - } else { - correction = shiftBins[myBin + 1] - shiftBins[myBin]; - correction *= (amplitude - amplitudeBins[myBin]) / (amplitudeBins[myBin + 1] - amplitudeBins[myBin]); + } else { + correction = shiftBins[myBin+1] - shiftBins[myBin]; + correction *= (amplitude - amplitudeBins[myBin]) / + (amplitudeBins[myBin+1] - amplitudeBins[myBin]); correction += shiftBins[myBin]; - } + } - // correction * 1./25. - correction = correction * 0.04; - auto const timeMax = g_timeMax[gtx]; - auto const timeError = g_timeError[gtx]; - auto const jitter = timeMax - 5 + correction; - auto const jitterError = - std::sqrt(timeError * timeError + timeConstantTerm * timeConstantTerm * 0.04 * 0.04); // 0.04 = 1./25. + // correction * 1./25. + correction = correction * 0.04; + auto const timeMax = g_timeMax[gtx]; + auto const timeError = g_timeError[gtx]; + auto const jitter = timeMax - 5 + correction; + auto const jitterError = std::sqrt(timeError*timeError + + timeConstantTerm*timeConstantTerm * 0.04 * 0.04); // 0.04 = 1./25. #ifdef DEBUG_TIME_CORRECTION - // if (gtx == 0) { - printf("ch = %d timeMax = %f timeError = %f jitter = %f correction = %f\n", - gtx, - timeMax, - timeError, - jitter, - correction); +// if (gtx == 0) { + printf("ch = %d timeMax = %f timeError = %f jitter = %f correction = %f\n", + gtx, timeMax, timeError, jitter, correction); // } #endif - // store back to global - g_jitter[gtx] = jitter; - g_jitterError[gtx] = jitterError; + // store back to global + g_jitter[gtx] = jitter; + g_jitterError[gtx] = jitterError; - // set the flag - // TODO: replace with something more efficient (if required), - // for now just to make it work - if (amplitude > amplitudeThreshold * rms_x12) { + // set the flag + // TODO: replace with something more efficient (if required), + // for now just to make it work + if (amplitude > amplitudeThreshold * rms_x12) { auto threshP = outOfTimeThreshG12p; auto threshM = outOfTimeThreshG12m; if (amplitude > 3000.) { - for (int isample = 0; isample < nsamples; isample++) { - int gainid = ecal::mgpa::gainId(digis[nsamples * gtx + isample]); - if (gainid != 1) { - threshP = outOfTimeThreshG61p; - threshM = outOfTimeThreshG61m; - break; + for (int isample=0; isample sigmat * threshP || correctedTime < -sigmat * threshM) - flags[gtx] |= 0x1 << EcalUncalibratedRecHit::kOutOfTime; - } + auto const sigmat = std::sqrt(nterm * nterm + + timeConstantTerm*timeConstantTerm); + if (correctedTime > sigmat*threshP || + correctedTime < -sigmat*threshM) + flags[gtx] |= 0x1 << EcalUncalibratedRecHit::kOutOfTime; } +} - } // namespace multifit -} // namespace ecal +}} diff --git a/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.h b/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.h index 30f2a6f6b774d..1a5d1a96e65cd 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.h +++ b/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.h @@ -17,55 +17,60 @@ //#define ECAL_RECO_CUDA_DEBUG -namespace ecal { - namespace multifit { +namespace ecal { namespace multifit { - __global__ void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values, - SampleVector::Scalar const* sample_value_errors, - bool const* useless_sample_values, - SampleVector::Scalar* chi2s, - SampleVector::Scalar* sum0s, - SampleVector::Scalar* sumAAs, - int const nchannels); - // - // launch ctx parameters are - // 45 threads per channel, X channels per block, Y blocks - // 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9 - // TODO: it might be much beter to use 32 threads per channel instead of 45 - // to simplify the synchronization - // - __global__ void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values, - SampleVector::Scalar const* sample_value_errors, - uint32_t const* dids, - bool const* useless_sample_values, - char const* pedestal_nums, - ConfigurationParameters::type const* amplitudeFitParametersEB, - ConfigurationParameters::type const* amplitudeFitParametersEE, - ConfigurationParameters::type const* timeFitParametersEB, - ConfigurationParameters::type const* timeFitParametersEE, - SampleVector::Scalar const* sumAAsNullHypot, - SampleVector::Scalar const* sum0sNullHypot, - SampleVector::Scalar* tMaxAlphaBetas, - SampleVector::Scalar* tMaxErrorAlphaBetas, - SampleVector::Scalar* g_accTimeMax, - SampleVector::Scalar* g_accTimeWgt, - TimeComputationState* g_state, - unsigned int const timeFitParameters_sizeEB, - unsigned int const timeFitParameters_sizeEE, - ConfigurationParameters::type const timeFitLimits_firstEB, - ConfigurationParameters::type const timeFitLimits_firstEE, - ConfigurationParameters::type const timeFitLimits_secondEB, - ConfigurationParameters::type const timeFitLimits_secondEE, - int const nchannels); +__global__ +void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values, + SampleVector::Scalar const* sample_value_errors, + bool const* useless_sample_values, + SampleVector::Scalar* chi2s, + SampleVector::Scalar* sum0s, + SampleVector::Scalar* sumAAs, + int const nchannels); +// +// launch ctx parameters are +// 45 threads per channel, X channels per block, Y blocks +// 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9 +// TODO: it might be much beter to use 32 threads per channel instead of 45 +// to simplify the synchronization +// +__global__ +void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values, + SampleVector::Scalar const* sample_value_errors, + uint32_t const* dids_eb, + uint32_t const* dids_ee, + bool const* useless_sample_values, + char const* pedestal_nums, + ConfigurationParameters::type const* amplitudeFitParametersEB, + ConfigurationParameters::type const* amplitudeFitParametersEE, + ConfigurationParameters::type const* timeFitParametersEB, + ConfigurationParameters::type const* timeFitParametersEE, + SampleVector::Scalar const* sumAAsNullHypot, + SampleVector::Scalar const* sum0sNullHypot, + SampleVector::Scalar* tMaxAlphaBetas, + SampleVector::Scalar* tMaxErrorAlphaBetas, + SampleVector::Scalar* g_accTimeMax, + SampleVector::Scalar* g_accTimeWgt, + TimeComputationState* g_state, + unsigned int const timeFitParameters_sizeEB, + unsigned int const timeFitParameters_sizeEE, + ConfigurationParameters::type const timeFitLimits_firstEB, + ConfigurationParameters::type const timeFitLimits_firstEE, + ConfigurationParameters::type const timeFitLimits_secondEB, + ConfigurationParameters::type const timeFitLimits_secondEE, + int const nchannels, + uint32_t const offsetForInputs); - /// launch ctx parameters are - /// 10 threads per channel, N channels per block, Y blocks - /// TODO: do we need to keep the state around or can be removed?! - //#define DEBUG_FINDAMPLCHI2_AND_FINISH - __global__ void kernel_time_compute_findamplchi2_and_finish( +/// launch ctx parameters are +/// 10 threads per channel, N channels per block, Y blocks +/// TODO: do we need to keep the state around or can be removed?! +//#define DEBUG_FINDAMPLCHI2_AND_FINISH +__global__ +void kernel_time_compute_findamplchi2_and_finish( SampleVector::Scalar const* sample_values, SampleVector::Scalar const* sample_value_errors, - uint32_t const* dids, + uint32_t const* dids_eb, + uint32_t const* dids_ee, bool const* useless_samples, SampleVector::Scalar const* g_tMaxAlphaBeta, SampleVector::Scalar const* g_tMaxErrorAlphaBeta, @@ -81,55 +86,69 @@ namespace ecal { SampleVector::Scalar* g_ampMaxError, SampleVector::Scalar* g_timeMax, SampleVector::Scalar* g_timeError, - int const nchannels); + int const nchannels, + uint32_t const offsetForInputs); - __global__ void kernel_time_compute_fixMGPAslew(uint16_t const* digis, - SampleVector::Scalar* sample_values, - SampleVector::Scalar* sample_value_errors, - bool* useless_sample_values, - unsigned int const sample_mask, - int const nchannels); +__global__ +void kernel_time_compute_fixMGPAslew(uint16_t const* digis_eb, + uint16_t const* digis_ee, + SampleVector::Scalar* sample_values, + SampleVector::Scalar* sample_value_errors, + bool* useless_sample_values, + unsigned int const sample_mask, + int const nchannels, + uint32_t const offsetForInputs); - __global__ void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values, - SampleVector::Scalar const* sample_value_errors, - uint32_t const* dids, - bool const* useless_samples, - SampleVector::Scalar const* g_timeMax, - SampleVector::Scalar const* amplitudeFitParametersEB, - SampleVector::Scalar const* amplitudeFitParametersEE, - SampleVector::Scalar* g_amplitudeMax, - int const nchannels); +__global__ +void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values, + SampleVector::Scalar const* sample_value_errors, + uint32_t const* dids_eb, + uint32_t const* dids_ed, + bool const* useless_samples, + SampleVector::Scalar const* g_timeMax, + SampleVector::Scalar const* amplitudeFitParametersEB, + SampleVector::Scalar const* amplitudeFitParametersEE, + SampleVector::Scalar *g_amplitudeMax, + int const nchannels, + uint32_t const offsetForInputs); - //#define ECAL_RECO_CUDA_TC_INIT_DEBUG - __global__ void kernel_time_computation_init(uint16_t const* digis, - uint32_t const* dids, - float const* rms_x12, - float const* rms_x6, - float const* rms_x1, - float const* mean_x12, - float const* mean_x6, - float const* mean_x1, - float const* gain12Over6, - float const* gain6Over1, - SampleVector::Scalar* sample_values, - SampleVector::Scalar* sample_value_errors, - SampleVector::Scalar* ampMaxError, - bool* useless_sample_values, - char* pedestal_nums, - uint32_t const offsetForHashes, - unsigned int const sample_maskEB, - unsigned int const sample_maskEE, - int nchannels); +//#define ECAL_RECO_CUDA_TC_INIT_DEBUG +__global__ +void kernel_time_computation_init(uint16_t const* digis_eb, + uint32_t const* dids_eb, + uint16_t const* digis_ee, + uint32_t const* dids_ee, + float const* rms_x12, + float const* rms_x6, + float const* rms_x1, + float const* mean_x12, + float const* mean_x6, + float const* mean_x1, + float const* gain12Over6, + float const* gain6Over1, + SampleVector::Scalar* sample_values, + SampleVector::Scalar* sample_value_errors, + SampleVector::Scalar* ampMaxError, + bool* useless_sample_values, + char* pedestal_nums, + uint32_t const offsetForHashes, + uint32_t const offsetForInputs, + unsigned int const sample_maskEB, + unsigned int const sample_maskEE, + int nchannels); - /// - /// launch context parameters: 1 thread per channel - /// - //#define DEBUG_TIME_CORRECTION - __global__ void kernel_time_correction_and_finalize( - // SampleVector::Scalar const* g_amplitude, +/// +/// launch context parameters: 1 thread per channel +/// +//#define DEBUG_TIME_CORRECTION +__global__ +void kernel_time_correction_and_finalize( +// SampleVector::Scalar const* g_amplitude, ::ecal::reco::StorageScalarType const* g_amplitude, - uint16_t const* digis, - uint32_t const* dids, + uint16_t const* digis_eb, + uint32_t const* dids_eb, + uint16_t const* digis_ee, + uint32_t const* dids_ee, float const* amplitudeBinsEB, float const* amplitudeBinsEE, float const* shiftBinsEB, @@ -138,9 +157,9 @@ namespace ecal { SampleVector::Scalar const* g_timeError, float const* g_rms_x12, float const* timeCalibConstant, - ::ecal::reco::StorageScalarType* g_jitter, - ::ecal::reco::StorageScalarType* g_jitterError, - uint32_t* flags, + ::ecal::reco::StorageScalarType *g_jitter, + ::ecal::reco::StorageScalarType *g_jitterError, + uint32_t *flags, int const amplitudeBinsSizeEB, int const amplitudeBinsSizeEE, ConfigurationParameters::type const timeConstantTermEB, @@ -160,9 +179,9 @@ namespace ecal { ConfigurationParameters::type const outOfTimeThreshG61mEB, ConfigurationParameters::type const outOfTimeThreshG61mEE, uint32_t const offsetForHashes, + uint32_t const offsetForInputs, int const nchannels); - } // namespace multifit -} // namespace ecal +}} -#endif // RecoLocalCalo_EcalRecAlgos_src_TimeComputationKernels +#endif // RecoLocalCalo_EcalRecAlgos_src_TimeComputationKernels diff --git a/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.cu b/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.cu index 98f2899876d43..f657981b95fa0 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.cu @@ -1,122 +1,120 @@ #include "inplace_fnnls.h" -namespace ecal { - namespace multifit { - - using matrix_t = SampleMatrix; - using vector_t = SampleVector; - - __device__ bool inplace_fnnls(matrix_t const& A, - vector_t const& b, - vector_t& x, - int& npassive, - BXVectorType& activeBXs, - PulseMatrixType& pulse_matrix, - const double eps, - const unsigned int max_iterations) { - matrix_t AtA = A.transpose() * A; - vector_t Atb = A.transpose() * b; - vector_t s; - vector_t w; - - // main loop - Eigen::Index w_max_idx_prev = 0; - matrix_t::Scalar w_max_prev = 0; - double eps_to_use = eps; - - int iter = 0; - while (true) { - if (iter > 0 || npassive == 0) { - const auto nActive = vector_t::RowsAtCompileTime - npassive; - if (!nActive) +namespace ecal { namespace multifit { + +using matrix_t = SampleMatrix; +using vector_t = SampleVector; + +__device__ +bool inplace_fnnls(matrix_t& AtA, + vector_t& Atb, + vector_t& x, + int& npassive, + BXVectorType& activeBXs, + PulseMatrixType& pulse_matrix, + const double eps, + const unsigned int max_iterations) { + vector_t s; + vector_t w; + +// main loop + Eigen::Index w_max_idx_prev = 0; + matrix_t::Scalar w_max_prev = 0; + double eps_to_use = eps; + + int iter = 0; + while (true) { + if (iter>0 || npassive==0) { + const auto nActive = vector_t::RowsAtCompileTime - npassive; + if(!nActive) + break; + + w.tail(nActive) = Atb.tail(nActive) - (AtA * x).tail(nActive); + + // get the index of w that gives the maximum gain + Eigen::Index w_max_idx; + const auto max_w = w.tail(nActive).maxCoeff(&w_max_idx); + + // check for convergence + if (max_w < eps_to_use || (w_max_idx==w_max_idx_prev && max_w==w_max_prev)) + break; + + // worst case + if (iter >= 500) break; - w.tail(nActive) = Atb.tail(nActive) - (AtA * x).tail(nActive); + w_max_prev = max_w; + w_max_idx_prev = w_max_idx; - // get the index of w that gives the maximum gain - Eigen::Index w_max_idx; - const auto max_w = w.tail(nActive).maxCoeff(&w_max_idx); + // need to translate the index into the right part of the vector + w_max_idx += npassive; - // check for convergence - if (max_w < eps_to_use || (w_max_idx == w_max_idx_prev && max_w == w_max_prev)) - break; - - // worst case - if (iter >= 500) - break; - - w_max_prev = max_w; - w_max_idx_prev = w_max_idx; - - // need to translate the index into the right part of the vector - w_max_idx += npassive; - - // swap AtA to avoid copy - AtA.col(npassive).swap(AtA.col(w_max_idx)); - AtA.row(npassive).swap(AtA.row(w_max_idx)); - // swap Atb to match with AtA - Eigen::numext::swap(Atb.coeffRef(npassive), Atb.coeffRef(w_max_idx)); - Eigen::numext::swap(x.coeffRef(npassive), x.coeffRef(w_max_idx)); - Eigen::numext::swap(activeBXs.coeffRef(npassive), activeBXs.coeffRef(w_max_idx)); - pulse_matrix.col(npassive).swap(pulse_matrix.col(w_max_idx)); + // swap AtA to avoid copy + AtA.col(npassive).swap(AtA.col(w_max_idx)); + AtA.row(npassive).swap(AtA.row(w_max_idx)); + // swap Atb to match with AtA + Eigen::numext::swap(Atb.coeffRef(npassive), Atb.coeffRef(w_max_idx)); + Eigen::numext::swap(x.coeffRef(npassive), x.coeffRef(w_max_idx)); + Eigen::numext::swap(activeBXs.coeffRef(npassive), activeBXs.coeffRef(w_max_idx)); + pulse_matrix.col(npassive).swap(pulse_matrix.col(w_max_idx)); - ++npassive; - } + ++npassive; + } - // inner loop - while (true) { - if (npassive == 0) - break; +// inner loop + while (true) { + if (npassive == 0) break; - s.head(npassive) = AtA.topLeftCorner(npassive, npassive).llt().solve(Atb.head(npassive)); + s.head(npassive) = + AtA.topLeftCorner(npassive, npassive).llt().solve(Atb.head(npassive)); - // if all coefficients are positive, done for this iteration - if (s.head(npassive).minCoeff() > 0.) { - x.head(npassive) = s.head(npassive); - break; - } + // if all coefficients are positive, done for this iteration + if (s.head(npassive).minCoeff() > 0.) { + x.head(npassive) = s.head(npassive); + break; + } - auto alpha = std::numeric_limits::max(); - Eigen::Index alpha_idx = 0; + auto alpha = std::numeric_limits::max(); + Eigen::Index alpha_idx = 0; #pragma unroll - for (auto i = 0; i < npassive; ++i) { - if (s[i] <= 0.) { - auto const ratio = x[i] / (x[i] - s[i]); - if (ratio < alpha) { - alpha = ratio; - alpha_idx = i; - } - } + for (auto i = 0; i < npassive; ++i) { + if (s[i] <= 0.) { + auto const ratio = x[i] / (x[i] - s[i]); + if (ratio < alpha) { + alpha = ratio; + alpha_idx = i; } + } + } - /* + /* if (std::numeric_limits::max() == alpha) { x.head(npassive) = s.head(npassive); break; }*/ - x.head(npassive) += alpha * (s.head(npassive) - x.head(npassive)); - x[alpha_idx] = 0; - --npassive; - - AtA.col(npassive).swap(AtA.col(alpha_idx)); - AtA.row(npassive).swap(AtA.row(alpha_idx)); - // swap Atb to match with AtA - Eigen::numext::swap(Atb.coeffRef(npassive), Atb.coeffRef(alpha_idx)); - Eigen::numext::swap(x.coeffRef(npassive), x.coeffRef(alpha_idx)); - Eigen::numext::swap(activeBXs.coeffRef(npassive), activeBXs.coeffRef(alpha_idx)); - pulse_matrix.col(npassive).swap(pulse_matrix.col(alpha_idx)); - } - - // TODO as in cpu NNLS version - iter++; - if (iter % 16 == 0) - eps_to_use *= 2; - } - - return true; + x.head(npassive) += alpha * (s.head(npassive) - x.head(npassive)); + x[alpha_idx] = 0; + --npassive; + + AtA.col(npassive).swap(AtA.col(alpha_idx)); + AtA.row(npassive).swap(AtA.row(alpha_idx)); + // swap Atb to match with AtA + Eigen::numext::swap(Atb.coeffRef(npassive), Atb.coeffRef(alpha_idx)); + Eigen::numext::swap(x.coeffRef(npassive), x.coeffRef(alpha_idx)); + Eigen::numext::swap(activeBXs.coeffRef(npassive), + activeBXs.coeffRef(alpha_idx)); + pulse_matrix.col(npassive).swap(pulse_matrix.col(alpha_idx)); } - } // namespace multifit -} // namespace ecal + // TODO as in cpu NNLS version + iter++; + if (iter % 16 == 0) + eps_to_use *= 2; + } + + return true; +} + +}} diff --git a/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.h b/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.h index 54805a3ab941c..9cda75008cc32 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.h +++ b/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.h @@ -3,22 +3,22 @@ #include "RecoLocalCalo/EcalRecAlgos/interface/EigenMatrixTypes_gpu.h" -namespace ecal { - namespace multifit { +namespace ecal { namespace multifit { - using matrix_t = SampleMatrix; - using vector_t = SampleVector; +using matrix_t = SampleMatrix; +using vector_t = SampleVector; - __device__ bool inplace_fnnls(matrix_t const& A, - vector_t const& b, - vector_t& x, - int& npassive, - BXVectorType& activeBXs, - PulseMatrixType& pulse_matrix, - const double eps = 1e-11, - const unsigned int max_iterations = 500); +__device__ +bool +inplace_fnnls(matrix_t& AtA, + vector_t& Atb, + vector_t& x, + int& npassive, + BXVectorType& activeBXs, + PulseMatrixType& pulse_matrix, + const double eps = 1e-11, + const unsigned int max_iterations = 500); - } // namespace multifit -} // namespace ecal +}} #endif From 1fea2b70f473163f84c8d9b11243d0bd8856980e Mon Sep 17 00:00:00 2001 From: Viktor Khristenko Date: Mon, 30 Mar 2020 16:31:58 +0200 Subject: [PATCH 05/30] ecal reco producers adapted for 111x --- .../plugins/EcalCPUUncalibRecHitProducer.cc | 200 +++++ .../plugins/EcalESProducerGPU.h | 47 +- .../plugins/EcalESProducersGPUDefs.cc | 35 +- .../EcalUncalibRecHitConvertGPU2CPUFormat.cc | 144 ++-- .../plugins/EcalUncalibRecHitProducerGPU.cc | 731 ++++++++---------- 5 files changed, 658 insertions(+), 499 deletions(-) create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc new file mode 100644 index 0000000000000..9c531d7060525 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc @@ -0,0 +1,200 @@ +#include + +// framework +#include "FWCore/Framework/interface/stream/EDProducer.h" +//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h" +//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" + +// algorithm specific + +#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h" + +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" + +class EcalCPUUncalibRecHitProducer + : public edm::stream::EDProducer +{ +public: + explicit EcalCPUUncalibRecHitProducer(edm::ParameterSet const& ps); + ~EcalCPUUncalibRecHitProducer() override; + static void fillDescriptions(edm::ConfigurationDescriptions&); + +private: + void acquire(edm::Event const&, + edm::EventSetup const&, + edm::WaitingTaskWithArenaHolder) override; + void produce(edm::Event&, edm::EventSetup const&) override; + +private: + edm::EDGetTokenT>> + recHitsInEBToken_, recHitsInEEToken_; + edm::EDPutTokenT> + recHitsOutEBToken_, recHitsOutEEToken_; + + ecal::UncalibratedRecHit + recHitsEB_, recHitsEE_; + bool containsTimingInformation_; +}; + +void EcalCPUUncalibRecHitProducer::fillDescriptions( + edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; + + desc.add("recHitsInLabelEB", + edm::InputTag{"ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEB"}); + desc.add("recHitsInLabelEE", + edm::InputTag{"ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEE"}); + desc.add("recHitsOutLabelEB", "EcalUncalibRecHitsEB"); + desc.add("recHitsOutLabelEE", "EcalUncalibRecHitsEE"); + desc.add("containsTimingInformation", false); + + std::string label = "ecalCPUUncalibRecHitProducer"; + confDesc.add(label, desc); +} + +EcalCPUUncalibRecHitProducer::EcalCPUUncalibRecHitProducer( + const edm::ParameterSet& ps) + : recHitsInEBToken_{consumes>>( + ps.getParameter("recHitsInLabelEB"))} + , recHitsInEEToken_{consumes>>( + ps.getParameter("recHitsInLabelEE"))} + , recHitsOutEBToken_{produces>( + ps.getParameter("recHitsOutLabelEB"))} + , recHitsOutEEToken_{produces>( + ps.getParameter("recHitsOutLabelEE"))} + , containsTimingInformation_{ps.getParameter("containsTimingInformation")} +{} + +EcalCPUUncalibRecHitProducer::~EcalCPUUncalibRecHitProducer() {} + +void EcalCPUUncalibRecHitProducer::acquire( + edm::Event const& event, + edm::EventSetup const& setup, + edm::WaitingTaskWithArenaHolder taskHolder) +{ + // retrieve data/ctx + auto const& ebRecHitsProduct = event.get(recHitsInEBToken_); + auto const& eeRecHitsProduct = event.get(recHitsInEEToken_); + cms::cuda::ScopedContextAcquire ctx{ebRecHitsProduct, std::move(taskHolder)}; + auto const& ebRecHits = ctx.get(ebRecHitsProduct); + auto const& eeRecHits = ctx.get(eeRecHitsProduct); + + // resize the output buffers + recHitsEB_.resize(ebRecHits.size); + recHitsEE_.resize(eeRecHits.size); + + // enqeue transfers + cudaCheck( cudaMemcpyAsync(recHitsEB_.did.data(), + ebRecHits.did, + recHitsEB_.did.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + cudaCheck( cudaMemcpyAsync(recHitsEE_.did.data(), + eeRecHits.did, + recHitsEE_.did.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + + cudaCheck( cudaMemcpyAsync(recHitsEB_.amplitudesAll.data(), + ebRecHits.amplitudesAll, + recHitsEB_.amplitudesAll.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + cudaCheck( cudaMemcpyAsync(recHitsEE_.amplitudesAll.data(), + eeRecHits.amplitudesAll, + recHitsEE_.amplitudesAll.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + + cudaCheck( cudaMemcpyAsync(recHitsEB_.amplitude.data(), + ebRecHits.amplitude, + recHitsEB_.amplitude.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + cudaCheck( cudaMemcpyAsync(recHitsEE_.amplitude.data(), + eeRecHits.amplitude, + recHitsEE_.amplitude.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + + cudaCheck( cudaMemcpyAsync(recHitsEB_.chi2.data(), + ebRecHits.chi2, + recHitsEB_.chi2.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + cudaCheck( cudaMemcpyAsync(recHitsEE_.chi2.data(), + eeRecHits.chi2, + recHitsEE_.chi2.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + + cudaCheck( cudaMemcpyAsync(recHitsEB_.pedestal.data(), + ebRecHits.pedestal, + recHitsEB_.pedestal.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + cudaCheck( cudaMemcpyAsync(recHitsEE_.pedestal.data(), + eeRecHits.pedestal, + recHitsEE_.pedestal.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + + cudaCheck( cudaMemcpyAsync(recHitsEB_.flags.data(), + ebRecHits.flags, + recHitsEB_.flags.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + cudaCheck( cudaMemcpyAsync(recHitsEE_.flags.data(), + eeRecHits.flags, + recHitsEE_.flags.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + + if (containsTimingInformation_) { + cudaCheck( cudaMemcpyAsync(recHitsEB_.jitter.data(), + ebRecHits.jitter, + recHitsEB_.jitter.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + cudaCheck( cudaMemcpyAsync(recHitsEE_.jitter.data(), + eeRecHits.jitter, + recHitsEE_.jitter.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + + cudaCheck( cudaMemcpyAsync(recHitsEB_.jitterError.data(), + ebRecHits.jitterError, + recHitsEB_.jitterError.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + cudaCheck( cudaMemcpyAsync(recHitsEE_.jitterError.data(), + eeRecHits.jitterError, + recHitsEE_.jitterError.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + } +} + +void EcalCPUUncalibRecHitProducer::produce( + edm::Event& event, + edm::EventSetup const& setup) +{ + // tmp vectors + auto recHitsOutEB = std::make_unique>( + std::move(recHitsEB_)); + auto recHitsOutEE = std::make_unique>( + std::move(recHitsEE_)); + + // put into event + event.put(recHitsOutEBToken_, std::move(recHitsOutEB)); + event.put(recHitsOutEEToken_, std::move(recHitsOutEE)); +} + +DEFINE_FWK_MODULE(EcalCPUUncalibRecHitProducer); diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducerGPU.h b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducerGPU.h index 7c8bfb86dba24..b1509d593f67f 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducerGPU.h +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducerGPU.h @@ -11,33 +11,34 @@ #include -template +template class EcalESProducerGPU : public edm::ESProducer { public: - explicit EcalESProducerGPU(edm::ParameterSet const& ps) : label_{ps.getParameter("label")} { - std::string name = ps.getParameter("ComponentName"); - setWhatProduced(this, name); - } - - std::unique_ptr produce(Record const& record) { - // retrieve conditions in old format - edm::ESTransientHandle product; - record.get(label_, product); - - return std::make_unique(*product); - } - - static void fillDescriptions(edm::ConfigurationDescriptions& confDesc) { - edm::ParameterSetDescription desc; - - std::string label = Target::name() + "ESProducer"; - desc.add("ComponentName", ""); - desc.add("label", "")->setComment("Product Label"); - confDesc.add(label, desc); - } + explicit EcalESProducerGPU(edm::ParameterSet const& ps) { + auto const label = ps.getParameter("label"); + std::string name = ps.getParameter("ComponentName"); + auto cc = setWhatProduced(this, name); + cc.setConsumes(token_, edm::ESInputTag{"", label}); + } + + std::unique_ptr produce(Record const& record) { + // retrieve conditions in old format + auto sourceProduct = record.getTransientHandle(token_); + + return std::make_unique(*sourceProduct); + } + + static void fillDescriptions(edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; + + std::string label = Target::name() + "ESProducer"; + desc.add("ComponentName", ""); + desc.add("label", "")->setComment("Product Label"); + confDesc.add(label, desc); + } private: - std::string label_; + edm::ESGetToken token_; }; #endif diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc index c851bf24c0e40..24b782b7b434d 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc @@ -18,19 +18,32 @@ #include -using EcalPedestalsGPUESProducer = EcalESProducerGPU; -using EcalGainRatiosGPUESProducer = EcalESProducerGPU; -using EcalPulseShapesGPUESProducer = EcalESProducerGPU; -using EcalPulseCovariancesGPUESProducer = - EcalESProducerGPU; -using EcalSamplesCorrelationGPUESProducer = - EcalESProducerGPU; +using EcalPedestalsGPUESProducer = EcalESProducerGPU; +using EcalGainRatiosGPUESProducer = EcalESProducerGPU; +using EcalPulseShapesGPUESProducer = EcalESProducerGPU; +using EcalPulseCovariancesGPUESProducer = EcalESProducerGPU; +using EcalSamplesCorrelationGPUESProducer = EcalESProducerGPU< + EcalSamplesCorrelationGPU, + EcalSamplesCorrelation, + EcalSamplesCorrelationRcd>; -using EcalTimeBiasCorrectionsGPUESProducer = - EcalESProducerGPU; +using EcalTimeBiasCorrectionsGPUESProducer = EcalESProducerGPU< + EcalTimeBiasCorrectionsGPU, + EcalTimeBiasCorrections, + EcalTimeBiasCorrectionsRcd>; -using EcalTimeCalibConstantsGPUESProducer = - EcalESProducerGPU; +using EcalTimeCalibConstantsGPUESProducer = EcalESProducerGPU< + EcalTimeCalibConstantsGPU, + EcalTimeCalibConstants, + EcalTimeCalibConstantsRcd>; DEFINE_FWK_EVENTSETUP_MODULE(EcalPedestalsGPUESProducer); DEFINE_FWK_EVENTSETUP_MODULE(EcalGainRatiosGPUESProducer); diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc index 20f51ea5245df..916230516f070 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc @@ -3,7 +3,7 @@ #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" -#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/MakerMacros.h" // algorithm specific #include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" @@ -14,87 +14,103 @@ #include -class EcalUncalibRecHitConvertGPU2CPUFormat : public edm::stream::EDProducer<> { +class EcalUncalibRecHitConvertGPU2CPUFormat + : public edm::stream::EDProducer<> +{ public: - explicit EcalUncalibRecHitConvertGPU2CPUFormat(edm::ParameterSet const& ps); - ~EcalUncalibRecHitConvertGPU2CPUFormat() override; - static void fillDescriptions(edm::ConfigurationDescriptions&); + explicit EcalUncalibRecHitConvertGPU2CPUFormat(edm::ParameterSet const& ps); + ~EcalUncalibRecHitConvertGPU2CPUFormat() override; + static void fillDescriptions(edm::ConfigurationDescriptions&); private: - using GPURecHitType = ecal::UncalibratedRecHit; - void produce(edm::Event&, edm::EventSetup const&) override; + using GPURecHitType = ecal::UncalibratedRecHit; + void produce(edm::Event&, edm::EventSetup const&) override; private: - const edm::EDGetTokenT recHitsGPUEB_; - const edm::EDGetTokenT recHitsGPUEE_; + const edm::EDGetTokenT recHitsGPUEB_; + const edm::EDGetTokenT recHitsGPUEE_; - const std::string recHitsLabelCPUEB_, recHitsLabelCPUEE_; + const std::string recHitsLabelCPUEB_, recHitsLabelCPUEE_; }; -void EcalUncalibRecHitConvertGPU2CPUFormat::fillDescriptions(edm::ConfigurationDescriptions& confDesc) { - edm::ParameterSetDescription desc; +void EcalUncalibRecHitConvertGPU2CPUFormat::fillDescriptions( + edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; - desc.add("recHitsLabelGPUEB", edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEB")); - desc.add("recHitsLabelGPUEE", edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEE")); + desc.add("recHitsLabelGPUEB", + edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEB")); + desc.add("recHitsLabelGPUEE", + edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEE")); - desc.add("recHitsLabelCPUEB", "EcalUncalibRecHitsEB"); - desc.add("recHitsLabelCPUEE", "EcalUncalibRecHitsEE"); + desc.add("recHitsLabelCPUEB", "EcalUncalibRecHitsEB"); + desc.add("recHitsLabelCPUEE", "EcalUncalibRecHitsEE"); - std::string label = "ecalUncalibRecHitConvertGPU2CPUFormat"; - confDesc.add(label, desc); + std::string label = "ecalUncalibRecHitConvertGPU2CPUFormat"; + confDesc.add(label, desc); } -EcalUncalibRecHitConvertGPU2CPUFormat::EcalUncalibRecHitConvertGPU2CPUFormat(const edm::ParameterSet& ps) +EcalUncalibRecHitConvertGPU2CPUFormat::EcalUncalibRecHitConvertGPU2CPUFormat( + const edm::ParameterSet& ps) : recHitsGPUEB_{consumes( - ps.getParameter("recHitsLabelGPUEB"))}, - recHitsGPUEE_{ - consumes(ps.getParameter("recHitsLabelGPUEE"))}, - recHitsLabelCPUEB_{ps.getParameter("recHitsLabelCPUEB")}, - recHitsLabelCPUEE_{ps.getParameter("recHitsLabelCPUEE")} { - produces(recHitsLabelCPUEB_); - produces(recHitsLabelCPUEE_); + ps.getParameter("recHitsLabelGPUEB"))} + , recHitsGPUEE_{consumes( + ps.getParameter("recHitsLabelGPUEE"))} + , recHitsLabelCPUEB_{ps.getParameter("recHitsLabelCPUEB")} + , recHitsLabelCPUEE_{ps.getParameter("recHitsLabelCPUEE")} +{ + produces(recHitsLabelCPUEB_); + produces(recHitsLabelCPUEE_); } EcalUncalibRecHitConvertGPU2CPUFormat::~EcalUncalibRecHitConvertGPU2CPUFormat() {} -void EcalUncalibRecHitConvertGPU2CPUFormat::produce(edm::Event& event, edm::EventSetup const& setup) { - edm::Handle hRecHitsGPUEB, hRecHitsGPUEE; - event.getByToken(recHitsGPUEB_, hRecHitsGPUEB); - event.getByToken(recHitsGPUEE_, hRecHitsGPUEE); - - auto recHitsCPUEB = std::make_unique(); - auto recHitsCPUEE = std::make_unique(); - recHitsCPUEB->reserve(hRecHitsGPUEB->amplitude.size()); - recHitsCPUEE->reserve(hRecHitsGPUEE->amplitude.size()); - - for (uint32_t i = 0; i < hRecHitsGPUEB->amplitude.size(); ++i) { - recHitsCPUEB->emplace_back(DetId{hRecHitsGPUEB->did[i]}, - hRecHitsGPUEB->amplitude[i], - hRecHitsGPUEB->pedestal[i], - hRecHitsGPUEB->jitter[i], - hRecHitsGPUEB->chi2[i], - hRecHitsGPUEB->flags[i]); - (*recHitsCPUEB)[i].setJitterError(hRecHitsGPUEB->jitterError[i]); - auto const offset = i * EcalDataFrame::MAXSAMPLES; - for (uint32_t sample = 0; sample < EcalDataFrame::MAXSAMPLES; ++sample) - (*recHitsCPUEB)[i].setOutOfTimeAmplitude(sample, hRecHitsGPUEB->amplitudesAll[offset + sample]); - } - - for (uint32_t i = 0; i < hRecHitsGPUEE->amplitude.size(); ++i) { - recHitsCPUEE->emplace_back(DetId{hRecHitsGPUEE->did[i]}, - hRecHitsGPUEE->amplitude[i], - hRecHitsGPUEE->pedestal[i], - hRecHitsGPUEE->jitter[i], - hRecHitsGPUEE->chi2[i], - hRecHitsGPUEE->flags[i]); - (*recHitsCPUEE)[i].setJitterError(hRecHitsGPUEE->jitterError[i]); - auto const offset = i * EcalDataFrame::MAXSAMPLES; - for (uint32_t sample = 0; sample < EcalDataFrame::MAXSAMPLES; ++sample) - (*recHitsCPUEE)[i].setOutOfTimeAmplitude(sample, hRecHitsGPUEE->amplitudesAll[offset + sample]); - } - - event.put(std::move(recHitsCPUEB), recHitsLabelCPUEB_); - event.put(std::move(recHitsCPUEE), recHitsLabelCPUEE_); +void EcalUncalibRecHitConvertGPU2CPUFormat::produce( + edm::Event& event, + edm::EventSetup const& setup) +{ + edm::Handle hRecHitsGPUEB, hRecHitsGPUEE; + event.getByToken(recHitsGPUEB_, hRecHitsGPUEB); + event.getByToken(recHitsGPUEE_, hRecHitsGPUEE); + + auto recHitsCPUEB = std::make_unique(); + auto recHitsCPUEE = std::make_unique(); + recHitsCPUEB->reserve(hRecHitsGPUEB->amplitude.size()); + recHitsCPUEE->reserve(hRecHitsGPUEE->amplitude.size()); + + for (uint32_t i=0; iamplitude.size(); ++i) { + recHitsCPUEB->emplace_back( + DetId{hRecHitsGPUEB->did[i]}, + hRecHitsGPUEB->amplitude[i], + hRecHitsGPUEB->pedestal[i], + hRecHitsGPUEB->jitter[i], + hRecHitsGPUEB->chi2[i], + hRecHitsGPUEB->flags[i] + ); + (*recHitsCPUEB)[i].setJitterError(hRecHitsGPUEB->jitterError[i]); + auto const offset = i * EcalDataFrame::MAXSAMPLES; + for (uint32_t sample=0; sampleamplitudesAll[offset + sample]); + } + + for (uint32_t i=0; iamplitude.size(); ++i) { + recHitsCPUEE->emplace_back( + DetId{hRecHitsGPUEE->did[i]}, + hRecHitsGPUEE->amplitude[i], + hRecHitsGPUEE->pedestal[i], + hRecHitsGPUEE->jitter[i], + hRecHitsGPUEE->chi2[i], + hRecHitsGPUEE->flags[i] + ); + (*recHitsCPUEE)[i].setJitterError(hRecHitsGPUEE->jitterError[i]); + auto const offset = i * EcalDataFrame::MAXSAMPLES; + for (uint32_t sample=0; sampleamplitudesAll[offset + sample]); + } + + event.put(std::move(recHitsCPUEB), recHitsLabelCPUEB_); + event.put(std::move(recHitsCPUEE), recHitsLabelCPUEE_); } DEFINE_FWK_MODULE(EcalUncalibRecHitConvertGPU2CPUFormat); diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc index a90cc1536c482..a96b729223d01 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc @@ -8,7 +8,7 @@ #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" -#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/MakerMacros.h" // algorithm specific #include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" @@ -40,426 +40,355 @@ #include "RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalUncalibRecHitMultiFitAlgo_gpu_new.h" -class EcalUncalibRecHitProducerGPU : public edm::stream::EDProducer { +class EcalUncalibRecHitProducerGPU + : public edm::stream::EDProducer +{ public: - explicit EcalUncalibRecHitProducerGPU(edm::ParameterSet const& ps); - ~EcalUncalibRecHitProducerGPU() override; - static void fillDescriptions(edm::ConfigurationDescriptions&); + explicit EcalUncalibRecHitProducerGPU(edm::ParameterSet const& ps); + ~EcalUncalibRecHitProducerGPU() override; + static void fillDescriptions(edm::ConfigurationDescriptions&); private: - using RecHitType = ecal::UncalibratedRecHit; - void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override; - void produce(edm::Event&, edm::EventSetup const&) override; - - void transferToHost(RecHitType& ebRecHits, RecHitType& eeRecHits, cudaStream_t cudaStream); + using RecHitType = ecal::UncalibratedRecHit; + void acquire(edm::Event const&, + edm::EventSetup const&, + edm::WaitingTaskWithArenaHolder) override; + void produce(edm::Event&, edm::EventSetup const&) override; private: - edm::EDGetTokenT digisTokenEB_; - edm::EDGetTokenT digisTokenEE_; - - std::string recHitsLabelEB_, recHitsLabelEE_; - - // conditions handles - edm::ESHandle pedestalsHandle_; - edm::ESHandle gainRatiosHandle_; - edm::ESHandle pulseShapesHandle_; - edm::ESHandle pulseCovariancesHandle_; - edm::ESHandle samplesCorrelationHandle_; - edm::ESHandle timeBiasCorrectionsHandle_; - edm::ESHandle timeCalibConstantsHandle_; - edm::ESHandle sampleMaskHandle_; - edm::ESHandle timeOffsetConstantHandle_; - - // configuration parameters - ecal::multifit::ConfigurationParameters configParameters_; + edm::EDGetTokenT> digisTokenEB_, digisTokenEE_; + edm::EDPutTokenT>> + recHitsTokenEB_, recHitsTokenEE_; + + // conditions handles + edm::ESHandle pedestalsHandle_; + edm::ESHandle gainRatiosHandle_; + edm::ESHandle pulseShapesHandle_; + edm::ESHandle pulseCovariancesHandle_; + edm::ESHandle samplesCorrelationHandle_; + edm::ESHandle timeBiasCorrectionsHandle_; + edm::ESHandle timeCalibConstantsHandle_; + edm::ESHandle sampleMaskHandle_; + edm::ESHandle timeOffsetConstantHandle_; - // event data - ecal::multifit::EventInputDataGPU eventInputDataGPU_; - ecal::multifit::EventOutputDataGPU eventOutputDataGPU_; - ecal::multifit::EventDataForScratchGPU eventDataForScratchGPU_; - bool shouldTransferToHost_{true}; + // configuration parameters + ecal::multifit::ConfigurationParameters configParameters_; - cms::cuda::ContextState cudaState_; + // event data + ecal::multifit::EventOutputDataGPU eventOutputDataGPU_; + ecal::multifit::EventDataForScratchGPU eventDataForScratchGPU_; + bool shouldTransferToHost_{true}; - std::unique_ptr> ebRecHits_{nullptr}, eeRecHits_{nullptr}; + cms::cuda::ContextState cudaState_; - uint32_t maxNumberHits_; + uint32_t maxNumberHits_; + uint32_t neb_, nee_; }; -void EcalUncalibRecHitProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& confDesc) { - edm::ParameterSetDescription desc; - - desc.add("digisLabelEB", edm::InputTag("ecalDigis", "ebDigis")); - desc.add("digisLabelEE", edm::InputTag("ecalDigis", "eeDigis")); - - desc.add("recHitsLabelEB", "EcalUncalibRecHitsEB"); - desc.add("recHitsLabelEE", "EcalUncalibRecHitsEE"); - - desc.add>("EBtimeFitParameters", - {-2.015452e+00, - 3.130702e+00, - -1.234730e+01, - 4.188921e+01, - -8.283944e+01, - 9.101147e+01, - -5.035761e+01, - 1.105621e+01}); - desc.add>("EEtimeFitParameters", - {-2.390548e+00, - 3.553628e+00, - -1.762341e+01, - 6.767538e+01, - -1.332130e+02, - 1.407432e+02, - -7.541106e+01, - 1.620277e+01}); - desc.add>("EBamplitudeFitParameters", {1.138, 1.652}); - desc.add>("EEamplitudeFitParameters", {1.890, 1.400}); - desc.add("EBtimeFitLimits_Lower", 0.2); - desc.add("EBtimeFitLimits_Upper", 1.4); - desc.add("EEtimeFitLimits_Lower", 0.2); - desc.add("EEtimeFitLimits_Upper", 1.4); - desc.add("EBtimeConstantTerm", .6); - desc.add("EEtimeConstantTerm", 1.0); - desc.add("EBtimeNconst", 28.5); - desc.add("EEtimeNconst", 31.8); - desc.add("outOfTimeThresholdGain12pEB", 5); - desc.add("outOfTimeThresholdGain12mEB", 5); - desc.add("outOfTimeThresholdGain61pEB", 5); - desc.add("outOfTimeThresholdGain61mEB", 5); - desc.add("outOfTimeThresholdGain12pEE", 1000); - desc.add("outOfTimeThresholdGain12mEE", 1000); - desc.add("outOfTimeThresholdGain61pEE", 1000); - desc.add("outOfTimeThresholdGain61mEE", 1000); - desc.add("amplitudeThresholdEB", 10); - desc.add("amplitudeThresholdEE", 10); - desc.add("maxNumberHits", 20000); //---- AM TEST - desc.add("shouldTransferToHost", true); - desc.add>("kernelMinimizeThreads", {32, 1, 1}); - // ---- default false or true? It was set to true, but at HLT it is false - desc.add("shouldRunTimingComputation", false); - std::string label = "ecalUncalibRecHitProducerGPU"; - confDesc.add(label, desc); +void EcalUncalibRecHitProducerGPU::fillDescriptions( + edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; + + desc.add("digisLabelEB", + edm::InputTag("ecalRawToDigiGPU", "ebDigisGPU")); + desc.add("digisLabelEE", + edm::InputTag("ecalRawToDigiGPU", "eeDigisGPU")); + + desc.add("recHitsLabelEB", "EcalUncalibRecHitsEB"); + desc.add("recHitsLabelEE", "EcalUncalibRecHitsEE"); + + desc.add>("EBtimeFitParameters", + {-2.015452e+00, 3.130702e+00, -1.234730e+01, 4.188921e+01, -8.283944e+01, + 9.101147e+01, -5.035761e+01, 1.105621e+01}); + desc.add>("EEtimeFitParameters", + {-2.390548e+00, 3.553628e+00, -1.762341e+01, 6.767538e+01, -1.332130e+02, + 1.407432e+02, -7.541106e+01, 1.620277e+01}); + desc.add>("EBamplitudeFitParameters", {1.138,1.652}); + desc.add>("EEamplitudeFitParameters", {1.890,1.400}); + desc.add("EBtimeFitLimits_Lower", 0.2); + desc.add("EBtimeFitLimits_Upper", 1.4); + desc.add("EEtimeFitLimits_Lower", 0.2); + desc.add("EEtimeFitLimits_Upper", 1.4); + desc.add("EBtimeConstantTerm", .6); + desc.add("EEtimeConstantTerm", 1.0); + desc.add("EBtimeNconst", 28.5); + desc.add("EEtimeNconst", 31.8); + desc.add("outOfTimeThresholdGain12pEB", 5); + desc.add("outOfTimeThresholdGain12mEB", 5); + desc.add("outOfTimeThresholdGain61pEB", 5); + desc.add("outOfTimeThresholdGain61mEB", 5); + desc.add("outOfTimeThresholdGain12pEE", 1000); + desc.add("outOfTimeThresholdGain12mEE", 1000); + desc.add("outOfTimeThresholdGain61pEE", 1000); + desc.add("outOfTimeThresholdGain61mEE", 1000); + desc.add("amplitudeThresholdEB", 10); + desc.add("amplitudeThresholdEE", 10); + desc.add("maxNumberHits", 20000); //---- AM TEST + desc.add("shouldTransferToHost", true); + desc.add>("kernelMinimizeThreads", {32, 1, 1}); + // ---- default false or true? It was set to true, but at HLT it is false + desc.add("shouldRunTimingComputation", false); + std::string label = "ecalUncalibRecHitProducerGPU"; + confDesc.add(label, desc); } -EcalUncalibRecHitProducerGPU::EcalUncalibRecHitProducerGPU(const edm::ParameterSet& ps) { - digisTokenEB_ = consumes(ps.getParameter("digisLabelEB")); - digisTokenEE_ = consumes(ps.getParameter("digisLabelEE")); - - recHitsLabelEB_ = ps.getParameter("recHitsLabelEB"); - recHitsLabelEE_ = ps.getParameter("recHitsLabelEE"); - - auto EBamplitudeFitParameters = ps.getParameter>("EBamplitudeFitParameters"); - auto EEamplitudeFitParameters = ps.getParameter>("EEamplitudeFitParameters"); - auto EBtimeFitParameters = ps.getParameter>("EBtimeFitParameters"); - auto EEtimeFitParameters = ps.getParameter>("EEtimeFitParameters"); - std::pair EBtimeFitLimits, EEtimeFitLimits; - EBtimeFitLimits.first = ps.getParameter("EBtimeFitLimits_Lower"); - EBtimeFitLimits.second = ps.getParameter("EBtimeFitLimits_Upper"); - EEtimeFitLimits.first = ps.getParameter("EEtimeFitLimits_Lower"); - EEtimeFitLimits.second = ps.getParameter("EEtimeFitLimits_Upper"); - - auto EBtimeConstantTerm = ps.getParameter("EBtimeConstantTerm"); - auto EEtimeConstantTerm = ps.getParameter("EEtimeConstantTerm"); - auto EBtimeNconst = ps.getParameter("EBtimeNconst"); - auto EEtimeNconst = ps.getParameter("EEtimeNconst"); - - auto outOfTimeThreshG12pEB = ps.getParameter("outOfTimeThresholdGain12pEB"); - auto outOfTimeThreshG12mEB = ps.getParameter("outOfTimeThresholdGain12mEB"); - auto outOfTimeThreshG61pEB = ps.getParameter("outOfTimeThresholdGain61pEB"); - auto outOfTimeThreshG61mEB = ps.getParameter("outOfTimeThresholdGain61mEB"); - auto outOfTimeThreshG12pEE = ps.getParameter("outOfTimeThresholdGain12pEE"); - auto outOfTimeThreshG12mEE = ps.getParameter("outOfTimeThresholdGain12mEE"); - auto outOfTimeThreshG61pEE = ps.getParameter("outOfTimeThresholdGain61pEE"); - auto outOfTimeThreshG61mEE = ps.getParameter("outOfTimeThresholdGain61mEE"); - auto amplitudeThreshEB = ps.getParameter("amplitudeThresholdEB"); - auto amplitudeThreshEE = ps.getParameter("amplitudeThresholdEE"); - - // max number of digis to allocate for - maxNumberHits_ = ps.getParameter("maxNumberHits"); - - // transfer to host switch - shouldTransferToHost_ = ps.getParameter("shouldTransferToHost"); - - // switch to run timing computation kernels - configParameters_.shouldRunTimingComputation = ps.getParameter("shouldRunTimingComputation"); - - // minimize kernel launch conf - auto threadsMinimize = ps.getParameter>("kernelMinimizeThreads"); - configParameters_.kernelMinimizeThreads[0] = threadsMinimize[0]; - configParameters_.kernelMinimizeThreads[1] = threadsMinimize[1]; - configParameters_.kernelMinimizeThreads[2] = threadsMinimize[2]; - - produces(recHitsLabelEB_); - produces(recHitsLabelEE_); - - // - // configuration and physics parameters: done once - // assume there is a single device - // use sync copying - // - - // amplitude fit parameters copying - cudaCheck(cudaMalloc((void**)&configParameters_.amplitudeFitParametersEB, - sizeof(ecal::multifit::ConfigurationParameters::type) * EBamplitudeFitParameters.size())); - cudaCheck(cudaMemcpy(configParameters_.amplitudeFitParametersEB, - EBamplitudeFitParameters.data(), - EBamplitudeFitParameters.size() * sizeof(ecal::multifit::ConfigurationParameters::type), - cudaMemcpyHostToDevice)); - cudaCheck(cudaMalloc((void**)&configParameters_.amplitudeFitParametersEE, - sizeof(ecal::multifit::ConfigurationParameters::type) * EEamplitudeFitParameters.size())); - cudaCheck(cudaMemcpy(configParameters_.amplitudeFitParametersEE, - EEamplitudeFitParameters.data(), - EEamplitudeFitParameters.size() * sizeof(ecal::multifit::ConfigurationParameters::type), - cudaMemcpyHostToDevice)); - - // time fit parameters and limits - configParameters_.timeFitParametersSizeEB = EBtimeFitParameters.size(); - configParameters_.timeFitParametersSizeEE = EEtimeFitParameters.size(); - configParameters_.timeFitLimitsFirstEB = EBtimeFitLimits.first; - configParameters_.timeFitLimitsSecondEB = EBtimeFitLimits.second; - configParameters_.timeFitLimitsFirstEE = EEtimeFitLimits.first; - configParameters_.timeFitLimitsSecondEE = EEtimeFitLimits.second; - cudaCheck(cudaMalloc((void**)&configParameters_.timeFitParametersEB, - sizeof(ecal::multifit::ConfigurationParameters::type) * EBtimeFitParameters.size())); - cudaCheck(cudaMemcpy(configParameters_.timeFitParametersEB, - EBtimeFitParameters.data(), - EBtimeFitParameters.size() * sizeof(ecal::multifit::ConfigurationParameters::type), - cudaMemcpyHostToDevice)); - cudaCheck(cudaMalloc((void**)&configParameters_.timeFitParametersEE, - sizeof(ecal::multifit::ConfigurationParameters::type) * EEtimeFitParameters.size())); - cudaCheck(cudaMemcpy(configParameters_.timeFitParametersEE, - EEtimeFitParameters.data(), - EEtimeFitParameters.size() * sizeof(ecal::multifit::ConfigurationParameters::type), - cudaMemcpyHostToDevice)); - - // time constant terms - configParameters_.timeConstantTermEB = EBtimeConstantTerm; - configParameters_.timeConstantTermEE = EEtimeConstantTerm; - - // time N const - configParameters_.timeNconstEB = EBtimeNconst; - configParameters_.timeNconstEE = EEtimeNconst; - - // amplitude threshold for time flags - configParameters_.amplitudeThreshEB = amplitudeThreshEB; - configParameters_.amplitudeThreshEE = amplitudeThreshEE; - - // out of time thresholds gain-dependent - configParameters_.outOfTimeThreshG12pEB = outOfTimeThreshG12pEB; - configParameters_.outOfTimeThreshG12pEE = outOfTimeThreshG12pEE; - configParameters_.outOfTimeThreshG61pEB = outOfTimeThreshG61pEB; - configParameters_.outOfTimeThreshG61pEE = outOfTimeThreshG61pEE; - configParameters_.outOfTimeThreshG12mEB = outOfTimeThreshG12mEB; - configParameters_.outOfTimeThreshG12mEE = outOfTimeThreshG12mEE; - configParameters_.outOfTimeThreshG61mEB = outOfTimeThreshG61mEB; - configParameters_.outOfTimeThreshG61mEE = outOfTimeThreshG61mEE; - - // allocate event input data - eventInputDataGPU_.allocate(maxNumberHits_); - - // allocate event output data - eventOutputDataGPU_.allocate(configParameters_, maxNumberHits_); - - // allocate scratch data for gpu - eventDataForScratchGPU_.allocate(configParameters_, maxNumberHits_); +EcalUncalibRecHitProducerGPU::EcalUncalibRecHitProducerGPU( + const edm::ParameterSet& ps) + : digisTokenEB_{consumes>( + ps.getParameter("digisLabelEB"))} + , digisTokenEE_{consumes>( + ps.getParameter("digisLabelEE"))} + , recHitsTokenEB_{produces>>( + ps.getParameter("recHitsLabelEB"))} + , recHitsTokenEE_{produces>>( + ps.getParameter("recHitsLabelEE"))} +{ + auto EBamplitudeFitParameters = ps.getParameter>( + "EBamplitudeFitParameters"); + auto EEamplitudeFitParameters = ps.getParameter>( + "EEamplitudeFitParameters"); + auto EBtimeFitParameters = ps.getParameter>( + "EBtimeFitParameters"); + auto EEtimeFitParameters = ps.getParameter>( + "EEtimeFitParameters"); + std::pair EBtimeFitLimits, EEtimeFitLimits; + EBtimeFitLimits.first = ps.getParameter("EBtimeFitLimits_Lower"); + EBtimeFitLimits.second = ps.getParameter("EBtimeFitLimits_Upper"); + EEtimeFitLimits.first = ps.getParameter("EEtimeFitLimits_Lower"); + EEtimeFitLimits.second = ps.getParameter("EEtimeFitLimits_Upper"); + + auto EBtimeConstantTerm = ps.getParameter("EBtimeConstantTerm"); + auto EEtimeConstantTerm = ps.getParameter("EEtimeConstantTerm"); + auto EBtimeNconst = ps.getParameter("EBtimeNconst"); + auto EEtimeNconst = ps.getParameter("EEtimeNconst"); + + auto outOfTimeThreshG12pEB = ps.getParameter( + "outOfTimeThresholdGain12pEB"); + auto outOfTimeThreshG12mEB = ps.getParameter( + "outOfTimeThresholdGain12mEB"); + auto outOfTimeThreshG61pEB = ps.getParameter( + "outOfTimeThresholdGain61pEB"); + auto outOfTimeThreshG61mEB = ps.getParameter( + "outOfTimeThresholdGain61mEB"); + auto outOfTimeThreshG12pEE = ps.getParameter( + "outOfTimeThresholdGain12pEE"); + auto outOfTimeThreshG12mEE = ps.getParameter( + "outOfTimeThresholdGain12mEE"); + auto outOfTimeThreshG61pEE = ps.getParameter( + "outOfTimeThresholdGain61pEE"); + auto outOfTimeThreshG61mEE = ps.getParameter( + "outOfTimeThresholdGain61mEE"); + auto amplitudeThreshEB = ps.getParameter("amplitudeThresholdEB"); + auto amplitudeThreshEE = ps.getParameter("amplitudeThresholdEE"); + + // max number of digis to allocate for + maxNumberHits_ = ps.getParameter("maxNumberHits"); + + // transfer to host switch + shouldTransferToHost_ = ps.getParameter("shouldTransferToHost"); + + // switch to run timing computation kernels + configParameters_.shouldRunTimingComputation = + ps.getParameter("shouldRunTimingComputation"); + + // minimize kernel launch conf + auto threadsMinimize = ps.getParameter>("kernelMinimizeThreads"); + configParameters_.kernelMinimizeThreads[0] = threadsMinimize[0]; + configParameters_.kernelMinimizeThreads[1] = threadsMinimize[1]; + configParameters_.kernelMinimizeThreads[2] = threadsMinimize[2]; + + // + // configuration and physics parameters: done once + // assume there is a single device + // use sync copying + // + + // amplitude fit parameters copying + cudaCheck( cudaMalloc((void**)&configParameters_.amplitudeFitParametersEB, + sizeof(ecal::multifit::ConfigurationParameters::type) + * EBamplitudeFitParameters.size()) ); + cudaCheck( cudaMemcpy(configParameters_.amplitudeFitParametersEB, + EBamplitudeFitParameters.data(), + EBamplitudeFitParameters.size() * + sizeof(ecal::multifit::ConfigurationParameters::type), + cudaMemcpyHostToDevice) ); + cudaCheck( cudaMalloc((void**)&configParameters_.amplitudeFitParametersEE, + sizeof(ecal::multifit::ConfigurationParameters::type) * + EEamplitudeFitParameters.size()) ); + cudaCheck( cudaMemcpy(configParameters_.amplitudeFitParametersEE, + EEamplitudeFitParameters.data(), + EEamplitudeFitParameters.size() * + sizeof(ecal::multifit::ConfigurationParameters::type), + cudaMemcpyHostToDevice) ); + + // time fit parameters and limits + configParameters_.timeFitParametersSizeEB = EBtimeFitParameters.size(); + configParameters_.timeFitParametersSizeEE = EEtimeFitParameters.size(); + configParameters_.timeFitLimitsFirstEB = EBtimeFitLimits.first; + configParameters_.timeFitLimitsSecondEB = EBtimeFitLimits.second; + configParameters_.timeFitLimitsFirstEE = EEtimeFitLimits.first; + configParameters_.timeFitLimitsSecondEE = EEtimeFitLimits.second; + cudaCheck( cudaMalloc((void**)&configParameters_.timeFitParametersEB, + sizeof(ecal::multifit::ConfigurationParameters::type) + * EBtimeFitParameters.size()) ); + cudaCheck( cudaMemcpy(configParameters_.timeFitParametersEB, + EBtimeFitParameters.data(), + EBtimeFitParameters.size() * + sizeof(ecal::multifit::ConfigurationParameters::type), + cudaMemcpyHostToDevice) ); + cudaCheck( cudaMalloc((void**)&configParameters_.timeFitParametersEE, + sizeof(ecal::multifit::ConfigurationParameters::type) + * EEtimeFitParameters.size()) ); + cudaCheck( cudaMemcpy(configParameters_.timeFitParametersEE, + EEtimeFitParameters.data(), + EEtimeFitParameters.size() + * sizeof(ecal::multifit::ConfigurationParameters::type), + cudaMemcpyHostToDevice) ); + + // time constant terms + configParameters_.timeConstantTermEB = EBtimeConstantTerm; + configParameters_.timeConstantTermEE = EEtimeConstantTerm; + + // time N const + configParameters_.timeNconstEB = EBtimeNconst; + configParameters_.timeNconstEE = EEtimeNconst; + + // amplitude threshold for time flags + configParameters_.amplitudeThreshEB = amplitudeThreshEB; + configParameters_.amplitudeThreshEE = amplitudeThreshEE; + + // out of time thresholds gain-dependent + configParameters_.outOfTimeThreshG12pEB = outOfTimeThreshG12pEB; + configParameters_.outOfTimeThreshG12pEE = outOfTimeThreshG12pEE; + configParameters_.outOfTimeThreshG61pEB = outOfTimeThreshG61pEB; + configParameters_.outOfTimeThreshG61pEE = outOfTimeThreshG61pEE; + configParameters_.outOfTimeThreshG12mEB = outOfTimeThreshG12mEB; + configParameters_.outOfTimeThreshG12mEE = outOfTimeThreshG12mEE; + configParameters_.outOfTimeThreshG61mEB = outOfTimeThreshG61mEB; + configParameters_.outOfTimeThreshG61mEE = outOfTimeThreshG61mEE; + + // allocate event output data + eventOutputDataGPU_.allocate(configParameters_, maxNumberHits_); + + // allocate scratch data for gpu + eventDataForScratchGPU_.allocate(configParameters_, maxNumberHits_); } EcalUncalibRecHitProducerGPU::~EcalUncalibRecHitProducerGPU() { - // - // assume single device for now - // - - if (configParameters_.amplitudeFitParametersEB) { - // configuration parameters - cudaCheck(cudaFree(configParameters_.amplitudeFitParametersEB)); - cudaCheck(cudaFree(configParameters_.amplitudeFitParametersEE)); - cudaCheck(cudaFree(configParameters_.timeFitParametersEB)); - cudaCheck(cudaFree(configParameters_.timeFitParametersEE)); - - // free event input data - eventInputDataGPU_.deallocate(); - - // free event ouput data - eventOutputDataGPU_.deallocate(configParameters_); - - // free event scratch data - eventDataForScratchGPU_.deallocate(configParameters_); - } -} - -void EcalUncalibRecHitProducerGPU::acquire(edm::Event const& event, - edm::EventSetup const& setup, - edm::WaitingTaskWithArenaHolder holder) { - //DurationMeasurer timer{std::string{"acquire duration"}}; - - // raii - cms::cuda::ScopedContextAcquire ctx{event.streamID(), std::move(holder), cudaState_}; - - // conditions - setup.get().get(pedestalsHandle_); - setup.get().get(gainRatiosHandle_); - setup.get().get(pulseShapesHandle_); - setup.get().get(pulseCovariancesHandle_); - setup.get().get(samplesCorrelationHandle_); - setup.get().get(timeBiasCorrectionsHandle_); - setup.get().get(timeCalibConstantsHandle_); - setup.get().get(sampleMaskHandle_); - setup.get().get(timeOffsetConstantHandle_); - - auto const& pedProduct = pedestalsHandle_->getProduct(ctx.stream()); - auto const& gainsProduct = gainRatiosHandle_->getProduct(ctx.stream()); - auto const& pulseShapesProduct = pulseShapesHandle_->getProduct(ctx.stream()); - auto const& pulseCovariancesProduct = pulseCovariancesHandle_->getProduct(ctx.stream()); - auto const& samplesCorrelationProduct = samplesCorrelationHandle_->getProduct(ctx.stream()); - auto const& timeBiasCorrectionsProduct = timeBiasCorrectionsHandle_->getProduct(ctx.stream()); - auto const& timeCalibConstantsProduct = timeCalibConstantsHandle_->getProduct(ctx.stream()); - - // bundle up conditions - ecal::multifit::ConditionsProducts conditions{pedProduct, - gainsProduct, - pulseShapesProduct, - pulseCovariancesProduct, - samplesCorrelationProduct, - timeBiasCorrectionsProduct, - timeCalibConstantsProduct, - *sampleMaskHandle_, - *timeOffsetConstantHandle_, - timeCalibConstantsHandle_->getOffset()}; - - // - // retrieve collections - // - edm::Handle ebDigis; - edm::Handle eeDigis; - event.getByToken(digisTokenEB_, ebDigis); - event.getByToken(digisTokenEE_, eeDigis); - - ecal::multifit::EventInputDataCPU eventInputDataCPU{*ebDigis, *eeDigis}; - - // - // schedule algorithms - // - ecal::multifit::entryPoint(eventInputDataCPU, - eventInputDataGPU_, - eventOutputDataGPU_, - eventDataForScratchGPU_, - conditions, - configParameters_, - ctx.stream()); - - ebRecHits_ = std::make_unique>(); - eeRecHits_ = std::make_unique>(); - - if (shouldTransferToHost_) { - // allocate for the result while kernels are running - ebRecHits_->resize(ebDigis->size()); - eeRecHits_->resize(eeDigis->size()); - - // det ids are host copy only - no need to run device -> host - std::memcpy(ebRecHits_->did.data(), ebDigis->ids().data(), ebDigis->ids().size() * sizeof(uint32_t)); - std::memcpy(eeRecHits_->did.data(), eeDigis->ids().data(), eeDigis->ids().size() * sizeof(uint32_t)); - } + // + // assume single device for now + // + + if (configParameters_.amplitudeFitParametersEB) { + // configuration parameters + cudaCheck( cudaFree(configParameters_.amplitudeFitParametersEB) ); + cudaCheck( cudaFree(configParameters_.amplitudeFitParametersEE) ); + cudaCheck( cudaFree(configParameters_.timeFitParametersEB) ); + cudaCheck( cudaFree(configParameters_.timeFitParametersEE) ); + + // free event ouput data + eventOutputDataGPU_.deallocate(configParameters_); + + // free event scratch data + eventDataForScratchGPU_.deallocate(configParameters_); + } } -void EcalUncalibRecHitProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) { - //DurationMeasurer timer{std::string{"produce duration"}}; - cms::cuda::ScopedContextProduce ctx{cudaState_}; - - if (shouldTransferToHost_) { - // rec hits objects were not originally member variables - transferToHost(*ebRecHits_, *eeRecHits_, ctx.stream()); - - // TODO - // for now just sync on the host when transferring back products - cudaStreamSynchronize(ctx.stream()); - } - - event.put(std::move(ebRecHits_), recHitsLabelEB_); - event.put(std::move(eeRecHits_), recHitsLabelEE_); +void EcalUncalibRecHitProducerGPU::acquire( + edm::Event const& event, + edm::EventSetup const& setup, + edm::WaitingTaskWithArenaHolder holder) +{ + // cuda products + auto const& ebDigisProduct = event.get(digisTokenEB_); + auto const& eeDigisProduct = event.get(digisTokenEE_); + + // raii + cms::cuda::ScopedContextAcquire ctx{ebDigisProduct, std::move(holder), cudaState_}; + + // get actual obj + auto const& ebDigis = ctx.get(ebDigisProduct); + auto const& eeDigis = ctx.get(eeDigisProduct); + ecal::multifit::EventInputDataGPU inputDataGPU{ebDigis, eeDigis}; + neb_ = ebDigis.ndigis; + nee_ = eeDigis.ndigis; + + // conditions + setup.get().get(pedestalsHandle_); + setup.get().get(gainRatiosHandle_); + setup.get().get(pulseShapesHandle_); + setup.get().get(pulseCovariancesHandle_); + setup.get().get(samplesCorrelationHandle_); + setup.get().get(timeBiasCorrectionsHandle_); + setup.get().get(timeCalibConstantsHandle_); + setup.get().get(sampleMaskHandle_); + setup.get().get(timeOffsetConstantHandle_); + + auto const& pedProduct = pedestalsHandle_->getProduct(ctx.stream()); + auto const& gainsProduct = gainRatiosHandle_->getProduct(ctx.stream()); + auto const& pulseShapesProduct = pulseShapesHandle_->getProduct(ctx.stream()); + auto const& pulseCovariancesProduct = pulseCovariancesHandle_->getProduct(ctx.stream()); + auto const& samplesCorrelationProduct = samplesCorrelationHandle_->getProduct(ctx.stream()); + auto const& timeBiasCorrectionsProduct = timeBiasCorrectionsHandle_->getProduct(ctx.stream()); + auto const& timeCalibConstantsProduct = timeCalibConstantsHandle_->getProduct(ctx.stream()); + + // bundle up conditions + ecal::multifit::ConditionsProducts conditions { + pedProduct, gainsProduct, pulseShapesProduct, + pulseCovariancesProduct, + samplesCorrelationProduct, + timeBiasCorrectionsProduct, + timeCalibConstantsProduct, + *sampleMaskHandle_, + *timeOffsetConstantHandle_, + timeCalibConstantsHandle_->getOffset() + }; + + // + // schedule algorithms + // + ecal::multifit::entryPoint( + inputDataGPU, + eventOutputDataGPU_, + eventDataForScratchGPU_, + conditions, + configParameters_, + ctx.stream() + ); } -void EcalUncalibRecHitProducerGPU::transferToHost(RecHitType& ebRecHits, - RecHitType& eeRecHits, - cudaStream_t cudaStream) { - cudaCheck(cudaMemcpyAsync(ebRecHits.amplitude.data(), - eventOutputDataGPU_.amplitude, - ebRecHits.amplitude.size() * sizeof(ecal::reco::StorageScalarType), - cudaMemcpyDeviceToHost, - cudaStream)); - cudaCheck(cudaMemcpyAsync(eeRecHits.amplitude.data(), - eventOutputDataGPU_.amplitude + ebRecHits.amplitude.size(), - eeRecHits.amplitude.size() * sizeof(ecal::reco::StorageScalarType), - cudaMemcpyDeviceToHost, - cudaStream)); - - cudaCheck(cudaMemcpyAsync(ebRecHits.pedestal.data(), - eventOutputDataGPU_.pedestal, - ebRecHits.pedestal.size() * sizeof(ecal::reco::StorageScalarType), - cudaMemcpyDeviceToHost, - cudaStream)); - cudaCheck(cudaMemcpyAsync(eeRecHits.pedestal.data(), - eventOutputDataGPU_.pedestal + ebRecHits.pedestal.size(), - eeRecHits.pedestal.size() * sizeof(ecal::reco::StorageScalarType), - cudaMemcpyDeviceToHost, - cudaStream)); - - cudaCheck(cudaMemcpyAsync(ebRecHits.chi2.data(), - eventOutputDataGPU_.chi2, - ebRecHits.chi2.size() * sizeof(ecal::reco::StorageScalarType), - cudaMemcpyDeviceToHost, - cudaStream)); - cudaCheck(cudaMemcpyAsync(eeRecHits.chi2.data(), - eventOutputDataGPU_.chi2 + ebRecHits.chi2.size(), - eeRecHits.chi2.size() * sizeof(ecal::reco::StorageScalarType), - cudaMemcpyDeviceToHost, - cudaStream)); - - if (configParameters_.shouldRunTimingComputation) { - cudaCheck(cudaMemcpyAsync(ebRecHits.jitter.data(), - eventOutputDataGPU_.jitter, - ebRecHits.jitter.size() * sizeof(ecal::reco::StorageScalarType), - cudaMemcpyDeviceToHost, - cudaStream)); - cudaCheck(cudaMemcpyAsync(eeRecHits.jitter.data(), - eventOutputDataGPU_.jitter + ebRecHits.jitter.size(), - eeRecHits.jitter.size() * sizeof(ecal::reco::StorageScalarType), - cudaMemcpyDeviceToHost, - cudaStream)); - - cudaCheck(cudaMemcpyAsync(ebRecHits.jitterError.data(), - eventOutputDataGPU_.jitterError, - ebRecHits.jitterError.size() * sizeof(ecal::reco::StorageScalarType), - cudaMemcpyDeviceToHost, - cudaStream)); - cudaCheck(cudaMemcpyAsync(eeRecHits.jitterError.data(), - eventOutputDataGPU_.jitterError + ebRecHits.jitterError.size(), - eeRecHits.jitterError.size() * sizeof(ecal::reco::StorageScalarType), - cudaMemcpyDeviceToHost, - cudaStream)); - } - - cudaCheck(cudaMemcpyAsync(ebRecHits.flags.data(), - eventOutputDataGPU_.flags, - ebRecHits.flags.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - cudaStream)); - cudaCheck(cudaMemcpyAsync(eeRecHits.flags.data(), - eventOutputDataGPU_.flags + ebRecHits.flags.size(), - eeRecHits.flags.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - cudaStream)); - - cudaCheck(cudaMemcpyAsync(ebRecHits.amplitudesAll.data(), - eventOutputDataGPU_.amplitudesAll, - ebRecHits.amplitudesAll.size() * sizeof(ecal::reco::ComputationScalarType), - cudaMemcpyDeviceToHost, - cudaStream)); - cudaCheck(cudaMemcpyAsync(eeRecHits.amplitudesAll.data(), - eventOutputDataGPU_.amplitudesAll + ebRecHits.amplitudesAll.size(), - eeRecHits.amplitudesAll.size() * sizeof(ecal::reco::ComputationScalarType), - cudaMemcpyDeviceToHost, - cudaStream)); +void EcalUncalibRecHitProducerGPU::produce( + edm::Event& event, + edm::EventSetup const& setup) +{ + //DurationMeasurer timer{std::string{"produce duration"}}; + cms::cuda::ScopedContextProduce ctx{cudaState_}; + + // copy construct output collections + // note, output collections do not own device memory! + ecal::UncalibratedRecHit + ebRecHits{eventOutputDataGPU_}, + eeRecHits{eventOutputDataGPU_}; + + // set the size of eb and ee + ebRecHits.size = neb_; + eeRecHits.size = nee_; + + // shift ptrs for ee + eeRecHits.amplitudesAll += neb_ * EcalDataFrame::MAXSAMPLES; + eeRecHits.amplitude += neb_; + eeRecHits.chi2 += neb_; + eeRecHits.pedestal += neb_; + eeRecHits.did += neb_; + eeRecHits.flags += neb_; + if (configParameters_.shouldRunTimingComputation) { + eeRecHits.jitter += neb_; + eeRecHits.jitterError += neb_; + } + + // put into the event + ctx.emplace(event, recHitsTokenEB_, std::move(ebRecHits)); + ctx.emplace(event, recHitsTokenEE_, std::move(eeRecHits)); } DEFINE_FWK_MODULE(EcalUncalibRecHitProducerGPU); From 6e0f6304b3d85943b3b4f218e45b090f5bce7f5b Mon Sep 17 00:00:00 2001 From: Viktor Khristenko Date: Wed, 1 Apr 2020 12:01:28 +0200 Subject: [PATCH 06/30] make sure proper types are deduced for cuda copies --- .../plugins/EcalCPUUncalibRecHitProducer.cc | 114 +++++------------- 1 file changed, 30 insertions(+), 84 deletions(-) diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc index 9c531d7060525..9661f98139f7b 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc @@ -90,95 +90,41 @@ void EcalCPUUncalibRecHitProducer::acquire( recHitsEB_.resize(ebRecHits.size); recHitsEE_.resize(eeRecHits.size); + auto lambdaToTransfer = [&ctx](auto& dest, auto* src) { + using vector_type = typename std::remove_reference::type; + using type = typename vector_type::value_type; + cudaCheck(cudaMemcpyAsync(dest.data(), + src, + dest.size() * sizeof(type), + cudaMemcpyDeviceToHost, + ctx.stream())); + }; + // enqeue transfers - cudaCheck( cudaMemcpyAsync(recHitsEB_.did.data(), - ebRecHits.did, - recHitsEB_.did.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - cudaCheck( cudaMemcpyAsync(recHitsEE_.did.data(), - eeRecHits.did, - recHitsEE_.did.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - - cudaCheck( cudaMemcpyAsync(recHitsEB_.amplitudesAll.data(), - ebRecHits.amplitudesAll, - recHitsEB_.amplitudesAll.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - cudaCheck( cudaMemcpyAsync(recHitsEE_.amplitudesAll.data(), - eeRecHits.amplitudesAll, - recHitsEE_.amplitudesAll.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - - cudaCheck( cudaMemcpyAsync(recHitsEB_.amplitude.data(), - ebRecHits.amplitude, - recHitsEB_.amplitude.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - cudaCheck( cudaMemcpyAsync(recHitsEE_.amplitude.data(), - eeRecHits.amplitude, - recHitsEE_.amplitude.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); + lambdaToTransfer(recHitsEB_.did, ebRecHits.did); + lambdaToTransfer(recHitsEE_.did, eeRecHits.did); - cudaCheck( cudaMemcpyAsync(recHitsEB_.chi2.data(), - ebRecHits.chi2, - recHitsEB_.chi2.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - cudaCheck( cudaMemcpyAsync(recHitsEE_.chi2.data(), - eeRecHits.chi2, - recHitsEE_.chi2.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); + lambdaToTransfer(recHitsEB_.amplitudesAll, ebRecHits.amplitudesAll); + lambdaToTransfer(recHitsEE_.amplitudesAll, eeRecHits.amplitudesAll); - cudaCheck( cudaMemcpyAsync(recHitsEB_.pedestal.data(), - ebRecHits.pedestal, - recHitsEB_.pedestal.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - cudaCheck( cudaMemcpyAsync(recHitsEE_.pedestal.data(), - eeRecHits.pedestal, - recHitsEE_.pedestal.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - - cudaCheck( cudaMemcpyAsync(recHitsEB_.flags.data(), - ebRecHits.flags, - recHitsEB_.flags.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - cudaCheck( cudaMemcpyAsync(recHitsEE_.flags.data(), - eeRecHits.flags, - recHitsEE_.flags.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); + lambdaToTransfer(recHitsEB_.amplitude, ebRecHits.amplitude); + lambdaToTransfer(recHitsEE_.amplitude, eeRecHits.amplitude); + + lambdaToTransfer(recHitsEB_.chi2, ebRecHits.chi2); + lambdaToTransfer(recHitsEE_.chi2, eeRecHits.chi2); + lambdaToTransfer(recHitsEB_.pedestal, ebRecHits.pedestal); + lambdaToTransfer(recHitsEE_.pedestal, eeRecHits.pedestal); + + lambdaToTransfer(recHitsEB_.flags, ebRecHits.flags); + lambdaToTransfer(recHitsEE_.flags, eeRecHits.flags); + if (containsTimingInformation_) { - cudaCheck( cudaMemcpyAsync(recHitsEB_.jitter.data(), - ebRecHits.jitter, - recHitsEB_.jitter.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - cudaCheck( cudaMemcpyAsync(recHitsEE_.jitter.data(), - eeRecHits.jitter, - recHitsEE_.jitter.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - - cudaCheck( cudaMemcpyAsync(recHitsEB_.jitterError.data(), - ebRecHits.jitterError, - recHitsEB_.jitterError.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - cudaCheck( cudaMemcpyAsync(recHitsEE_.jitterError.data(), - eeRecHits.jitterError, - recHitsEE_.jitterError.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); + lambdaToTransfer(recHitsEB_.jitter, ebRecHits.jitter); + lambdaToTransfer(recHitsEE_.jitter, eeRecHits.jitter); + + lambdaToTransfer(recHitsEB_.jitterError, ebRecHits.jitterError); + lambdaToTransfer(recHitsEE_.jitterError, eeRecHits.jitterError); } } From 0e946cc670dcfa2f513a1b415c6fe175145f73c4 Mon Sep 17 00:00:00 2001 From: Viktor Khristenko Date: Wed, 1 Apr 2020 12:17:10 +0200 Subject: [PATCH 07/30] add ratio plot --- ...eEcalMultifitResultsGpuValidationPlots.cpp | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp index a336de13b9e7d..e0cca70f93795 100644 --- a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp +++ b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp @@ -45,6 +45,8 @@ int main(int argc, char *argv[]) { auto hSOIAmplitudesEEGPU = new TH1D("hSOIAmplitudesEEGPU", "hSOIAmplitudesEEGPU", nbins, 0, last); auto hSOIAmplitudesEBCPU = new TH1D("hSOIAmplitudesEBCPU", "hSOIAmplitudesEBCPU", nbins, 0, last); auto hSOIAmplitudesEECPU = new TH1D("hSOIAmplitudesEECPU", "hSOIAmplitudesEECPU", nbins, 0, last); + auto hSOIAmplitudesEBGPUCPUratio = new TH1D("SOIAmplitudesEBGPU/CPUratio", "SOIAmplitudesEBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); + auto hSOIAmplitudesEEGPUCPUratio = new TH1D("SOIAmplitudesEEGPU/CPUratio", "SOIAmplitudesEEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); auto hChi2EBGPU = new TH1D("hChi2EBGPU", "hChi2EBGPU", nbins_chi2, 0, last_chi2); auto hChi2EEGPU = new TH1D("hChi2EEGPU", "hChi2EEGPU", nbins_chi2, 0, last_chi2); @@ -113,6 +115,7 @@ int main(int argc, char *argv[]) { hSOIAmplitudesEBCPU->Fill(soi_amp_cpu); hSOIAmplitudesEBGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu); hSOIAmplitudesEBdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu-soi_amp_cpu); + hSOIAmplitudesEBGPUCPUratio->Fill( (float) soi_amp_gpu/soi_amp_cpu); hChi2EBGPU->Fill(chi2_gpu); hChi2EBCPU->Fill(chi2_cpu); hChi2EBGPUvsCPU->Fill(chi2_cpu, chi2_gpu); @@ -146,6 +149,7 @@ int main(int argc, char *argv[]) { hSOIAmplitudesEECPU->Fill(soi_amp_cpu); hSOIAmplitudesEEGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu); hSOIAmplitudesEEdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu-soi_amp_cpu); + hSOIAmplitudesEEGPUCPUratio->Fill( (float) soi_amp_gpu/soi_amp_cpu); hChi2EEGPU->Fill(chi2_gpu); hChi2EECPU->Fill(chi2_cpu); hChi2EEGPUvsCPU->Fill(chi2_cpu, chi2_gpu); @@ -164,7 +168,7 @@ int main(int argc, char *argv[]) { { TCanvas c("plots", "plots", 4200, 6200); - c.Divide(2, 3); + c.Divide(2, 4); c.cd(1); { @@ -206,8 +210,26 @@ int main(int argc, char *argv[]) { hSOIAmplitudesEBdeltavsCPU->Draw("COLZ"); c.cd(6); hSOIAmplitudesEEdeltavsCPU->Draw("COLZ"); + c.cd(7); + { + gPad->SetLogy(); + hSOIAmplitudesEBGPUCPUratio->SetLineColor(kBlack); + hSOIAmplitudesEBGPUCPUratio->SetLineWidth(1.); + hSOIAmplitudesEBGPUCPUratio->Draw(""); + } + c.cd(8); + { + gPad->SetLogy(); + hSOIAmplitudesEEGPUCPUratio->SetLineColor(kBlack); + hSOIAmplitudesEEGPUCPUratio->SetLineWidth(1.); + hSOIAmplitudesEEGPUCPUratio->Draw(""); + } c.SaveAs("ecal-amplitudes.pdf"); + } + { + TCanvas c("plots", "plots", 4200, 6200); + c.Divide(2, 3); c.cd(1); { From b21427c6d505b2c6780fde61cffbec7c173a5585 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Fri, 3 Apr 2020 18:43:16 +0200 Subject: [PATCH 08/30] Clean up ECAL unapcker code Fix compilation warnings, remove commented out code, and apply code formatting rules. --- CUDADataFormats/EcalDigi/BuildFile.xml | 3 +- CUDADataFormats/EcalRecHitSoA/BuildFile.xml | 4 +- .../EcalRecHitSoA/src/classes_def.xml | 8 - .../EcalObjects/interface/EcalXtalGroupId.h | 2 +- .../EcalDigi/interface/EcalMatacqDigi.h | 4 +- EventFilter/EcalRawToDigi/BuildFile.xml | 49 +- .../makeEcalRaw2DigiGpuValidationPlots.cpp | 392 ++-- .../EcalRawToDigi/plugins/BuildFile.xml | 45 +- .../plugins/EcalCPUDigisProducer.cc | 183 +- .../plugins/EcalRawESProducersGPUDefs.cc | 6 +- .../EcalRawToDigi/plugins/EcalRawToDigiGPU.cc | 234 +-- .../src/ElectronicsMappingGPU.cc | 85 +- EventFilter/EcalRawToDigi/src/UnpackGPU.cu | 703 +++---- ...eEcalMultifitResultsGpuValidationPlots.cpp | 486 ++--- .../src/AmplitudeComputationCommonKernels.cu | 819 ++++---- .../src/AmplitudeComputationKernels.cu | 685 +++---- .../EcalRecAlgos/src/EcalGainRatiosGPU.cc | 83 +- .../EcalRecAlgos/src/EcalPedestalsGPU.cc | 167 +- .../src/EcalPulseCovariancesGPU.cc | 68 +- .../EcalRecAlgos/src/EcalPulseShapesGPU.cc | 68 +- .../src/EcalSamplesCorrelationGPU.cc | 143 +- .../src/EcalTimeBiasCorrectionsGPU.cc | 111 +- .../src/EcalTimeCalibConstantsGPU.cc | 65 +- .../EcalUncalibRecHitMultiFitAlgo_gpu_new.cu | 307 ++- .../EcalRecAlgos/src/KernelHelpers.cu | 156 +- .../src/TimeComputationKernels.cu | 1783 ++++++++--------- .../EcalRecAlgos/src/inplace_fnnls.cu | 196 +- .../plugins/EcalCPUUncalibRecHitProducer.cc | 191 +- .../plugins/EcalESProducersGPUDefs.cc | 35 +- .../EcalUncalibRecHitConvertGPU2CPUFormat.cc | 144 +- .../plugins/EcalUncalibRecHitProducerGPU.cc | 634 +++--- 31 files changed, 3610 insertions(+), 4249 deletions(-) diff --git a/CUDADataFormats/EcalDigi/BuildFile.xml b/CUDADataFormats/EcalDigi/BuildFile.xml index a1838ba91dc91..4a5c646e3a1b3 100644 --- a/CUDADataFormats/EcalDigi/BuildFile.xml +++ b/CUDADataFormats/EcalDigi/BuildFile.xml @@ -1,7 +1,6 @@ - + - diff --git a/CUDADataFormats/EcalRecHitSoA/BuildFile.xml b/CUDADataFormats/EcalRecHitSoA/BuildFile.xml index 927a7a57a86a7..de31c3f42a961 100644 --- a/CUDADataFormats/EcalRecHitSoA/BuildFile.xml +++ b/CUDADataFormats/EcalRecHitSoA/BuildFile.xml @@ -1,8 +1,8 @@ - + + - diff --git a/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml b/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml index 68056d21ad4c1..b75a258a5151e 100644 --- a/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml +++ b/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml @@ -8,14 +8,6 @@ - - - diff --git a/CondFormats/EcalObjects/interface/EcalXtalGroupId.h b/CondFormats/EcalObjects/interface/EcalXtalGroupId.h index 3331db375dfd1..b21c2b9889d9f 100644 --- a/CondFormats/EcalObjects/interface/EcalXtalGroupId.h +++ b/CondFormats/EcalObjects/interface/EcalXtalGroupId.h @@ -19,7 +19,7 @@ class EcalXtalGroupId { bool operator<(const EcalXtalGroupId& rhs) const { return (id_ < rhs.id()); } bool operator<=(const EcalXtalGroupId& rhs) const { return (id_ <= rhs.id()); } - const unsigned int id() const { return id_; } + unsigned int id() const { return id_; } private: unsigned int id_; diff --git a/DataFormats/EcalDigi/interface/EcalMatacqDigi.h b/DataFormats/EcalDigi/interface/EcalMatacqDigi.h index 952bd894ec891..b7cbc3949cf01 100644 --- a/DataFormats/EcalDigi/interface/EcalMatacqDigi.h +++ b/DataFormats/EcalDigi/interface/EcalMatacqDigi.h @@ -50,12 +50,12 @@ class EcalMatacqDigi { /** Gets amplitude in ADC count of time sample i. i between 0 and size()-1. * Note: Amplitude is pedestal subtracted at acquisition time. */ - const float adcCount(const int& i) const { return data_[i]; } + float adcCount(const int& i) const { return data_[i]; } /** Gets amplitude in Volt of time sample i. i between 0 and size()-1. * Note: Amplitude is pedestal subtracted at acquisition time. */ - const float amplitudeV(const int& i) const { return data_[i] * lsb_; } + float amplitudeV(const int& i) const { return data_[i] * lsb_; } /** Gets Matacq electronics channel id */ diff --git a/EventFilter/EcalRawToDigi/BuildFile.xml b/EventFilter/EcalRawToDigi/BuildFile.xml index 61a07973df153..e31aea9a0b58a 100644 --- a/EventFilter/EcalRawToDigi/BuildFile.xml +++ b/EventFilter/EcalRawToDigi/BuildFile.xml @@ -1,28 +1,27 @@ - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + - + diff --git a/EventFilter/EcalRawToDigi/bin/makeEcalRaw2DigiGpuValidationPlots.cpp b/EventFilter/EcalRawToDigi/bin/makeEcalRaw2DigiGpuValidationPlots.cpp index 9fc9ec26e3714..609c277e19288 100644 --- a/EventFilter/EcalRawToDigi/bin/makeEcalRaw2DigiGpuValidationPlots.cpp +++ b/EventFilter/EcalRawToDigi/bin/makeEcalRaw2DigiGpuValidationPlots.cpp @@ -13,212 +13,198 @@ #include "DataFormats/Common/interface/Wrapper.h" #include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" -int main(int argc, char *argv[]) { - if (argc<3) { - std::cout << "run with: ./ \n"; - exit(0); +int main(int argc, char* argv[]) { + if (argc < 3) { + std::cout << "run with: ./ \n"; + exit(0); + } + + // branches to use + edm::Wrapper*wgpuEB = nullptr, *wcpuEB = nullptr; + edm::Wrapper*wgpuEE = nullptr, *wcpuEE = nullptr; + + std::string inFileName{argv[1]}; + std::string outFileName{argv[2]}; + + // prep output + TFile rfout{outFileName.c_str(), "recreate"}; + + int const nbins = 400; + float const last = 4096.; + auto hADCEBGPU = new TH1D("hADCEBGPU", "hADCEBGPU", nbins, 0, last); + auto hADCEBCPU = new TH1D("hADCEBCPU", "hADCEBCPU", nbins, 0, last); + auto hADCEEGPU = new TH1D("hADCEEGPU", "hADCEEGPU", nbins, 0, last); + auto hADCEECPU = new TH1D("hADCEECPU", "hADCEECPU", nbins, 0, last); + + auto hGainEBGPU = new TH1D("hGainEBGPU", "hGainEBGPU", 4, 0, 4); + auto hGainEBCPU = new TH1D("hGainEBCPU", "hGainEBCPU", 4, 0, 4); + auto hGainEEGPU = new TH1D("hGainEEGPU", "hGainEEGPU", 4, 0, 4); + auto hGainEECPU = new TH1D("hGainEECPU", "hGainEECPU", 4, 0, 4); + + auto hADCEBGPUvsCPU = new TH2D("hADCEBGPUvsCPU", "hADCEBGPUvsCPU", nbins, 0, last, nbins, 0, last); + auto hADCEEGPUvsCPU = new TH2D("hADCEEGPUvsCPU", "hADCEEGPUvsCPU", nbins, 0, last, nbins, 0, last); + auto hGainEBGPUvsCPU = new TH2D("hGainEBGPUvsCPU", "hGainEBGPUvsCPU", 4, 0, 4, 4, 0, 4); + auto hGainEEGPUvsCPU = new TH2D("hGainEEGPUvsCPU", "hGainEEGPUvsCPU", 4, 0, 4, 4, 0, 4); + + // prep input + TFile rfin{inFileName.c_str()}; + TTree* rt = (TTree*)rfin.Get("Events"); + rt->SetBranchAddress("EBDigiCollection_ecalCPUDigisProducer_ebDigis_RECO.", &wgpuEB); + rt->SetBranchAddress("EEDigiCollection_ecalCPUDigisProducer_eeDigis_RECO.", &wgpuEE); + rt->SetBranchAddress("EBDigiCollection_ecalDigis_ebDigis_RECO.", &wcpuEB); + rt->SetBranchAddress("EEDigiCollection_ecalDigis_eeDigis_RECO.", &wcpuEE); + + // accumulate + auto const nentries = rt->GetEntries(); + std::cout << ">>> nentries = " << nentries << std::endl; + for (int ie = 0; ie < nentries; ++ie) { + rt->GetEntry(ie); + + auto const ngpuebs = wgpuEB->bareProduct().size(); + auto const ncpuebs = wcpuEB->bareProduct().size(); + auto const ngpuees = wgpuEE->bareProduct().size(); + auto const ncpuees = wcpuEE->bareProduct().size(); + + if (ngpuebs != ncpuebs or ngpuees != ncpuees) { + std::cerr << "*** mismatch in ndigis: " + << "ie = " << ie << " ngpuebs = " << ngpuebs << " ncpuebs = " << ncpuebs << " ngpuees = " << ngpuees + << " ncpuees = " << ncpuees << std::endl; + + // this is a must for now + //assert(ngpuebs==ncpuebs); + //assert(ngpuees==ncpuees); } - - // branches to use - edm::Wrapper *wgpuEB=nullptr, *wcpuEB=nullptr; - edm::Wrapper *wgpuEE=nullptr, *wcpuEE=nullptr; - - std::string inFileName{argv[1]}; - std::string outFileName{argv[2]}; - - // prep output - TFile rfout{outFileName.c_str(), "recreate"}; - - int const nbins = 400; - float const last = 4096.; - auto hADCEBGPU = new TH1D("hADCEBGPU", "hADCEBGPU", nbins, 0, last); - auto hADCEBCPU = new TH1D("hADCEBCPU", "hADCEBCPU", nbins, 0, last); - auto hADCEEGPU = new TH1D("hADCEEGPU", "hADCEEGPU", nbins, 0, last); - auto hADCEECPU = new TH1D("hADCEECPU", "hADCEECPU", nbins, 0, last); - - auto hGainEBGPU = new TH1D("hGainEBGPU", "hGainEBGPU", 4, 0, 4); - auto hGainEBCPU = new TH1D("hGainEBCPU", "hGainEBCPU", 4, 0, 4); - auto hGainEEGPU = new TH1D("hGainEEGPU", "hGainEEGPU", 4, 0, 4); - auto hGainEECPU = new TH1D("hGainEECPU", "hGainEECPU", 4, 0, 4); - - auto hADCEBGPUvsCPU = new TH2D("hADCEBGPUvsCPU", "hADCEBGPUvsCPU", - nbins, 0, last, nbins, 0, last); - auto hADCEEGPUvsCPU = new TH2D("hADCEEGPUvsCPU", "hADCEEGPUvsCPU", - nbins, 0, last, nbins, 0, last); - auto hGainEBGPUvsCPU = new TH2D("hGainEBGPUvsCPU", "hGainEBGPUvsCPU", - 4, 0, 4, 4, 0, 4); - auto hGainEEGPUvsCPU = new TH2D("hGainEEGPUvsCPU", "hGainEEGPUvsCPU", - 4, 0, 4, 4, 0, 4); - - // prep input - TFile rfin{inFileName.c_str()}; - TTree *rt = (TTree*)rfin.Get("Events"); - rt->SetBranchAddress("EBDigiCollection_ecalCPUDigisProducer_ebDigis_RECO.", - &wgpuEB); - rt->SetBranchAddress("EEDigiCollection_ecalCPUDigisProducer_eeDigis_RECO.", - &wgpuEE); - rt->SetBranchAddress("EBDigiCollection_ecalDigis_ebDigis_RECO.", - &wcpuEB); - rt->SetBranchAddress("EEDigiCollection_ecalDigis_eeDigis_RECO.", - &wcpuEE); - - // accumulate - auto const nentries = rt->GetEntries(); - std::cout << ">>> nentries = " << nentries << std::endl; - for (int ie=0; ieGetEntry(ie); - - auto const ngpuebs = wgpuEB->bareProduct().size(); - auto const ncpuebs = wcpuEB->bareProduct().size(); - auto const ngpuees = wgpuEE->bareProduct().size(); - auto const ncpuees = wcpuEE->bareProduct().size(); - - if (ngpuebs!=ncpuebs or ngpuees!=ncpuees) { - std::cerr << "*** mismatch in ndigis: " - << "ie = " << ie - << " ngpuebs = " << ngpuebs - << " ncpuebs = " << ncpuebs - << " ngpuees = " << ngpuees - << " ncpuees = " << ncpuees - << std::endl; - - // this is a must for now - //assert(ngpuebs==ncpuebs); - //assert(ngpuees==ncpuees); - } - - // assume identical sizes - auto const& idsgpuEB = wgpuEB->bareProduct().ids(); - auto const& datagpuEB = wgpuEB->bareProduct().data(); - auto const& idscpuEB = wcpuEB->bareProduct().ids(); - auto const& datacpuEB = wcpuEB->bareProduct().data(); - for (uint32_t ieb=0; iebFill(sampleGPU.adc()); - hGainEBGPU->Fill(sampleGPU.gainId()); - hADCEBCPU->Fill(sampleCPU.adc()); - hGainEBCPU->Fill(sampleCPU.gainId()); - hADCEBGPUvsCPU->Fill(sampleCPU.adc(), sampleGPU.adc()); - hGainEBGPUvsCPU->Fill(sampleCPU.gainId(), sampleGPU.gainId()); - } - } - - auto const& idsgpuEE = wgpuEE->bareProduct().ids(); - auto const& datagpuEE = wgpuEE->bareProduct().data(); - auto const& idscpuEE = wcpuEE->bareProduct().ids(); - auto const& datacpuEE = wcpuEE->bareProduct().data(); - for (uint32_t iee=0; ieeFill(sampleGPU.adc()); - hGainEEGPU->Fill(sampleGPU.gainId()); - hADCEECPU->Fill(sampleCPU.adc()); - hGainEECPU->Fill(sampleCPU.gainId()); - hADCEEGPUvsCPU->Fill(sampleCPU.adc(), sampleGPU.adc()); - hGainEEGPUvsCPU->Fill(sampleCPU.gainId(), sampleGPU.gainId()); - } - } + + // assume identical sizes + auto const& idsgpuEB = wgpuEB->bareProduct().ids(); + auto const& datagpuEB = wgpuEB->bareProduct().data(); + auto const& idscpuEB = wcpuEB->bareProduct().ids(); + auto const& datacpuEB = wcpuEB->bareProduct().data(); + for (uint32_t ieb = 0; ieb < ngpuebs; ++ieb) { + auto const& idgpu = idsgpuEB[ieb]; + auto iter2idcpu = std::find(idscpuEB.begin(), idscpuEB.end(), idgpu); + // FIXME + assert(idgpu == *iter2idcpu); + + auto const ptrdiff = iter2idcpu - idscpuEB.begin(); + for (uint32_t s = 0u; s < 10u; s++) { + EcalMGPASample sampleGPU{datagpuEB[ieb * 10 + s]}; + EcalMGPASample sampleCPU{datacpuEB[ptrdiff * 10 + s]}; + + hADCEBGPU->Fill(sampleGPU.adc()); + hGainEBGPU->Fill(sampleGPU.gainId()); + hADCEBCPU->Fill(sampleCPU.adc()); + hGainEBCPU->Fill(sampleCPU.gainId()); + hADCEBGPUvsCPU->Fill(sampleCPU.adc(), sampleGPU.adc()); + hGainEBGPUvsCPU->Fill(sampleCPU.gainId(), sampleGPU.gainId()); + } } - { - TCanvas c{"plots", "plots", 4200, 6200}; - c.Divide(2, 4); - c.cd(1); - { - gPad->SetLogy(); - hADCEBCPU->SetLineColor(kBlack); - hADCEBCPU->SetLineWidth(1.); - hADCEBCPU->Draw(""); - hADCEBGPU->SetLineColor(kBlue); - hADCEBGPU->SetLineWidth(1.); - hADCEBGPU->Draw("sames"); - gPad->Update(); - auto stats = (TPaveStats*)hADCEBGPU->FindObject("stats"); - auto y2 = stats->GetY2NDC(); - auto y1 = stats->GetY1NDC(); - stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); - } - c.cd(2); - { - gPad->SetLogy(); - hADCEECPU->SetLineColor(kBlack); - hADCEECPU->SetLineWidth(1.); - hADCEECPU->Draw(""); - hADCEEGPU->SetLineColor(kBlue); - hADCEEGPU->SetLineWidth(1.); - hADCEEGPU->Draw("sames"); - gPad->Update(); - auto stats = (TPaveStats*)hADCEEGPU->FindObject("stats"); - auto y2 = stats->GetY2NDC(); - auto y1 = stats->GetY1NDC(); - stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); - } - c.cd(3); - { - gPad->SetLogy(); - hGainEBCPU->SetLineColor(kBlack); - hGainEBCPU->SetLineWidth(1.); - hGainEBCPU->Draw(""); - hGainEBGPU->SetLineColor(kBlue); - hGainEBGPU->SetLineWidth(1.); - hGainEBGPU->Draw("sames"); - gPad->Update(); - auto stats = (TPaveStats*)hGainEBGPU->FindObject("stats"); - auto y2 = stats->GetY2NDC(); - auto y1 = stats->GetY1NDC(); - stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); - } - c.cd(4); - { - gPad->SetLogy(); - hGainEECPU->SetLineColor(kBlack); - hGainEECPU->SetLineWidth(1.); - hGainEECPU->Draw(""); - hGainEEGPU->SetLineColor(kBlue); - hGainEEGPU->SetLineWidth(1.); - hGainEEGPU->Draw("sames"); - gPad->Update(); - auto stats = (TPaveStats*)hGainEEGPU->FindObject("stats"); - auto y2 = stats->GetY2NDC(); - auto y1 = stats->GetY1NDC(); - stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); - } - c.cd(5); - hADCEBGPUvsCPU->Draw("colz"); - c.cd(6); - hADCEEGPUvsCPU->Draw("colz"); - c.cd(7); - hGainEBGPUvsCPU->Draw("colz"); - c.cd(8); - hGainEEGPUvsCPU->Draw("colz"); - c.SaveAs("plots.pdf"); + auto const& idsgpuEE = wgpuEE->bareProduct().ids(); + auto const& datagpuEE = wgpuEE->bareProduct().data(); + auto const& idscpuEE = wcpuEE->bareProduct().ids(); + auto const& datacpuEE = wcpuEE->bareProduct().data(); + for (uint32_t iee = 0; iee < ngpuees; ++iee) { + auto const& idgpu = idsgpuEE[iee]; + auto iter2idcpu = std::find(idscpuEE.begin(), idscpuEE.end(), idgpu); + // FIXME + assert(idgpu == *iter2idcpu); + + // get the digis + auto const ptrdiff = iter2idcpu - idscpuEE.begin(); + for (uint32_t s = 0u; s < 10u; s++) { + EcalMGPASample sampleGPU{datagpuEE[iee * 10 + s]}; + EcalMGPASample sampleCPU{datacpuEE[ptrdiff * 10 + s]}; + + hADCEEGPU->Fill(sampleGPU.adc()); + hGainEEGPU->Fill(sampleGPU.gainId()); + hADCEECPU->Fill(sampleCPU.adc()); + hGainEECPU->Fill(sampleCPU.gainId()); + hADCEEGPUvsCPU->Fill(sampleCPU.adc(), sampleGPU.adc()); + hGainEEGPUvsCPU->Fill(sampleCPU.gainId(), sampleGPU.gainId()); + } } + } - rfin.Close(); - rfout.Write(); - rfout.Close(); + { + TCanvas c{"plots", "plots", 4200, 6200}; + c.Divide(2, 4); + c.cd(1); + { + gPad->SetLogy(); + hADCEBCPU->SetLineColor(kBlack); + hADCEBCPU->SetLineWidth(1.); + hADCEBCPU->Draw(""); + hADCEBGPU->SetLineColor(kBlue); + hADCEBGPU->SetLineWidth(1.); + hADCEBGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hADCEBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2 - y1)); + } + c.cd(2); + { + gPad->SetLogy(); + hADCEECPU->SetLineColor(kBlack); + hADCEECPU->SetLineWidth(1.); + hADCEECPU->Draw(""); + hADCEEGPU->SetLineColor(kBlue); + hADCEEGPU->SetLineWidth(1.); + hADCEEGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hADCEEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2 - y1)); + } + c.cd(3); + { + gPad->SetLogy(); + hGainEBCPU->SetLineColor(kBlack); + hGainEBCPU->SetLineWidth(1.); + hGainEBCPU->Draw(""); + hGainEBGPU->SetLineColor(kBlue); + hGainEBGPU->SetLineWidth(1.); + hGainEBGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hGainEBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2 - y1)); + } + c.cd(4); + { + gPad->SetLogy(); + hGainEECPU->SetLineColor(kBlack); + hGainEECPU->SetLineWidth(1.); + hGainEECPU->Draw(""); + hGainEEGPU->SetLineColor(kBlue); + hGainEEGPU->SetLineWidth(1.); + hGainEEGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hGainEEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2 - y1)); + } + c.cd(5); + hADCEBGPUvsCPU->Draw("colz"); + c.cd(6); + hADCEEGPUvsCPU->Draw("colz"); + c.cd(7); + hGainEBGPUvsCPU->Draw("colz"); + c.cd(8); + hGainEEGPUvsCPU->Draw("colz"); + c.SaveAs("plots.pdf"); + } + + rfin.Close(); + rfout.Write(); + rfout.Close(); } diff --git a/EventFilter/EcalRawToDigi/plugins/BuildFile.xml b/EventFilter/EcalRawToDigi/plugins/BuildFile.xml index 296a6b2461f8c..6c2f2bb94db7c 100644 --- a/EventFilter/EcalRawToDigi/plugins/BuildFile.xml +++ b/EventFilter/EcalRawToDigi/plugins/BuildFile.xml @@ -1,25 +1,24 @@ - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + - - - - - - - + + diff --git a/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc b/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc index 6f488053b204b..00491efe634cd 100644 --- a/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc +++ b/EventFilter/EcalRawToDigi/plugins/EcalCPUDigisProducer.cc @@ -10,7 +10,7 @@ #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" -#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/MakerMacros.h" // algorithm specific @@ -25,128 +25,99 @@ #include "EventFilter/EcalRawToDigi/interface/DeclsForKernels.h" #include "EventFilter/EcalRawToDigi/interface/UnpackGPU.h" -class EcalCPUDigisProducer - : public edm::stream::EDProducer -{ +class EcalCPUDigisProducer : public edm::stream::EDProducer { public: - explicit EcalCPUDigisProducer(edm::ParameterSet const& ps); - ~EcalCPUDigisProducer() override; - static void fillDescriptions(edm::ConfigurationDescriptions&); + explicit EcalCPUDigisProducer(edm::ParameterSet const& ps); + ~EcalCPUDigisProducer() override; + static void fillDescriptions(edm::ConfigurationDescriptions&); private: - void acquire(edm::Event const&, - edm::EventSetup const&, - edm::WaitingTaskWithArenaHolder) override; - void produce(edm::Event&, edm::EventSetup const&) override; + void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override; + void produce(edm::Event&, edm::EventSetup const&) override; private: - edm::EDGetTokenT> digisInEBToken_, - digisInEEToken_; - edm::EDPutTokenT digisOutEBToken_; - edm::EDPutTokenT digisOutEEToken_; - - // FIXME better way to pass pointers from acquire to produce? - std::vector> idsebtmp, idseetmp; - std::vector> dataebtmp, dataeetmp; + edm::EDGetTokenT> digisInEBToken_, digisInEEToken_; + edm::EDPutTokenT digisOutEBToken_; + edm::EDPutTokenT digisOutEEToken_; + + // FIXME better way to pass pointers from acquire to produce? + std::vector> idsebtmp, idseetmp; + std::vector> dataebtmp, dataeetmp; }; -void EcalCPUDigisProducer::fillDescriptions( - edm::ConfigurationDescriptions& confDesc) { - edm::ParameterSetDescription desc; +void EcalCPUDigisProducer::fillDescriptions(edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; - desc.add("digisInLabelEB", - edm::InputTag{"ecalRawToDigiGPU", "ebDigisGPU"}); - desc.add("digisInLabelEE", - edm::InputTag{"ecalRawToDigiGPU", "eeDigisGPU"}); - desc.add("digisOutLabelEB", "ebDigis"); - desc.add("digisOutLabelEE", "eeDigis"); + desc.add("digisInLabelEB", edm::InputTag{"ecalRawToDigiGPU", "ebDigisGPU"}); + desc.add("digisInLabelEE", edm::InputTag{"ecalRawToDigiGPU", "eeDigisGPU"}); + desc.add("digisOutLabelEB", "ebDigis"); + desc.add("digisOutLabelEE", "eeDigis"); - std::string label = "ecalCPUDigisProducer"; - confDesc.add(label, desc); + std::string label = "ecalCPUDigisProducer"; + confDesc.add(label, desc); } -EcalCPUDigisProducer::EcalCPUDigisProducer( - const edm::ParameterSet& ps) +EcalCPUDigisProducer::EcalCPUDigisProducer(const edm::ParameterSet& ps) : digisInEBToken_{consumes>( - ps.getParameter("digisInLabelEB"))} - , digisInEEToken_{consumes>( - ps.getParameter("digisInLabelEE"))} - , digisOutEBToken_{produces( - ps.getParameter("digisOutLabelEB"))} - , digisOutEEToken_{produces( - ps.getParameter("digisOutLabelEE"))} -{} + ps.getParameter("digisInLabelEB"))}, + digisInEEToken_{ + consumes>(ps.getParameter("digisInLabelEE"))}, + digisOutEBToken_{produces(ps.getParameter("digisOutLabelEB"))}, + digisOutEEToken_{produces(ps.getParameter("digisOutLabelEE"))} {} EcalCPUDigisProducer::~EcalCPUDigisProducer() {} -void EcalCPUDigisProducer::acquire( - edm::Event const& event, - edm::EventSetup const& setup, - edm::WaitingTaskWithArenaHolder taskHolder) -{ - // retrieve data/ctx - auto const& ebdigisProduct = event.get(digisInEBToken_); - auto const& eedigisProduct = event.get(digisInEEToken_); - cms::cuda::ScopedContextAcquire ctx{ebdigisProduct, std::move(taskHolder)}; - auto const& ebdigis = ctx.get(ebdigisProduct); - auto const& eedigis = ctx.get(eedigisProduct); - - // resize out tmp buffers - // FIXME remove hardcoded values - idsebtmp.resize(ebdigis.ndigis); - dataebtmp.resize(ebdigis.ndigis * 10); - idseetmp.resize(eedigis.ndigis); - dataeetmp.resize(eedigis.ndigis * 10); - - // enqeue transfers - cudaCheck( cudaMemcpyAsync(dataebtmp.data(), - ebdigis.data, - dataebtmp.size() * sizeof(uint16_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - cudaCheck( cudaMemcpyAsync(dataeetmp.data(), - eedigis.data, - dataeetmp.size() * sizeof(uint16_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - cudaCheck( cudaMemcpyAsync(idsebtmp.data(), - ebdigis.ids, - idsebtmp.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - cudaCheck( cudaMemcpyAsync(idseetmp.data(), - eedigis.ids, - idseetmp.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); +void EcalCPUDigisProducer::acquire(edm::Event const& event, + edm::EventSetup const& setup, + edm::WaitingTaskWithArenaHolder taskHolder) { + // retrieve data/ctx + auto const& ebdigisProduct = event.get(digisInEBToken_); + auto const& eedigisProduct = event.get(digisInEEToken_); + cms::cuda::ScopedContextAcquire ctx{ebdigisProduct, std::move(taskHolder)}; + auto const& ebdigis = ctx.get(ebdigisProduct); + auto const& eedigis = ctx.get(eedigisProduct); + + // resize out tmp buffers + // FIXME remove hardcoded values + idsebtmp.resize(ebdigis.ndigis); + dataebtmp.resize(ebdigis.ndigis * 10); + idseetmp.resize(eedigis.ndigis); + dataeetmp.resize(eedigis.ndigis * 10); + + // enqeue transfers + cudaCheck(cudaMemcpyAsync( + dataebtmp.data(), ebdigis.data, dataebtmp.size() * sizeof(uint16_t), cudaMemcpyDeviceToHost, ctx.stream())); + cudaCheck(cudaMemcpyAsync( + dataeetmp.data(), eedigis.data, dataeetmp.size() * sizeof(uint16_t), cudaMemcpyDeviceToHost, ctx.stream())); + cudaCheck(cudaMemcpyAsync( + idsebtmp.data(), ebdigis.ids, idsebtmp.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, ctx.stream())); + cudaCheck(cudaMemcpyAsync( + idseetmp.data(), eedigis.ids, idseetmp.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, ctx.stream())); } -void EcalCPUDigisProducer::produce( - edm::Event& event, - edm::EventSetup const& setup) -{ - // output collections - auto digisEB = std::make_unique(); - auto digisEE = std::make_unique(); - digisEB->resize(idsebtmp.size()); - digisEE->resize(idseetmp.size()); - - // cast constness away - // use pointers to buffers instead of move operator= semantics - // cause we have different allocators in there... - auto *dataEB = const_cast(digisEB->data().data()); - auto *dataEE = const_cast(digisEE->data().data()); - auto *idsEB = const_cast(digisEB->ids().data()); - auto *idsEE = const_cast(digisEE->ids().data()); - - // copy data - std::memcpy(dataEB, dataebtmp.data(), dataebtmp.size() * sizeof(uint16_t)); - std::memcpy(dataEE, dataeetmp.data(), dataeetmp.size() * sizeof(uint16_t)); - std::memcpy(idsEB, idsebtmp.data(), idsebtmp.size() * sizeof(uint32_t)); - std::memcpy(idsEE, idseetmp.data(), idseetmp.size() * sizeof(uint32_t)); - - event.put(digisOutEBToken_, std::move(digisEB)); - event.put(digisOutEEToken_, std::move(digisEE)); +void EcalCPUDigisProducer::produce(edm::Event& event, edm::EventSetup const& setup) { + // output collections + auto digisEB = std::make_unique(); + auto digisEE = std::make_unique(); + digisEB->resize(idsebtmp.size()); + digisEE->resize(idseetmp.size()); + + // cast constness away + // use pointers to buffers instead of move operator= semantics + // cause we have different allocators in there... + auto* dataEB = const_cast(digisEB->data().data()); + auto* dataEE = const_cast(digisEE->data().data()); + auto* idsEB = const_cast(digisEB->ids().data()); + auto* idsEE = const_cast(digisEE->ids().data()); + + // copy data + std::memcpy(dataEB, dataebtmp.data(), dataebtmp.size() * sizeof(uint16_t)); + std::memcpy(dataEE, dataeetmp.data(), dataeetmp.size() * sizeof(uint16_t)); + std::memcpy(idsEB, idsebtmp.data(), idsebtmp.size() * sizeof(uint32_t)); + std::memcpy(idsEE, idseetmp.data(), idseetmp.size() * sizeof(uint32_t)); + + event.put(digisOutEBToken_, std::move(digisEB)); + event.put(digisOutEEToken_, std::move(digisEE)); } DEFINE_FWK_MODULE(EcalCPUDigisProducer); diff --git a/EventFilter/EcalRawToDigi/plugins/EcalRawESProducersGPUDefs.cc b/EventFilter/EcalRawToDigi/plugins/EcalRawESProducersGPUDefs.cc index 6538cb0f32816..0133eb27d5c71 100644 --- a/EventFilter/EcalRawToDigi/plugins/EcalRawESProducersGPUDefs.cc +++ b/EventFilter/EcalRawToDigi/plugins/EcalRawESProducersGPUDefs.cc @@ -6,9 +6,7 @@ #include -using EcalElectronicsMappingGPUESProducer = EcalRawESProducerGPU< - ecal::raw::ElectronicsMappingGPU, - EcalMappingElectronics, - EcalMappingElectronicsRcd>; +using EcalElectronicsMappingGPUESProducer = + EcalRawESProducerGPU; DEFINE_FWK_EVENTSETUP_MODULE(EcalElectronicsMappingGPUESProducer); diff --git a/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc b/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc index 3198017117cb6..18dc2307e9bfc 100644 --- a/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc +++ b/EventFilter/EcalRawToDigi/plugins/EcalRawToDigiGPU.cc @@ -10,7 +10,7 @@ #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" -#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/MakerMacros.h" // algorithm specific @@ -25,151 +25,131 @@ #include "EventFilter/EcalRawToDigi/interface/DeclsForKernels.h" #include "EventFilter/EcalRawToDigi/interface/UnpackGPU.h" -class EcalRawToDigiGPU - : public edm::stream::EDProducer -{ +class EcalRawToDigiGPU : public edm::stream::EDProducer { public: - explicit EcalRawToDigiGPU(edm::ParameterSet const& ps); - ~EcalRawToDigiGPU() override; - static void fillDescriptions(edm::ConfigurationDescriptions&); + explicit EcalRawToDigiGPU(edm::ParameterSet const& ps); + ~EcalRawToDigiGPU() override; + static void fillDescriptions(edm::ConfigurationDescriptions&); private: - void acquire(edm::Event const&, - edm::EventSetup const&, - edm::WaitingTaskWithArenaHolder) override; - void produce(edm::Event&, edm::EventSetup const&) override; + void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override; + void produce(edm::Event&, edm::EventSetup const&) override; private: - edm::EDGetTokenT rawDataToken_; - edm::EDPutTokenT> digisEBToken_, - digisEEToken_; + edm::EDGetTokenT rawDataToken_; + edm::EDPutTokenT> digisEBToken_, digisEEToken_; - cms::cuda::ContextState cudaState_; + cms::cuda::ContextState cudaState_; - std::vector fedsToUnpack_; + std::vector fedsToUnpack_; - ecal::raw::ConfigurationParameters config_; - // FIXME move this to use raii - ecal::raw::InputDataCPU inputCPU_; - ecal::raw::InputDataGPU inputGPU_; - ecal::raw::OutputDataGPU outputGPU_; - ecal::raw::ScratchDataGPU scratchGPU_; - ecal::raw::OutputDataCPU outputCPU_; + ecal::raw::ConfigurationParameters config_; + // FIXME move this to use raii + ecal::raw::InputDataCPU inputCPU_; + ecal::raw::InputDataGPU inputGPU_; + ecal::raw::OutputDataGPU outputGPU_; + ecal::raw::ScratchDataGPU scratchGPU_; + ecal::raw::OutputDataCPU outputCPU_; }; -void EcalRawToDigiGPU::fillDescriptions( - edm::ConfigurationDescriptions& confDesc) { - edm::ParameterSetDescription desc; - - desc.add("InputLabel", edm::InputTag("rawDataCollector")); - std::vector feds(54); - for (uint32_t i=0; i<54; ++i) - feds[i] = i+601; - desc.add>("FEDs", feds); - desc.add("maxChannels", 20000); - desc.add("digisLabelEB", "ebDigisGPU"); - desc.add("digisLabelEE", "eeDigisGPU"); - - std::string label = "ecalRawToDigiGPU"; - confDesc.add(label, desc); +void EcalRawToDigiGPU::fillDescriptions(edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; + + desc.add("InputLabel", edm::InputTag("rawDataCollector")); + std::vector feds(54); + for (uint32_t i = 0; i < 54; ++i) + feds[i] = i + 601; + desc.add>("FEDs", feds); + desc.add("maxChannels", 20000); + desc.add("digisLabelEB", "ebDigisGPU"); + desc.add("digisLabelEE", "eeDigisGPU"); + + std::string label = "ecalRawToDigiGPU"; + confDesc.add(label, desc); } -EcalRawToDigiGPU::EcalRawToDigiGPU( - const edm::ParameterSet& ps) - : rawDataToken_{consumes(ps.getParameter( - "InputLabel"))} - , digisEBToken_{produces>( - ps.getParameter("digisLabelEB"))} - , digisEEToken_{produces>( - ps.getParameter("digisLabelEE"))} - , fedsToUnpack_{ps.getParameter>("FEDs")} -{ - config_.maxChannels = ps.getParameter("maxChannels"); - - inputCPU_.allocate(); - inputGPU_.allocate(); - outputGPU_.allocate(config_); - scratchGPU_.allocate(config_); - outputCPU_.allocate(); +EcalRawToDigiGPU::EcalRawToDigiGPU(const edm::ParameterSet& ps) + : rawDataToken_{consumes(ps.getParameter("InputLabel"))}, + digisEBToken_{produces>(ps.getParameter("digisLabelEB"))}, + digisEEToken_{produces>(ps.getParameter("digisLabelEE"))}, + fedsToUnpack_{ps.getParameter>("FEDs")} { + config_.maxChannels = ps.getParameter("maxChannels"); + + inputCPU_.allocate(); + inputGPU_.allocate(); + outputGPU_.allocate(config_); + scratchGPU_.allocate(config_); + outputCPU_.allocate(); } EcalRawToDigiGPU::~EcalRawToDigiGPU() { - inputGPU_.deallocate(); - outputGPU_.deallocate(config_); - scratchGPU_.deallocate(config_); + inputGPU_.deallocate(); + outputGPU_.deallocate(config_); + scratchGPU_.deallocate(config_); } -void EcalRawToDigiGPU::acquire( - edm::Event const& event, - edm::EventSetup const& setup, - edm::WaitingTaskWithArenaHolder holder) -{ - // raii - cms::cuda::ScopedContextAcquire ctx{ - event.streamID(), std::move(holder), cudaState_}; - - // conditions - edm::ESHandle eMappingHandle; - setup.get().get(eMappingHandle); - auto const& eMappingProduct = eMappingHandle->getProduct(ctx.stream()); - - // bundle up conditions - ecal::raw::ConditionsProducts conditions{eMappingProduct}; - - // event data - edm::Handle rawDataHandle; - event.getByToken(rawDataToken_, rawDataHandle); - - // iterate over feds - // TODO: another idea - // - loop over all feds to unpack and enqueue cuda memcpy - // - accumulate the sizes - // - after the loop launch cuda memcpy for sizes - // - enqueue the kernel - uint32_t currentCummOffset = 0; - uint32_t counter = 0; - for (auto const& fed : fedsToUnpack_) { - //std::cout << "fed: " << fed << std::endl; - auto const& data = rawDataHandle->FEDData(fed); - auto const nbytes = data.size(); - - // skip empty feds - if (nbytes < ecal::raw::empty_event_size) - continue; - - // copy raw data into plain buffer - std::memcpy(inputCPU_.data.data() + currentCummOffset, data.data(), nbytes); - // set the offset in bytes from the start - inputCPU_.offsets[counter] = currentCummOffset; - inputCPU_.feds[counter] = fed; - - // this is the current offset into the vector - currentCummOffset += nbytes; - ++counter; - } - - ecal::raw::entryPoint( - inputCPU_, inputGPU_, outputGPU_, scratchGPU_, outputCPU_, - conditions, ctx.stream(), counter, currentCummOffset); +void EcalRawToDigiGPU::acquire(edm::Event const& event, + edm::EventSetup const& setup, + edm::WaitingTaskWithArenaHolder holder) { + // raii + cms::cuda::ScopedContextAcquire ctx{event.streamID(), std::move(holder), cudaState_}; + + // conditions + edm::ESHandle eMappingHandle; + setup.get().get(eMappingHandle); + auto const& eMappingProduct = eMappingHandle->getProduct(ctx.stream()); + + // bundle up conditions + ecal::raw::ConditionsProducts conditions{eMappingProduct}; + + // event data + edm::Handle rawDataHandle; + event.getByToken(rawDataToken_, rawDataHandle); + + // iterate over feds + // TODO: another idea + // - loop over all feds to unpack and enqueue cuda memcpy + // - accumulate the sizes + // - after the loop launch cuda memcpy for sizes + // - enqueue the kernel + uint32_t currentCummOffset = 0; + uint32_t counter = 0; + for (auto const& fed : fedsToUnpack_) { + //std::cout << "fed: " << fed << std::endl; + auto const& data = rawDataHandle->FEDData(fed); + auto const nbytes = data.size(); + + // skip empty feds + if (nbytes < ecal::raw::empty_event_size) + continue; + + // copy raw data into plain buffer + std::memcpy(inputCPU_.data.data() + currentCummOffset, data.data(), nbytes); + // set the offset in bytes from the start + inputCPU_.offsets[counter] = currentCummOffset; + inputCPU_.feds[counter] = fed; + + // this is the current offset into the vector + currentCummOffset += nbytes; + ++counter; + } + + ecal::raw::entryPoint( + inputCPU_, inputGPU_, outputGPU_, scratchGPU_, outputCPU_, conditions, ctx.stream(), counter, currentCummOffset); } -void EcalRawToDigiGPU::produce( - edm::Event& event, - edm::EventSetup const& setup) -{ - cms::cuda::ScopedContextProduce ctx{cudaState_}; - - // get the number of channels - auto const nchannelsEB = outputCPU_.nchannels[0]; - auto const nchannelsEE = outputCPU_.nchannels[1]; - - ecal::DigisCollection digisEB{outputGPU_.idsEB, - outputGPU_.samplesEB, nchannelsEB}; - ecal::DigisCollection digisEE{outputGPU_.idsEE, - outputGPU_.samplesEE, nchannelsEE}; - - ctx.emplace(event, digisEBToken_, std::move(digisEB)); - ctx.emplace(event, digisEEToken_, std::move(digisEE)); +void EcalRawToDigiGPU::produce(edm::Event& event, edm::EventSetup const& setup) { + cms::cuda::ScopedContextProduce ctx{cudaState_}; + + // get the number of channels + auto const nchannelsEB = outputCPU_.nchannels[0]; + auto const nchannelsEE = outputCPU_.nchannels[1]; + + ecal::DigisCollection digisEB{outputGPU_.idsEB, outputGPU_.samplesEB, nchannelsEB}; + ecal::DigisCollection digisEE{outputGPU_.idsEE, outputGPU_.samplesEE, nchannelsEE}; + + ctx.emplace(event, digisEBToken_, std::move(digisEB)); + ctx.emplace(event, digisEEToken_, std::move(digisEE)); } DEFINE_FWK_MODULE(EcalRawToDigiGPU); diff --git a/EventFilter/EcalRawToDigi/src/ElectronicsMappingGPU.cc b/EventFilter/EcalRawToDigi/src/ElectronicsMappingGPU.cc index c09a963b62a1d..8264c501a896c 100644 --- a/EventFilter/EcalRawToDigi/src/ElectronicsMappingGPU.cc +++ b/EventFilter/EcalRawToDigi/src/ElectronicsMappingGPU.cc @@ -5,58 +5,53 @@ #include "DataFormats/EcalDetId/interface/EcalElectronicsId.h" -namespace ecal { namespace raw { - -// TODO: 0x3FFFFF * 4B ~= 16MB -// tmp solution for linear mapping of eid -> did -ElectronicsMappingGPU::ElectronicsMappingGPU(EcalMappingElectronics const& mapping) - : eid2did_(0x3FFFFF) -{ - - // fill in eb - // TODO: EB vector is actually empty - auto const& barrelValues = mapping.barrelItems(); - for (unsigned int i=0; i did + ElectronicsMappingGPU::ElectronicsMappingGPU(EcalMappingElectronics const& mapping) : eid2did_(0x3FFFFF) { + // fill in eb + // TODO: EB vector is actually empty + auto const& barrelValues = mapping.barrelItems(); + for (unsigned int i = 0; i < barrelValues.size(); i++) { EcalElectronicsId eid{barrelValues[i].electronicsid}; EBDetId did{EBDetId::unhashIndex(i)}; eid2did_[eid.linearIndex()] = did.rawId(); - } - - // fill in ee - auto const& endcapValues = mapping.endcapItems(); - for (unsigned int i=0; ieid2did_.size() * sizeof(uint32_t)) ); - - // transfer - cudaCheck( cudaMemcpyAsync(product.eid2did, - this->eid2did_.data(), - this->eid2did_.size() * sizeof(uint32_t), - cudaMemcpyHostToDevice, - cudaStream) ); - } - ); - - return product; -} - -}} + cudaCheck(cudaMalloc((void**)&product.eid2did, this->eid2did_.size() * sizeof(uint32_t))); + + // transfer + cudaCheck(cudaMemcpyAsync(product.eid2did, + this->eid2did_.data(), + this->eid2did_.size() * sizeof(uint32_t), + cudaMemcpyHostToDevice, + cudaStream)); + }); + + return product; + } + + } // namespace raw +} // namespace ecal TYPELOOKUP_DATA_REG(ecal::raw::ElectronicsMappingGPU); diff --git a/EventFilter/EcalRawToDigi/src/UnpackGPU.cu b/EventFilter/EcalRawToDigi/src/UnpackGPU.cu index 8c9f05535b70d..a2e5057bbbf6a 100644 --- a/EventFilter/EcalRawToDigi/src/UnpackGPU.cu +++ b/EventFilter/EcalRawToDigi/src/UnpackGPU.cu @@ -1,476 +1,331 @@ -#include "EventFilter/EcalRawToDigi/interface/UnpackGPU.h" #include "EventFilter/EcalRawToDigi/interface/ElectronicsIdGPU.h" +#include "EventFilter/EcalRawToDigi/interface/UnpackGPU.h" -namespace ecal { namespace raw { +namespace ecal { + namespace raw { -__forceinline__ __device__ -void print_raw_buffer( - uint8_t const* const buffer, - uint32_t const nbytes, uint32_t const nbytes_per_row = 20) { - for (uint32_t i=0; i0) - printf("\n"); + __forceinline__ __device__ void print_raw_buffer(uint8_t const* const buffer, + uint32_t const nbytes, + uint32_t const nbytes_per_row = 20) { + for (uint32_t i = 0; i < nbytes; i++) { + if (i % nbytes_per_row == 0 && i > 0) + printf("\n"); printf("%02X ", buffer[i]); + } } -} -__forceinline__ __device__ -void print_first3bits(uint64_t const* buffer, uint32_t size) { - for (uint32_t i=0; i> 61) & 0x1; uint8_t const b62 = (buffer[i] >> 62) & 0x1; uint8_t const b63 = (buffer[i] >> 63) & 0x1; - printf("[word: %u] %u%u%u\n", i, - b63, b62, b61); + printf("[word: %u] %u%u%u\n", i, b63, b62, b61); + } + } + + __forceinline__ __device__ bool is_barrel(uint8_t dccid) { + return dccid >= ElectronicsIdGPU::MIN_DCCID_EBM && dccid <= ElectronicsIdGPU::MAX_DCCID_EBP; + } + + __forceinline__ __device__ uint8_t fed2dcc(int fed) { return static_cast(fed - 600); } + + __forceinline__ __device__ int zside_for_eb(ElectronicsIdGPU const& eid) { + int dcc = eid.dccId(); + return ((dcc >= ElectronicsIdGPU::MIN_DCCID_EBM && dcc <= ElectronicsIdGPU::MAX_DCCID_EBM)) ? -1 : 1; } -} - -__forceinline__ __device__ -bool is_barrel(uint8_t dccid) { - return dccid >= ElectronicsIdGPU::MIN_DCCID_EBM && - dccid <= ElectronicsIdGPU::MAX_DCCID_EBP; -} - -__forceinline__ __device__ -uint8_t fed2dcc(int fed) { return static_cast(fed - 600); } - -__forceinline__ __device__ -int zside_for_eb(ElectronicsIdGPU const& eid) { - int dcc = eid.dccId(); - return ((dcc >= ElectronicsIdGPU::MIN_DCCID_EBM && - dcc <= ElectronicsIdGPU::MAX_DCCID_EBM)) - ? -1 - : 1; - /* - if ((dcc >= MIN_DCCID_EBP && dcc <= MAX_DCCID_EBP)) - return +1; - */ -} - -__forceinline__ __device__ -bool is_synced_towerblock( - uint16_t const dccbx, - uint16_t const bx, - uint16_t const dccl1, - uint16_t const l1) { - bool const bxsync = (bx==0 && dccbx==3564) || (bx==dccbx && dccbx!=3564); - bool const l1sync = (l1 == ((dccl1 - 1) & 0xfff)); - return bxsync && l1sync; -} - -__forceinline__ __device__ -bool right_tower_for_eb(int tower) { - // for EB, two types of tower (LVRB top/bottom) - if ((tower > 12 && tower < 21) || - (tower > 28 && tower < 37) || - (tower > 44 && tower < 53) || - (tower > 60 && tower < 69)) + + __forceinline__ __device__ bool is_synced_towerblock(uint16_t const dccbx, + uint16_t const bx, + uint16_t const dccl1, + uint16_t const l1) { + bool const bxsync = (bx == 0 && dccbx == 3564) || (bx == dccbx && dccbx != 3564); + bool const l1sync = (l1 == ((dccl1 - 1) & 0xfff)); + return bxsync && l1sync; + } + + __forceinline__ __device__ bool right_tower_for_eb(int tower) { + // for EB, two types of tower (LVRB top/bottom) + if ((tower > 12 && tower < 21) || (tower > 28 && tower < 37) || (tower > 44 && tower < 53) || + (tower > 60 && tower < 69)) return true; - else + else return false; -} - -__forceinline__ __device__ -uint32_t compute_ebdetid(ElectronicsIdGPU const& eid) { - // as in Geometry/EcalMaping/.../EcalElectronicsMapping - auto const dcc = eid.dccId(); - auto const tower = eid.towerId(); - auto const strip = eid.stripId(); - auto const xtal = eid.xtalId(); - - int smid = 0; - int iphi = 0; - bool EBPlus = (zside_for_eb(eid) > 0); - bool EBMinus = !EBPlus; - - if (zside_for_eb(eid) < 0) { + } + + __forceinline__ __device__ uint32_t compute_ebdetid(ElectronicsIdGPU const& eid) { + // as in Geometry/EcalMaping/.../EcalElectronicsMapping + auto const dcc = eid.dccId(); + auto const tower = eid.towerId(); + auto const strip = eid.stripId(); + auto const xtal = eid.xtalId(); + + int smid = 0; + int iphi = 0; + bool EBPlus = (zside_for_eb(eid) > 0); + bool EBMinus = !EBPlus; + + if (zside_for_eb(eid) < 0) { smid = dcc + 19 - ElectronicsIdGPU::DCCID_PHI0_EBM; iphi = (smid - 19) * ElectronicsIdGPU::kCrystalsInPhi; iphi += 5 * ((tower - 1) % ElectronicsIdGPU::kTowersInPhi); - } else { + } else { smid = dcc + 1 - ElectronicsIdGPU::DCCID_PHI0_EBP; iphi = (smid - 1) * ElectronicsIdGPU::kCrystalsInPhi; iphi += 5 * (ElectronicsIdGPU::kTowersInPhi - ((tower - 1) % ElectronicsIdGPU::kTowersInPhi) - 1); - } + } - bool RightTower = right_tower_for_eb(tower); - int ieta = 5 * ((tower - 1) / ElectronicsIdGPU::kTowersInPhi) + 1; - if (RightTower) { + bool RightTower = right_tower_for_eb(tower); + int ieta = 5 * ((tower - 1) / ElectronicsIdGPU::kTowersInPhi) + 1; + if (RightTower) { ieta += (strip - 1); if (strip % 2 == 1) { - if (EBMinus) - iphi += (xtal - 1) + 1; - else - iphi += (4 - (xtal - 1)) + 1; + if (EBMinus) + iphi += (xtal - 1) + 1; + else + iphi += (4 - (xtal - 1)) + 1; } else { - if (EBMinus) - iphi += (4 - (xtal - 1)) + 1; - else - iphi += (xtal - 1) + 1; + if (EBMinus) + iphi += (4 - (xtal - 1)) + 1; + else + iphi += (xtal - 1) + 1; } - } else { + } else { ieta += 4 - (strip - 1); if (strip % 2 == 1) { - if (EBMinus) - iphi += (4 - (xtal - 1)) + 1; - else - iphi += (xtal - 1) + 1; + if (EBMinus) + iphi += (4 - (xtal - 1)) + 1; + else + iphi += (xtal - 1) + 1; } else { - if (EBMinus) - iphi += (xtal - 1) + 1; - else - iphi += (4 - (xtal - 1)) + 1; + if (EBMinus) + iphi += (xtal - 1) + 1; + else + iphi += (4 - (xtal - 1)) + 1; } - } - - if (zside_for_eb(eid) < 0) + } + + if (zside_for_eb(eid) < 0) ieta = -ieta; - DetId did{DetId::Ecal, EcalBarrel}; - return did.rawId() | - ((ieta > 0) - ? (0x10000 | (ieta << 9)) - : ((-ieta) << 9)) | (iphi & 0x1FF); -} - -__forceinline__ __device__ -int adc(uint16_t sample) { return sample & 0xfff; } -__forceinline__ __device__ -int gainId(uint16_t sample) { return (sample>>12) & 0x3; } - -template -__global__ -void kernel_unpack_test( - unsigned char const* __restrict__ data, - uint32_t const* __restrict__ offsets, - int const* __restrict__ feds, - uint16_t *samplesEB, - uint16_t *samplesEE, - uint32_t *idsEB, - uint32_t *idsEE, - uint32_t *pChannelsCounterEBEE, - uint32_t const* eid2did, - uint32_t const nbytesTotal) { - // indices - auto const ifed = blockIdx.x; - - // FIXME: use only the very first fed - //if (ifed!=10) return; - - // offset in bytes - auto const offset = offsets[ifed]; - // fed id - auto const fed = feds[ifed]; - auto const isBarrel = is_barrel(static_cast(fed - 600)); - // size - auto const size = ifed==gridDim.x-1 ? nbytesTotal - offset : offsets[ifed+1] - offset; - auto *samples = isBarrel ? samplesEB : samplesEE; - auto *ids = isBarrel ? idsEB : idsEE; - auto *pChannelsCounter = isBarrel - ? &pChannelsCounterEBEE[0] - : &pChannelsCounterEBEE[1]; - - // FIXME: debugging - //printf("ifed = %u fed = %d offset = %u size = %u\n", ifed, fed, offset, size); - - // offset to the right raw buffer - uint64_t const* buffer = reinterpret_cast(data + offset); - - // dump first 3 bits for each 64-bit word - //print_first3bits(buffer, size / 8); - - // - // fed header - // - //print_raw_buffer(reinterpret_cast(buffer), 8); - //printf("\n"); - auto const fed_header = buffer[0]; - uint32_t fed_id = (fed_header >> 8) & 0xfff; - uint32_t bx = (fed_header >> 20) & 0xfff; - uint32_t lv1 = (fed_header >> 32) & 0xffffff; - uint8_t trigger_type = (fed_header >> 56) & 0xf; - uint8_t const bid_fed_header = (fed_header >> 60) & 0xf; - //printf("fed = %d fed_id = %u bx = %u lv1 = %u tt=%hhu bid = 0x%u\n", - // fed, fed_id, bx, lv1, trigger_type, bid_fed_header); - - // - // dcc header: w1 - // - //print_raw_buffer(reinterpret_cast(buffer + 1), 8); - //printf("\n"); - auto const dcc_header = buffer[1]; - uint32_t event_length = dcc_header & 0xffffff; - uint8_t dcc_errors = (dcc_header >> 24) & 0xff; - uint32_t run_number = (dcc_header >> 32) & 0xffffff; - uint8_t const word_dcc = (dcc_header >> 56) & 0x3f; - uint8_t const bid_dcc_header = (dcc_header >> 62) & 0x3; - //printf("fed = %d size = %u event_length = %u dcc_errors = %u run_number = %u word_dcc = 0x%u bid_dcc_header = 0x%u\n", - // fed, size, 8*event_length, static_cast(dcc_errors), run_number, static_cast(word_dcc), static_cast(bid_dcc_header)); - - // - // dcc header w2 - // - //print_raw_buffer(reinterpret_cast(buffer + 2), 8); - //printf("\n"); - auto const w2 = buffer[2]; - uint32_t const run_type = w2 & 0xffffffff; - uint16_t const det_trigger_type = (w2 >> 32) & 0xffff; - uint8_t w2_dcc = (w2 >> 56) & 0x3f; - uint8_t w2_bid_dcc = (w2 >> 62) & 0x3; - //printf("run_type = %u det_trigger_type = %u w2_dcc = %u w2_bid_dcc = %u\n", - // run_type, det_trigger_type, w2_dcc, w2_bid_dcc); - - // - // dcc header w3 - // - auto const w3 = buffer[3]; - //print_raw_buffer(reinterpret_cast(&w3), 8); - //printf("\n"); - uint32_t const orbit_number = w3 & 0xffffffff; - uint8_t const sr = (w3 >> 32) & 0x1; - uint8_t const zs = (w3 >> 33) & 0x1; - uint8_t const tzs = (w3 >> 34) & 0x1; - uint8_t const sr_chstatus = (w3 >> 36) & 0xf; - uint8_t const tcc_chstatus1 = (w3 >> 40) & 0xf; - uint8_t const tcc_chstatus2 = (w3 >> 44) & 0xf; - uint8_t const tcc_chstatus3 = (w3 >> 48) & 0xf; - uint8_t const tcc_chstatus4 = (w3 >> 52) & 0xf; - uint8_t const w3_dcc = (w3 >> 56) & 0x3f; - uint8_t const w3_bid_dcc = (w3 >> 62) & 0x3; - //printf("orbit_number = %u sr = %u zs = %u tzs = %u sr_chstatus = %u\n", - // orbit_number, static_cast(sr), static_cast(zs), - // static_cast(tzs), static_cast(sr_chstatus)); - //printf("tcc_chstatus1 = %u tcc_chstatus2 = %u tcc_chstatus3 = %u tcc_chstatus4 = %u\n", - // static_cast(tcc_chstatus1), static_cast(tcc_chstatus2), - // static_cast(tcc_chstatus3), static_cast(tcc_chstatus4)); - - // - // w4 - w8 (including 5 64-bit words) - // - /* - for (uint32_t i=0; i<5; i++) { - auto const wi = buffer[4 + i]; - for (uint32_t i=0; i<14; i++) { - uint8_t value_i = (wi >> i*4) & 0xf; - printf("fe_chstatus_%u = %u ", i, static_cast(value_i)); - } - uint8_t wi_dcc = (wi >> 56) & 0x3f; - uint8_t wi_bid_dcc = (wi >> 62) & 0x3; - printf("wi_dcc = %u wi_bid-dcc = %u\n", - static_cast(wi_dcc), static_cast(wi_bid_dcc)); - printf("\n"); - } - */ - - // - // TCC block - // - { - auto const w = buffer[9]; - //print_raw_buffer(reinterpret_cast(&w), 8); - //printf("\n"); - uint8_t const tccid = w & 0xff; - uint8_t const bxlocal = (w >> 16) & 0xff; - uint8_t const e0 = (w >> 17) & 0x1; - uint8_t const w_bfield_0 = (w >> 29) & 0x7; - uint16_t const lv1local = (w >> 32) & 0xfff; - uint8_t const e1 = (w >> 44) & 0x1; - uint8_t const ntt = (w >> 48) & 0x7f; - uint8_t const ntimesamples = (w >> 55) & 0xf; - uint8_t const le0 = (w >> 59) & 0x1; - uint8_t const le1 = (w >> 60) & 0x1; - uint8_t const w_bfield_1 = (w >> 61) & 0x7; - //printf("tccid = %u bxlocal = %u e0 = %u w_bitfield_0 = %u lv1local = %u\n", - // tccid, bxlocal, e0, w_bfield_0, lv1local); - //printf("e1 = %u ntt = %u ntimesamples = %u le0 = %u le1 = %u w_bfield_1 = %u\n", - // e1, ntt, ntimesamples, le0, le1, w_bfield_1); + DetId did{DetId::Ecal, EcalBarrel}; + return did.rawId() | ((ieta > 0) ? (0x10000 | (ieta << 9)) : ((-ieta) << 9)) | (iphi & 0x1FF); } - // 9 for fed + dcc header - // 36 for 4 EE TCC blocks or 18 for 1 EB TCC block - // 6 for SR block size - //print_first3bits(buffer, size / 8); - //auto const* tower_block_start = buffer + 9 + 36 + 6; - //print_first3bits(tower_block_start, size / 8 - 10 - 36 - 6); - - // - // print Tower block headers - // - uint8_t ntccblockwords = isBarrel ? 18 : 36; - auto const* tower_blocks_start = buffer + 9 + ntccblockwords + 6; - auto const* trailer = buffer + (size / 8 - 1); - auto const* current_tower_block = tower_blocks_start; - while (current_tower_block != trailer) { + __forceinline__ __device__ int adc(uint16_t sample) { return sample & 0xfff; } + + __forceinline__ __device__ int gainId(uint16_t sample) { return (sample >> 12) & 0x3; } + + template + __global__ void kernel_unpack_test(unsigned char const* __restrict__ data, + uint32_t const* __restrict__ offsets, + int const* __restrict__ feds, + uint16_t* samplesEB, + uint16_t* samplesEE, + uint32_t* idsEB, + uint32_t* idsEE, + uint32_t* pChannelsCounterEBEE, + uint32_t const* eid2did, + uint32_t const nbytesTotal) { + // indices + auto const ifed = blockIdx.x; + + // FIXME: use only the very first fed + //if (ifed!=10) return; + + // offset in bytes + auto const offset = offsets[ifed]; + // fed id + auto const fed = feds[ifed]; + auto const isBarrel = is_barrel(static_cast(fed - 600)); + // size + auto const size = ifed == gridDim.x - 1 ? nbytesTotal - offset : offsets[ifed + 1] - offset; + auto* samples = isBarrel ? samplesEB : samplesEE; + auto* ids = isBarrel ? idsEB : idsEE; + auto* pChannelsCounter = isBarrel ? &pChannelsCounterEBEE[0] : &pChannelsCounterEBEE[1]; + + // FIXME: debugging + //printf("ifed = %u fed = %d offset = %u size = %u\n", ifed, fed, offset, size); + + // offset to the right raw buffer + uint64_t const* buffer = reinterpret_cast(data + offset); + + // dump first 3 bits for each 64-bit word + //print_first3bits(buffer, size / 8); + + // + // fed header + // + auto const fed_header = buffer[0]; + uint32_t bx = (fed_header >> 20) & 0xfff; + uint32_t lv1 = (fed_header >> 32) & 0xffffff; + + // 9 for fed + dcc header + // 36 for 4 EE TCC blocks or 18 for 1 EB TCC block + // 6 for SR block size + + // + // print Tower block headers + // + uint8_t ntccblockwords = isBarrel ? 18 : 36; + auto const* tower_blocks_start = buffer + 9 + ntccblockwords + 6; + auto const* trailer = buffer + (size / 8 - 1); + auto const* current_tower_block = tower_blocks_start; + while (current_tower_block != trailer) { auto const w = *current_tower_block; uint8_t ttid = w & 0xff; - uint8_t ntimesamples = (w >> 8) & 0x7f; uint16_t bxlocal = (w >> 16) & 0xfff; - uint8_t e0 = (w >> 28) & 0x1; - uint8_t w_bfield_0 = (w >> 30) & 0x3; uint16_t lv1local = (w >> 32) & 0xfff; - uint8_t e1 = (w >> 44) & 0x1; uint16_t block_length = (w >> 48) & 0x1ff; - uint16_t w_bfield_1 = (w >> 62) & 0x3; - // uint16_t const dccbx = bx & 0xfff; uint16_t const dccl1 = lv1 & 0xfff; - //printf("dccbx = %u bxlocal = %u dccl1 = %u l1local = %u\n", - // dccbx, bxlocal, dccl1, lv1local); if (!is_synced_towerblock(dccbx, bxlocal, dccl1, lv1local)) { - current_tower_block += block_length; - continue; + current_tower_block += block_length; + continue; } - //printf("ttid = %u ntimesamples = %u\ bxlocal = %u e0 = %u w_bfield_0 = %u\n", - // ttid, ntimesamples, bxlocal, e0, w_bfield_0); - //printf("lv1local = %u e1 = %u block_length = %u w_bfield-1 = %u\n", - // lv1local, e1, block_length, w_bfield_1); - - // go thru all the channels + // go through all the channels // get the next channel coordinates uint32_t nchannels = (block_length - 1) / 3; // 1 threads per channel in this block - for (uint32_t ich=0; ich leave the loop - if (i_to_access>=nchannels) break; - - // inc the channel's counter and get the pos where to store - auto const wdata = current_tower_block[1 + i_to_access*3]; - uint8_t const stripid = wdata & 0x7; - uint8_t const xtalid = (wdata >> 4) & 0x7; - ElectronicsIdGPU eid{fed2dcc(fed), ttid, stripid, xtalid}; - auto const didraw = isBarrel - ? compute_ebdetid(eid) - : eid2did[eid.linearIndex()]; - // FIXME: what kind of channels are these guys - if (didraw == 0) - continue; - - // get samples - uint16_t sampleValues[10]; - sampleValues[0] = (wdata >> 16) & 0x3fff; - sampleValues[1] = (wdata >> 32) & 0x3fff; - sampleValues[2] = (wdata >> 48) & 0x3fff; - auto const wdata1 = current_tower_block[2+i_to_access*3]; - sampleValues[3] = wdata1 & 0x3fff; - sampleValues[4] = (wdata1 >> 16) & 0x3fff; - sampleValues[5] = (wdata1 >> 32) & 0x3fff; - sampleValues[6] = (wdata1 >> 48) & 0x3fff; - auto const wdata2 = current_tower_block[3+i_to_access*3]; - sampleValues[7] = wdata2 & 0x3fff; - sampleValues[8] = (wdata2 >> 16) & 0x3fff; - sampleValues[9] = (wdata2 >> 32) & 0x3fff; - //printf("stripid = %u xtalid = %u\n", stripid, xtalid); - - // check gain - bool isSaturation = true; - short firstGainZeroSampID{-1}, firstGainZeroSampADC{-1}; - for (uint32_t si=0; si<10; si++) { - if (gainId(sampleValues[si]) == 0) { - firstGainZeroSampID = si; - firstGainZeroSampADC = adc(sampleValues[si]); - break; - } + for (uint32_t ich = 0; ich < nchannels; ich += NTHREADS) { + auto const i_to_access = ich + threadIdx.x; + // threads outside of the range -> leave the loop + if (i_to_access >= nchannels) + break; + + // inc the channel's counter and get the pos where to store + auto const wdata = current_tower_block[1 + i_to_access * 3]; + uint8_t const stripid = wdata & 0x7; + uint8_t const xtalid = (wdata >> 4) & 0x7; + ElectronicsIdGPU eid{fed2dcc(fed), ttid, stripid, xtalid}; + auto const didraw = isBarrel ? compute_ebdetid(eid) : eid2did[eid.linearIndex()]; + // FIXME: what kind of channels are these guys + if (didraw == 0) + continue; + + // get samples + uint16_t sampleValues[10]; + sampleValues[0] = (wdata >> 16) & 0x3fff; + sampleValues[1] = (wdata >> 32) & 0x3fff; + sampleValues[2] = (wdata >> 48) & 0x3fff; + auto const wdata1 = current_tower_block[2 + i_to_access * 3]; + sampleValues[3] = wdata1 & 0x3fff; + sampleValues[4] = (wdata1 >> 16) & 0x3fff; + sampleValues[5] = (wdata1 >> 32) & 0x3fff; + sampleValues[6] = (wdata1 >> 48) & 0x3fff; + auto const wdata2 = current_tower_block[3 + i_to_access * 3]; + sampleValues[7] = wdata2 & 0x3fff; + sampleValues[8] = (wdata2 >> 16) & 0x3fff; + sampleValues[9] = (wdata2 >> 32) & 0x3fff; + //printf("stripid = %u xtalid = %u\n", stripid, xtalid); + + // check gain + bool isSaturation = true; + short firstGainZeroSampID{-1}, firstGainZeroSampADC{-1}; + for (uint32_t si = 0; si < 10; si++) { + if (gainId(sampleValues[si]) == 0) { + firstGainZeroSampID = si; + firstGainZeroSampADC = adc(sampleValues[si]); + break; + } + } + if (firstGainZeroSampID != -1) { + unsigned int plateauEnd = std::min(10u, (unsigned int)(firstGainZeroSampID + 5)); + for (unsigned int s = firstGainZeroSampID; s < plateauEnd; s++) { + if (gainId(sampleValues[s]) == 0 && adc(sampleValues[s]) == firstGainZeroSampADC) { + ; + } else { + isSaturation = false; + break; + } //it's not saturation + } + // get rid of channels which are stuck in gain0 + if (firstGainZeroSampID < 3) { + isSaturation = false; } - if (firstGainZeroSampID!=-1) { - unsigned int plateauEnd = std::min(10u ,(unsigned int)(firstGainZeroSampID+5)); - for (unsigned int s=firstGainZeroSampID; s gainId(sampleValues[si])) && - numGain<5) gainSwitchError=true; - if (gainId(sampleValues[si-1]) == gainId(sampleValues[si])) numGain++; - else numGain=1; - } - if (gainSwitchError) - continue; + if (!isSaturation) + continue; + } else { // there is no zero gainId sample + // gain switch check + short numGain = 1; + bool gainSwitchError = false; + for (unsigned int si = 1; si < 10; si++) { + if ((gainId(sampleValues[si - 1]) > gainId(sampleValues[si])) && numGain < 5) + gainSwitchError = true; + if (gainId(sampleValues[si - 1]) == gainId(sampleValues[si])) + numGain++; + else + numGain = 1; } - - auto const pos = atomicAdd(pChannelsCounter, 1); - - // store to global - ids[pos] = didraw; - samples[pos*10] = sampleValues[0]; - samples[pos*10 + 1] = sampleValues[1]; - samples[pos*10 + 2] = sampleValues[2]; - samples[pos*10 + 3] = sampleValues[3]; - samples[pos*10 + 4] = sampleValues[4]; - samples[pos*10 + 5] = sampleValues[5]; - samples[pos*10 + 6] = sampleValues[6]; - samples[pos*10 + 7] = sampleValues[7]; - samples[pos*10 + 8] = sampleValues[8]; - samples[pos*10 + 9] = sampleValues[9]; + if (gainSwitchError) + continue; + } + + auto const pos = atomicAdd(pChannelsCounter, 1); + + // store to global + ids[pos] = didraw; + samples[pos * 10] = sampleValues[0]; + samples[pos * 10 + 1] = sampleValues[1]; + samples[pos * 10 + 2] = sampleValues[2]; + samples[pos * 10 + 3] = sampleValues[3]; + samples[pos * 10 + 4] = sampleValues[4]; + samples[pos * 10 + 5] = sampleValues[5]; + samples[pos * 10 + 6] = sampleValues[6]; + samples[pos * 10 + 7] = sampleValues[7]; + samples[pos * 10 + 8] = sampleValues[8]; + samples[pos * 10 + 9] = sampleValues[9]; } current_tower_block += block_length; + } } -} - -void entryPoint( - InputDataCPU const& inputCPU, - InputDataGPU& inputGPU, - OutputDataGPU& outputGPU, - ScratchDataGPU& scratchGPU, - OutputDataCPU& outputCPU, - ConditionsProducts const& conditions, - cudaStream_t cudaStream, - uint32_t const nfedsWithData, - uint32_t const nbytesTotal) { - // transfer - cudaCheck( cudaMemcpyAsync(inputGPU.data, - inputCPU.data.data(), - nbytesTotal * sizeof(unsigned char), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(inputGPU.offsets, - inputCPU.offsets.data(), - nfedsWithData * sizeof(uint32_t), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemsetAsync(scratchGPU.pChannelsCounter, - 0, - sizeof(uint32_t) * 2, // EB + EE - cudaStream) ); - cudaCheck( cudaMemcpyAsync(inputGPU.feds, - inputCPU.feds.data(), - nfedsWithData * sizeof(int), - cudaMemcpyHostToDevice, - cudaStream) ); - - kernel_unpack_test<32><<>>( - inputGPU.data, - inputGPU.offsets, - inputGPU.feds, - outputGPU.samplesEB, - outputGPU.samplesEE, - outputGPU.idsEB, - outputGPU.idsEE, - scratchGPU.pChannelsCounter, - conditions.eMappingProduct.eid2did, - nbytesTotal - ); - cudaCheck( cudaGetLastError() ); - - // transfer the counters for how many eb and ee channels we got - cudaCheck( cudaMemcpyAsync(outputCPU.nchannels.data(), - scratchGPU.pChannelsCounter, - sizeof(uint32_t) * 2, - cudaMemcpyDeviceToHost, - cudaStream) ); -} - -}} + + void entryPoint(InputDataCPU const& inputCPU, + InputDataGPU& inputGPU, + OutputDataGPU& outputGPU, + ScratchDataGPU& scratchGPU, + OutputDataCPU& outputCPU, + ConditionsProducts const& conditions, + cudaStream_t cudaStream, + uint32_t const nfedsWithData, + uint32_t const nbytesTotal) { + // transfer + cudaCheck(cudaMemcpyAsync( + inputGPU.data, inputCPU.data.data(), nbytesTotal * sizeof(unsigned char), cudaMemcpyHostToDevice, cudaStream)); + cudaCheck(cudaMemcpyAsync(inputGPU.offsets, + inputCPU.offsets.data(), + nfedsWithData * sizeof(uint32_t), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemsetAsync(scratchGPU.pChannelsCounter, + 0, + sizeof(uint32_t) * 2, // EB + EE + cudaStream)); + cudaCheck(cudaMemcpyAsync( + inputGPU.feds, inputCPU.feds.data(), nfedsWithData * sizeof(int), cudaMemcpyHostToDevice, cudaStream)); + + kernel_unpack_test<32><<>>(inputGPU.data, + inputGPU.offsets, + inputGPU.feds, + outputGPU.samplesEB, + outputGPU.samplesEE, + outputGPU.idsEB, + outputGPU.idsEE, + scratchGPU.pChannelsCounter, + conditions.eMappingProduct.eid2did, + nbytesTotal); + cudaCheck(cudaGetLastError()); + + // transfer the counters for how many eb and ee channels we got + cudaCheck(cudaMemcpyAsync(outputCPU.nchannels.data(), + scratchGPU.pChannelsCounter, + sizeof(uint32_t) * 2, + cudaMemcpyDeviceToHost, + cudaStream)); + } + + } // namespace raw +} // namespace ecal diff --git a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp index e0cca70f93795..4d50b758d39f3 100644 --- a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp +++ b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp @@ -16,268 +16,288 @@ #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" int main(int argc, char *argv[]) { - if (argc<3) { - std::cout << "run with: ./validateGPU \n"; - exit(0); - } + if (argc < 3) { + std::cout << "run with: ./validateGPU \n"; + exit(0); + } - edm::Wrapper> *wgpuEB=nullptr; - edm::Wrapper> *wgpuEE=nullptr; - edm::Wrapper *wcpuEB = nullptr; - edm::Wrapper *wcpuEE = nullptr; + edm::Wrapper> *wgpuEB = nullptr; + edm::Wrapper> *wgpuEE = nullptr; + edm::Wrapper *wcpuEB = nullptr; + edm::Wrapper *wcpuEE = nullptr; - std::string fileName = argv[1]; - std::string outFileName = argv[2]; + std::string fileName = argv[1]; + std::string outFileName = argv[2]; - // output - TFile rfout{outFileName.c_str(), "recreate"}; + // output + TFile rfout{outFileName.c_str(), "recreate"}; - int nbins = 300; - float last = 3000.; + int nbins = 300; + float last = 3000.; - int nbins_chi2 = 1000; - float last_chi2 = 1000.; + int nbins_chi2 = 1000; + float last_chi2 = 1000.; - int nbins_delta = 201; // use an odd number to center around 0 - float delta = 0.2; + int nbins_delta = 201; // use an odd number to center around 0 + float delta = 0.2; - auto hSOIAmplitudesEBGPU = new TH1D("hSOIAmplitudesEBGPU", "hSOIAmplitudesEBGPU", nbins, 0, last); - auto hSOIAmplitudesEEGPU = new TH1D("hSOIAmplitudesEEGPU", "hSOIAmplitudesEEGPU", nbins, 0, last); - auto hSOIAmplitudesEBCPU = new TH1D("hSOIAmplitudesEBCPU", "hSOIAmplitudesEBCPU", nbins, 0, last); - auto hSOIAmplitudesEECPU = new TH1D("hSOIAmplitudesEECPU", "hSOIAmplitudesEECPU", nbins, 0, last); - auto hSOIAmplitudesEBGPUCPUratio = new TH1D("SOIAmplitudesEBGPU/CPUratio", "SOIAmplitudesEBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); - auto hSOIAmplitudesEEGPUCPUratio = new TH1D("SOIAmplitudesEEGPU/CPUratio", "SOIAmplitudesEEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); + auto hSOIAmplitudesEBGPU = new TH1D("hSOIAmplitudesEBGPU", "hSOIAmplitudesEBGPU", nbins, 0, last); + auto hSOIAmplitudesEEGPU = new TH1D("hSOIAmplitudesEEGPU", "hSOIAmplitudesEEGPU", nbins, 0, last); + auto hSOIAmplitudesEBCPU = new TH1D("hSOIAmplitudesEBCPU", "hSOIAmplitudesEBCPU", nbins, 0, last); + auto hSOIAmplitudesEECPU = new TH1D("hSOIAmplitudesEECPU", "hSOIAmplitudesEECPU", nbins, 0, last); + auto hSOIAmplitudesEBGPUCPUratio = + new TH1D("SOIAmplitudesEBGPU/CPUratio", "SOIAmplitudesEBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); + auto hSOIAmplitudesEEGPUCPUratio = + new TH1D("SOIAmplitudesEEGPU/CPUratio", "SOIAmplitudesEEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); - auto hChi2EBGPU = new TH1D("hChi2EBGPU", "hChi2EBGPU", nbins_chi2, 0, last_chi2); - auto hChi2EEGPU = new TH1D("hChi2EEGPU", "hChi2EEGPU", nbins_chi2, 0, last_chi2); - auto hChi2EBCPU = new TH1D("hChi2EBCPU", "hChi2EBCPU", nbins_chi2, 0, last_chi2); - auto hChi2EECPU = new TH1D("hChi2EECPU", "hChi2EECPU", nbins_chi2, 0, last_chi2); + auto hChi2EBGPU = new TH1D("hChi2EBGPU", "hChi2EBGPU", nbins_chi2, 0, last_chi2); + auto hChi2EEGPU = new TH1D("hChi2EEGPU", "hChi2EEGPU", nbins_chi2, 0, last_chi2); + auto hChi2EBCPU = new TH1D("hChi2EBCPU", "hChi2EBCPU", nbins_chi2, 0, last_chi2); + auto hChi2EECPU = new TH1D("hChi2EECPU", "hChi2EECPU", nbins_chi2, 0, last_chi2); - auto hSOIAmplitudesEBGPUvsCPU = new TH2D("hSOIAmplitudesEBGPUvsCPU", "hSOIAmplitudesEBGPUvsCPU", nbins, 0, last, nbins, 0, last); - auto hSOIAmplitudesEEGPUvsCPU = new TH2D("hSOIAmplitudesEEGPUvsCPU", "hSOIAmplitudesEEGPUvsCPU", nbins, 0, last, nbins, 0, last); - auto hSOIAmplitudesEBdeltavsCPU = new TH2D("hSOIAmplitudesEBdeltavsCPU", "hSOIAmplitudesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); - auto hSOIAmplitudesEEdeltavsCPU = new TH2D("hSOIAmplitudesEEdeltavsCPU", "hSOIAmplitudesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); + auto hSOIAmplitudesEBGPUvsCPU = + new TH2D("hSOIAmplitudesEBGPUvsCPU", "hSOIAmplitudesEBGPUvsCPU", nbins, 0, last, nbins, 0, last); + auto hSOIAmplitudesEEGPUvsCPU = + new TH2D("hSOIAmplitudesEEGPUvsCPU", "hSOIAmplitudesEEGPUvsCPU", nbins, 0, last, nbins, 0, last); + auto hSOIAmplitudesEBdeltavsCPU = + new TH2D("hSOIAmplitudesEBdeltavsCPU", "hSOIAmplitudesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); + auto hSOIAmplitudesEEdeltavsCPU = + new TH2D("hSOIAmplitudesEEdeltavsCPU", "hSOIAmplitudesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); - auto hChi2EBGPUvsCPU = new TH2D("hChi2EBGPUvsCPU", "hChi2EBGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2); - auto hChi2EEGPUvsCPU = new TH2D("hChi2EEGPUvsCPU", "hChi2EEGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2); - auto hChi2EBdeltavsCPU = new TH2D("hChi2EBdeltavsCPU", "hChi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); - auto hChi2EEdeltavsCPU = new TH2D("hChi2EEdeltavsCPU", "hChi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); + auto hChi2EBGPUvsCPU = + new TH2D("hChi2EBGPUvsCPU", "hChi2EBGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2); + auto hChi2EEGPUvsCPU = + new TH2D("hChi2EEGPUvsCPU", "hChi2EEGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2); + auto hChi2EBdeltavsCPU = + new TH2D("hChi2EBdeltavsCPU", "hChi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); + auto hChi2EEdeltavsCPU = + new TH2D("hChi2EEdeltavsCPU", "hChi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); - // input - std::cout << "validating file " << fileName << std::endl; - TFile rf{fileName.c_str()}; - TTree *rt = (TTree*)rf.Get("Events"); - rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEB_RECO.", &wgpuEB); - rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEE_RECO.", &wgpuEE); - rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEB_RECO.", &wcpuEB); - rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEE_RECO.", &wcpuEE); + // input + std::cout << "validating file " << fileName << std::endl; + TFile rf{fileName.c_str()}; + TTree *rt = (TTree *)rf.Get("Events"); + rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEB_RECO.", + &wgpuEB); + rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEE_RECO.", + &wgpuEE); + rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEB_RECO.", &wcpuEB); + rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEE_RECO.", &wcpuEE); - constexpr float eps_diff = 1e-3; + constexpr float eps_diff = 1e-3; - // accumulate - auto const nentries = rt->GetEntries(); - std::cout << "#events to validate over: " << nentries << std::endl; - for (int ie=0; ieGetEntry(ie); + // accumulate + auto const nentries = rt->GetEntries(); + std::cout << "#events to validate over: " << nentries << std::endl; + for (int ie = 0; ie < nentries; ++ie) { + rt->GetEntry(ie); - const char* ordinal[] = { "th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th" }; - auto cpu_eb_size = wcpuEB->bareProduct().size(); - auto cpu_ee_size = wcpuEE->bareProduct().size(); - auto gpu_eb_size = wgpuEB->bareProduct().amplitude.size(); - auto gpu_ee_size = wgpuEE->bareProduct().amplitude.size(); - if (cpu_eb_size != gpu_eb_size or cpu_ee_size != gpu_ee_size) { - std::cerr << ie << ordinal[ie % 10] << " entry:\n" - << " EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << gpu_eb_size << " (gpu)\n" - << " EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << gpu_ee_size << " (gpu)" << std::endl; - continue; - } + const char *ordinal[] = {"th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th"}; + auto cpu_eb_size = wcpuEB->bareProduct().size(); + auto cpu_ee_size = wcpuEE->bareProduct().size(); + auto gpu_eb_size = wgpuEB->bareProduct().amplitude.size(); + auto gpu_ee_size = wgpuEE->bareProduct().amplitude.size(); + if (cpu_eb_size != gpu_eb_size or cpu_ee_size != gpu_ee_size) { + std::cerr << ie << ordinal[ie % 10] << " entry:\n" + << " EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << gpu_eb_size + << " (gpu)\n" + << " EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << gpu_ee_size + << " (gpu)" << std::endl; + continue; + } - assert(wgpuEB->bareProduct().amplitude.size() == wcpuEB->bareProduct().size()); - assert(wgpuEE->bareProduct().amplitude.size() == wcpuEE->bareProduct().size()); - auto const neb = wcpuEB->bareProduct().size(); - auto const nee = wcpuEE->bareProduct().size(); + assert(wgpuEB->bareProduct().amplitude.size() == wcpuEB->bareProduct().size()); + assert(wgpuEE->bareProduct().amplitude.size() == wcpuEE->bareProduct().size()); + auto const neb = wcpuEB->bareProduct().size(); + auto const nee = wcpuEE->bareProduct().size(); - for (uint32_t i=0; ibareProduct().did[i]; - auto const soi_amp_gpu = wgpuEB->bareProduct().amplitude[i]; - auto const cpu_iter = wcpuEB->bareProduct().find(DetId{did_gpu}); - if (cpu_iter == wcpuEB->bareProduct().end()) { - std::cerr << ie << ordinal[ie % 10] << " entry\n" - << " Did not find a DetId " << did_gpu - << " in a CPU collection\n"; - continue; - } - auto const soi_amp_cpu = cpu_iter->amplitude(); - auto const chi2_gpu = wgpuEB->bareProduct().chi2[i]; - auto const chi2_cpu = cpu_iter->chi2(); + for (uint32_t i = 0; i < neb; ++i) { + auto const did_gpu = wgpuEB->bareProduct().did[i]; + auto const soi_amp_gpu = wgpuEB->bareProduct().amplitude[i]; + auto const cpu_iter = wcpuEB->bareProduct().find(DetId{did_gpu}); + if (cpu_iter == wcpuEB->bareProduct().end()) { + std::cerr << ie << ordinal[ie % 10] << " entry\n" + << " Did not find a DetId " << did_gpu << " in a CPU collection\n"; + continue; + } + auto const soi_amp_cpu = cpu_iter->amplitude(); + auto const chi2_gpu = wgpuEB->bareProduct().chi2[i]; + auto const chi2_cpu = cpu_iter->chi2(); - hSOIAmplitudesEBGPU->Fill(soi_amp_gpu); - hSOIAmplitudesEBCPU->Fill(soi_amp_cpu); - hSOIAmplitudesEBGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu); - hSOIAmplitudesEBdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu-soi_amp_cpu); - hSOIAmplitudesEBGPUCPUratio->Fill( (float) soi_amp_gpu/soi_amp_cpu); - hChi2EBGPU->Fill(chi2_gpu); - hChi2EBCPU->Fill(chi2_cpu); - hChi2EBGPUvsCPU->Fill(chi2_cpu, chi2_gpu); - hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu); + hSOIAmplitudesEBGPU->Fill(soi_amp_gpu); + hSOIAmplitudesEBCPU->Fill(soi_amp_cpu); + hSOIAmplitudesEBGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu); + hSOIAmplitudesEBdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu - soi_amp_cpu); + hSOIAmplitudesEBGPUCPUratio->Fill((float)soi_amp_gpu / soi_amp_cpu); + hChi2EBGPU->Fill(chi2_gpu); + hChi2EBCPU->Fill(chi2_cpu); + hChi2EBGPUvsCPU->Fill(chi2_cpu, chi2_gpu); + hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu); - if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or - (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu)) - { - printf("EB eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n", - ie, i, soi_amp_gpu, soi_amp_cpu, chi2_gpu, chi2_cpu); - if (std::isnan(chi2_gpu)) - printf("*** nan ***\n"); - } - } + if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or + std::isnan(chi2_gpu)) { + printf("EB eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n", + ie, + i, + soi_amp_gpu, + soi_amp_cpu, + chi2_gpu, + chi2_cpu); + if (std::isnan(chi2_gpu)) + printf("*** nan ***\n"); + } + } - for (uint32_t i=0; ibareProduct().did[i]; - auto const soi_amp_gpu = wgpuEE->bareProduct().amplitude[i]; - auto const cpu_iter = wcpuEE->bareProduct().find(DetId{did_gpu}); - if (cpu_iter == wcpuEE->bareProduct().end()) { - std::cerr << ie << ordinal[ie % 10] << " entry\n" - << " did not find a DetId " << did_gpu - << " in a CPU collection\n"; - continue; - } - auto const soi_amp_cpu = cpu_iter->amplitude(); - auto const chi2_gpu = wgpuEE->bareProduct().chi2[i]; - auto const chi2_cpu = cpu_iter->chi2(); + for (uint32_t i = 0; i < nee; ++i) { + auto const did_gpu = wgpuEE->bareProduct().did[i]; + auto const soi_amp_gpu = wgpuEE->bareProduct().amplitude[i]; + auto const cpu_iter = wcpuEE->bareProduct().find(DetId{did_gpu}); + if (cpu_iter == wcpuEE->bareProduct().end()) { + std::cerr << ie << ordinal[ie % 10] << " entry\n" + << " did not find a DetId " << did_gpu << " in a CPU collection\n"; + continue; + } + auto const soi_amp_cpu = cpu_iter->amplitude(); + auto const chi2_gpu = wgpuEE->bareProduct().chi2[i]; + auto const chi2_cpu = cpu_iter->chi2(); - hSOIAmplitudesEEGPU->Fill(soi_amp_gpu); - hSOIAmplitudesEECPU->Fill(soi_amp_cpu); - hSOIAmplitudesEEGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu); - hSOIAmplitudesEEdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu-soi_amp_cpu); - hSOIAmplitudesEEGPUCPUratio->Fill( (float) soi_amp_gpu/soi_amp_cpu); - hChi2EEGPU->Fill(chi2_gpu); - hChi2EECPU->Fill(chi2_cpu); - hChi2EEGPUvsCPU->Fill(chi2_cpu, chi2_gpu); - hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu); + hSOIAmplitudesEEGPU->Fill(soi_amp_gpu); + hSOIAmplitudesEECPU->Fill(soi_amp_cpu); + hSOIAmplitudesEEGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu); + hSOIAmplitudesEEdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu - soi_amp_cpu); + hSOIAmplitudesEEGPUCPUratio->Fill((float)soi_amp_gpu / soi_amp_cpu); + hChi2EEGPU->Fill(chi2_gpu); + hChi2EECPU->Fill(chi2_cpu); + hChi2EEGPUvsCPU->Fill(chi2_cpu, chi2_gpu); + hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu); - if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or - (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu)) - { - printf("EE eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n", - ie, static_cast(neb+i), soi_amp_gpu, soi_amp_cpu, chi2_gpu, chi2_cpu); - if (std::isnan(chi2_gpu)) - printf("*** nan ***\n"); - } - } + if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or + std::isnan(chi2_gpu)) { + printf("EE eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n", + ie, + static_cast(neb + i), + soi_amp_gpu, + soi_amp_cpu, + chi2_gpu, + chi2_cpu); + if (std::isnan(chi2_gpu)) + printf("*** nan ***\n"); + } } + } - { - TCanvas c("plots", "plots", 4200, 6200); - c.Divide(2, 4); - - c.cd(1); - { - gPad->SetLogy(); - hSOIAmplitudesEBCPU->SetLineColor(kBlack); - hSOIAmplitudesEBCPU->SetLineWidth(1.); - hSOIAmplitudesEBCPU->Draw(""); - hSOIAmplitudesEBGPU->SetLineColor(kBlue); - hSOIAmplitudesEBGPU->SetLineWidth(1.); - hSOIAmplitudesEBGPU->Draw("sames"); - gPad->Update(); - auto stats = (TPaveStats*)hSOIAmplitudesEBGPU->FindObject("stats"); - auto y2 = stats->GetY2NDC(); - auto y1 = stats->GetY1NDC(); - stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); - } - c.cd(2); - { - gPad->SetLogy(); - hSOIAmplitudesEECPU->SetLineColor(kBlack); - hSOIAmplitudesEECPU->SetLineWidth(1.); - hSOIAmplitudesEECPU->Draw(""); - hSOIAmplitudesEEGPU->SetLineColor(kBlue); - hSOIAmplitudesEEGPU->SetLineWidth(1.); - hSOIAmplitudesEEGPU->Draw("sames"); - gPad->Update(); - auto stats = (TPaveStats*)hSOIAmplitudesEEGPU->FindObject("stats"); - auto y2 = stats->GetY2NDC(); - auto y1 = stats->GetY1NDC(); - stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); - } - c.cd(3); - hSOIAmplitudesEBGPUvsCPU->Draw("COLZ"); - c.cd(4); - hSOIAmplitudesEEGPUvsCPU->Draw("COLZ"); - c.cd(5); - hSOIAmplitudesEBdeltavsCPU->Draw("COLZ"); - c.cd(6); - hSOIAmplitudesEEdeltavsCPU->Draw("COLZ"); - c.cd(7); - { - gPad->SetLogy(); - hSOIAmplitudesEBGPUCPUratio->SetLineColor(kBlack); - hSOIAmplitudesEBGPUCPUratio->SetLineWidth(1.); - hSOIAmplitudesEBGPUCPUratio->Draw(""); - } - c.cd(8); - { - gPad->SetLogy(); - hSOIAmplitudesEEGPUCPUratio->SetLineColor(kBlack); - hSOIAmplitudesEEGPUCPUratio->SetLineWidth(1.); - hSOIAmplitudesEEGPUCPUratio->Draw(""); - } + { + TCanvas c("plots", "plots", 4200, 6200); + c.Divide(2, 4); - c.SaveAs("ecal-amplitudes.pdf"); + c.cd(1); + { + gPad->SetLogy(); + hSOIAmplitudesEBCPU->SetLineColor(kBlack); + hSOIAmplitudesEBCPU->SetLineWidth(1.); + hSOIAmplitudesEBCPU->Draw(""); + hSOIAmplitudesEBGPU->SetLineColor(kBlue); + hSOIAmplitudesEBGPU->SetLineWidth(1.); + hSOIAmplitudesEBGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats *)hSOIAmplitudesEBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2 - y1)); + } + c.cd(2); + { + gPad->SetLogy(); + hSOIAmplitudesEECPU->SetLineColor(kBlack); + hSOIAmplitudesEECPU->SetLineWidth(1.); + hSOIAmplitudesEECPU->Draw(""); + hSOIAmplitudesEEGPU->SetLineColor(kBlue); + hSOIAmplitudesEEGPU->SetLineWidth(1.); + hSOIAmplitudesEEGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats *)hSOIAmplitudesEEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2 - y1)); } + c.cd(3); + hSOIAmplitudesEBGPUvsCPU->Draw("COLZ"); + c.cd(4); + hSOIAmplitudesEEGPUvsCPU->Draw("COLZ"); + c.cd(5); + hSOIAmplitudesEBdeltavsCPU->Draw("COLZ"); + c.cd(6); + hSOIAmplitudesEEdeltavsCPU->Draw("COLZ"); + c.cd(7); { - TCanvas c("plots", "plots", 4200, 6200); - c.Divide(2, 3); + gPad->SetLogy(); + hSOIAmplitudesEBGPUCPUratio->SetLineColor(kBlack); + hSOIAmplitudesEBGPUCPUratio->SetLineWidth(1.); + hSOIAmplitudesEBGPUCPUratio->Draw(""); + } + c.cd(8); + { + gPad->SetLogy(); + hSOIAmplitudesEEGPUCPUratio->SetLineColor(kBlack); + hSOIAmplitudesEEGPUCPUratio->SetLineWidth(1.); + hSOIAmplitudesEEGPUCPUratio->Draw(""); + } - c.cd(1); - { - gPad->SetLogy(); - hChi2EBCPU->SetLineColor(kBlack); - hChi2EBCPU->SetLineWidth(1.); - hChi2EBCPU->Draw(""); - hChi2EBGPU->SetLineColor(kBlue); - hChi2EBGPU->SetLineWidth(1.); - hChi2EBGPU->Draw("sames"); - gPad->Update(); - auto stats = (TPaveStats*)hChi2EBGPU->FindObject("stats"); - auto y2 = stats->GetY2NDC(); - auto y1 = stats->GetY1NDC(); - stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); - } - c.cd(2); - { - gPad->SetLogy(); - hChi2EECPU->SetLineColor(kBlack); - hChi2EECPU->SetLineWidth(1.); - hChi2EECPU->Draw(""); - hChi2EEGPU->SetLineColor(kBlue); - hChi2EEGPU->SetLineWidth(1.); - hChi2EEGPU->Draw("sames"); - gPad->Update(); - auto stats = (TPaveStats*)hChi2EEGPU->FindObject("stats"); - auto y2 = stats->GetY2NDC(); - auto y1 = stats->GetY1NDC(); - stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); - } - c.cd(3); - hChi2EBGPUvsCPU->Draw("COLZ"); - c.cd(4); - hChi2EEGPUvsCPU->Draw("COLZ"); - c.cd(5); - hChi2EBdeltavsCPU->Draw("COLZ"); - c.cd(6); - hChi2EEdeltavsCPU->Draw("COLZ"); + c.SaveAs("ecal-amplitudes.pdf"); + } + { + TCanvas c("plots", "plots", 4200, 6200); + c.Divide(2, 3); - c.SaveAs("ecal-chi2.pdf"); + c.cd(1); + { + gPad->SetLogy(); + hChi2EBCPU->SetLineColor(kBlack); + hChi2EBCPU->SetLineWidth(1.); + hChi2EBCPU->Draw(""); + hChi2EBGPU->SetLineColor(kBlue); + hChi2EBGPU->SetLineWidth(1.); + hChi2EBGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats *)hChi2EBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2 - y1)); } + c.cd(2); + { + gPad->SetLogy(); + hChi2EECPU->SetLineColor(kBlack); + hChi2EECPU->SetLineWidth(1.); + hChi2EECPU->Draw(""); + hChi2EEGPU->SetLineColor(kBlue); + hChi2EEGPU->SetLineWidth(1.); + hChi2EEGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats *)hChi2EEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2 - y1)); + } + c.cd(3); + hChi2EBGPUvsCPU->Draw("COLZ"); + c.cd(4); + hChi2EEGPUvsCPU->Draw("COLZ"); + c.cd(5); + hChi2EBdeltavsCPU->Draw("COLZ"); + c.cd(6); + hChi2EEdeltavsCPU->Draw("COLZ"); + + c.SaveAs("ecal-chi2.pdf"); + } - rf.Close(); - rfout.Write(); - rfout.Close(); + rf.Close(); + rfout.Write(); + rfout.Close(); - return 0; + return 0; } diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu index 83a3e2b39ed0b..d095a0f2181ef 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu @@ -16,84 +16,68 @@ #include "inplace_fnnls.h" #include "KernelHelpers.h" -namespace ecal { namespace multifit { - -/// -/// assume kernel launch configuration is -/// (MAXSAMPLES * nchannels, blocks) -/// -__global__ -void kernel_prep_1d_and_initialize( - EcalPulseShape const* shapes_in, - uint16_t const* digis_in_eb, - uint32_t const* dids_eb, - uint16_t const* digis_in_ee, - uint32_t const* dids_ee, - SampleVector* amplitudes, - SampleVector* amplitudesForMinimization, - SampleGainVector* gainsNoise, - float const* mean_x1, - float const* mean_x12, - float const* rms_x12, - float const* mean_x6, - float const* gain6Over1, - float const* gain12Over6, - bool* hasSwitchToGain6, - bool* hasSwitchToGain1, - bool* isSaturated, - ::ecal::reco::StorageScalarType* energies, - ::ecal::reco::StorageScalarType* chi2, - ::ecal::reco::StorageScalarType* g_pedestal, - uint32_t *dids_out, - uint32_t *flags, - char* acState, - BXVectorType *bxs, - uint32_t const offsetForHashes, - uint32_t const offsetForInputs, - bool const gainSwitchUseMaxSampleEB, - bool const gainSwitchUseMaxSampleEE, - int const nchannels) { - constexpr bool dynamicPedestal = false; //---- default to false, ok - constexpr int nsamples = EcalDataFrame::MAXSAMPLES; - constexpr int sample_max = 5; - constexpr int full_pulse_max = 9; - int const tx = threadIdx.x + blockIdx.x*blockDim.x; - int const nchannels_per_block = blockDim.x / nsamples; - int const total_threads = nchannels * nsamples; - int const ch = tx / nsamples; - // for accessing input arrays - int const inputCh = ch >= offsetForInputs - ? ch - offsetForInputs - : ch; - int const inputTx = ch >= offsetForInputs - ? tx - offsetForInputs*10 - : tx; - // eb is first and then ee - auto const* digis_in = ch >= offsetForInputs - ? digis_in_ee - : digis_in_eb; - auto const* dids = ch >= offsetForInputs - ? dids_ee - : dids_eb; - int const sample = threadIdx.x % nsamples; - - if (ch < nchannels) { +namespace ecal { + namespace multifit { + + /// + /// assume kernel launch configuration is + /// (MAXSAMPLES * nchannels, blocks) + /// + __global__ void kernel_prep_1d_and_initialize(EcalPulseShape const* shapes_in, + uint16_t const* digis_in_eb, + uint32_t const* dids_eb, + uint16_t const* digis_in_ee, + uint32_t const* dids_ee, + SampleVector* amplitudes, + SampleVector* amplitudesForMinimization, + SampleGainVector* gainsNoise, + float const* mean_x1, + float const* mean_x12, + float const* rms_x12, + float const* mean_x6, + float const* gain6Over1, + float const* gain12Over6, + bool* hasSwitchToGain6, + bool* hasSwitchToGain1, + bool* isSaturated, + ::ecal::reco::StorageScalarType* energies, + ::ecal::reco::StorageScalarType* chi2, + ::ecal::reco::StorageScalarType* g_pedestal, + uint32_t* dids_out, + uint32_t* flags, + char* acState, + BXVectorType* bxs, + uint32_t const offsetForHashes, + uint32_t const offsetForInputs, + bool const gainSwitchUseMaxSampleEB, + bool const gainSwitchUseMaxSampleEE, + int const nchannels) { + constexpr bool dynamicPedestal = false; //---- default to false, ok + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + constexpr int sample_max = 5; + constexpr int full_pulse_max = 9; + int const tx = threadIdx.x + blockIdx.x * blockDim.x; + int const nchannels_per_block = blockDim.x / nsamples; + int const ch = tx / nsamples; + // for accessing input arrays + int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch; + int const inputTx = ch >= offsetForInputs ? tx - offsetForInputs * 10 : tx; + // eb is first and then ee + auto const* digis_in = ch >= offsetForInputs ? digis_in_ee : digis_in_eb; + auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb; + int const sample = threadIdx.x % nsamples; + + if (ch < nchannels) { // array of 10 x channels per block // TODO: any other way of doing simple reduction // assume bool is 1 byte, should be quite safe extern __shared__ char shared_mem[]; - bool* shr_hasSwitchToGain6 = reinterpret_cast( - shared_mem); - bool* shr_hasSwitchToGain1 = shr_hasSwitchToGain6 + - nchannels_per_block*nsamples; - bool* shr_hasSwitchToGain0 = shr_hasSwitchToGain1 + - nchannels_per_block*nsamples; - bool* shr_isSaturated = shr_hasSwitchToGain0 + - nchannels_per_block*nsamples; - bool* shr_hasSwitchToGain0_tmp = shr_isSaturated + - nchannels_per_block*nsamples; - char* shr_counts = reinterpret_cast( - shr_hasSwitchToGain0_tmp) + nchannels_per_block*nsamples; + bool* shr_hasSwitchToGain6 = reinterpret_cast(shared_mem); + bool* shr_hasSwitchToGain1 = shr_hasSwitchToGain6 + nchannels_per_block * nsamples; + bool* shr_hasSwitchToGain0 = shr_hasSwitchToGain1 + nchannels_per_block * nsamples; + bool* shr_isSaturated = shr_hasSwitchToGain0 + nchannels_per_block * nsamples; + bool* shr_hasSwitchToGain0_tmp = shr_isSaturated + nchannels_per_block * nsamples; + char* shr_counts = reinterpret_cast(shr_hasSwitchToGain0_tmp) + nchannels_per_block * nsamples; // // indices @@ -101,10 +85,7 @@ void kernel_prep_1d_and_initialize( auto const did = DetId{dids[inputCh]}; auto const isBarrel = did.subdetId() == EcalBarrel; // TODO offset for ee, 0 for eb - auto const hashedId = isBarrel - ? hashedIndexEB(did.rawId()) - : offsetForHashes + hashedIndexEE(did.rawId()); - + auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId()); // // pulse shape template @@ -113,7 +94,7 @@ void kernel_prep_1d_and_initialize( isample+=nsamples) shapes_out[ch](isample + 7) = shapes_in[hashedId].pdfval[isample]; */ - + // will be used in the future for setting state auto const rmsForChecking = rms_x12[hashedId]; @@ -133,13 +114,12 @@ void kernel_prep_1d_and_initialize( shr_hasSwitchToGain0[threadIdx.x] = shr_hasSwitchToGain0_tmp[threadIdx.x]; shr_counts[threadIdx.x] = 0; __syncthreads(); - + // non-divergent branch (except for the last 4 threads) - if (threadIdx.x<=blockDim.x-5) { - #pragma unroll - for (int i=0; i<5; i++) - shr_counts[threadIdx.x] += - shr_hasSwitchToGain0[threadIdx.x+i]; + if (threadIdx.x <= blockDim.x - 5) { +#pragma unroll + for (int i = 0; i < 5; i++) + shr_counts[threadIdx.x] += shr_hasSwitchToGain0[threadIdx.x + i]; } shr_isSaturated[threadIdx.x] = shr_counts[threadIdx.x] == 5; @@ -148,102 +128,89 @@ void kernel_prep_1d_and_initialize( // TODO // if (sample < 5) { - shr_hasSwitchToGain6[threadIdx.x] = - shr_hasSwitchToGain6[threadIdx.x] || - shr_hasSwitchToGain6[threadIdx.x + 5]; - shr_hasSwitchToGain1[threadIdx.x] = - shr_hasSwitchToGain1[threadIdx.x] || - shr_hasSwitchToGain1[threadIdx.x + 5]; - - // duplication of hasSwitchToGain0 in order not to - // introduce another syncthreads - shr_hasSwitchToGain0_tmp[threadIdx.x] = - shr_hasSwitchToGain0_tmp[threadIdx.x] || - shr_hasSwitchToGain0_tmp[threadIdx.x+5]; + shr_hasSwitchToGain6[threadIdx.x] = + shr_hasSwitchToGain6[threadIdx.x] || shr_hasSwitchToGain6[threadIdx.x + 5]; + shr_hasSwitchToGain1[threadIdx.x] = + shr_hasSwitchToGain1[threadIdx.x] || shr_hasSwitchToGain1[threadIdx.x + 5]; + + // duplication of hasSwitchToGain0 in order not to + // introduce another syncthreads + shr_hasSwitchToGain0_tmp[threadIdx.x] = + shr_hasSwitchToGain0_tmp[threadIdx.x] || shr_hasSwitchToGain0_tmp[threadIdx.x + 5]; } __syncthreads(); - - if (sample<2) { - // note, both threads per channel take value [3] twice to avoid another if - shr_hasSwitchToGain6[threadIdx.x] = - shr_hasSwitchToGain6[threadIdx.x] || - shr_hasSwitchToGain6[threadIdx.x+2] || - shr_hasSwitchToGain6[threadIdx.x+3]; - shr_hasSwitchToGain1[threadIdx.x] = - shr_hasSwitchToGain1[threadIdx.x] || - shr_hasSwitchToGain1[threadIdx.x+2] || - shr_hasSwitchToGain1[threadIdx.x+3]; - - shr_hasSwitchToGain0_tmp[threadIdx.x] = - shr_hasSwitchToGain0_tmp[threadIdx.x] || - shr_hasSwitchToGain0_tmp[threadIdx.x+2] || - shr_hasSwitchToGain0_tmp[threadIdx.x+3]; - - // sample < 2 -> first 2 threads of each channel will be used here - // => 0 -> will compare 3 and 4 and put into 0 - // => 1 -> will compare 4 and 5 and put into 1 - shr_isSaturated[threadIdx.x] = - shr_isSaturated[threadIdx.x+3] || shr_isSaturated[threadIdx.x+4]; + + if (sample < 2) { + // note, both threads per channel take value [3] twice to avoid another if + shr_hasSwitchToGain6[threadIdx.x] = shr_hasSwitchToGain6[threadIdx.x] || + shr_hasSwitchToGain6[threadIdx.x + 2] || + shr_hasSwitchToGain6[threadIdx.x + 3]; + shr_hasSwitchToGain1[threadIdx.x] = shr_hasSwitchToGain1[threadIdx.x] || + shr_hasSwitchToGain1[threadIdx.x + 2] || + shr_hasSwitchToGain1[threadIdx.x + 3]; + + shr_hasSwitchToGain0_tmp[threadIdx.x] = shr_hasSwitchToGain0_tmp[threadIdx.x] || + shr_hasSwitchToGain0_tmp[threadIdx.x + 2] || + shr_hasSwitchToGain0_tmp[threadIdx.x + 3]; + + // sample < 2 -> first 2 threads of each channel will be used here + // => 0 -> will compare 3 and 4 and put into 0 + // => 1 -> will compare 4 and 5 and put into 1 + shr_isSaturated[threadIdx.x] = shr_isSaturated[threadIdx.x + 3] || shr_isSaturated[threadIdx.x + 4]; } __syncthreads(); bool check_hasSwitchToGain0 = false; - if (sample==0) { - shr_hasSwitchToGain6[threadIdx.x] = - shr_hasSwitchToGain6[threadIdx.x] || - shr_hasSwitchToGain6[threadIdx.x+1]; - shr_hasSwitchToGain1[threadIdx.x] = - shr_hasSwitchToGain1[threadIdx.x] || - shr_hasSwitchToGain1[threadIdx.x+1]; - shr_hasSwitchToGain0_tmp[threadIdx.x] = - shr_hasSwitchToGain0_tmp[threadIdx.x] || - shr_hasSwitchToGain0_tmp[threadIdx.x+1]; - - hasSwitchToGain6[ch] = shr_hasSwitchToGain6[threadIdx.x]; - hasSwitchToGain1[ch] = shr_hasSwitchToGain1[threadIdx.x]; - - // set only for the threadIdx.x corresponding to sample==0 - check_hasSwitchToGain0 = shr_hasSwitchToGain0_tmp[threadIdx.x]; - - shr_isSaturated[threadIdx.x+3] = - shr_isSaturated[threadIdx.x] || - shr_isSaturated[threadIdx.x+1]; - isSaturated[ch] = shr_isSaturated[threadIdx.x+3]; + if (sample == 0) { + shr_hasSwitchToGain6[threadIdx.x] = + shr_hasSwitchToGain6[threadIdx.x] || shr_hasSwitchToGain6[threadIdx.x + 1]; + shr_hasSwitchToGain1[threadIdx.x] = + shr_hasSwitchToGain1[threadIdx.x] || shr_hasSwitchToGain1[threadIdx.x + 1]; + shr_hasSwitchToGain0_tmp[threadIdx.x] = + shr_hasSwitchToGain0_tmp[threadIdx.x] || shr_hasSwitchToGain0_tmp[threadIdx.x + 1]; + + hasSwitchToGain6[ch] = shr_hasSwitchToGain6[threadIdx.x]; + hasSwitchToGain1[ch] = shr_hasSwitchToGain1[threadIdx.x]; + + // set only for the threadIdx.x corresponding to sample==0 + check_hasSwitchToGain0 = shr_hasSwitchToGain0_tmp[threadIdx.x]; + + shr_isSaturated[threadIdx.x + 3] = shr_isSaturated[threadIdx.x] || shr_isSaturated[threadIdx.x + 1]; + isSaturated[ch] = shr_isSaturated[threadIdx.x + 3]; } // TODO: w/o this sync, there is a race // if (threadIdx == sample_max) below uses max sample thread, not for 0 sample // check if we can remove it __syncthreads(); - + // TODO: divergent branch - if (gainId==0 || gainId==3) { - pedestal = mean_x1[hashedId]; - gainratio = gain6Over1[hashedId] * gain12Over6[hashedId]; - gainsNoise[ch](sample) = 2; - } else if (gainId==1) { - pedestal = mean_x12[hashedId]; - gainratio = 1.; - gainsNoise[ch](sample) = 0; - } else if (gainId==2) { - pedestal = mean_x6[hashedId]; - gainratio = gain12Over6[hashedId]; - gainsNoise[ch](sample) = 1; + if (gainId == 0 || gainId == 3) { + pedestal = mean_x1[hashedId]; + gainratio = gain6Over1[hashedId] * gain12Over6[hashedId]; + gainsNoise[ch](sample) = 2; + } else if (gainId == 1) { + pedestal = mean_x12[hashedId]; + gainratio = 1.; + gainsNoise[ch](sample) = 0; + } else if (gainId == 2) { + pedestal = mean_x6[hashedId]; + gainratio = gain12Over6[hashedId]; + gainsNoise[ch](sample) = 1; } - + // TODO: compile time constant -> branch should be non-divergent if (dynamicPedestal) - amplitude = static_cast(adc) * gainratio; + amplitude = static_cast(adc) * gainratio; else - amplitude = (static_cast(adc) - pedestal) * gainratio; + amplitude = (static_cast(adc) - pedestal) * gainratio; amplitudes[ch][sample] = amplitude; #ifdef ECAL_RECO_CUDA_DEBUG - printf("%d %d %d %d %f %f %f\n", tx, ch, sample, adc, amplitude, - pedestal, gainratio); - if (adc==0) - printf("adc is zero\n"); + printf("%d %d %d %d %f %f %f\n", tx, ch, sample, adc, amplitude, pedestal, gainratio); + if (adc == 0) + printf("adc is zero\n"); #endif // @@ -252,325 +219,287 @@ void kernel_prep_1d_and_initialize( amplitudesForMinimization[ch](sample) = 0; bxs[ch](sample) = sample - 5; - // select the thread for the max sample + // select the thread for the max sample //---> hardcoded above to be 5th sample, ok if (sample == sample_max) { - // - // initialization - // - acState[ch] = static_cast(MinimizationState::NotFinished); - energies[ch] = 0; - chi2[ch] = 0; - g_pedestal[ch] = 0; - uint32_t flag = 0; - dids_out[ch] = did.rawId(); - - // start of this channel in shared mem - int const chStart = threadIdx.x - sample_max; - // thread for the max sample in shared mem - int const threadMax = threadIdx.x; - auto const gainSwitchUseMaxSample = isBarrel - ? gainSwitchUseMaxSampleEB - : gainSwitchUseMaxSampleEE; - - // this flag setting is applied to all of the cases - if (shr_hasSwitchToGain6[chStart]) - flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain6; - if (shr_hasSwitchToGain1[chStart]) - flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain1; - - // this corresponds to cpu branching on lastSampleBeforeSaturation - // likely false - if (check_hasSwitchToGain0) { - // assign for the case some sample having gainId == 0 - //energies[ch] = amplitudes[ch][sample_max]; - energies[ch] = amplitude; - - // check if samples before sample_max have true - bool saturated_before_max = false; - #pragma unroll - for (char ii=0; ii<5; ii++) - saturated_before_max = saturated_before_max || - shr_hasSwitchToGain0[chStart + ii]; - - // if saturation is in the max sample and not in the first 5 - if (!saturated_before_max && - shr_hasSwitchToGain0[threadMax]) - energies[ch] = 49140; // 4095 * 12 - //---- AM FIXME : no pedestal subtraction??? - //It should be "(4095. - pedestal) * gainratio" - - // set state flag to terminate further processing of this channel - acState[ch] = static_cast(MinimizationState::Precomputed); - flag |= 0x1 << EcalUncalibratedRecHit::kSaturated; - flags[ch] = flag; - return; - } - - // according to cpu version -// auto max_amplitude = amplitudes[ch][sample_max]; - auto const max_amplitude = amplitude; - // according to cpu version - auto shape_value = shapes_in[hashedId].pdfval[full_pulse_max-7]; - // note, no syncing as the same thread will be accessing here - bool hasGainSwitch = shr_hasSwitchToGain6[chStart] - || shr_hasSwitchToGain1[chStart] - || shr_isSaturated[chStart+3]; - - // pedestal is final unconditionally - g_pedestal[ch] = pedestal; - if (hasGainSwitch && gainSwitchUseMaxSample) { - // thread for sample=0 will access the right guys - energies[ch] = max_amplitude / shape_value; - acState[ch] = static_cast(MinimizationState::Precomputed); - flags[ch] = flag; - return; - } - - // this happens cause sometimes rms_x12 is 0... - // needs to be checkec why this is the case - // general case here is that noisecov is a Zero matrix - if (rmsForChecking == 0) { - acState[ch] = static_cast(MinimizationState::Precomputed); - flags[ch] = flag; - return; - } - - // for the case when no shortcuts were taken + // + // initialization + // + acState[ch] = static_cast(MinimizationState::NotFinished); + energies[ch] = 0; + chi2[ch] = 0; + g_pedestal[ch] = 0; + uint32_t flag = 0; + dids_out[ch] = did.rawId(); + + // start of this channel in shared mem + int const chStart = threadIdx.x - sample_max; + // thread for the max sample in shared mem + int const threadMax = threadIdx.x; + auto const gainSwitchUseMaxSample = isBarrel ? gainSwitchUseMaxSampleEB : gainSwitchUseMaxSampleEE; + + // this flag setting is applied to all of the cases + if (shr_hasSwitchToGain6[chStart]) + flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain6; + if (shr_hasSwitchToGain1[chStart]) + flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain1; + + // this corresponds to cpu branching on lastSampleBeforeSaturation + // likely false + if (check_hasSwitchToGain0) { + // assign for the case some sample having gainId == 0 + //energies[ch] = amplitudes[ch][sample_max]; + energies[ch] = amplitude; + + // check if samples before sample_max have true + bool saturated_before_max = false; +#pragma unroll + for (char ii = 0; ii < 5; ii++) + saturated_before_max = saturated_before_max || shr_hasSwitchToGain0[chStart + ii]; + + // if saturation is in the max sample and not in the first 5 + if (!saturated_before_max && shr_hasSwitchToGain0[threadMax]) + energies[ch] = 49140; // 4095 * 12 + //---- AM FIXME : no pedestal subtraction??? + //It should be "(4095. - pedestal) * gainratio" + + // set state flag to terminate further processing of this channel + acState[ch] = static_cast(MinimizationState::Precomputed); + flag |= 0x1 << EcalUncalibratedRecHit::kSaturated; flags[ch] = flag; + return; + } + + // according to cpu version + // auto max_amplitude = amplitudes[ch][sample_max]; + auto const max_amplitude = amplitude; + // according to cpu version + auto shape_value = shapes_in[hashedId].pdfval[full_pulse_max - 7]; + // note, no syncing as the same thread will be accessing here + bool hasGainSwitch = + shr_hasSwitchToGain6[chStart] || shr_hasSwitchToGain1[chStart] || shr_isSaturated[chStart + 3]; + + // pedestal is final unconditionally + g_pedestal[ch] = pedestal; + if (hasGainSwitch && gainSwitchUseMaxSample) { + // thread for sample=0 will access the right guys + energies[ch] = max_amplitude / shape_value; + acState[ch] = static_cast(MinimizationState::Precomputed); + flags[ch] = flag; + return; + } + + // this happens cause sometimes rms_x12 is 0... + // needs to be checkec why this is the case + // general case here is that noisecov is a Zero matrix + if (rmsForChecking == 0) { + acState[ch] = static_cast(MinimizationState::Precomputed); + flags[ch] = flag; + return; + } + + // for the case when no shortcuts were taken + flags[ch] = flag; } + } } -} -/// -/// assume kernel launch configuration is -/// ([MAXSAMPLES, MAXSAMPLES], nchannels) -/// -__global__ -void kernel_prep_2d(SampleGainVector const* gainNoise, - uint32_t const* dids_eb, - uint32_t const* dids_ee, - float const* rms_x12, - float const* rms_x6, - float const* rms_x1, - float const* gain12Over6, - float const* gain6Over1, - double const* G12SamplesCorrelationEB, - double const* G6SamplesCorrelationEB, - double const* G1SamplesCorrelationEB, - double const* G12SamplesCorrelationEE, - double const* G6SamplesCorrelationEE, - double const* G1SamplesCorrelationEE, - SampleMatrix* noisecov, - PulseMatrixType* pulse_matrix, - EcalPulseShape const* pulse_shape, - bool const* hasSwitchToGain6, - bool const* hasSwitchToGain1, - bool const* isSaturated, - uint32_t const offsetForHashes, - uint32_t const offsetForInputs) { - int const ch = blockIdx.x; - int const tx = threadIdx.x; - int const ty = threadIdx.y; - constexpr int nsamples = EcalDataFrame::MAXSAMPLES; - constexpr float addPedestalUncertainty = 0.f; - constexpr bool dynamicPedestal = false; - constexpr bool simplifiedNoiseModelForGainSwitch = true; //---- default is true - constexpr int template_samples = EcalPulseShape::TEMPLATESAMPLES; - - // to access input arrays (ids and digis only) - int const inputCh = ch >= offsetForInputs - ? ch - offsetForInputs - : ch; - auto const* dids = ch >= offsetForInputs - ? dids_ee - : dids_eb; - - bool tmp0 = hasSwitchToGain6[ch]; - bool tmp1 = hasSwitchToGain1[ch]; - auto const did = DetId{dids[inputCh]}; - auto const isBarrel = did.subdetId() == EcalBarrel; - auto const hashedId = isBarrel - ? hashedIndexEB(did.rawId()) - : offsetForHashes + hashedIndexEE(did.rawId()); - auto const G12SamplesCorrelation = isBarrel - ? G12SamplesCorrelationEB - : G12SamplesCorrelationEE; - auto const* G6SamplesCorrelation = isBarrel - ? G6SamplesCorrelationEB - : G6SamplesCorrelationEE; - auto const* G1SamplesCorrelation = isBarrel - ? G1SamplesCorrelationEB - : G1SamplesCorrelationEE; - bool tmp2 = isSaturated[ch]; - bool hasGainSwitch = tmp0 || tmp1 || tmp2; - auto const vidx = ecal::abs(ty - tx); - - // non-divergent branch for all threads per block - if (hasGainSwitch) { + /// + /// assume kernel launch configuration is + /// ([MAXSAMPLES, MAXSAMPLES], nchannels) + /// + __global__ void kernel_prep_2d(SampleGainVector const* gainNoise, + uint32_t const* dids_eb, + uint32_t const* dids_ee, + float const* rms_x12, + float const* rms_x6, + float const* rms_x1, + float const* gain12Over6, + float const* gain6Over1, + double const* G12SamplesCorrelationEB, + double const* G6SamplesCorrelationEB, + double const* G1SamplesCorrelationEB, + double const* G12SamplesCorrelationEE, + double const* G6SamplesCorrelationEE, + double const* G1SamplesCorrelationEE, + SampleMatrix* noisecov, + PulseMatrixType* pulse_matrix, + EcalPulseShape const* pulse_shape, + bool const* hasSwitchToGain6, + bool const* hasSwitchToGain1, + bool const* isSaturated, + uint32_t const offsetForHashes, + uint32_t const offsetForInputs) { + int const ch = blockIdx.x; + int const tx = threadIdx.x; + int const ty = threadIdx.y; + constexpr float addPedestalUncertainty = 0.f; + constexpr bool dynamicPedestal = false; + constexpr bool simplifiedNoiseModelForGainSwitch = true; //---- default is true + + // to access input arrays (ids and digis only) + int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch; + auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb; + + bool tmp0 = hasSwitchToGain6[ch]; + bool tmp1 = hasSwitchToGain1[ch]; + auto const did = DetId{dids[inputCh]}; + auto const isBarrel = did.subdetId() == EcalBarrel; + auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId()); + auto const G12SamplesCorrelation = isBarrel ? G12SamplesCorrelationEB : G12SamplesCorrelationEE; + auto const* G6SamplesCorrelation = isBarrel ? G6SamplesCorrelationEB : G6SamplesCorrelationEE; + auto const* G1SamplesCorrelation = isBarrel ? G1SamplesCorrelationEB : G1SamplesCorrelationEE; + bool tmp2 = isSaturated[ch]; + bool hasGainSwitch = tmp0 || tmp1 || tmp2; + auto const vidx = ecal::abs(ty - tx); + + // non-divergent branch for all threads per block + if (hasGainSwitch) { // TODO: did not include simplified noise model float noise_value = 0; // non-divergent branch - all threads per block - // TODO: all of these constants indicate that - // that these parts could be splitted into completely different + // TODO: all of these constants indicate that + // that these parts could be splitted into completely different // kernels and run one of them only depending on the config if (simplifiedNoiseModelForGainSwitch) { - int isample_max = 5; // according to cpu defs - int gainidx = gainNoise[ch][isample_max]; - - // non-divergent branches - if (gainidx==0) - //noise_value = rms_x12[ch]*rms_x12[ch]*noisecorrs[0](ty, tx); - noise_value = rms_x12[hashedId]*rms_x12[hashedId] - * G12SamplesCorrelation[vidx]; - if (gainidx==1) -// noise_value = gain12Over6[ch]*gain12Over6[ch] * rms_x6[ch]*rms_x6[ch] -// *noisecorrs[1](ty, tx); - noise_value = gain12Over6[hashedId]*gain12Over6[hashedId] - * rms_x6[hashedId]*rms_x6[hashedId] - * G6SamplesCorrelation[vidx]; - if (gainidx==2) -// noise_value = gain12Over6[ch]*gain12Over6[ch] -// * gain6Over1[ch]*gain6Over1[ch] * rms_x1[ch]*rms_x1[ch] -// * noisecorrs[2](ty, tx); - noise_value = gain12Over6[hashedId]*gain12Over6[hashedId] - * gain6Over1[hashedId]*gain6Over1[hashedId] - * rms_x1[hashedId]*rms_x1[hashedId] - * G1SamplesCorrelation[vidx]; - if (!dynamicPedestal && addPedestalUncertainty>0.f) - noise_value += addPedestalUncertainty*addPedestalUncertainty; + int isample_max = 5; // according to cpu defs + int gainidx = gainNoise[ch][isample_max]; + + // non-divergent branches + if (gainidx == 0) + //noise_value = rms_x12[ch]*rms_x12[ch]*noisecorrs[0](ty, tx); + noise_value = rms_x12[hashedId] * rms_x12[hashedId] * G12SamplesCorrelation[vidx]; + if (gainidx == 1) + // noise_value = gain12Over6[ch]*gain12Over6[ch] * rms_x6[ch]*rms_x6[ch] + // *noisecorrs[1](ty, tx); + noise_value = gain12Over6[hashedId] * gain12Over6[hashedId] * rms_x6[hashedId] * rms_x6[hashedId] * + G6SamplesCorrelation[vidx]; + if (gainidx == 2) + // noise_value = gain12Over6[ch]*gain12Over6[ch] + // * gain6Over1[ch]*gain6Over1[ch] * rms_x1[ch]*rms_x1[ch] + // * noisecorrs[2](ty, tx); + noise_value = gain12Over6[hashedId] * gain12Over6[hashedId] * gain6Over1[hashedId] * gain6Over1[hashedId] * + rms_x1[hashedId] * rms_x1[hashedId] * G1SamplesCorrelation[vidx]; + if (!dynamicPedestal && addPedestalUncertainty > 0.f) + noise_value += addPedestalUncertainty * addPedestalUncertainty; } else { - int gainidx=0; - char mask = gainidx; - int pedestal = gainNoise[ch][ty] == mask ? 1 : 0; -// noise_value += /* gainratio is 1*/ rms_x12[ch]*rms_x12[ch] -// *pedestal*noisecorrs[0](ty, tx); - noise_value += /* gainratio is 1*/ rms_x12[hashedId]*rms_x12[hashedId] - * pedestal* G12SamplesCorrelation[vidx]; - // non-divergent branch - if (!dynamicPedestal && addPedestalUncertainty>0.f) { - noise_value += /* gainratio is 1 */ - addPedestalUncertainty*addPedestalUncertainty*pedestal; - } - - // - gainidx=1; - mask = gainidx; - pedestal = gainNoise[ch][ty] == mask ? 1 : 0; -// noise_value += gain12Over6[ch]*gain12Over6[ch] -// *rms_x6[ch]*rms_x6[ch]*pedestal*noisecorrs[1](ty, tx); - noise_value += gain12Over6[hashedId]*gain12Over6[hashedId] - *rms_x6[hashedId]*rms_x6[hashedId]*pedestal - * G6SamplesCorrelation[vidx]; - // non-divergent branch - if (!dynamicPedestal && addPedestalUncertainty>0.f) { - noise_value += gain12Over6[hashedId]*gain12Over6[hashedId] - *addPedestalUncertainty*addPedestalUncertainty - *pedestal; - } - - // - gainidx=2; - mask = gainidx; - pedestal = gainNoise[ch][ty] == mask ? 1 : 0; - float tmp = gain6Over1[hashedId] * gain12Over6[hashedId]; -// noise_value += tmp*tmp * rms_x1[ch]*rms_x1[ch] -// *pedestal*noisecorrs[2](ty, tx); - noise_value += tmp*tmp * rms_x1[hashedId]*rms_x1[hashedId] - *pedestal* G1SamplesCorrelation[vidx]; - // non-divergent branch - if (!dynamicPedestal && addPedestalUncertainty>0.f) { - noise_value += tmp*tmp * addPedestalUncertainty*addPedestalUncertainty - * pedestal; - } + int gainidx = 0; + char mask = gainidx; + int pedestal = gainNoise[ch][ty] == mask ? 1 : 0; + // noise_value += /* gainratio is 1*/ rms_x12[ch]*rms_x12[ch] + // *pedestal*noisecorrs[0](ty, tx); + noise_value += + /* gainratio is 1*/ rms_x12[hashedId] * rms_x12[hashedId] * pedestal * G12SamplesCorrelation[vidx]; + // non-divergent branch + if (!dynamicPedestal && addPedestalUncertainty > 0.f) { + noise_value += /* gainratio is 1 */ + addPedestalUncertainty * addPedestalUncertainty * pedestal; + } + + // + gainidx = 1; + mask = gainidx; + pedestal = gainNoise[ch][ty] == mask ? 1 : 0; + // noise_value += gain12Over6[ch]*gain12Over6[ch] + // *rms_x6[ch]*rms_x6[ch]*pedestal*noisecorrs[1](ty, tx); + noise_value += gain12Over6[hashedId] * gain12Over6[hashedId] * rms_x6[hashedId] * rms_x6[hashedId] * + pedestal * G6SamplesCorrelation[vidx]; + // non-divergent branch + if (!dynamicPedestal && addPedestalUncertainty > 0.f) { + noise_value += gain12Over6[hashedId] * gain12Over6[hashedId] * addPedestalUncertainty * + addPedestalUncertainty * pedestal; + } + + // + gainidx = 2; + mask = gainidx; + pedestal = gainNoise[ch][ty] == mask ? 1 : 0; + float tmp = gain6Over1[hashedId] * gain12Over6[hashedId]; + // noise_value += tmp*tmp * rms_x1[ch]*rms_x1[ch] + // *pedestal*noisecorrs[2](ty, tx); + noise_value += tmp * tmp * rms_x1[hashedId] * rms_x1[hashedId] * pedestal * G1SamplesCorrelation[vidx]; + // non-divergent branch + if (!dynamicPedestal && addPedestalUncertainty > 0.f) { + noise_value += tmp * tmp * addPedestalUncertainty * addPedestalUncertainty * pedestal; + } } noisecov[ch](ty, tx) = noise_value; - } else { + } else { auto rms = rms_x12[hashedId]; - float noise_value = rms*rms * G12SamplesCorrelation[vidx]; - if (!dynamicPedestal && addPedestalUncertainty>0.f) { - //---- add fully correlated component to noise covariance to inflate pedestal uncertainty - noise_value += addPedestalUncertainty*addPedestalUncertainty; + float noise_value = rms * rms * G12SamplesCorrelation[vidx]; + if (!dynamicPedestal && addPedestalUncertainty > 0.f) { + //---- add fully correlated component to noise covariance to inflate pedestal uncertainty + noise_value += addPedestalUncertainty * addPedestalUncertainty; } noisecov[ch](ty, tx) = noise_value; + } + + // pulse matrix + // int const bx = tx - 5; // -5 -4 -3 ... 3 4 + // int bx = (*bxs)(tx); + // int const offset = 7 - 3 - bx; + int const posToAccess = 9 - tx + ty; // see cpu for reference + float const value = posToAccess >= 7 ? pulse_shape[hashedId].pdfval[posToAccess - 7] : 0; + pulse_matrix[ch](ty, tx) = value; } - // pulse matrix -// int const bx = tx - 5; // -5 -4 -3 ... 3 4 -// int bx = (*bxs)(tx); -// int const offset = 7 - 3 - bx; - int const posToAccess = 9 - tx + ty; // see cpu for reference - float const value = posToAccess>=7 - ? pulse_shape[hashedId].pdfval[posToAccess-7] - : 0; - pulse_matrix[ch](ty, tx) = value; -} - -__global__ -void kernel_permute_results( - SampleVector *amplitudes, - BXVectorType const*activeBXs, - ::ecal::reco::StorageScalarType *energies, - char const* acState, - int const nchannels) { - // constants - constexpr int nsamples = EcalDataFrame::MAXSAMPLES; - - // indices - int const tx = threadIdx.x + blockIdx.x * blockDim.x; - int const ch = tx / nsamples; - int const iii = tx % nsamples; // this is to address activeBXs - - if (ch >= nchannels) return; - - // channels that have amplitude precomputed do not need results to be permuted - auto const state = static_cast(acState[ch]); - if (static_cast(acState[ch]) == - MinimizationState::Precomputed) + __global__ void kernel_permute_results(SampleVector* amplitudes, + BXVectorType const* activeBXs, + ::ecal::reco::StorageScalarType* energies, + char const* acState, + int const nchannels) { + // constants + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + int const tx = threadIdx.x + blockIdx.x * blockDim.x; + int const ch = tx / nsamples; + int const iii = tx % nsamples; // this is to address activeBXs + + if (ch >= nchannels) return; - // configure shared memory and cp into it - extern __shared__ char smem[]; - SampleVector::Scalar* values = reinterpret_cast( - smem); - values[threadIdx.x] = amplitudes[ch](iii); - __syncthreads(); + // channels that have amplitude precomputed do not need results to be permuted + auto const state = static_cast(acState[ch]); + if (state == MinimizationState::Precomputed) + return; - // get the sample for this bx - auto const sample = static_cast(activeBXs[ch](iii)) + 5; + // configure shared memory and cp into it + extern __shared__ char smem[]; + SampleVector::Scalar* values = reinterpret_cast(smem); + values[threadIdx.x] = amplitudes[ch](iii); + __syncthreads(); - // store back to global - amplitudes[ch](sample) = values[threadIdx.x]; + // get the sample for this bx + auto const sample = static_cast(activeBXs[ch](iii)) + 5; - // store sample 5 separately - // only for the case when minimization was performed - // not for cases with precomputed amplitudes - if (sample == 5) + // store back to global + amplitudes[ch](sample) = values[threadIdx.x]; + + // store sample 5 separately + // only for the case when minimization was performed + // not for cases with precomputed amplitudes + if (sample == 5) energies[ch] = values[threadIdx.x]; -} + } /// /// Build an Ecal RecHit. /// TODO: Use SoA data structures on the host directly -/// the reason for removing this from minimize kernel is to isolate the minimize + +/// the reason for removing this from minimize kernel is to isolate the minimize + /// again, building an aos rec hit involves strides... -> bad memory access pattern /// #ifdef RUN_BUILD_AOS_RECHIT -__global__ -void kernel_build_rechit( - float const* energies, - float const* chi2s, - uint32_t* dids, - EcalUncalibratedRecHit* rechits, - int nchannels) { - int idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx < nchannels) { - rechits[idx] = EcalUncalibratedRecHit{dids[idx], energies[idx], - 0, 0, chi2s[idx], 0}; + __global__ void kernel_build_rechit( + float const* energies, float const* chi2s, uint32_t* dids, EcalUncalibratedRecHit* rechits, int nchannels) { + int idx = threadIdx.x + blockDim.x * blockIdx.x; + if (idx < nchannels) { + rechits[idx] = EcalUncalibratedRecHit{dids[idx], energies[idx], 0, 0, chi2s[idx], 0}; + } } -} #endif -}} + } // namespace multifit +} // namespace ecal diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu index fb6b396089151..ddcfa254e43e1 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu @@ -16,179 +16,163 @@ #include "AmplitudeComputationKernels.h" #include "AmplitudeComputationCommonKernels.h" -namespace ecal { namespace multifit { - -void eigen_solve_submatrix(SampleMatrix& mat, - SampleVector& invec, - SampleVector& outvec, unsigned NP) { - using namespace Eigen; - switch( NP ) { // pulse matrix is always square. - case 10: { - Matrix temp = mat.topLeftCorner<10,10>(); - outvec.head<10>() = temp.ldlt().solve(invec.head<10>()); - break; - } - case 9: { - Matrix temp = mat.topLeftCorner<9,9>(); - outvec.head<9>() = temp.ldlt().solve(invec.head<9>()); - break; - } - case 8: { - Matrix temp = mat.topLeftCorner<8,8>(); - outvec.head<8>() = temp.ldlt().solve(invec.head<8>()); - break; - } - case 7: { - Matrix temp = mat.topLeftCorner<7,7>(); - outvec.head<7>() = temp.ldlt().solve(invec.head<7>()); - break; - } - case 6: { - Matrix temp = mat.topLeftCorner<6,6>(); - outvec.head<6>() = temp.ldlt().solve(invec.head<6>()); - break; - } - case 5: { - Matrix temp = mat.topLeftCorner<5,5>(); - outvec.head<5>() = temp.ldlt().solve(invec.head<5>()); - break; - } - case 4: { - Matrix temp = mat.topLeftCorner<4,4>(); - outvec.head<4>() = temp.ldlt().solve(invec.head<4>()); - break; - } - case 3: { - Matrix temp = mat.topLeftCorner<3,3>(); - outvec.head<3>() = temp.ldlt().solve(invec.head<3>()); - break; - } - case 2: { - Matrix temp = mat.topLeftCorner<2,2>(); - outvec.head<2>() = temp.ldlt().solve(invec.head<2>()); - break; - } - case 1: { - Matrix temp = mat.topLeftCorner<1,1>(); - outvec.head<1>() = temp.ldlt().solve(invec.head<1>()); - break; - } - default: - return; +namespace ecal { + namespace multifit { + + void eigen_solve_submatrix(SampleMatrix& mat, SampleVector& invec, SampleVector& outvec, unsigned NP) { + using namespace Eigen; + switch (NP) { // pulse matrix is always square. + case 10: { + Matrix temp = mat.topLeftCorner<10, 10>(); + outvec.head<10>() = temp.ldlt().solve(invec.head<10>()); + break; + } + case 9: { + Matrix temp = mat.topLeftCorner<9, 9>(); + outvec.head<9>() = temp.ldlt().solve(invec.head<9>()); + break; + } + case 8: { + Matrix temp = mat.topLeftCorner<8, 8>(); + outvec.head<8>() = temp.ldlt().solve(invec.head<8>()); + break; + } + case 7: { + Matrix temp = mat.topLeftCorner<7, 7>(); + outvec.head<7>() = temp.ldlt().solve(invec.head<7>()); + break; + } + case 6: { + Matrix temp = mat.topLeftCorner<6, 6>(); + outvec.head<6>() = temp.ldlt().solve(invec.head<6>()); + break; + } + case 5: { + Matrix temp = mat.topLeftCorner<5, 5>(); + outvec.head<5>() = temp.ldlt().solve(invec.head<5>()); + break; + } + case 4: { + Matrix temp = mat.topLeftCorner<4, 4>(); + outvec.head<4>() = temp.ldlt().solve(invec.head<4>()); + break; + } + case 3: { + Matrix temp = mat.topLeftCorner<3, 3>(); + outvec.head<3>() = temp.ldlt().solve(invec.head<3>()); + break; + } + case 2: { + Matrix temp = mat.topLeftCorner<2, 2>(); + outvec.head<2>() = temp.ldlt().solve(invec.head<2>()); + break; + } + case 1: { + Matrix temp = mat.topLeftCorner<1, 1>(); + outvec.head<1>() = temp.ldlt().solve(invec.head<1>()); + break; + } + default: + return; + } } -} - -template -__device__ __forceinline__ -bool update_covariance( - EcalPulseCovariance const& pulse_covariance, - MatrixType& inverse_cov, - SampleVector const& amplitudes) { - constexpr int nsamples = SampleVector::RowsAtCompileTime; - constexpr int npulses = BXVectorType::RowsAtCompileTime; - - #pragma unroll - for (unsigned int ipulse=0; ipulse + __device__ __forceinline__ bool update_covariance(EcalPulseCovariance const& pulse_covariance, + MatrixType& inverse_cov, + SampleVector const& amplitudes) { + constexpr int nsamples = SampleVector::RowsAtCompileTime; + constexpr int npulses = BXVectorType::RowsAtCompileTime; + +#pragma unroll + for (unsigned int ipulse = 0; ipulse < npulses; ipulse++) { auto const amplitude = amplitudes.coeff(ipulse); - if (amplitude == 0) - continue; + if (amplitude == 0) + continue; // FIXME: ipulse - 5 -> ipulse - firstOffset int bx = ipulse - 5; - int first_sample_t = std::max(0, bx+3); + int first_sample_t = std::max(0, bx + 3); int offset = -3 - bx; auto const value_sq = amplitude * amplitude; - unsigned int nsample_pulse = nsamples - first_sample_t; - - for (int col=first_sample_t; col solution vector, what we are fitting for -/// - samples -> raw detector responses -/// - passive constraint - satisfied constraint -/// - active constraint - unsatisfied (yet) constraint -/// -__global__ -void kernel_minimize( - uint32_t const* dids_eb, - uint32_t const* dids_ee, - SampleMatrix const* __restrict__ noisecov, - EcalPulseCovariance const* __restrict__ pulse_covariance, - BXVectorType *bxs, - SampleVector const* __restrict__ samples, - SampleVector* amplitudes, - PulseMatrixType const* __restrict__ pulse_matrix, - ::ecal::reco::StorageScalarType* chi2s, - ::ecal::reco::StorageScalarType* energies, - char *acState, - int nchannels, - int max_iterations, - uint32_t const offsetForHashes, - uint32_t const offsetForInputs) { - // FIXME: ecal has 10 samples and 10 pulses.... - // but this needs to be properly treated and renamed everywhere - constexpr auto NSAMPLES = SampleMatrix::RowsAtCompileTime; - constexpr auto NPULSES = SampleMatrix::RowsAtCompileTime; - static_assert(NSAMPLES == NPULSES); - - using DataType = SampleVector::Scalar; - - extern __shared__ char shrmem[]; - DataType *shrMatrixLForFnnlsStorage = - reinterpret_cast(shrmem) + MapSymM::total * threadIdx.x; - DataType *shrAtAStorage = - reinterpret_cast(shrmem) + MapSymM::total * ( - threadIdx.x + blockDim.x); - - // FIXME: remove eitehr idx or ch -> they are teh same thing - int idx = threadIdx.x + blockDim.x*blockIdx.x; - auto const ch = idx; - if (idx < nchannels) { - if (static_cast(acState[idx]) == - MinimizationState::Precomputed) - return; + /// + /// launch ctx parameters are (nchannels / block, blocks) + /// TODO: trivial impl for now, there must be a way to improve + /// + /// Conventions: + /// - amplitudes -> solution vector, what we are fitting for + /// - samples -> raw detector responses + /// - passive constraint - satisfied constraint + /// - active constraint - unsatisfied (yet) constraint + /// + __global__ void kernel_minimize(uint32_t const* dids_eb, + uint32_t const* dids_ee, + SampleMatrix const* __restrict__ noisecov, + EcalPulseCovariance const* __restrict__ pulse_covariance, + BXVectorType* bxs, + SampleVector const* __restrict__ samples, + SampleVector* amplitudes, + PulseMatrixType const* __restrict__ pulse_matrix, + ::ecal::reco::StorageScalarType* chi2s, + ::ecal::reco::StorageScalarType* energies, + char* acState, + int nchannels, + int max_iterations, + uint32_t const offsetForHashes, + uint32_t const offsetForInputs) { + // FIXME: ecal has 10 samples and 10 pulses.... + // but this needs to be properly treated and renamed everywhere + constexpr auto NSAMPLES = SampleMatrix::RowsAtCompileTime; + constexpr auto NPULSES = SampleMatrix::RowsAtCompileTime; + static_assert(NSAMPLES == NPULSES); + + using DataType = SampleVector::Scalar; + + extern __shared__ char shrmem[]; + DataType* shrMatrixLForFnnlsStorage = + reinterpret_cast(shrmem) + MapSymM::total * threadIdx.x; + DataType* shrAtAStorage = + reinterpret_cast(shrmem) + MapSymM::total * (threadIdx.x + blockDim.x); + + // FIXME: remove eitehr idx or ch -> they are teh same thing + int idx = threadIdx.x + blockDim.x * blockIdx.x; + auto const ch = idx; + if (idx < nchannels) { + if (static_cast(acState[idx]) == MinimizationState::Precomputed) + return; // get the hash - int const inputCh = ch >= offsetForInputs - ? ch - offsetForInputs - : ch; - auto const* dids = ch >= offsetForInputs - ? dids_ee - : dids_eb; + int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch; + auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb; auto const did = DetId{dids[inputCh]}; auto const isBarrel = did.subdetId() == EcalBarrel; - auto const hashedId = isBarrel - ? hashedIndexEB(did.rawId()) - : offsetForHashes + hashedIndexEE(did.rawId()); + auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId()); // inits int iter = 0; int npassive = 0; ColumnVector pulseOffsets; - #pragma unroll - for (int i=0; i resultAmplitudes; - #pragma unroll - for (int counter=0; counter= max_iterations) - break; - - //inverse_cov = noisecov[idx]; - //DataType covMatrixStorage[MapSymM::total]; - DataType* covMatrixStorage = shrMatrixLForFnnlsStorage; - MapSymM covMatrix{covMatrixStorage}; - int counter = 0; - #pragma unroll - for (int col=0; col::total]; - MapSymM matrixL{matrixLStorage}; - compute_decomposition_unrolled(matrixL, covMatrix); - - // L * A = P - ColMajorMatrix A; - solve_forward_subst_matrix(A, pulse_matrix[idx], matrixL); - - // L b = s - float reg_b[NSAMPLES]; - solve_forward_subst_vector(reg_b, samples[idx], matrixL); - - // FIXME: shared mem - //DataType AtAStorage[MapSymM::total]; - MapSymM AtA{shrAtAStorage}; - //SampleMatrix AtA; - SampleVector Atb; - #pragma unroll - for (int icol=0; icol= max_iterations) + break; + + //inverse_cov = noisecov[idx]; + //DataType covMatrixStorage[MapSymM::total]; + DataType* covMatrixStorage = shrMatrixLForFnnlsStorage; + MapSymM covMatrix{covMatrixStorage}; + int counter = 0; +#pragma unroll + for (int col = 0; col < NSAMPLES; col++) +#pragma unroll + for (int row = col; row < NSAMPLES; row++) + covMatrixStorage[counter++] = __ldg(&noisecov[idx].coeffRef(row, col)); + + update_covariance(pulse_covariance[hashedId], covMatrix, resultAmplitudes); + + // compute actual covariance decomposition + //covariance_decomposition.compute(inverse_cov); + //auto const& matrixL = covariance_decomposition.matrixL(); + DataType matrixLStorage[MapSymM::total]; + MapSymM matrixL{matrixLStorage}; + compute_decomposition_unrolled(matrixL, covMatrix); + + // L * A = P + ColMajorMatrix A; + solve_forward_subst_matrix(A, pulse_matrix[idx], matrixL); + + // L b = s + float reg_b[NSAMPLES]; + solve_forward_subst_vector(reg_b, samples[idx], matrixL); + + // FIXME: shared mem + //DataType AtAStorage[MapSymM::total]; + MapSymM AtA{shrAtAStorage}; + //SampleMatrix AtA; + SampleVector Atb; +#pragma unroll + for (int icol = 0; icol < NPULSES; icol++) { + float reg_ai[NSAMPLES]; + +// load column icol +#pragma unroll + for (int counter = 0; counter < NSAMPLES; counter++) + reg_ai[counter] = A(counter, icol); + + // compute diagoanl + float sum = 0.f; +#pragma unroll + for (int counter = 0; counter < NSAMPLES; counter++) + sum += reg_ai[counter] * reg_ai[counter]; + + // store + AtA(icol, icol) = sum; + +// go thru the other columns +#pragma unroll + for (int j = icol + 1; j < NPULSES; j++) { + // load column j + float reg_aj[NSAMPLES]; +#pragma unroll + for (int counter = 0; counter < NSAMPLES; counter++) + reg_aj[counter] = A(counter, j); + + // accum + float sum = 0.f; +#pragma unroll + for (int counter = 0; counter < NSAMPLES; counter++) + sum += reg_aj[counter] * reg_ai[counter]; + + // store + //AtA(icol, j) = sum; + AtA(j, icol) = sum; } - - // FIXME: shared mem - //DataType matrixLForFnnlsStorage[MapSymM::total]; - MapSymM matrixLForFnnls{shrMatrixLForFnnlsStorage}; - fnnls( - AtA, + // Atb accum + float sum_atb = 0.f; +#pragma unroll + for (int counter = 0; counter < NSAMPLES; counter++) + sum_atb += reg_ai[counter] * reg_b[counter]; + + // store atb + Atb(icol) = sum_atb; + } + + // FIXME: shared mem + //DataType matrixLForFnnlsStorage[MapSymM::total]; + MapSymM matrixLForFnnls{shrMatrixLForFnnlsStorage}; + + fnnls(AtA, Atb, //amplitudes[idx], resultAmplitudes, @@ -298,128 +277,124 @@ void kernel_minimize( pulseOffsets, matrixLForFnnls, 1e-11, - 500 - ); - - { - DataType accum[NSAMPLES]; - // load accum - #pragma unroll - for (int counter=0; counter totalChannels - ? 1 - : (totalChannels + threads_min - 1) / threads_min; - uint32_t const offsetForHashes = conditions.offsetForHashes; - uint32_t const offsetForInputs = eventInputGPU.ebDigis.ndigis; - auto const nbytesShared = 2 * threads_min * - MapSymM::total * sizeof(DataType); - kernel_minimize<<>>( - eventInputGPU.ebDigis.ids, - eventInputGPU.eeDigis.ids, - scratch.noisecov, - conditions.pulseCovariances.values, - scratch.activeBXs, - scratch.samples, - (SampleVector*)eventOutputGPU.amplitudesAll, - scratch.pulse_matrix, - eventOutputGPU.chi2, - eventOutputGPU.amplitude, - scratch.acState, - totalChannels, - 50, - offsetForHashes, - offsetForInputs); - cudaCheck(cudaGetLastError()); -} - -} - -}} + + namespace v1 { + + void minimization_procedure(EventInputDataGPU const& eventInputGPU, + EventOutputDataGPU& eventOutputGPU, + EventDataForScratchGPU& scratch, + ConditionsProducts const& conditions, + ConfigurationParameters const& configParameters, + cudaStream_t cudaStream) { + using DataType = SampleVector::Scalar; + unsigned int totalChannels = eventInputGPU.ebDigis.ndigis + eventInputGPU.eeDigis.ndigis; + // unsigned int threads_min = conf.threads.x; + // TODO: configure from python + unsigned int threads_min = configParameters.kernelMinimizeThreads[0]; + unsigned int blocks_min = threads_min > totalChannels ? 1 : (totalChannels + threads_min - 1) / threads_min; + uint32_t const offsetForHashes = conditions.offsetForHashes; + uint32_t const offsetForInputs = eventInputGPU.ebDigis.ndigis; + auto const nbytesShared = + 2 * threads_min * MapSymM::total * sizeof(DataType); + kernel_minimize<<>>( + eventInputGPU.ebDigis.ids, + eventInputGPU.eeDigis.ids, + scratch.noisecov, + conditions.pulseCovariances.values, + scratch.activeBXs, + scratch.samples, + (SampleVector*)eventOutputGPU.amplitudesAll, + scratch.pulse_matrix, + eventOutputGPU.chi2, + eventOutputGPU.amplitude, + scratch.acState, + totalChannels, + 50, + offsetForHashes, + offsetForInputs); + cudaCheck(cudaGetLastError()); + } + + } // namespace v1 + + } // namespace multifit +} // namespace ecal diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalGainRatiosGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalGainRatiosGPU.cc index bcb199b133c0d..d5980d8a757aa 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalGainRatiosGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalGainRatiosGPU.cc @@ -3,57 +3,50 @@ #include "FWCore/Utilities/interface/typelookup.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -EcalGainRatiosGPU::EcalGainRatiosGPU(EcalGainRatios const& values) - : gain12Over6_(values.size()) - , gain6Over1_(values.size()) -{ - // fill in eb - auto const& barrelValues = values.barrelItems(); - for (unsigned int i=0; igain12Over6_.size() * sizeof(float)) ); - cudaCheck( cudaMalloc((void**)&product.gain6Over1, - this->gain6Over1_.size() * sizeof(float)) ); - // transfer - cudaCheck( cudaMemcpyAsync(product.gain12Over6, - this->gain12Over6_.data(), - this->gain12Over6_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.gain6Over1, - this->gain6Over1_.data(), - this->gain6Over1_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - } - ); +EcalGainRatiosGPU::Product const& EcalGainRatiosGPU::getProduct(cudaStream_t cudaStream) const { + auto const& product = product_.dataForCurrentDeviceAsync( + cudaStream, [this](EcalGainRatiosGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck(cudaMalloc((void**)&product.gain12Over6, this->gain12Over6_.size() * sizeof(float))); + cudaCheck(cudaMalloc((void**)&product.gain6Over1, this->gain6Over1_.size() * sizeof(float))); + // transfer + cudaCheck(cudaMemcpyAsync(product.gain12Over6, + this->gain12Over6_.data(), + this->gain12Over6_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.gain6Over1, + this->gain6Over1_.data(), + this->gain6Over1_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + }); - return product; + return product; } TYPELOOKUP_DATA_REG(EcalGainRatiosGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalPedestalsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalPedestalsGPU.cc index 401ad8c454737..9e3284cd9c7c8 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalPedestalsGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalPedestalsGPU.cc @@ -3,103 +3,92 @@ #include "FWCore/Utilities/interface/typelookup.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -EcalPedestalsGPU::EcalPedestalsGPU(EcalPedestals const& pedestals) - : mean_x12_(pedestals.size()) - , rms_x12_(pedestals.size()) - , mean_x6_(pedestals.size()) - , rms_x6_(pedestals.size()) - , mean_x1_(pedestals.size()) - , rms_x1_(pedestals.size()) -{ +EcalPedestalsGPU::EcalPedestalsGPU(EcalPedestals const& pedestals) + : mean_x12_(pedestals.size()), + rms_x12_(pedestals.size()), + mean_x6_(pedestals.size()), + rms_x6_(pedestals.size()), + mean_x1_(pedestals.size()), + rms_x1_(pedestals.size()) { + // fill in eb + auto const& barrelValues = pedestals.barrelItems(); + for (unsigned int i = 0; i < barrelValues.size(); i++) { + mean_x12_[i] = barrelValues[i].mean_x12; + rms_x12_[i] = barrelValues[i].rms_x12; + mean_x6_[i] = barrelValues[i].mean_x6; + rms_x6_[i] = barrelValues[i].rms_x6; + mean_x1_[i] = barrelValues[i].mean_x1; + rms_x1_[i] = barrelValues[i].rms_x1; + } - // fill in eb - auto const& barrelValues = pedestals.barrelItems(); - for (unsigned int i=0; imean_x12_.size() * sizeof(float)) ); - cudaCheck( cudaMalloc((void**)&product.rms_x12, - this->mean_x12_.size() * sizeof(float)) ); - cudaCheck( cudaMalloc((void**)&product.mean_x6, - this->mean_x12_.size() * sizeof(float)) ); - cudaCheck( cudaMalloc((void**)&product.rms_x6, - this->mean_x12_.size() * sizeof(float)) ); - cudaCheck( cudaMalloc((void**)&product.mean_x1, - this->mean_x12_.size() * sizeof(float)) ); - cudaCheck( cudaMalloc((void**)&product.rms_x1, - this->mean_x12_.size() * sizeof(float)) ); +EcalPedestalsGPU::Product const& EcalPedestalsGPU::getProduct(cudaStream_t cudaStream) const { + auto const& product = product_.dataForCurrentDeviceAsync( + cudaStream, [this](EcalPedestalsGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck(cudaMalloc((void**)&product.mean_x12, this->mean_x12_.size() * sizeof(float))); + cudaCheck(cudaMalloc((void**)&product.rms_x12, this->mean_x12_.size() * sizeof(float))); + cudaCheck(cudaMalloc((void**)&product.mean_x6, this->mean_x12_.size() * sizeof(float))); + cudaCheck(cudaMalloc((void**)&product.rms_x6, this->mean_x12_.size() * sizeof(float))); + cudaCheck(cudaMalloc((void**)&product.mean_x1, this->mean_x12_.size() * sizeof(float))); + cudaCheck(cudaMalloc((void**)&product.rms_x1, this->mean_x12_.size() * sizeof(float))); - // transfer - cudaCheck( cudaMemcpyAsync(product.mean_x12, - this->mean_x12_.data(), - this->mean_x12_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.rms_x12, - this->rms_x12_.data(), - this->rms_x12_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.mean_x6, - this->mean_x6_.data(), - this->mean_x6_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.rms_x6, - this->rms_x6_.data(), - this->rms_x6_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.mean_x1, - this->mean_x1_.data(), - this->mean_x1_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.rms_x1, - this->rms_x1_.data(), - this->rms_x1_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - } - ); + // transfer + cudaCheck(cudaMemcpyAsync(product.mean_x12, + this->mean_x12_.data(), + this->mean_x12_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.rms_x12, + this->rms_x12_.data(), + this->rms_x12_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.mean_x6, + this->mean_x6_.data(), + this->mean_x6_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.rms_x6, + this->rms_x6_.data(), + this->rms_x6_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.mean_x1, + this->mean_x1_.data(), + this->mean_x1_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.rms_x1, + this->rms_x1_.data(), + this->rms_x1_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + }); - return product; + return product; } TYPELOOKUP_DATA_REG(EcalPedestalsGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalPulseCovariancesGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalPulseCovariancesGPU.cc index 121a5b9e684f7..bbeda99652e22 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalPulseCovariancesGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalPulseCovariancesGPU.cc @@ -3,48 +3,40 @@ #include "FWCore/Utilities/interface/typelookup.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -EcalPulseCovariancesGPU::EcalPulseCovariancesGPU(EcalPulseCovariances const& values) - : valuesEB_{values.barrelItems()} - , valuesEE_{values.endcapItems()} -{} +EcalPulseCovariancesGPU::EcalPulseCovariancesGPU(EcalPulseCovariances const& values) + : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {} EcalPulseCovariancesGPU::Product::~Product() { - // deallocation - cudaCheck( cudaFree(values) ); + // deallocation + cudaCheck(cudaFree(values)); } -EcalPulseCovariancesGPU::Product const& EcalPulseCovariancesGPU::getProduct( - cudaStream_t cudaStream) const -{ - auto const& product = product_.dataForCurrentDeviceAsync(cudaStream, - [this](EcalPulseCovariancesGPU::Product& product, cudaStream_t cudaStream) { - // malloc - cudaCheck( cudaMalloc((void**)&product.values, - (this->valuesEE_.size() + this->valuesEB_.size()) - * sizeof(EcalPulseCovariance)) ); - - // offset in terms of sizeof(EcalPulseCovariance) - uint32_t offset = this->valuesEB_.size(); - - // transfer eb - cudaCheck( cudaMemcpyAsync(product.values, - this->valuesEB_.data(), - this->valuesEB_.size() * - sizeof(EcalPulseCovariance), - cudaMemcpyHostToDevice, - cudaStream) ); - - // transfer ee starting at values + offset - cudaCheck( cudaMemcpyAsync(product.values + offset, - this->valuesEE_.data(), - this->valuesEE_.size() * - sizeof(EcalPulseCovariance), - cudaMemcpyHostToDevice, - cudaStream) ); - } - ); - - return product; +EcalPulseCovariancesGPU::Product const& EcalPulseCovariancesGPU::getProduct(cudaStream_t cudaStream) const { + auto const& product = product_.dataForCurrentDeviceAsync( + cudaStream, [this](EcalPulseCovariancesGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck(cudaMalloc((void**)&product.values, + (this->valuesEE_.size() + this->valuesEB_.size()) * sizeof(EcalPulseCovariance))); + + // offset in terms of sizeof(EcalPulseCovariance) + uint32_t offset = this->valuesEB_.size(); + + // transfer eb + cudaCheck(cudaMemcpyAsync(product.values, + this->valuesEB_.data(), + this->valuesEB_.size() * sizeof(EcalPulseCovariance), + cudaMemcpyHostToDevice, + cudaStream)); + + // transfer ee starting at values + offset + cudaCheck(cudaMemcpyAsync(product.values + offset, + this->valuesEE_.data(), + this->valuesEE_.size() * sizeof(EcalPulseCovariance), + cudaMemcpyHostToDevice, + cudaStream)); + }); + + return product; } TYPELOOKUP_DATA_REG(EcalPulseCovariancesGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalPulseShapesGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalPulseShapesGPU.cc index 8e8f00795d225..aee122a01627d 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalPulseShapesGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalPulseShapesGPU.cc @@ -3,48 +3,40 @@ #include "FWCore/Utilities/interface/typelookup.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -EcalPulseShapesGPU::EcalPulseShapesGPU(EcalPulseShapes const& values) - : valuesEB_{values.barrelItems()} - , valuesEE_{values.endcapItems()} -{} +EcalPulseShapesGPU::EcalPulseShapesGPU(EcalPulseShapes const& values) + : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {} EcalPulseShapesGPU::Product::~Product() { - // deallocation - cudaCheck( cudaFree(values) ); + // deallocation + cudaCheck(cudaFree(values)); } -EcalPulseShapesGPU::Product const& EcalPulseShapesGPU::getProduct( - cudaStream_t cudaStream) const -{ - auto const& product = product_.dataForCurrentDeviceAsync(cudaStream, - [this](EcalPulseShapesGPU::Product& product, cudaStream_t cudaStream) { - // malloc - cudaCheck( cudaMalloc((void**)&product.values, - (this->valuesEE_.size() + this->valuesEB_.size()) - * sizeof(EcalPulseShape)) ); - - // offset in terms of sizeof(EcalPulseShape) - plain c array - uint32_t offset = this->valuesEB_.size(); - - // transfer eb - cudaCheck( cudaMemcpyAsync(product.values, - this->valuesEB_.data(), - this->valuesEB_.size() * - sizeof(EcalPulseShape), - cudaMemcpyHostToDevice, - cudaStream) ); - - // transfer ee starting at values + offset - cudaCheck( cudaMemcpyAsync(product.values + offset, - this->valuesEE_.data(), - this->valuesEE_.size() * - sizeof(EcalPulseShape), - cudaMemcpyHostToDevice, - cudaStream) ); - } - ); - - return product; +EcalPulseShapesGPU::Product const& EcalPulseShapesGPU::getProduct(cudaStream_t cudaStream) const { + auto const& product = product_.dataForCurrentDeviceAsync( + cudaStream, [this](EcalPulseShapesGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck(cudaMalloc((void**)&product.values, + (this->valuesEE_.size() + this->valuesEB_.size()) * sizeof(EcalPulseShape))); + + // offset in terms of sizeof(EcalPulseShape) - plain c array + uint32_t offset = this->valuesEB_.size(); + + // transfer eb + cudaCheck(cudaMemcpyAsync(product.values, + this->valuesEB_.data(), + this->valuesEB_.size() * sizeof(EcalPulseShape), + cudaMemcpyHostToDevice, + cudaStream)); + + // transfer ee starting at values + offset + cudaCheck(cudaMemcpyAsync(product.values + offset, + this->valuesEE_.data(), + this->valuesEE_.size() * sizeof(EcalPulseShape), + cudaMemcpyHostToDevice, + cudaStream)); + }); + + return product; } TYPELOOKUP_DATA_REG(EcalPulseShapesGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalSamplesCorrelationGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalSamplesCorrelationGPU.cc index 7294c759aaa0d..2a98067f51d9e 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalSamplesCorrelationGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalSamplesCorrelationGPU.cc @@ -3,91 +3,74 @@ #include "FWCore/Utilities/interface/typelookup.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -EcalSamplesCorrelationGPU::EcalSamplesCorrelationGPU( - EcalSamplesCorrelation const& values) - : EBG12SamplesCorrelation_{values.EBG12SamplesCorrelation} - , EBG6SamplesCorrelation_{values.EBG6SamplesCorrelation} - , EBG1SamplesCorrelation_{values.EBG1SamplesCorrelation} - , EEG12SamplesCorrelation_{values.EEG12SamplesCorrelation} - , EEG6SamplesCorrelation_{values.EEG6SamplesCorrelation} - , EEG1SamplesCorrelation_{values.EEG1SamplesCorrelation} -{} +EcalSamplesCorrelationGPU::EcalSamplesCorrelationGPU(EcalSamplesCorrelation const& values) + : EBG12SamplesCorrelation_{values.EBG12SamplesCorrelation}, + EBG6SamplesCorrelation_{values.EBG6SamplesCorrelation}, + EBG1SamplesCorrelation_{values.EBG1SamplesCorrelation}, + EEG12SamplesCorrelation_{values.EEG12SamplesCorrelation}, + EEG6SamplesCorrelation_{values.EEG6SamplesCorrelation}, + EEG1SamplesCorrelation_{values.EEG1SamplesCorrelation} {} EcalSamplesCorrelationGPU::Product::~Product() { - // deallocation - cudaCheck( cudaFree(EBG12SamplesCorrelation) ); - cudaCheck( cudaFree(EBG6SamplesCorrelation) ); - cudaCheck( cudaFree(EBG1SamplesCorrelation) ); - cudaCheck( cudaFree(EEG12SamplesCorrelation) ); - cudaCheck( cudaFree(EEG6SamplesCorrelation) ); - cudaCheck( cudaFree(EEG1SamplesCorrelation) ); + // deallocation + cudaCheck(cudaFree(EBG12SamplesCorrelation)); + cudaCheck(cudaFree(EBG6SamplesCorrelation)); + cudaCheck(cudaFree(EBG1SamplesCorrelation)); + cudaCheck(cudaFree(EEG12SamplesCorrelation)); + cudaCheck(cudaFree(EEG6SamplesCorrelation)); + cudaCheck(cudaFree(EEG1SamplesCorrelation)); } -EcalSamplesCorrelationGPU::Product const& EcalSamplesCorrelationGPU::getProduct( - cudaStream_t cudaStream) const -{ - auto const& product = product_.dataForCurrentDeviceAsync(cudaStream, - [this](EcalSamplesCorrelationGPU::Product& product, cudaStream_t cudaStream) { - // malloc - cudaCheck( cudaMalloc((void**)&product.EBG12SamplesCorrelation, - this->EBG12SamplesCorrelation_.size() * - sizeof(double)) ); - cudaCheck( cudaMalloc((void**)&product.EBG6SamplesCorrelation, - this->EBG6SamplesCorrelation_.size() * - sizeof(double)) ); - cudaCheck( cudaMalloc((void**)&product.EBG1SamplesCorrelation, - this->EBG1SamplesCorrelation_.size() * - sizeof(double)) ); - cudaCheck( cudaMalloc((void**)&product.EEG12SamplesCorrelation, - this->EEG12SamplesCorrelation_.size() * - sizeof(double)) ); - cudaCheck( cudaMalloc((void**)&product.EEG6SamplesCorrelation, - this->EEG6SamplesCorrelation_.size() * - sizeof(double)) ); - cudaCheck( cudaMalloc((void**)&product.EEG1SamplesCorrelation, - this->EEG1SamplesCorrelation_.size() * - sizeof(double)) ); - // transfer - cudaCheck( cudaMemcpyAsync(product.EBG12SamplesCorrelation, - this->EBG12SamplesCorrelation_.data(), - this->EBG12SamplesCorrelation_.size() * - sizeof(double), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.EBG6SamplesCorrelation, - this->EBG6SamplesCorrelation_.data(), - this->EBG6SamplesCorrelation_.size() * - sizeof(double), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.EBG1SamplesCorrelation, - this->EBG1SamplesCorrelation_.data(), - this->EBG1SamplesCorrelation_.size() * - sizeof(double), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.EEG12SamplesCorrelation, - this->EEG12SamplesCorrelation_.data(), - this->EEG12SamplesCorrelation_.size() * - sizeof(double), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.EEG6SamplesCorrelation, - this->EEG6SamplesCorrelation_.data(), - this->EEG6SamplesCorrelation_.size() * - sizeof(double), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.EEG1SamplesCorrelation, - this->EEG1SamplesCorrelation_.data(), - this->EEG1SamplesCorrelation_.size() * - sizeof(double), - cudaMemcpyHostToDevice, - cudaStream) ); - } - ); +EcalSamplesCorrelationGPU::Product const& EcalSamplesCorrelationGPU::getProduct(cudaStream_t cudaStream) const { + auto const& product = product_.dataForCurrentDeviceAsync( + cudaStream, [this](EcalSamplesCorrelationGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck(cudaMalloc((void**)&product.EBG12SamplesCorrelation, + this->EBG12SamplesCorrelation_.size() * sizeof(double))); + cudaCheck( + cudaMalloc((void**)&product.EBG6SamplesCorrelation, this->EBG6SamplesCorrelation_.size() * sizeof(double))); + cudaCheck( + cudaMalloc((void**)&product.EBG1SamplesCorrelation, this->EBG1SamplesCorrelation_.size() * sizeof(double))); + cudaCheck(cudaMalloc((void**)&product.EEG12SamplesCorrelation, + this->EEG12SamplesCorrelation_.size() * sizeof(double))); + cudaCheck( + cudaMalloc((void**)&product.EEG6SamplesCorrelation, this->EEG6SamplesCorrelation_.size() * sizeof(double))); + cudaCheck( + cudaMalloc((void**)&product.EEG1SamplesCorrelation, this->EEG1SamplesCorrelation_.size() * sizeof(double))); + // transfer + cudaCheck(cudaMemcpyAsync(product.EBG12SamplesCorrelation, + this->EBG12SamplesCorrelation_.data(), + this->EBG12SamplesCorrelation_.size() * sizeof(double), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.EBG6SamplesCorrelation, + this->EBG6SamplesCorrelation_.data(), + this->EBG6SamplesCorrelation_.size() * sizeof(double), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.EBG1SamplesCorrelation, + this->EBG1SamplesCorrelation_.data(), + this->EBG1SamplesCorrelation_.size() * sizeof(double), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.EEG12SamplesCorrelation, + this->EEG12SamplesCorrelation_.data(), + this->EEG12SamplesCorrelation_.size() * sizeof(double), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.EEG6SamplesCorrelation, + this->EEG6SamplesCorrelation_.data(), + this->EEG6SamplesCorrelation_.size() * sizeof(double), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.EEG1SamplesCorrelation, + this->EEG1SamplesCorrelation_.data(), + this->EEG1SamplesCorrelation_.size() * sizeof(double), + cudaMemcpyHostToDevice, + cudaStream)); + }); - return product; + return product; } TYPELOOKUP_DATA_REG(EcalSamplesCorrelationGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalTimeBiasCorrectionsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalTimeBiasCorrectionsGPU.cc index 277661b030c68..9ab0a6302a9c4 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalTimeBiasCorrectionsGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalTimeBiasCorrectionsGPU.cc @@ -3,76 +3,59 @@ #include "FWCore/Utilities/interface/typelookup.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -EcalTimeBiasCorrectionsGPU::EcalTimeBiasCorrectionsGPU( - EcalTimeBiasCorrections const& values) - : EBTimeCorrAmplitudeBins_{values.EBTimeCorrAmplitudeBins} - , EBTimeCorrShiftBins_{values.EBTimeCorrShiftBins} - , EETimeCorrAmplitudeBins_{values.EETimeCorrAmplitudeBins} - , EETimeCorrShiftBins_{values.EETimeCorrShiftBins} -{} +EcalTimeBiasCorrectionsGPU::EcalTimeBiasCorrectionsGPU(EcalTimeBiasCorrections const& values) + : EBTimeCorrAmplitudeBins_{values.EBTimeCorrAmplitudeBins}, + EBTimeCorrShiftBins_{values.EBTimeCorrShiftBins}, + EETimeCorrAmplitudeBins_{values.EETimeCorrAmplitudeBins}, + EETimeCorrShiftBins_{values.EETimeCorrShiftBins} {} EcalTimeBiasCorrectionsGPU::Product::~Product() { - // deallocation - cudaCheck( cudaFree(EBTimeCorrAmplitudeBins) ); - cudaCheck( cudaFree(EBTimeCorrShiftBins) ); - cudaCheck( cudaFree(EETimeCorrAmplitudeBins) ); - cudaCheck( cudaFree(EETimeCorrShiftBins) ); + // deallocation + cudaCheck(cudaFree(EBTimeCorrAmplitudeBins)); + cudaCheck(cudaFree(EBTimeCorrShiftBins)); + cudaCheck(cudaFree(EETimeCorrAmplitudeBins)); + cudaCheck(cudaFree(EETimeCorrShiftBins)); } -EcalTimeBiasCorrectionsGPU::Product const& EcalTimeBiasCorrectionsGPU::getProduct( - cudaStream_t cudaStream) const -{ - auto const& product = product_.dataForCurrentDeviceAsync(cudaStream, - [this](EcalTimeBiasCorrectionsGPU::Product& product, cudaStream_t cudaStream) { - // to get the size of vectors later on - // should be removed and host conditions' objects used directly - product.EBTimeCorrAmplitudeBinsSize = - this->EBTimeCorrAmplitudeBins_.size(); - product.EETimeCorrAmplitudeBinsSize = - this->EETimeCorrAmplitudeBins_.size(); +EcalTimeBiasCorrectionsGPU::Product const& EcalTimeBiasCorrectionsGPU::getProduct(cudaStream_t cudaStream) const { + auto const& product = product_.dataForCurrentDeviceAsync( + cudaStream, [this](EcalTimeBiasCorrectionsGPU::Product& product, cudaStream_t cudaStream) { + // to get the size of vectors later on + // should be removed and host conditions' objects used directly + product.EBTimeCorrAmplitudeBinsSize = this->EBTimeCorrAmplitudeBins_.size(); + product.EETimeCorrAmplitudeBinsSize = this->EETimeCorrAmplitudeBins_.size(); - // malloc - cudaCheck( cudaMalloc((void**)&product.EBTimeCorrAmplitudeBins, - this->EBTimeCorrAmplitudeBins_.size() * - sizeof(float)) ); - cudaCheck( cudaMalloc((void**)&product.EBTimeCorrShiftBins, - this->EBTimeCorrShiftBins_.size() * - sizeof(float)) ); - cudaCheck( cudaMalloc((void**)&product.EETimeCorrAmplitudeBins, - this->EETimeCorrAmplitudeBins_.size() * - sizeof(float)) ); - cudaCheck( cudaMalloc((void**)&product.EETimeCorrShiftBins, - this->EETimeCorrShiftBins_.size() * - sizeof(float)) ); - // transfer - cudaCheck( cudaMemcpyAsync(product.EBTimeCorrAmplitudeBins, - this->EBTimeCorrAmplitudeBins_.data(), - this->EBTimeCorrAmplitudeBins_.size() * - sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.EBTimeCorrShiftBins, - this->EBTimeCorrShiftBins_.data(), - this->EBTimeCorrShiftBins_.size() * - sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.EETimeCorrAmplitudeBins, - this->EETimeCorrAmplitudeBins_.data(), - this->EETimeCorrAmplitudeBins_.size() * - sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.EETimeCorrShiftBins, - this->EETimeCorrShiftBins_.data(), - this->EETimeCorrShiftBins_.size() * - sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - } - ); + // malloc + cudaCheck(cudaMalloc((void**)&product.EBTimeCorrAmplitudeBins, + this->EBTimeCorrAmplitudeBins_.size() * sizeof(float))); + cudaCheck(cudaMalloc((void**)&product.EBTimeCorrShiftBins, this->EBTimeCorrShiftBins_.size() * sizeof(float))); + cudaCheck(cudaMalloc((void**)&product.EETimeCorrAmplitudeBins, + this->EETimeCorrAmplitudeBins_.size() * sizeof(float))); + cudaCheck(cudaMalloc((void**)&product.EETimeCorrShiftBins, this->EETimeCorrShiftBins_.size() * sizeof(float))); + // transfer + cudaCheck(cudaMemcpyAsync(product.EBTimeCorrAmplitudeBins, + this->EBTimeCorrAmplitudeBins_.data(), + this->EBTimeCorrAmplitudeBins_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.EBTimeCorrShiftBins, + this->EBTimeCorrShiftBins_.data(), + this->EBTimeCorrShiftBins_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.EETimeCorrAmplitudeBins, + this->EETimeCorrAmplitudeBins_.data(), + this->EETimeCorrAmplitudeBins_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.EETimeCorrShiftBins, + this->EETimeCorrShiftBins_.data(), + this->EETimeCorrShiftBins_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + }); - return product; + return product; } TYPELOOKUP_DATA_REG(EcalTimeBiasCorrectionsGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalTimeCalibConstantsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalTimeCalibConstantsGPU.cc index 1da155b2539f2..d724a33f1d4e1 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalTimeCalibConstantsGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalTimeCalibConstantsGPU.cc @@ -3,47 +3,38 @@ #include "FWCore/Utilities/interface/typelookup.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -EcalTimeCalibConstantsGPU::EcalTimeCalibConstantsGPU( - EcalTimeCalibConstants const& values) - : valuesEB_{values.barrelItems()} - , valuesEE_{values.endcapItems()} -{} +EcalTimeCalibConstantsGPU::EcalTimeCalibConstantsGPU(EcalTimeCalibConstants const& values) + : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {} EcalTimeCalibConstantsGPU::Product::~Product() { - // deallocation - cudaCheck( cudaFree(values) ); + // deallocation + cudaCheck(cudaFree(values)); } -EcalTimeCalibConstantsGPU::Product const& EcalTimeCalibConstantsGPU::getProduct( - cudaStream_t cudaStream) const -{ - auto const& product = product_.dataForCurrentDeviceAsync(cudaStream, - [this](EcalTimeCalibConstantsGPU::Product& product, cudaStream_t cudaStream) { - // malloc - cudaCheck( cudaMalloc((void**)&product.values, - (this->valuesEB_.size() + this->valuesEE_.size()) * - sizeof(float)) ); - - // offset in floats, not bytes - auto const offset = this->valuesEB_.size(); - - // transfer - cudaCheck( cudaMemcpyAsync(product.values, - this->valuesEB_.data(), - this->valuesEB_.size() * - sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.values + offset, - this->valuesEE_.data(), - this->valuesEE_.size() * - sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - } - ); - - return product; +EcalTimeCalibConstantsGPU::Product const& EcalTimeCalibConstantsGPU::getProduct(cudaStream_t cudaStream) const { + auto const& product = product_.dataForCurrentDeviceAsync( + cudaStream, [this](EcalTimeCalibConstantsGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck( + cudaMalloc((void**)&product.values, (this->valuesEB_.size() + this->valuesEE_.size()) * sizeof(float))); + + // offset in floats, not bytes + auto const offset = this->valuesEB_.size(); + + // transfer + cudaCheck(cudaMemcpyAsync(product.values, + this->valuesEB_.data(), + this->valuesEB_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.values + offset, + this->valuesEE_.data(), + this->valuesEE_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + }); + + return product; } TYPELOOKUP_DATA_REG(EcalTimeCalibConstantsGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu index b67bb74235e4a..c8d2926b29afc 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu @@ -25,107 +25,99 @@ //#define ECAL_RECO_CUDA_DEBUG -namespace ecal { namespace multifit { - -void entryPoint( - EventInputDataGPU const& eventInputGPU, - EventOutputDataGPU& eventOutputGPU, EventDataForScratchGPU& scratch, - ConditionsProducts const& conditions, - ConfigurationParameters const& configParameters, - cudaStream_t cudaStream) { - using digis_type = std::vector; - using dids_type = std::vector; - // accodring to the cpu setup //----> hardcoded - bool const gainSwitchUseMaxSampleEB = true; - // accodring to the cpu setup //----> hardcoded - bool const gainSwitchUseMaxSampleEE = false; - - uint32_t const offsetForHashes = conditions.offsetForHashes; - uint32_t const offsetForInputs = eventInputGPU.ebDigis.ndigis; - unsigned int totalChannels = eventInputGPU.ebDigis.ndigis + - eventInputGPU.eeDigis.ndigis; - - // - // 1d preparation kernel - // - unsigned int nchannels_per_block = 32; - unsigned int threads_1d = 10 * nchannels_per_block; - unsigned int blocks_1d = threads_1d > 10*totalChannels - ? 1 : (totalChannels*10 + threads_1d - 1) / threads_1d; - int shared_bytes = nchannels_per_block * EcalDataFrame::MAXSAMPLES * ( - sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(char) - + sizeof(bool) - ); - kernel_prep_1d_and_initialize<<>>( - conditions.pulseShapes.values, - eventInputGPU.ebDigis.data, - eventInputGPU.ebDigis.ids, - eventInputGPU.eeDigis.data, - eventInputGPU.eeDigis.ids, - scratch.samples, - (SampleVector*)eventOutputGPU.amplitudesAll, - scratch.gainsNoise, - conditions.pedestals.mean_x1, - conditions.pedestals.mean_x12, - conditions.pedestals.rms_x12, - conditions.pedestals.mean_x6, - conditions.gainRatios.gain6Over1, - conditions.gainRatios.gain12Over6, - scratch.hasSwitchToGain6, - scratch.hasSwitchToGain1, - scratch.isSaturated, - eventOutputGPU.amplitude, - eventOutputGPU.chi2, - eventOutputGPU.pedestal, - eventOutputGPU.did, - eventOutputGPU.flags, - scratch.acState, - scratch.activeBXs, - offsetForHashes, - offsetForInputs, - gainSwitchUseMaxSampleEB, - gainSwitchUseMaxSampleEE, - totalChannels); - cudaCheck(cudaGetLastError()); +namespace ecal { + namespace multifit { - // - // 2d preparation kernel - // - int blocks_2d = totalChannels; - dim3 threads_2d{10, 10}; - kernel_prep_2d<<>>( - scratch.gainsNoise, - eventInputGPU.ebDigis.ids, - eventInputGPU.eeDigis.ids, - conditions.pedestals.rms_x12, - conditions.pedestals.rms_x6, - conditions.pedestals.rms_x1, - conditions.gainRatios.gain12Over6, - conditions.gainRatios.gain6Over1, - conditions.samplesCorrelation.EBG12SamplesCorrelation, - conditions.samplesCorrelation.EBG6SamplesCorrelation, - conditions.samplesCorrelation.EBG1SamplesCorrelation, - conditions.samplesCorrelation.EEG12SamplesCorrelation, - conditions.samplesCorrelation.EEG6SamplesCorrelation, - conditions.samplesCorrelation.EEG1SamplesCorrelation, - scratch.noisecov, - scratch.pulse_matrix, - conditions.pulseShapes.values, - scratch.hasSwitchToGain6, - scratch.hasSwitchToGain1, - scratch.isSaturated, - offsetForHashes, - offsetForInputs); - cudaCheck(cudaGetLastError()); - - // run minimization kernels - v1::minimization_procedure( - eventInputGPU, eventOutputGPU, - scratch, conditions, configParameters, cudaStream); + void entryPoint(EventInputDataGPU const& eventInputGPU, + EventOutputDataGPU& eventOutputGPU, + EventDataForScratchGPU& scratch, + ConditionsProducts const& conditions, + ConfigurationParameters const& configParameters, + cudaStream_t cudaStream) { + using digis_type = std::vector; + using dids_type = std::vector; + // accodring to the cpu setup //----> hardcoded + bool const gainSwitchUseMaxSampleEB = true; + // accodring to the cpu setup //----> hardcoded + bool const gainSwitchUseMaxSampleEE = false; - if (configParameters.shouldRunTimingComputation) { - + uint32_t const offsetForHashes = conditions.offsetForHashes; + uint32_t const offsetForInputs = eventInputGPU.ebDigis.ndigis; + unsigned int totalChannels = eventInputGPU.ebDigis.ndigis + eventInputGPU.eeDigis.ndigis; + + // + // 1d preparation kernel + // + unsigned int nchannels_per_block = 32; + unsigned int threads_1d = 10 * nchannels_per_block; + unsigned int blocks_1d = threads_1d > 10 * totalChannels ? 1 : (totalChannels * 10 + threads_1d - 1) / threads_1d; + int shared_bytes = nchannels_per_block * EcalDataFrame::MAXSAMPLES * + (sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(char) + sizeof(bool)); + kernel_prep_1d_and_initialize<<>>( + conditions.pulseShapes.values, + eventInputGPU.ebDigis.data, + eventInputGPU.ebDigis.ids, + eventInputGPU.eeDigis.data, + eventInputGPU.eeDigis.ids, + scratch.samples, + (SampleVector*)eventOutputGPU.amplitudesAll, + scratch.gainsNoise, + conditions.pedestals.mean_x1, + conditions.pedestals.mean_x12, + conditions.pedestals.rms_x12, + conditions.pedestals.mean_x6, + conditions.gainRatios.gain6Over1, + conditions.gainRatios.gain12Over6, + scratch.hasSwitchToGain6, + scratch.hasSwitchToGain1, + scratch.isSaturated, + eventOutputGPU.amplitude, + eventOutputGPU.chi2, + eventOutputGPU.pedestal, + eventOutputGPU.did, + eventOutputGPU.flags, + scratch.acState, + scratch.activeBXs, + offsetForHashes, + offsetForInputs, + gainSwitchUseMaxSampleEB, + gainSwitchUseMaxSampleEE, + totalChannels); + cudaCheck(cudaGetLastError()); + + // + // 2d preparation kernel + // + int blocks_2d = totalChannels; + dim3 threads_2d{10, 10}; + kernel_prep_2d<<>>(scratch.gainsNoise, + eventInputGPU.ebDigis.ids, + eventInputGPU.eeDigis.ids, + conditions.pedestals.rms_x12, + conditions.pedestals.rms_x6, + conditions.pedestals.rms_x1, + conditions.gainRatios.gain12Over6, + conditions.gainRatios.gain6Over1, + conditions.samplesCorrelation.EBG12SamplesCorrelation, + conditions.samplesCorrelation.EBG6SamplesCorrelation, + conditions.samplesCorrelation.EBG1SamplesCorrelation, + conditions.samplesCorrelation.EEG12SamplesCorrelation, + conditions.samplesCorrelation.EEG6SamplesCorrelation, + conditions.samplesCorrelation.EEG1SamplesCorrelation, + scratch.noisecov, + scratch.pulse_matrix, + conditions.pulseShapes.values, + scratch.hasSwitchToGain6, + scratch.hasSwitchToGain1, + scratch.isSaturated, + offsetForHashes, + offsetForInputs); + cudaCheck(cudaGetLastError()); + + // run minimization kernels + v1::minimization_procedure(eventInputGPU, eventOutputGPU, scratch, conditions, configParameters, cudaStream); + + if (configParameters.shouldRunTimingComputation) { // // TODO: this guy can run concurrently with other kernels, // there is no dependence on the order of execution @@ -133,9 +125,8 @@ void entryPoint( unsigned int threads_time_init = threads_1d; unsigned int blocks_time_init = blocks_1d; int sharedBytesInit = 2 * threads_time_init * sizeof(SampleVector::Scalar); - kernel_time_computation_init<<>>( - eventInputGPU.ebDigis.data, + kernel_time_computation_init<<>>( + eventInputGPU.ebDigis.data, eventInputGPU.ebDigis.ids, eventInputGPU.eeDigis.data, eventInputGPU.eeDigis.ids, @@ -156,24 +147,21 @@ void entryPoint( offsetForInputs, conditions.sampleMask.getEcalSampleMaskRecordEB(), conditions.sampleMask.getEcalSampleMaskRecordEE(), - totalChannels - ); + totalChannels); cudaCheck(cudaGetLastError()); - // - // TODO: small kernel only for EB. It needs to be checked if + // + // TODO: small kernel only for EB. It needs to be checked if /// fusing such small kernels is beneficial in here // // we are running only over EB digis // therefore we need to create threads/blocks only for that unsigned int const threadsFixMGPA = threads_1d; - unsigned int const blocksFixMGPA = + unsigned int const blocksFixMGPA = threadsFixMGPA > 10 * eventInputGPU.ebDigis.ndigis ? 1 - : (10 * eventInputGPU.ebDigis.ndigis + threadsFixMGPA - 1) - / threadsFixMGPA; - kernel_time_compute_fixMGPAslew<<>>( + : (10 * eventInputGPU.ebDigis.ndigis + threadsFixMGPA - 1) / threadsFixMGPA; + kernel_time_compute_fixMGPAslew<<>>( eventInputGPU.ebDigis.data, eventInputGPU.eeDigis.data, scratch.sample_values, @@ -181,37 +169,32 @@ void entryPoint( scratch.useless_sample_values, conditions.sampleMask.getEcalSampleMaskRecordEB(), totalChannels, - offsetForInputs - ); + offsetForInputs); cudaCheck(cudaGetLastError()); // - // // - int sharedBytes = EcalDataFrame::MAXSAMPLES * nchannels_per_block * - 4 * sizeof(SampleVector::Scalar); + // + int sharedBytes = EcalDataFrame::MAXSAMPLES * nchannels_per_block * 4 * sizeof(SampleVector::Scalar); auto const threads_nullhypot = threads_1d; auto const blocks_nullhypot = blocks_1d; - kernel_time_compute_nullhypot<<>>( + kernel_time_compute_nullhypot<<>>( scratch.sample_values, scratch.sample_value_errors, scratch.useless_sample_values, scratch.chi2sNullHypot, scratch.sum0sNullHypot, scratch.sumAAsNullHypot, - totalChannels - ); + totalChannels); cudaCheck(cudaGetLastError()); unsigned int nchannels_per_block_makeratio = 10; unsigned int threads_makeratio = 45 * nchannels_per_block_makeratio; unsigned int blocks_makeratio = threads_makeratio > 45 * totalChannels - ? 1 - : (totalChannels * 45 + threads_makeratio - 1) / threads_makeratio; + ? 1 + : (totalChannels * 45 + threads_makeratio - 1) / threads_makeratio; int sharedBytesMakeRatio = 5 * threads_makeratio * sizeof(SampleVector::Scalar); - kernel_time_compute_makeratio<<>>( + kernel_time_compute_makeratio<<>>( scratch.sample_values, scratch.sample_value_errors, eventInputGPU.ebDigis.ids, @@ -229,15 +212,14 @@ void entryPoint( scratch.accTimeMax, scratch.accTimeWgt, scratch.tcState, - configParameters.timeFitParametersSizeEB, + configParameters.timeFitParametersSizeEB, configParameters.timeFitParametersSizeEE, configParameters.timeFitLimitsFirstEB, configParameters.timeFitLimitsFirstEE, configParameters.timeFitLimitsSecondEB, configParameters.timeFitLimitsSecondEE, totalChannels, - offsetForInputs - ); + offsetForInputs); cudaCheck(cudaGetLastError()); // @@ -245,43 +227,40 @@ void entryPoint( // auto const threads_findamplchi2 = threads_1d; auto const blocks_findamplchi2 = blocks_1d; - int const sharedBytesFindAmplChi2 = 2 * threads_findamplchi2 * - sizeof(SampleVector::Scalar); + int const sharedBytesFindAmplChi2 = 2 * threads_findamplchi2 * sizeof(SampleVector::Scalar); kernel_time_compute_findamplchi2_and_finish<<>>( - scratch.sample_values, - scratch.sample_value_errors, - eventInputGPU.ebDigis.ids, - eventInputGPU.eeDigis.ids, - scratch.useless_sample_values, - scratch.tMaxAlphaBetas, - scratch.tMaxErrorAlphaBetas, - scratch.accTimeMax, - scratch.accTimeWgt, - configParameters.amplitudeFitParametersEB, - configParameters.amplitudeFitParametersEE, - scratch.sumAAsNullHypot, - scratch.sum0sNullHypot, - scratch.chi2sNullHypot, - scratch.tcState, - scratch.ampMaxAlphaBeta, - scratch.ampMaxError, - scratch.timeMax, - scratch.timeError, - totalChannels, - offsetForInputs - ); + threads_findamplchi2, + sharedBytesFindAmplChi2, + cudaStream>>>(scratch.sample_values, + scratch.sample_value_errors, + eventInputGPU.ebDigis.ids, + eventInputGPU.eeDigis.ids, + scratch.useless_sample_values, + scratch.tMaxAlphaBetas, + scratch.tMaxErrorAlphaBetas, + scratch.accTimeMax, + scratch.accTimeWgt, + configParameters.amplitudeFitParametersEB, + configParameters.amplitudeFitParametersEE, + scratch.sumAAsNullHypot, + scratch.sum0sNullHypot, + scratch.chi2sNullHypot, + scratch.tcState, + scratch.ampMaxAlphaBeta, + scratch.ampMaxError, + scratch.timeMax, + scratch.timeError, + totalChannels, + offsetForInputs); cudaCheck(cudaGetLastError()); - + // // // auto const threads_timecorr = 32; - auto const blocks_timecorr = threads_timecorr > totalChannels - ? 1 : (totalChannels + threads_timecorr-1) / threads_timecorr; - kernel_time_correction_and_finalize<<>>( + auto const blocks_timecorr = + threads_timecorr > totalChannels ? 1 : (totalChannels + threads_timecorr - 1) / threads_timecorr; + kernel_time_correction_and_finalize<<>>( eventOutputGPU.amplitude, eventInputGPU.ebDigis.data, eventInputGPU.ebDigis.ids, @@ -318,18 +297,18 @@ void entryPoint( configParameters.outOfTimeThreshG61mEE, offsetForHashes, offsetForInputs, - totalChannels - ); + totalChannels); cudaCheck(cudaGetLastError()); - } + } - /* + /* cudaEventRecord(end_event, 0); cudaEventSynchronize(end_event); float ms; cudaEventElapsedTime(&ms, start_event, end_event); std::cout << "elapsed time = " << ms << std::endl; */ -} + } -}} + } // namespace multifit +} // namespace ecal diff --git a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu index 6b60f4fc35560..b85f002464f65 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu @@ -3,88 +3,74 @@ #include "DataFormats/EcalDetId/interface/EBDetId.h" #include "DataFormats/EcalDetId/interface/EEDetId.h" -namespace ecal { namespace multifit { - -namespace internal { - -namespace barrel { - -__device__ -__forceinline__ -bool positiveZ(uint32_t id) { return id & 0x10000; } - -__device__ -__forceinline__ -uint32_t ietaAbs(uint32_t id) { return (id >> 9) & 0x7F; } - -__device__ -__forceinline__ -uint32_t iphi(uint32_t id) { return id & 0x1FF; } - -} - -} - -__device__ -uint32_t hashedIndexEB(uint32_t id) { - using namespace internal::barrel; - return (EBDetId::MAX_IETA + - (positiveZ(id) ? ietaAbs(id)-1 : -ietaAbs(id)) ) * EBDetId::MAX_IPHI + - iphi(id)-1; -} - -namespace internal { - -namespace endcap { - -__device__ -__forceinline__ -uint32_t ix(uint32_t id) { return (id >> 7) & 0x7F; } - -__device__ -__forceinline__ -uint32_t iy(uint32_t id) { return id & 0x7F; } - -__device__ -__forceinline__ -bool positiveZ(uint32_t id) { return id & 0x4000; } - -// these constants come from EE Det Id -__constant__ -const unsigned short kxf[] = { - 41, 51, 41, 51, 41, 51, 36, 51, 36, 51, 26, 51, 26, 51, 26, 51, 21, 51, 21, 51, 21, 51, 21, 51, 21, 51, 16, 51, 16, - 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 9, 51, 9, 51, 9, 51, 9, 51, 9, 51, 6, 51, 6, 51, 6, 51, 6, 51, - 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 4, 51, 4, 51, 4, 51, 4, 51, 4, 56, 1, 58, 1, 59, 1, 60, 1, - 61, 1, 61, 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, 1, 61, 1, 61, 1, 60, - 1, 59, 1, 58, 4, 56, 4, 51, 4, 51, 4, 51, 4, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, - 51, 6, 51, 6, 51, 9, 51, 9, 51, 9, 51, 9, 51, 9, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 16, 51, 16, 51, - 21, 51, 21, 51, 21, 51, 21, 51, 21, 51, 26, 51, 26, 51, 26, 51, 36, 51, 36, 51, 41, 51, 41, 51, 41, 51}; - -__constant__ -const unsigned short kdi[] = { - 0, 10, 20, 30, 40, 50, 60, 75, 90, 105, 120, 145, 170, 195, 220, 245, 270, 300, 330, - 360, 390, 420, 450, 480, 510, 540, 570, 605, 640, 675, 710, 747, 784, 821, 858, 895, 932, 969, - 1006, 1043, 1080, 1122, 1164, 1206, 1248, 1290, 1332, 1374, 1416, 1458, 1500, 1545, 1590, 1635, 1680, 1725, 1770, - 1815, 1860, 1905, 1950, 1995, 2040, 2085, 2130, 2175, 2220, 2265, 2310, 2355, 2400, 2447, 2494, 2541, 2588, 2635, - 2682, 2729, 2776, 2818, 2860, 2903, 2946, 2988, 3030, 3071, 3112, 3152, 3192, 3232, 3272, 3311, 3350, 3389, 3428, - 3467, 3506, 3545, 3584, 3623, 3662, 3701, 3740, 3779, 3818, 3857, 3896, 3935, 3974, 4013, 4052, 4092, 4132, 4172, - 4212, 4253, 4294, 4336, 4378, 4421, 4464, 4506, 4548, 4595, 4642, 4689, 4736, 4783, 4830, 4877, 4924, 4969, 5014, - 5059, 5104, 5149, 5194, 5239, 5284, 5329, 5374, 5419, 5464, 5509, 5554, 5599, 5644, 5689, 5734, 5779, 5824, 5866, - 5908, 5950, 5992, 6034, 6076, 6118, 6160, 6202, 6244, 6281, 6318, 6355, 6392, 6429, 6466, 6503, 6540, 6577, 6614, - 6649, 6684, 6719, 6754, 6784, 6814, 6844, 6874, 6904, 6934, 6964, 6994, 7024, 7054, 7079, 7104, 7129, 7154, 7179, - 7204, 7219, 7234, 7249, 7264, 7274, 7284, 7294, 7304, 7314}; - -} - -} - -__device__ -uint32_t hashedIndexEE(uint32_t id) { - using namespace internal::endcap; - - const uint32_t jx ( ix(id) ) ; - const uint32_t jd ( 2*( iy(id) - 1 ) + ( jx - 1 )/50 ) ; - return ( ( positiveZ(id) ? EEDetId::kEEhalf : 0) + kdi[jd] + jx - kxf[jd] ) ; -} - -}} +namespace ecal { + namespace multifit { + + namespace internal { + + namespace barrel { + + __device__ __forceinline__ bool positiveZ(uint32_t id) { return id & 0x10000; } + + __device__ __forceinline__ uint32_t ietaAbs(uint32_t id) { return (id >> 9) & 0x7F; } + + __device__ __forceinline__ uint32_t iphi(uint32_t id) { return id & 0x1FF; } + + } // namespace barrel + + } // namespace internal + + __device__ uint32_t hashedIndexEB(uint32_t id) { + using namespace internal::barrel; + return (EBDetId::MAX_IETA + (positiveZ(id) ? ietaAbs(id) - 1 : -ietaAbs(id))) * EBDetId::MAX_IPHI + iphi(id) - 1; + } + + namespace internal { + + namespace endcap { + + __device__ __forceinline__ uint32_t ix(uint32_t id) { return (id >> 7) & 0x7F; } + + __device__ __forceinline__ uint32_t iy(uint32_t id) { return id & 0x7F; } + + __device__ __forceinline__ bool positiveZ(uint32_t id) { return id & 0x4000; } + + // these constants come from EE Det Id + __constant__ const unsigned short kxf[] = { + 41, 51, 41, 51, 41, 51, 36, 51, 36, 51, 26, 51, 26, 51, 26, 51, 21, 51, 21, 51, 21, 51, 21, 51, 21, + 51, 16, 51, 16, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 9, 51, 9, 51, 9, 51, 9, 51, 9, 51, + 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 4, 51, 4, 51, 4, + 51, 4, 51, 4, 56, 1, 58, 1, 59, 1, 60, 1, 61, 1, 61, 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, + 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, 1, 61, 1, 61, 1, 60, 1, 59, 1, 58, 4, 56, 4, 51, 4, + 51, 4, 51, 4, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, + 9, 51, 9, 51, 9, 51, 9, 51, 9, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 16, 51, 16, 51, 21, + 51, 21, 51, 21, 51, 21, 51, 21, 51, 26, 51, 26, 51, 26, 51, 36, 51, 36, 51, 41, 51, 41, 51, 41, 51}; + + __constant__ const unsigned short kdi[] = { + 0, 10, 20, 30, 40, 50, 60, 75, 90, 105, 120, 145, 170, 195, 220, 245, 270, + 300, 330, 360, 390, 420, 450, 480, 510, 540, 570, 605, 640, 675, 710, 747, 784, 821, + 858, 895, 932, 969, 1006, 1043, 1080, 1122, 1164, 1206, 1248, 1290, 1332, 1374, 1416, 1458, 1500, + 1545, 1590, 1635, 1680, 1725, 1770, 1815, 1860, 1905, 1950, 1995, 2040, 2085, 2130, 2175, 2220, 2265, + 2310, 2355, 2400, 2447, 2494, 2541, 2588, 2635, 2682, 2729, 2776, 2818, 2860, 2903, 2946, 2988, 3030, + 3071, 3112, 3152, 3192, 3232, 3272, 3311, 3350, 3389, 3428, 3467, 3506, 3545, 3584, 3623, 3662, 3701, + 3740, 3779, 3818, 3857, 3896, 3935, 3974, 4013, 4052, 4092, 4132, 4172, 4212, 4253, 4294, 4336, 4378, + 4421, 4464, 4506, 4548, 4595, 4642, 4689, 4736, 4783, 4830, 4877, 4924, 4969, 5014, 5059, 5104, 5149, + 5194, 5239, 5284, 5329, 5374, 5419, 5464, 5509, 5554, 5599, 5644, 5689, 5734, 5779, 5824, 5866, 5908, + 5950, 5992, 6034, 6076, 6118, 6160, 6202, 6244, 6281, 6318, 6355, 6392, 6429, 6466, 6503, 6540, 6577, + 6614, 6649, 6684, 6719, 6754, 6784, 6814, 6844, 6874, 6904, 6934, 6964, 6994, 7024, 7054, 7079, 7104, + 7129, 7154, 7179, 7204, 7219, 7234, 7249, 7264, 7274, 7284, 7294, 7304, 7314}; + + } // namespace endcap + + } // namespace internal + + __device__ uint32_t hashedIndexEE(uint32_t id) { + using namespace internal::endcap; + + const uint32_t jx(ix(id)); + const uint32_t jd(2 * (iy(id) - 1) + (jx - 1) / 50); + return ((positiveZ(id) ? EEDetId::kEEhalf : 0) + kdi[jd] + jx - kxf[jd]); + } + + } // namespace multifit +} // namespace ecal diff --git a/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.cu index 4c538a2e352ad..5089676ed0c9f 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.cu @@ -17,50 +17,46 @@ //#define ECAL_RECO_CUDA_DEBUG -namespace ecal { namespace multifit { - -__device__ -__forceinline__ -bool use_sample(unsigned int sample_mask, unsigned int sample) { - return sample_mask & (0x1 << (EcalDataFrame::MAXSAMPLES - (sample + 1))); -} - -__global__ -void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values, - SampleVector::Scalar const* sample_value_errors, - bool const* useless_sample_values, - SampleVector::Scalar* chi2s, - SampleVector::Scalar* sum0s, - SampleVector::Scalar* sumAAs, - int const nchannels) { - using ScalarType = SampleVector::Scalar; - constexpr int nsamples = EcalDataFrame::MAXSAMPLES; - - // indices - int tx = threadIdx.x + blockDim.x*blockIdx.x; - int ltx = threadIdx.x; - int ch = tx / nsamples; - int nchannels_per_block = blockDim.x / nsamples; - - // TODO: make sure that this branch plays nicely with __syncthreads inside - // can there be a deadlock even if the thread is inactive - if (ch < nchannels) { - // +namespace ecal { + namespace multifit { + + __device__ __forceinline__ bool use_sample(unsigned int sample_mask, unsigned int sample) { + return sample_mask & (0x1 << (EcalDataFrame::MAXSAMPLES - (sample + 1))); + } + + __global__ void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values, + SampleVector::Scalar const* sample_value_errors, + bool const* useless_sample_values, + SampleVector::Scalar* chi2s, + SampleVector::Scalar* sum0s, + SampleVector::Scalar* sumAAs, + const int nchannels) { + using ScalarType = SampleVector::Scalar; + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + int tx = threadIdx.x + blockDim.x * blockIdx.x; + int ltx = threadIdx.x; + int ch = tx / nsamples; + int nchannels_per_block = blockDim.x / nsamples; + + // TODO: make sure that this branch plays nicely with __syncthreads inside + // can there be a deadlock even if the thread is inactive + if (ch < nchannels) { + // int sample = tx % nsamples; // shared mem inits extern __shared__ char sdata[]; char* s_sum0 = sdata; - SampleVector::Scalar* s_sum1 = reinterpret_cast( - s_sum0 + nchannels_per_block*nsamples); - SampleVector::Scalar* s_sumA = s_sum1 + nchannels_per_block*nsamples; - SampleVector::Scalar* s_sumAA = s_sumA + nchannels_per_block*nsamples; + SampleVector::Scalar* s_sum1 = reinterpret_cast(s_sum0 + nchannels_per_block * nsamples); + SampleVector::Scalar* s_sumA = s_sum1 + nchannels_per_block * nsamples; + SampleVector::Scalar* s_sumAA = s_sumA + nchannels_per_block * nsamples; // TODO make sure no div by 0 - auto const inv_error = useless_sample_values[tx] - ? 0.0 - : 1.0 / (sample_value_errors[tx] * sample_value_errors[tx]); - auto const sample_value = sample_values[tx]; + const auto inv_error = + useless_sample_values[tx] ? 0.0 : 1.0 / (sample_value_errors[tx] * sample_value_errors[tx]); + const auto sample_value = sample_values[tx]; s_sum0[ltx] = useless_sample_values[tx] ? 0 : 1; s_sum1[ltx] = inv_error; s_sumA[ltx] = sample_value * inv_error; @@ -68,380 +64,349 @@ void kernel_time_compute_nullhypot(SampleVector::Scalar const* sample_values, __syncthreads(); // 5 threads for [0, 4] samples - if (sample<5) { - s_sum0[ltx] += s_sum0[ltx+5]; - s_sum1[ltx] += s_sum1[ltx+5]; - s_sumA[ltx] += s_sumA[ltx+5]; - s_sumAA[ltx] += s_sumAA[ltx+5]; + if (sample < 5) { + s_sum0[ltx] += s_sum0[ltx + 5]; + s_sum1[ltx] += s_sum1[ltx + 5]; + s_sumA[ltx] += s_sumA[ltx + 5]; + s_sumAA[ltx] += s_sumAA[ltx + 5]; } __syncthreads(); - if (sample<2) { - // note double counting of sample 3 - s_sum0[ltx] += s_sum0[ltx+2] + s_sum0[ltx+3]; - s_sum1[ltx] += s_sum1[ltx+2] + s_sum1[ltx+3]; - s_sumA[ltx] += s_sumA[ltx+2] + s_sumA[ltx+3]; - s_sumAA[ltx] += s_sumAA[ltx+2] + s_sumAA[ltx+3]; + if (sample < 2) { + // note double counting of sample 3 + s_sum0[ltx] += s_sum0[ltx + 2] + s_sum0[ltx + 3]; + s_sum1[ltx] += s_sum1[ltx + 2] + s_sum1[ltx + 3]; + s_sumA[ltx] += s_sumA[ltx + 2] + s_sumA[ltx + 3]; + s_sumAA[ltx] += s_sumAA[ltx + 2] + s_sumAA[ltx + 3]; } __syncthreads(); if (sample == 0) { - // note, subtract to remove the double counting of sample == 3 - //s_sum0[ltx] += s_sum0[ltx+1] - s_sum0[ltx+3]; - //s_sum1[ltx] += s_sum1[ltx+1] - s_sum1[ltx+3]; - //s_sumA[ltx] += s_sumA[ltx+1] - s_sumA[ltx+3]; - //s_sumAA[ltx] += s_sumAA[ltx+1] - s_sumAA[ltx+3]; - auto const sum0 = s_sum0[ltx] + s_sum0[ltx+1] - s_sum0[ltx+3]; - auto const sum1 = s_sum1[ltx] + s_sum1[ltx+1] - s_sum1[ltx+3]; - auto const sumA = s_sumA[ltx] + s_sumA[ltx+1] - s_sumA[ltx+3]; - auto const sumAA = s_sumAA[ltx] + s_sumAA[ltx+1] - s_sumAA[ltx+3]; - auto const chi2 = sum0>0 - ? (sumAA - sumA * sumA / sum1) / sum0 - : static_cast(0); - chi2s[ch] = chi2; - sum0s[ch] = sum0; - sumAAs[ch] = sumAA; + // note, subtract to remove the double counting of sample == 3 + //s_sum0[ltx] += s_sum0[ltx+1] - s_sum0[ltx+3]; + //s_sum1[ltx] += s_sum1[ltx+1] - s_sum1[ltx+3]; + //s_sumA[ltx] += s_sumA[ltx+1] - s_sumA[ltx+3]; + //s_sumAA[ltx] += s_sumAA[ltx+1] - s_sumAA[ltx+3]; + const auto sum0 = s_sum0[ltx] + s_sum0[ltx + 1] - s_sum0[ltx + 3]; + const auto sum1 = s_sum1[ltx] + s_sum1[ltx + 1] - s_sum1[ltx + 3]; + const auto sumA = s_sumA[ltx] + s_sumA[ltx + 1] - s_sumA[ltx + 3]; + const auto sumAA = s_sumAA[ltx] + s_sumAA[ltx + 1] - s_sumAA[ltx + 3]; + const auto chi2 = sum0 > 0 ? (sumAA - sumA * sumA / sum1) / sum0 : static_cast(0); + chi2s[ch] = chi2; + sum0s[ch] = sum0; + sumAAs[ch] = sumAA; #ifdef DEBUG_TC_NULLHYPOT - if (ch == 0) { - printf("chi2 = %f sum0 = %d sumAA = %f\n", - chi2, static_cast(sum0), sumAA); - } + if (ch == 0) { + printf("chi2 = %f sum0 = %d sumAA = %f\n", chi2, static_cast(sum0), sumAA); + } #endif } + } } -} - -constexpr float fast_expf(float x) { return unsafe_expf<6>(x); } -constexpr float fast_logf(float x) { return unsafe_logf<7>(x); } - -//#define DEBUG_TC_MAKERATIO -// -// launch ctx parameters are -// 45 threads per channel, X channels per block, Y blocks -// 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9 -// TODO: it might be much beter to use 32 threads per channel instead of 45 -// to simplify the synchronization -// -__global__ -void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values, - SampleVector::Scalar const* sample_value_errors, - uint32_t const* dids_eb, - uint32_t const* dids_ee, - bool const* useless_sample_values, - char const* pedestal_nums, - ConfigurationParameters::type const* amplitudeFitParametersEB, - ConfigurationParameters::type const* amplitudeFitParametersEE, - ConfigurationParameters::type const* timeFitParametersEB, - ConfigurationParameters::type const* timeFitParametersEE, - SampleVector::Scalar const* sumAAsNullHypot, - SampleVector::Scalar const* sum0sNullHypot, - SampleVector::Scalar* tMaxAlphaBetas, - SampleVector::Scalar* tMaxErrorAlphaBetas, - SampleVector::Scalar* g_accTimeMax, - SampleVector::Scalar* g_accTimeWgt, - TimeComputationState* g_state, - unsigned int const timeFitParameters_sizeEB, - unsigned int const timeFitParameters_sizeEE, - ConfigurationParameters::type const timeFitLimits_firstEB, - ConfigurationParameters::type const timeFitLimits_firstEE, - ConfigurationParameters::type const timeFitLimits_secondEB, - ConfigurationParameters::type const timeFitLimits_secondEE, - int const nchannels, - uint32_t const offsetForInputs) { - using ScalarType = SampleVector::Scalar; - - // constants - constexpr int nthreads_per_channel = 45; // n=10, n(n-1)/2 - constexpr int nsamples = EcalDataFrame::MAXSAMPLES; - - // indices - int const gtx = threadIdx.x + blockDim.x*blockIdx.x; - int const ch = gtx / nthreads_per_channel; - int const lch = threadIdx.x / nthreads_per_channel; - int const ltx = threadIdx.x % nthreads_per_channel; - int const ch_start = ch*nsamples; - int const lch_start = lch*nthreads_per_channel; - int const nchannels_per_block = blockDim.x / nthreads_per_channel; - auto const* dids = ch >= offsetForInputs - ? dids_ee - : dids_eb; - int const inputCh = ch >= offsetForInputs - ? ch - offsetForInputs - : ch; - - // rmeove inactive threads - // TODO: need to understand if this is 100% safe in presence of syncthreads - if (ch >= nchannels) return; - - auto const did = DetId{dids[inputCh]}; - auto const isBarrel = did.subdetId() == EcalBarrel; - auto const* amplitudeFitParameters = isBarrel - ? amplitudeFitParametersEB - : amplitudeFitParametersEE; - auto const* timeFitParameters = isBarrel - ? timeFitParametersEB - : timeFitParametersEE; - auto const timeFitParameters_size = isBarrel - ? timeFitParameters_sizeEB - : timeFitParameters_sizeEE; - auto const timeFitLimits_first = isBarrel - ? timeFitLimits_firstEB - : timeFitLimits_firstEE; - auto const timeFitLimits_second = isBarrel - ? timeFitLimits_secondEB - : timeFitLimits_secondEE; - - extern __shared__ char smem[]; - ScalarType* shr_chi2s = reinterpret_cast(smem); - ScalarType* shr_time_wgt = shr_chi2s + blockDim.x; - ScalarType* shr_time_max = shr_time_wgt + blockDim.x; - ScalarType* shrTimeMax = shr_time_max + blockDim.x; - ScalarType* shrTimeWgt = shrTimeMax + blockDim.x; - - // map tx -> (sample_i, sample_j) - int sample_i, sample_j = 0; - if (ltx>=0 && ltx<=8) { + + constexpr float fast_expf(float x) { return unsafe_expf<6>(x); } + constexpr float fast_logf(float x) { return unsafe_logf<7>(x); } + + //#define DEBUG_TC_MAKERATIO + // + // launch ctx parameters are + // 45 threads per channel, X channels per block, Y blocks + // 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9 + // TODO: it might be much beter to use 32 threads per channel instead of 45 + // to simplify the synchronization + // + __global__ void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values, + SampleVector::Scalar const* sample_value_errors, + uint32_t const* dids_eb, + uint32_t const* dids_ee, + bool const* useless_sample_values, + char const* pedestal_nums, + ConfigurationParameters::type const* amplitudeFitParametersEB, + ConfigurationParameters::type const* amplitudeFitParametersEE, + ConfigurationParameters::type const* timeFitParametersEB, + ConfigurationParameters::type const* timeFitParametersEE, + SampleVector::Scalar const* sumAAsNullHypot, + SampleVector::Scalar const* sum0sNullHypot, + SampleVector::Scalar* tMaxAlphaBetas, + SampleVector::Scalar* tMaxErrorAlphaBetas, + SampleVector::Scalar* g_accTimeMax, + SampleVector::Scalar* g_accTimeWgt, + TimeComputationState* g_state, + unsigned const int timeFitParameters_sizeEB, + unsigned const int timeFitParameters_sizeEE, + ConfigurationParameters::type const timeFitLimits_firstEB, + ConfigurationParameters::type const timeFitLimits_firstEE, + ConfigurationParameters::type const timeFitLimits_secondEB, + ConfigurationParameters::type const timeFitLimits_secondEE, + const int nchannels, + uint32_t const offsetForInputs) { + using ScalarType = SampleVector::Scalar; + + // constants + constexpr int nthreads_per_channel = 45; // n=10, n(n-1)/2 + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + const int gtx = threadIdx.x + blockDim.x * blockIdx.x; + const int ch = gtx / nthreads_per_channel; + const int ltx = threadIdx.x % nthreads_per_channel; + const int ch_start = ch * nsamples; + const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb; + const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch; + + // rmeove inactive threads + // TODO: need to understand if this is 100% safe in presence of syncthreads + if (ch >= nchannels) + return; + + const auto did = DetId{dids[inputCh]}; + const auto isBarrel = did.subdetId() == EcalBarrel; + const auto* amplitudeFitParameters = isBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE; + const auto* timeFitParameters = isBarrel ? timeFitParametersEB : timeFitParametersEE; + const auto timeFitParameters_size = isBarrel ? timeFitParameters_sizeEB : timeFitParameters_sizeEE; + const auto timeFitLimits_first = isBarrel ? timeFitLimits_firstEB : timeFitLimits_firstEE; + const auto timeFitLimits_second = isBarrel ? timeFitLimits_secondEB : timeFitLimits_secondEE; + + extern __shared__ char smem[]; + ScalarType* shr_chi2s = reinterpret_cast(smem); + ScalarType* shr_time_wgt = shr_chi2s + blockDim.x; + ScalarType* shr_time_max = shr_time_wgt + blockDim.x; + ScalarType* shrTimeMax = shr_time_max + blockDim.x; + ScalarType* shrTimeWgt = shrTimeMax + blockDim.x; + + // map tx -> (sample_i, sample_j) + int sample_i, sample_j = 0; + if (ltx >= 0 && ltx <= 8) { sample_i = 0; - sample_j = 1+ltx; - } else if (ltx<=16) { + sample_j = 1 + ltx; + } else if (ltx <= 16) { sample_i = 1; - sample_j = 2+ltx-9; - } else if (ltx<=23) { + sample_j = 2 + ltx - 9; + } else if (ltx <= 23) { sample_i = 2; sample_j = 3 + ltx - 17; - } else if (ltx<=29) { + } else if (ltx <= 29) { sample_i = 3; sample_j = 4 + ltx - 24; - } else if (ltx<=34) { + } else if (ltx <= 34) { sample_i = 4; sample_j = 5 + ltx - 30; - } else if (ltx<=38) { + } else if (ltx <= 38) { sample_i = 5; sample_j = 6 + ltx - 35; - } else if (ltx<=41) { + } else if (ltx <= 41) { sample_i = 6; sample_j = 7 + ltx - 39; - } else if (ltx<=43) { + } else if (ltx <= 43) { sample_i = 7; sample_j = 8 + ltx - 42; - } else if (ltx <= 44) { + } else if (ltx <= 44) { sample_i = 8; sample_j = 9; - } else + } else assert(false); - auto const tx_i = ch_start + sample_i; - auto const tx_j = ch_start + sample_j; + const auto tx_i = ch_start + sample_i; + const auto tx_j = ch_start + sample_j; - // - // note, given the way we partition the block, with 45 threads per channel - // we will end up with inactive threads which need to be dragged along - // through the synching point - // - /* + // + // note, given the way we partition the block, with 45 threads per channel + // we will end up with inactive threads which need to be dragged along + // through the synching point + // + /* bool const condToExit = ch >= nchannels ? true : useless_sample_values[tx_i] || useless_sample_values[tx_j] || sample_values[tx_i]<=1 || sample_values[tx_j]<=1; */ - bool const condForUselessSamples = useless_sample_values[tx_i] - || useless_sample_values[tx_j] - || sample_values[tx_i]<=1 || sample_values[tx_j]<=1; - - // - // see cpu implementation for explanation - // - ScalarType chi2 = std::numeric_limits::max(); - ScalarType tmax = 0; - ScalarType tmaxerr = 0; - shrTimeMax[threadIdx.x] = 0; - shrTimeWgt[threadIdx.x] = 0; - bool internalCondForSkipping1 = true; - bool internalCondForSkipping2 = true; - if (!condForUselessSamples) { - auto const rtmp = sample_values[tx_i] / sample_values[tx_j]; - auto const invampl_i = 1.0 / sample_values[tx_i]; - auto const relErr2_i = sample_value_errors[tx_i]*sample_value_errors[tx_i]* - invampl_i*invampl_i; - auto const invampl_j = 1.0 / sample_values[tx_j]; - auto const relErr2_j = sample_value_errors[tx_j]*sample_value_errors[tx_j]* - invampl_j*invampl_j; - auto const err1 = rtmp * rtmp * (relErr2_i + relErr2_j); - auto err2 = sample_value_errors[tx_j]* - (sample_values[tx_i] - sample_values[tx_j])*(invampl_j*invampl_j); + bool const condForUselessSamples = useless_sample_values[tx_i] || useless_sample_values[tx_j] || + sample_values[tx_i] <= 1 || sample_values[tx_j] <= 1; + + // + // see cpu implementation for explanation + // + ScalarType chi2 = std::numeric_limits::max(); + ScalarType tmax = 0; + ScalarType tmaxerr = 0; + shrTimeMax[threadIdx.x] = 0; + shrTimeWgt[threadIdx.x] = 0; + bool internalCondForSkipping1 = true; + bool internalCondForSkipping2 = true; + if (!condForUselessSamples) { + const auto rtmp = sample_values[tx_i] / sample_values[tx_j]; + const auto invampl_i = 1.0 / sample_values[tx_i]; + const auto relErr2_i = sample_value_errors[tx_i] * sample_value_errors[tx_i] * invampl_i * invampl_i; + const auto invampl_j = 1.0 / sample_values[tx_j]; + const auto relErr2_j = sample_value_errors[tx_j] * sample_value_errors[tx_j] * invampl_j * invampl_j; + const auto err1 = rtmp * rtmp * (relErr2_i + relErr2_j); + auto err2 = sample_value_errors[tx_j] * (sample_values[tx_i] - sample_values[tx_j]) * (invampl_j * invampl_j); // TODO non-divergent branch for a block if each block has 1 channel // otherwise non-divergent for groups of 45 threads // at this point, pedestal_nums[ch] can be either 0, 1 or 2 - if (pedestal_nums[ch]==2) - err2 *= err2 * 0.5; - auto const err3 = (0.289*0.289) * (invampl_j*invampl_j); - auto const total_error = std::sqrt(err1 + err2 + err3); + if (pedestal_nums[ch] == 2) + err2 *= err2 * 0.5; + const auto err3 = (0.289 * 0.289) * (invampl_j * invampl_j); + const auto total_error = std::sqrt(err1 + err2 + err3); - auto const alpha = amplitudeFitParameters[0]; - auto const beta = amplitudeFitParameters[1]; - auto const alphabeta = alpha * beta; - auto const invalphabeta = 1.0 / alphabeta; + const auto alpha = amplitudeFitParameters[0]; + const auto beta = amplitudeFitParameters[1]; + const auto alphabeta = alpha * beta; + const auto invalphabeta = 1.0 / alphabeta; // variables instead of a struct - auto const ratio_index = sample_i; - auto const ratio_step = sample_j - sample_i; - auto const ratio_value = rtmp; - auto const ratio_error = total_error; - - auto const rlim_i_j = fast_expf( - static_cast(sample_j - sample_i) / beta) - 0.001; - internalCondForSkipping1 = !(total_error<1.0 && rtmp>0.001 && rtmp(sample_j - sample_i) / beta) - 0.001; + internalCondForSkipping1 = !(total_error < 1.0 && rtmp > 0.001 && rtmp < rlim_i_j); if (!internalCondForSkipping1) { - // - // precompute. - // in cpu version this was done conditionally - // however easier to do it here (precompute) and then just filter out - // if not needed - // - auto const l_timeFitLimits_first = timeFitLimits_first; - auto const l_timeFitLimits_second = timeFitLimits_second; - if (ratio_step == 1 - && ratio_value >= l_timeFitLimits_first - && ratio_value <= l_timeFitLimits_second) { - - auto const time_max_i = static_cast(ratio_index); - auto u = timeFitParameters[timeFitParameters_size - 1]; + // + // precompute. + // in cpu version this was done conditionally + // however easier to do it here (precompute) and then just filter out + // if not needed + // + const auto l_timeFitLimits_first = timeFitLimits_first; + const auto l_timeFitLimits_second = timeFitLimits_second; + if (ratio_step == 1 && ratio_value >= l_timeFitLimits_first && ratio_value <= l_timeFitLimits_second) { + const auto time_max_i = static_cast(ratio_index); + auto u = timeFitParameters[timeFitParameters_size - 1]; #pragma unroll - for (int k=timeFitParameters_size-2; k>=0; k--) - u = u*ratio_value + timeFitParameters[k]; - - auto du = (timeFitParameters_size - 1) * - (timeFitParameters[timeFitParameters_size - 1]); - for (int k=timeFitParameters_size - 2; k>=1; k--) - du = du*ratio_value + k*timeFitParameters[k]; - - auto const error2 = ratio_error * ratio_error * du * du; - auto const time_max = error2 > 0 - ? (time_max_i - u) / error2 - : static_cast(0); - auto const time_wgt = error2 > 0 - ? 1.0 / error2 - : static_cast(0); - - // store into shared mem - // note, this name is essentially identical to the one used - // below. - shrTimeMax[threadIdx.x] = error2 > 0 ? time_max : 0; - shrTimeWgt[threadIdx.x] = error2 > 0 ? time_wgt : 0; - } else { - shrTimeMax[threadIdx.x] = 0; - shrTimeWgt[threadIdx.x] = 0; - } - - // continue with ratios - auto const stepOverBeta = static_cast(ratio_step) / beta; - auto const offset = static_cast(ratio_index) + alphabeta; - auto const rmin = std::max(ratio_value - ratio_error, 0.001); - auto const rmax = std::min(ratio_value + ratio_error, - fast_expf(static_cast(ratio_step) / beta) - - 0.001); - auto const time1 = - offset - - ratio_step / - (fast_expf((stepOverBeta - fast_logf(rmin)) / - alpha) - 1.0); - auto const time2 = - offset - - ratio_step / - (fast_expf((stepOverBeta - fast_logf(rmax)) / - alpha) - 1.0); - - // set these guys - tmax = 0.5 * (time1 + time2); - tmaxerr = 0.5 * std::sqrt((time1 - time2) * (time1 - time2)); + for (int k = timeFitParameters_size - 2; k >= 0; k--) + u = u * ratio_value + timeFitParameters[k]; + + auto du = (timeFitParameters_size - 1) * (timeFitParameters[timeFitParameters_size - 1]); + for (int k = timeFitParameters_size - 2; k >= 1; k--) + du = du * ratio_value + k * timeFitParameters[k]; + + const auto error2 = ratio_error * ratio_error * du * du; + const auto time_max = error2 > 0 ? (time_max_i - u) / error2 : static_cast(0); + const auto time_wgt = error2 > 0 ? 1.0 / error2 : static_cast(0); + + // store into shared mem + // note, this name is essentially identical to the one used + // below. + shrTimeMax[threadIdx.x] = error2 > 0 ? time_max : 0; + shrTimeWgt[threadIdx.x] = error2 > 0 ? time_wgt : 0; + } else { + shrTimeMax[threadIdx.x] = 0; + shrTimeWgt[threadIdx.x] = 0; + } + + // continue with ratios + const auto stepOverBeta = static_cast(ratio_step) / beta; + const auto offset = static_cast(ratio_index) + alphabeta; + const auto rmin = std::max(ratio_value - ratio_error, 0.001); + const auto rmax = std::min(ratio_value + ratio_error, + fast_expf(static_cast(ratio_step) / beta) - 0.001); + const auto time1 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmin)) / alpha) - 1.0); + const auto time2 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmax)) / alpha) - 1.0); + + // set these guys + tmax = 0.5 * (time1 + time2); + tmaxerr = 0.5 * std::sqrt((time1 - time2) * (time1 - time2)); #ifdef DEBUG_TC_MAKERATIO - if (ch == 1 || ch == 0) - printf("ch = %d ltx = %d tmax = %f tmaxerr = %f time1 = %f time2 = %f offset = %f rmin = %f rmax = %f\n", - ch, ltx, tmax, tmaxerr, time1, time2, offset, rmin, rmax); + if (ch == 1 || ch == 0) + printf("ch = %d ltx = %d tmax = %f tmaxerr = %f time1 = %f time2 = %f offset = %f rmin = %f rmax = %f\n", + ch, + ltx, + tmax, + tmaxerr, + time1, + time2, + offset, + rmin, + rmax); #endif - SampleVector::Scalar sumAf = 0; - SampleVector::Scalar sumff = 0; - int const itmin = std::max(-1, static_cast(std::floor(tmax - alphabeta))); - auto loffset = (static_cast(itmin) - tmax) * invalphabeta; - // TODO: data dependence - for (int it = itmin+1; it 1e-6) - ? fast_expf(alpha * (fast_logf(term1) - loffset)) - : 0; - sumAf += sample_values[ch_start+it] * (f * inverr2); - sumff += f*(f*inverr2); - } - - auto const sumAA = sumAAsNullHypot[ch]; - auto const sum0 = sum0sNullHypot[ch]; - chi2 = sumAA; - ScalarType amp = 0; - // TODO: sum0 can not be 0 below, need to introduce the check upfront - if (sumff > 0) { - chi2 = sumAA - sumAf * (sumAf / sumff); - amp = sumAf / sumff; - } - chi2 /= sum0; + SampleVector::Scalar sumAf = 0; + SampleVector::Scalar sumff = 0; + const int itmin = std::max(-1, static_cast(std::floor(tmax - alphabeta))); + auto loffset = (static_cast(itmin) - tmax) * invalphabeta; + // TODO: data dependence + for (int it = itmin + 1; it < nsamples; it++) { + loffset += invalphabeta; + if (useless_sample_values[ch_start + it]) + continue; + const auto inverr2 = 1.0 / (sample_value_errors[ch_start + it] * sample_value_errors[ch_start + it]); + const auto term1 = 1.0 + loffset; + const auto f = (term1 > 1e-6) ? fast_expf(alpha * (fast_logf(term1) - loffset)) : 0; + sumAf += sample_values[ch_start + it] * (f * inverr2); + sumff += f * (f * inverr2); + } + + const auto sumAA = sumAAsNullHypot[ch]; + const auto sum0 = sum0sNullHypot[ch]; + chi2 = sumAA; + // TODO: sum0 can not be 0 below, need to introduce the check upfront + if (sumff > 0) { + chi2 = sumAA - sumAf * (sumAf / sumff); + } + chi2 /= sum0; #ifdef DEBUG_TC_MAKERATIO - if (ch == 1 || ch == 0) - printf("ch = %d ltx = %d sumAf = %f sumff = %f sumAA = %f sum0 = %d tmax = %f tmaxerr = %f chi2 = %f\n", - ch, ltx, sumAf, sumff, sumAA, static_cast(sum0), tmax, tmaxerr, chi2); + if (ch == 1 || ch == 0) + printf("ch = %d ltx = %d sumAf = %f sumff = %f sumAA = %f sum0 = %d tmax = %f tmaxerr = %f chi2 = %f\n", + ch, + ltx, + sumAf, + sumff, + sumAA, + static_cast(sum0), + tmax, + tmaxerr, + chi2); #endif - if (chi2>0 && tmax>0 && tmaxerr>0) - internalCondForSkipping2 = false; - else - chi2 = std::numeric_limits::max(); + if (chi2 > 0 && tmax > 0 && tmaxerr > 0) + internalCondForSkipping2 = false; + else + chi2 = std::numeric_limits::max(); } - } + } - // store into smem - shr_chi2s[threadIdx.x] = chi2; - __syncthreads(); + // store into smem + shr_chi2s[threadIdx.x] = chi2; + __syncthreads(); - // find min chi2 - quite crude for now - // TODO validate/check - char iter = nthreads_per_channel / 2 + nthreads_per_channel % 2; - bool oddElements = nthreads_per_channel % 2; + // find min chi2 - quite crude for now + // TODO validate/check + char iter = nthreads_per_channel / 2 + nthreads_per_channel % 2; + bool oddElements = nthreads_per_channel % 2; #pragma unroll - while (iter>=1) { + while (iter >= 1) { if (ltx < iter) - // for odd ns, the last guy will just store itself - // exception is for ltx == 0 and iter==1 - shr_chi2s[threadIdx.x] = oddElements && (ltx==iter-1 && ltx>0) - ? shr_chi2s[threadIdx.x] - : std::min(shr_chi2s[threadIdx.x], shr_chi2s[threadIdx.x+iter]); + // for odd ns, the last guy will just store itself + // exception is for ltx == 0 and iter==1 + shr_chi2s[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0) + ? shr_chi2s[threadIdx.x] + : std::min(shr_chi2s[threadIdx.x], shr_chi2s[threadIdx.x + iter]); __syncthreads(); oddElements = iter % 2; - iter = iter==1 ? iter/2 : iter/2 + iter%2; - } + iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2; + } - // filter out inactive or useless samples threads - if (!condForUselessSamples && !internalCondForSkipping1 - && !internalCondForSkipping2) { + // filter out inactive or useless samples threads + if (!condForUselessSamples && !internalCondForSkipping1 && !internalCondForSkipping2) { // min chi2, now compute weighted average of tmax measurements // see cpu version for more explanation - auto const chi2min = shr_chi2s[threadIdx.x - ltx]; - auto const chi2Limit = chi2min + 1.0; - auto const inverseSigmaSquared = - chi2 < chi2Limit - ? 1.0 / (tmaxerr * tmaxerr) - : 0.0; + const auto chi2min = shr_chi2s[threadIdx.x - ltx]; + const auto chi2Limit = chi2min + 1.0; + const auto inverseSigmaSquared = chi2 < chi2Limit ? 1.0 / (tmaxerr * tmaxerr) : 0.0; #ifdef DEBUG_TC_MAKERATIO if (ch == 1 || ch == 0) - printf("ch = %d ltx = %d chi2min = %f chi2Limit = %f inverseSigmaSquared = %f\n", - ch, ltx, chi2min, chi2Limit, inverseSigmaSquared); + printf("ch = %d ltx = %d chi2min = %f chi2Limit = %f inverseSigmaSquared = %f\n", + ch, + ltx, + chi2min, + chi2Limit, + inverseSigmaSquared); #endif // store into shared mem and run reduction @@ -449,53 +414,53 @@ void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values, // TODO: check if shuffling intrinsics are better shr_time_wgt[threadIdx.x] = inverseSigmaSquared; shr_time_max[threadIdx.x] = tmax * inverseSigmaSquared; - } else { + } else { shr_time_wgt[threadIdx.x] = 0; shr_time_max[threadIdx.x] = 0; - } - __syncthreads(); + } + __syncthreads(); - // reduce to compute time_max and time_wgt - iter = nthreads_per_channel / 2 + nthreads_per_channel % 2; - oddElements = nthreads_per_channel % 2; + // reduce to compute time_max and time_wgt + iter = nthreads_per_channel / 2 + nthreads_per_channel % 2; + oddElements = nthreads_per_channel % 2; #pragma unroll - while (iter>=1) { + while (iter >= 1) { if (ltx < iter) { - shr_time_wgt[threadIdx.x] = oddElements && (ltx==iter-1 && ltx>0) - ? shr_time_wgt[threadIdx.x] - : shr_time_wgt[threadIdx.x] + shr_time_wgt[threadIdx.x+iter]; - shr_time_max[threadIdx.x] = oddElements && (ltx==iter-1 && ltx>0) - ? shr_time_max[threadIdx.x] - : shr_time_max[threadIdx.x] + shr_time_max[threadIdx.x+iter]; - shrTimeMax[threadIdx.x] = oddElements && (ltx==iter-1 && ltx>0) - ? shrTimeMax[threadIdx.x] - : shrTimeMax[threadIdx.x] + shrTimeMax[threadIdx.x+iter]; - shrTimeWgt[threadIdx.x] = oddElements && (ltx==iter-1 && ltx>0) - ? shrTimeWgt[threadIdx.x] - : shrTimeWgt[threadIdx.x] + shrTimeWgt[threadIdx.x+iter]; + shr_time_wgt[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0) + ? shr_time_wgt[threadIdx.x] + : shr_time_wgt[threadIdx.x] + shr_time_wgt[threadIdx.x + iter]; + shr_time_max[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0) + ? shr_time_max[threadIdx.x] + : shr_time_max[threadIdx.x] + shr_time_max[threadIdx.x + iter]; + shrTimeMax[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0) + ? shrTimeMax[threadIdx.x] + : shrTimeMax[threadIdx.x] + shrTimeMax[threadIdx.x + iter]; + shrTimeWgt[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0) + ? shrTimeWgt[threadIdx.x] + : shrTimeWgt[threadIdx.x] + shrTimeWgt[threadIdx.x + iter]; } - + __syncthreads(); oddElements = iter % 2; - iter = iter==1 ? iter/2 : iter/2 + iter%2; - } + iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2; + } - // load from shared memory the 0th guy (will contain accumulated values) - // compute - // store into global mem - if (ltx == 0) { - auto const tmp_time_max = shr_time_max[threadIdx.x]; - auto const tmp_time_wgt = shr_time_wgt[threadIdx.x]; + // load from shared memory the 0th guy (will contain accumulated values) + // compute + // store into global mem + if (ltx == 0) { + const auto tmp_time_max = shr_time_max[threadIdx.x]; + const auto tmp_time_wgt = shr_time_wgt[threadIdx.x]; // we are done if there number of time ratios is 0 - if (tmp_time_wgt==0 && tmp_time_max==0) { - g_state[ch] = TimeComputationState::Finished; - return ; + if (tmp_time_wgt == 0 && tmp_time_max == 0) { + g_state[ch] = TimeComputationState::Finished; + return; } // no div by 0 - auto const tMaxAlphaBeta = tmp_time_max / tmp_time_wgt; - auto const tMaxErrorAlphaBeta = 1.0 / std::sqrt(tmp_time_wgt); + const auto tMaxAlphaBeta = tmp_time_max / tmp_time_wgt; + const auto tMaxErrorAlphaBeta = 1.0 / std::sqrt(tmp_time_wgt); tMaxAlphaBetas[ch] = tMaxAlphaBeta; tMaxErrorAlphaBetas[ch] = tMaxErrorAlphaBeta; @@ -504,22 +469,22 @@ void kernel_time_compute_makeratio(SampleVector::Scalar const* sample_values, g_state[ch] = TimeComputationState::NotFinished; #ifdef DEBUG_TC_MAKERATIO - printf("ch = %d time_max = %f time_wgt = %f\n", - ch, tmp_time_max, tmp_time_wgt); - printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f timeMax = %f timeWgt = %f\n", - ch, tMaxAlphaBeta, tMaxErrorAlphaBeta, - shrTimeMax[threadIdx.x], - shrTimeWgt[threadIdx.x]); + printf("ch = %d time_max = %f time_wgt = %f\n", ch, tmp_time_max, tmp_time_wgt); + printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f timeMax = %f timeWgt = %f\n", + ch, + tMaxAlphaBeta, + tMaxErrorAlphaBeta, + shrTimeMax[threadIdx.x], + shrTimeWgt[threadIdx.x]); #endif + } } -} - -/// launch ctx parameters are -/// 10 threads per channel, N channels per block, Y blocks -/// TODO: do we need to keep the state around or can be removed?! -//#define DEBUG_FINDAMPLCHI2_AND_FINISH -__global__ -void kernel_time_compute_findamplchi2_and_finish( + + /// launch ctx parameters are + /// 10 threads per channel, N channels per block, Y blocks + /// TODO: do we need to keep the state around or can be removed?! + //#define DEBUG_FINDAMPLCHI2_AND_FINISH + __global__ void kernel_time_compute_findamplchi2_and_finish( SampleVector::Scalar const* sample_values, SampleVector::Scalar const* sample_value_errors, uint32_t const* dids_eb, @@ -539,451 +504,387 @@ void kernel_time_compute_findamplchi2_and_finish( SampleVector::Scalar* g_ampMaxError, SampleVector::Scalar* g_timeMax, SampleVector::Scalar* g_timeError, - int const nchannels, + const int nchannels, uint32_t const offsetForInputs) { - using ScalarType = SampleVector::Scalar; - - // constants - constexpr int nsamples = EcalDataFrame::MAXSAMPLES; - - // indices - int const gtx = threadIdx.x + blockIdx.x*blockDim.x; - int const ch = gtx / nsamples; - int const sample = threadIdx.x % nsamples; - int const ch_start = ch * nsamples; - auto const* dids = ch >= offsetForInputs - ? dids_ee - : dids_eb; - int const inputCh = ch >= offsetForInputs - ? ch - offsetForInputs - : ch; - - // configure shared mem - // per block, we need #threads per block * 2 * sizeof(ScalarType) - // we run with N channels per block - extern __shared__ char smem[]; - ScalarType* shr_sumAf = reinterpret_cast(smem); - ScalarType* shr_sumff = shr_sumAf + blockDim.x; - - if (ch >= nchannels) return; - - auto state = g_state[ch]; - auto const did = DetId{dids[inputCh]}; - auto const* amplitudeFitParameters = did.subdetId() == EcalBarrel - ? amplitudeFitParametersEB - : amplitudeFitParametersEE; - - - // TODO is that better than storing into global and launching another kernel - // for the first 10 threads - if (state == TimeComputationState::NotFinished) { - auto const alpha = amplitudeFitParameters[0]; - auto const beta = amplitudeFitParameters[1]; - auto const alphabeta = alpha * beta; - auto const invalphabeta = 1.0 / alphabeta; - auto const tMaxAlphaBeta = g_tMaxAlphaBeta[ch]; - auto const sample_value = sample_values[gtx]; - auto const sample_value_error = sample_value_errors[gtx]; - auto const inverr2 = useless_samples[gtx] - ? static_cast(0) - : 1.0 / (sample_value_error * sample_value_error); - auto const offset = (static_cast(sample) - tMaxAlphaBeta) - * invalphabeta; - auto const term1 = 1.0 + offset; - auto const f = term1 > 1e-6 - ? fast_expf(alpha * (fast_logf(term1) - offset)) - : static_cast(0.0); - auto const sumAf = sample_value * (f * inverr2); - auto const sumff = f * (f * inverr2); + using ScalarType = SampleVector::Scalar; + + // constants + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + const int gtx = threadIdx.x + blockIdx.x * blockDim.x; + const int ch = gtx / nsamples; + const int sample = threadIdx.x % nsamples; + const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb; + const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch; + + // configure shared mem + // per block, we need #threads per block * 2 * sizeof(ScalarType) + // we run with N channels per block + extern __shared__ char smem[]; + ScalarType* shr_sumAf = reinterpret_cast(smem); + ScalarType* shr_sumff = shr_sumAf + blockDim.x; + + if (ch >= nchannels) + return; + + auto state = g_state[ch]; + const auto did = DetId{dids[inputCh]}; + const auto* amplitudeFitParameters = + did.subdetId() == EcalBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE; + + // TODO is that better than storing into global and launching another kernel + // for the first 10 threads + if (state == TimeComputationState::NotFinished) { + const auto alpha = amplitudeFitParameters[0]; + const auto beta = amplitudeFitParameters[1]; + const auto alphabeta = alpha * beta; + const auto invalphabeta = 1.0 / alphabeta; + const auto tMaxAlphaBeta = g_tMaxAlphaBeta[ch]; + const auto sample_value = sample_values[gtx]; + const auto sample_value_error = sample_value_errors[gtx]; + const auto inverr2 = + useless_samples[gtx] ? static_cast(0) : 1.0 / (sample_value_error * sample_value_error); + const auto offset = (static_cast(sample) - tMaxAlphaBeta) * invalphabeta; + const auto term1 = 1.0 + offset; + const auto f = term1 > 1e-6 ? fast_expf(alpha * (fast_logf(term1) - offset)) : static_cast(0.0); + const auto sumAf = sample_value * (f * inverr2); + const auto sumff = f * (f * inverr2); // store into shared mem shr_sumAf[threadIdx.x] = sumAf; shr_sumff[threadIdx.x] = sumff; - } else { + } else { shr_sumAf[threadIdx.x] = 0; shr_sumff[threadIdx.x] = 0; - } - __syncthreads(); - - // reduce - // unroll completely here (but hardcoded) - if (sample<5) { - shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x+5]; - shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x+5]; - } - __syncthreads(); - - if (sample<2) { + } + __syncthreads(); + + // reduce + // unroll completely here (but hardcoded) + if (sample < 5) { + shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x + 5]; + shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x + 5]; + } + __syncthreads(); + + if (sample < 2) { // will need to subtract for ltx = 3, we double count here - shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x+2] - + shr_sumAf[threadIdx.x+3]; - shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x+2] - + shr_sumff[threadIdx.x+3]; - } - __syncthreads(); + shr_sumAf[threadIdx.x] += shr_sumAf[threadIdx.x + 2] + shr_sumAf[threadIdx.x + 3]; + shr_sumff[threadIdx.x] += shr_sumff[threadIdx.x + 2] + shr_sumff[threadIdx.x + 3]; + } + __syncthreads(); - if (sample==0) { + if (sample == 0) { // exit if the state is done // note, we do not exit before all __synchtreads are finished if (state == TimeComputationState::Finished) { - g_timeMax[ch] = 5; - g_timeError[ch] = -999; - return; + g_timeMax[ch] = 5; + g_timeError[ch] = -999; + return; } // subtract to avoid double counting - auto const sumff = shr_sumff[threadIdx.x] - + shr_sumff[threadIdx.x+1] - - shr_sumff[threadIdx.x+3]; - auto const sumAf = shr_sumAf[threadIdx.x] - + shr_sumAf[threadIdx.x+1] - - shr_sumAf[threadIdx.x+3]; - - auto const ampMaxAlphaBeta = sumff>0 ? sumAf / sumff : 0; - auto const sumAA = sumAAsNullHypot[ch]; - auto const sum0 = sum0sNullHypot[ch]; - auto const nullChi2 = chi2sNullHypot[ch]; + const auto sumff = shr_sumff[threadIdx.x] + shr_sumff[threadIdx.x + 1] - shr_sumff[threadIdx.x + 3]; + const auto sumAf = shr_sumAf[threadIdx.x] + shr_sumAf[threadIdx.x + 1] - shr_sumAf[threadIdx.x + 3]; + + const auto ampMaxAlphaBeta = sumff > 0 ? sumAf / sumff : 0; + const auto sumAA = sumAAsNullHypot[ch]; + const auto sum0 = sum0sNullHypot[ch]; + const auto nullChi2 = chi2sNullHypot[ch]; if (sumff > 0) { - auto const chi2AlphaBeta = (sumAA - sumAf * sumAf / sumff) / sum0; - if (chi2AlphaBeta > nullChi2) { - // null hypothesis is better - state = TimeComputationState::Finished; + const auto chi2AlphaBeta = (sumAA - sumAf * sumAf / sumff) / sum0; + if (chi2AlphaBeta > nullChi2) { + // null hypothesis is better + state = TimeComputationState::Finished; #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH - printf("ch = %d chi2AlphaBeta = %f nullChi2 = %f sumAA = %f sumAf = %f sumff = %f sum0 = %f\n", - ch, chi2AlphaBeta, nullChi2, sumAA, sumAf, sumff, sum0); + printf("ch = %d chi2AlphaBeta = %f nullChi2 = %f sumAA = %f sumAf = %f sumff = %f sum0 = %f\n", + ch, + chi2AlphaBeta, + nullChi2, + sumAA, + sumAf, + sumff, + sum0); #endif - } + } - // store to global - g_ampMaxAlphaBeta[ch] = ampMaxAlphaBeta; + // store to global + g_ampMaxAlphaBeta[ch] = ampMaxAlphaBeta; } else { #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH - printf("ch = %d sum0 = %f sumAA = %f sumff = %f sumAf = %f\n", - ch, sum0, sumAA, sumff, sumAf); + printf("ch = %d sum0 = %f sumAA = %f sumff = %f sumAf = %f\n", ch, sum0, sumAA, sumff, sumAf); #endif - state = TimeComputationState::Finished; + state = TimeComputationState::Finished; } // store the state to global and finish calcs g_state[ch] = state; if (state == TimeComputationState::Finished) { - // store default values into global - g_timeMax[ch] = 5; - g_timeError[ch] = -999; + // store default values into global + g_timeMax[ch] = 5; + g_timeError[ch] = -999; #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH - printf("ch = %d finished state\n", ch); + printf("ch = %d finished state\n", ch); #endif - return; + return; } - auto const ampMaxError = g_ampMaxError[ch]; - auto const test_ratio = ampMaxAlphaBeta / ampMaxError; - auto const accTimeMax = g_accTimeMax[ch]; - auto const accTimeWgt = g_accTimeWgt[ch]; - auto const tMaxAlphaBeta = g_tMaxAlphaBeta[ch]; - auto const tMaxErrorAlphaBeta = g_tMaxErrorAlphaBeta[ch]; + const auto ampMaxError = g_ampMaxError[ch]; + const auto test_ratio = ampMaxAlphaBeta / ampMaxError; + const auto accTimeMax = g_accTimeMax[ch]; + const auto accTimeWgt = g_accTimeWgt[ch]; + const auto tMaxAlphaBeta = g_tMaxAlphaBeta[ch]; + const auto tMaxErrorAlphaBeta = g_tMaxErrorAlphaBeta[ch]; // branch to separate large vs small pulses // see cpu version for more info - if (test_ratio > 5.0 && accTimeWgt>0) { - auto const tMaxRatio = accTimeWgt>0 - ? accTimeMax / accTimeWgt - : static_cast(0); - auto const tMaxErrorRatio = accTimeWgt>0 - ? 1.0 / std::sqrt(accTimeWgt) - : static_cast(0); - - if (test_ratio > 10.0) { - g_timeMax[ch] = tMaxRatio; - g_timeError[ch] = tMaxErrorRatio; - -#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH - printf("ch = %d tMaxRatio = %f tMaxErrorRatio = %f\n", - ch, tMaxRatio, tMaxErrorRatio); -#endif - } else { - auto const timeMax = - (tMaxAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) + - tMaxRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) / 5.0; - auto const timeError = - (tMaxErrorAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) + - tMaxErrorRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) / 5.0; - state = TimeComputationState::Finished; - g_state[ch] = state; - g_timeMax[ch] = timeMax; - g_timeError[ch] = timeError; + if (test_ratio > 5.0 && accTimeWgt > 0) { + const auto tMaxRatio = accTimeWgt > 0 ? accTimeMax / accTimeWgt : static_cast(0); + const auto tMaxErrorRatio = accTimeWgt > 0 ? 1.0 / std::sqrt(accTimeWgt) : static_cast(0); + + if (test_ratio > 10.0) { + g_timeMax[ch] = tMaxRatio; + g_timeError[ch] = tMaxErrorRatio; #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH - printf("ch = %d timeMax = %f timeError = %f\n", - ch, timeMax, timeError); + printf("ch = %d tMaxRatio = %f tMaxErrorRatio = %f\n", ch, tMaxRatio, tMaxErrorRatio); #endif - } - } - else { + } else { + const auto timeMax = (tMaxAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) + + tMaxRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) / + 5.0; + const auto timeError = (tMaxErrorAlphaBeta * (10.0 - ampMaxAlphaBeta / ampMaxError) + + tMaxErrorRatio * (ampMaxAlphaBeta / ampMaxError - 5.0)) / + 5.0; state = TimeComputationState::Finished; g_state[ch] = state; - g_timeMax[ch] = tMaxAlphaBeta; - g_timeError[ch] = tMaxErrorAlphaBeta; + g_timeMax[ch] = timeMax; + g_timeError[ch] = timeError; + +#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH + printf("ch = %d timeMax = %f timeError = %f\n", ch, timeMax, timeError); +#endif + } + } else { + state = TimeComputationState::Finished; + g_state[ch] = state; + g_timeMax[ch] = tMaxAlphaBeta; + g_timeError[ch] = tMaxErrorAlphaBeta; #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH - printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f\n", - ch, tMaxAlphaBeta, tMaxErrorAlphaBeta); + printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f\n", ch, tMaxAlphaBeta, tMaxErrorAlphaBeta); #endif } + } } -} - -__global__ -void kernel_time_compute_fixMGPAslew(uint16_t const* digis_eb, - uint16_t const* digis_ee, - SampleVector::Scalar* sample_values, - SampleVector::Scalar* sample_value_errors, - bool* useless_sample_values, - unsigned int const sample_mask, - int const nchannels, - uint32_t const offsetForInputs) { - using ScalarType = SampleVector::Scalar; - - // constants - constexpr int nsamples = EcalDataFrame::MAXSAMPLES; - - // indices - int const gtx = threadIdx.x + blockIdx.x * blockDim.x; - int const ch = gtx / nsamples; - int const sample = threadIdx.x % nsamples; - int const inputCh = ch >= offsetForInputs - ? ch - offsetForInputs - : ch; - int const inputGtx = ch >= offsetForInputs - ? gtx - offsetForInputs*nsamples - : gtx; - auto const* digis = ch >= offsetForInputs - ? digis_ee - : digis_eb; - - // remove thread for sample 0, oversubscribing is easier than .... - if (ch >= nchannels || sample==0) return; - - if (!use_sample(sample_mask, sample)) return; - - auto const gainIdPrev = ecal::mgpa::gainId(digis[inputGtx-1]); - auto const gainIdNext = ecal::mgpa::gainId(digis[inputGtx]); - if (gainIdPrev>=1 && gainIdPrev<=3 && - gainIdNext>=1 && gainIdNext<=3 && gainIdPrev < gainIdNext) { - sample_values[gtx-1] = 0; - sample_value_errors[gtx-1] = 1e+9; - useless_sample_values[gtx-1] = true; - } -} - -__global__ -void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values, - SampleVector::Scalar const* sample_value_errors, - uint32_t const* dids, - bool const* useless_samples, - SampleVector::Scalar const* g_timeMax, - SampleVector::Scalar const* amplitudeFitParametersEB, - SampleVector::Scalar const* amplitudeFitParametersEE, - SampleVector::Scalar *g_amplitudeMax, - int const nchannels) { - using ScalarType = SampleVector::Scalar; - - // constants - constexpr ScalarType corr4 = 1.; - constexpr ScalarType corr6 = 1.; - constexpr int nsamples = EcalDataFrame::MAXSAMPLES; - - // indices - int const gtx = threadIdx.x + blockIdx.x * blockDim.x; - int const ch = gtx / nsamples; - int const sample = threadIdx.x % nsamples; - - if (ch >= nchannels) return; - - auto const did = DetId{dids[ch]}; - auto const* amplitudeFitParameters = did.subdetId() == EcalBarrel - ? amplitudeFitParametersEB - : amplitudeFitParametersEE; - - // configure shared mem - extern __shared__ char smem[]; - ScalarType* shr_sum1 = reinterpret_cast(smem); - auto *shr_sumA = shr_sum1 + blockDim.x; - auto *shr_sumF = shr_sumA + blockDim.x; - auto *shr_sumAF = shr_sumF + blockDim.x; - auto *shr_sumFF = shr_sumAF + blockDim.x; - - auto const alpha = amplitudeFitParameters[0]; - auto const beta = amplitudeFitParameters[1]; - auto const timeMax = g_timeMax[ch]; - auto const pedestalLimit = timeMax - (alpha * beta) - 1.0; - auto const sample_value = sample_values[gtx]; - auto const sample_value_error = sample_value_errors[gtx]; - auto const inverr2 = sample_value_error > 0 - ? 1. / (sample_value_error * sample_value_error) - : static_cast(0); - auto const termOne = 1 + (sample - timeMax) / (alpha * beta); - auto const f = termOne > 1.e-5 - ? fast_expf(alpha * fast_logf(termOne) - - (sample - timeMax) / beta) - : static_cast(0.); - - bool const cond = ((sample < pedestalLimit) || - (f>0.6*corr6 && sample<=timeMax) || - (f>0.4*corr4 && sample>=timeMax)) && !useless_samples[gtx]; - - // store into shared mem - shr_sum1[threadIdx.x] = cond ? inverr2 : static_cast(0); - shr_sumA[threadIdx.x] = cond - ? sample_value * inverr2 - : static_cast(0); - shr_sumF[threadIdx.x] = cond - ? f * inverr2 - : static_cast(0); - shr_sumAF[threadIdx.x] = cond - ? (f*inverr2)*sample_value - : static_cast(0); - shr_sumFF[threadIdx.x] = cond - ? f*(f*inverr2) - : static_cast(0); - - // reduction - if (sample <= 4) { - shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x+5]; - shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x+5]; - shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x+5]; - shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x+5]; - shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x+5]; + + __global__ void kernel_time_compute_fixMGPAslew(uint16_t const* digis_eb, + uint16_t const* digis_ee, + SampleVector::Scalar* sample_values, + SampleVector::Scalar* sample_value_errors, + bool* useless_sample_values, + unsigned const int sample_mask, + const int nchannels, + uint32_t const offsetForInputs) { + using ScalarType = SampleVector::Scalar; + + // constants + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + const int gtx = threadIdx.x + blockIdx.x * blockDim.x; + const int ch = gtx / nsamples; + const int sample = threadIdx.x % nsamples; + const int inputGtx = ch >= offsetForInputs ? gtx - offsetForInputs * nsamples : gtx; + const auto* digis = ch >= offsetForInputs ? digis_ee : digis_eb; + + // remove thread for sample 0, oversubscribing is easier than .... + if (ch >= nchannels || sample == 0) + return; + + if (!use_sample(sample_mask, sample)) + return; + + const auto gainIdPrev = ecal::mgpa::gainId(digis[inputGtx - 1]); + const auto gainIdNext = ecal::mgpa::gainId(digis[inputGtx]); + if (gainIdPrev >= 1 && gainIdPrev <= 3 && gainIdNext >= 1 && gainIdNext <= 3 && gainIdPrev < gainIdNext) { + sample_values[gtx - 1] = 0; + sample_value_errors[gtx - 1] = 1e+9; + useless_sample_values[gtx - 1] = true; + } } - __syncthreads(); - if (sample < 2) { + __global__ void kernel_time_compute_ampl(SampleVector::Scalar const* sample_values, + SampleVector::Scalar const* sample_value_errors, + uint32_t const* dids, + bool const* useless_samples, + SampleVector::Scalar const* g_timeMax, + SampleVector::Scalar const* amplitudeFitParametersEB, + SampleVector::Scalar const* amplitudeFitParametersEE, + SampleVector::Scalar* g_amplitudeMax, + const int nchannels) { + using ScalarType = SampleVector::Scalar; + + // constants + constexpr ScalarType corr4 = 1.; + constexpr ScalarType corr6 = 1.; + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + const int gtx = threadIdx.x + blockIdx.x * blockDim.x; + const int ch = gtx / nsamples; + const int sample = threadIdx.x % nsamples; + + if (ch >= nchannels) + return; + + const auto did = DetId{dids[ch]}; + const auto* amplitudeFitParameters = + did.subdetId() == EcalBarrel ? amplitudeFitParametersEB : amplitudeFitParametersEE; + + // configure shared mem + extern __shared__ char smem[]; + ScalarType* shr_sum1 = reinterpret_cast(smem); + auto* shr_sumA = shr_sum1 + blockDim.x; + auto* shr_sumF = shr_sumA + blockDim.x; + auto* shr_sumAF = shr_sumF + blockDim.x; + auto* shr_sumFF = shr_sumAF + blockDim.x; + + const auto alpha = amplitudeFitParameters[0]; + const auto beta = amplitudeFitParameters[1]; + const auto timeMax = g_timeMax[ch]; + const auto pedestalLimit = timeMax - (alpha * beta) - 1.0; + const auto sample_value = sample_values[gtx]; + const auto sample_value_error = sample_value_errors[gtx]; + const auto inverr2 = + sample_value_error > 0 ? 1. / (sample_value_error * sample_value_error) : static_cast(0); + const auto termOne = 1 + (sample - timeMax) / (alpha * beta); + const auto f = termOne > 1.e-5 ? fast_expf(alpha * fast_logf(termOne) - (sample - timeMax) / beta) + : static_cast(0.); + + bool const cond = ((sample < pedestalLimit) || (f > 0.6 * corr6 && sample <= timeMax) || + (f > 0.4 * corr4 && sample >= timeMax)) && + !useless_samples[gtx]; + + // store into shared mem + shr_sum1[threadIdx.x] = cond ? inverr2 : static_cast(0); + shr_sumA[threadIdx.x] = cond ? sample_value * inverr2 : static_cast(0); + shr_sumF[threadIdx.x] = cond ? f * inverr2 : static_cast(0); + shr_sumAF[threadIdx.x] = cond ? (f * inverr2) * sample_value : static_cast(0); + shr_sumFF[threadIdx.x] = cond ? f * (f * inverr2) : static_cast(0); + + // reduction + if (sample <= 4) { + shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x + 5]; + shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x + 5]; + shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x + 5]; + shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x + 5]; + shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x + 5]; + } + __syncthreads(); + + if (sample < 2) { // note: we double count sample 3 - shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x+2] + shr_sum1[threadIdx.x+3]; - shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x+2] + shr_sumA[threadIdx.x+3]; - shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x+2] + shr_sumF[threadIdx.x+3]; - shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x+2] - + shr_sumAF[threadIdx.x+3]; - shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x+2] - + shr_sumFF[threadIdx.x+3]; - } - __syncthreads(); - - if (sample == 0) { - auto const sum1 = shr_sum1[threadIdx.x] - + shr_sum1[threadIdx.x+1] - shr_sum1[threadIdx.x+3]; - auto const sumA = shr_sumA[threadIdx.x] - + shr_sumA[threadIdx.x+1] - shr_sumA[threadIdx.x+3]; - auto const sumF = shr_sumF[threadIdx.x] - + shr_sumF[threadIdx.x+1] - shr_sumF[threadIdx.x+3]; - auto const sumAF = shr_sumAF[threadIdx.x] - + shr_sumAF[threadIdx.x+1] - shr_sumAF[threadIdx.x+3]; - auto const sumFF = shr_sumFF[threadIdx.x] - + shr_sumFF[threadIdx.x+1] - shr_sumFF[threadIdx.x+3]; - - auto const denom = sumFF * sum1 - sumF*sumF; - auto const condForDenom = sum1 > 0 && ecal::abs(denom)>1.e-20; - auto const amplitudeMax = condForDenom - ? (sumAF * sum1 - sumA * sumF) / denom - : static_cast(0.); + shr_sum1[threadIdx.x] += shr_sum1[threadIdx.x + 2] + shr_sum1[threadIdx.x + 3]; + shr_sumA[threadIdx.x] += shr_sumA[threadIdx.x + 2] + shr_sumA[threadIdx.x + 3]; + shr_sumF[threadIdx.x] += shr_sumF[threadIdx.x + 2] + shr_sumF[threadIdx.x + 3]; + shr_sumAF[threadIdx.x] += shr_sumAF[threadIdx.x + 2] + shr_sumAF[threadIdx.x + 3]; + shr_sumFF[threadIdx.x] += shr_sumFF[threadIdx.x + 2] + shr_sumFF[threadIdx.x + 3]; + } + __syncthreads(); + + if (sample == 0) { + const auto sum1 = shr_sum1[threadIdx.x] + shr_sum1[threadIdx.x + 1] - shr_sum1[threadIdx.x + 3]; + const auto sumA = shr_sumA[threadIdx.x] + shr_sumA[threadIdx.x + 1] - shr_sumA[threadIdx.x + 3]; + const auto sumF = shr_sumF[threadIdx.x] + shr_sumF[threadIdx.x + 1] - shr_sumF[threadIdx.x + 3]; + const auto sumAF = shr_sumAF[threadIdx.x] + shr_sumAF[threadIdx.x + 1] - shr_sumAF[threadIdx.x + 3]; + const auto sumFF = shr_sumFF[threadIdx.x] + shr_sumFF[threadIdx.x + 1] - shr_sumFF[threadIdx.x + 3]; + + const auto denom = sumFF * sum1 - sumF * sumF; + const auto condForDenom = sum1 > 0 && ecal::abs(denom) > 1.e-20; + const auto amplitudeMax = condForDenom ? (sumAF * sum1 - sumA * sumF) / denom : static_cast(0.); // store into global mem g_amplitudeMax[ch] = amplitudeMax; + } } -} - -//#define ECAL_RECO_CUDA_TC_INIT_DEBUG -__global__ -void kernel_time_computation_init(uint16_t const* digis_eb, - uint32_t const* dids_eb, - uint16_t const* digis_ee, - uint32_t const* dids_ee, - float const* rms_x12, - float const* rms_x6, - float const* rms_x1, - float const* mean_x12, - float const* mean_x6, - float const* mean_x1, - float const* gain12Over6, - float const* gain6Over1, - SampleVector::Scalar* sample_values, - SampleVector::Scalar* sample_value_errors, - SampleVector::Scalar* ampMaxError, - bool* useless_sample_values, - char* pedestal_nums, - uint32_t const offsetForHashes, - uint32_t const offsetForInputs, - unsigned int const sample_maskEB, - unsigned int const sample_maskEE, - int nchannels) { - using ScalarType = SampleVector::Scalar; - - // constants - constexpr int nsamples = EcalDataFrame::MAXSAMPLES; - - // indices - int const tx = threadIdx.x + blockDim.x*blockIdx.x; - int const ch = tx/nsamples; - int const inputTx = ch >= offsetForInputs - ? tx - offsetForInputs*nsamples - : tx; - int const inputCh = ch >= offsetForInputs - ? ch - offsetForInputs - : ch; - auto const* digis = ch >= offsetForInputs - ? digis_ee - : digis_eb; - auto const* dids = ch >= offsetForInputs - ? dids_ee - : dids_eb; - - if (ch < nchannels) { + + //#define ECAL_RECO_CUDA_TC_INIT_DEBUG + __global__ void kernel_time_computation_init(uint16_t const* digis_eb, + uint32_t const* dids_eb, + uint16_t const* digis_ee, + uint32_t const* dids_ee, + float const* rms_x12, + float const* rms_x6, + float const* rms_x1, + float const* mean_x12, + float const* mean_x6, + float const* mean_x1, + float const* gain12Over6, + float const* gain6Over1, + SampleVector::Scalar* sample_values, + SampleVector::Scalar* sample_value_errors, + SampleVector::Scalar* ampMaxError, + bool* useless_sample_values, + char* pedestal_nums, + uint32_t const offsetForHashes, + uint32_t const offsetForInputs, + unsigned const int sample_maskEB, + unsigned const int sample_maskEE, + int nchannels) { + using ScalarType = SampleVector::Scalar; + + // constants + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + const int tx = threadIdx.x + blockDim.x * blockIdx.x; + const int ch = tx / nsamples; + const int inputTx = ch >= offsetForInputs ? tx - offsetForInputs * nsamples : tx; + const int inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch; + const auto* digis = ch >= offsetForInputs ? digis_ee : digis_eb; + const auto* dids = ch >= offsetForInputs ? dids_ee : dids_eb; + + if (ch < nchannels) { // indices/inits - int const sample = tx % nsamples; - int const ch_start = ch*nsamples; - int const input_ch_start = inputCh*nsamples; + const int sample = tx % nsamples; + const int input_ch_start = inputCh * nsamples; SampleVector::Scalar pedestal = 0.; int num = 0; // configure shared mem extern __shared__ char smem[]; - ScalarType* shrSampleValues = - reinterpret_cast(smem); + ScalarType* shrSampleValues = reinterpret_cast(smem); ScalarType* shrSampleValueErrors = shrSampleValues + blockDim.x; // 0 and 1 sample values - auto const adc0 = ecal::mgpa::adc(digis[input_ch_start]); - auto const gainId0 = ecal::mgpa::gainId(digis[input_ch_start]); - auto const adc1 = ecal::mgpa::adc(digis[input_ch_start+1]); - auto const gainId1 = ecal::mgpa::gainId(digis[input_ch_start+1]); - auto const did = DetId{dids[inputCh]}; - auto const isBarrel = did.subdetId() == EcalBarrel; - auto const sample_mask = did.subdetId() == EcalBarrel - ? sample_maskEB - : sample_maskEE; - auto const hashedId = isBarrel - ? hashedIndexEB(did.rawId()) - : offsetForHashes + hashedIndexEE(did.rawId()); + const auto adc0 = ecal::mgpa::adc(digis[input_ch_start]); + const auto gainId0 = ecal::mgpa::gainId(digis[input_ch_start]); + const auto adc1 = ecal::mgpa::adc(digis[input_ch_start + 1]); + const auto gainId1 = ecal::mgpa::gainId(digis[input_ch_start + 1]); + const auto did = DetId{dids[inputCh]}; + const auto isBarrel = did.subdetId() == EcalBarrel; + const auto sample_mask = did.subdetId() == EcalBarrel ? sample_maskEB : sample_maskEE; + const auto hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId()); // set pedestal // TODO this branch is non-divergent for a group of 10 threads if (gainId0 == 1 && use_sample(sample_mask, 0)) { - pedestal = static_cast(adc0); - num=1; - - auto const diff = adc1 - adc0; - if (gainId1 == 1 && use_sample(sample_mask, 1) - && std::abs(diff) < 3*rms_x12[hashedId]) { - pedestal = - (pedestal + static_cast(adc1)) / 2.0; - num=2; - } + pedestal = static_cast(adc0); + num = 1; + + const auto diff = adc1 - adc0; + if (gainId1 == 1 && use_sample(sample_mask, 1) && std::abs(diff) < 3 * rms_x12[hashedId]) { + pedestal = (pedestal + static_cast(adc1)) / 2.0; + num = 2; + } } else { - pedestal = mean_x12[ch]; + pedestal = mean_x12[ch]; } // ped subtracted and gain-renormalized samples. - auto const gainId = ecal::mgpa::gainId(digis[inputTx]); - auto const adc = ecal::mgpa::adc(digis[inputTx]); + const auto gainId = ecal::mgpa::gainId(digis[inputTx]); + const auto adc = ecal::mgpa::adc(digis[inputTx]); bool bad = false; SampleVector::Scalar sample_value, sample_value_error; @@ -991,29 +892,27 @@ void kernel_time_computation_init(uint16_t const* digis_eb, // TODO: piece below is general both for amplitudes and timing // potentially there is a way to reduce the amount of code... if (!use_sample(sample_mask, sample)) { - bad = true; - sample_value = 0; - sample_value_error = 0; + bad = true; + sample_value = 0; + sample_value_error = 0; } else if (gainId == 1) { - sample_value = static_cast(adc) - pedestal; - sample_value_error = rms_x12[hashedId]; + sample_value = static_cast(adc) - pedestal; + sample_value_error = rms_x12[hashedId]; } else if (gainId == 2) { - sample_value = (static_cast(adc) - - mean_x6[hashedId]) * gain12Over6[hashedId]; - sample_value_error = rms_x6[hashedId] * gain12Over6[hashedId]; + sample_value = (static_cast(adc) - mean_x6[hashedId]) * gain12Over6[hashedId]; + sample_value_error = rms_x6[hashedId] * gain12Over6[hashedId]; } else if (gainId == 3) { - sample_value = (static_cast(adc) - - mean_x1[hashedId]) * gain6Over1[hashedId] * gain12Over6[hashedId]; - sample_value_error = rms_x1[hashedId] - * gain6Over1[hashedId] * gain12Over6[hashedId]; + sample_value = (static_cast(adc) - mean_x1[hashedId]) * gain6Over1[hashedId] * + gain12Over6[hashedId]; + sample_value_error = rms_x1[hashedId] * gain6Over1[hashedId] * gain12Over6[hashedId]; } else { - sample_value = 0; - sample_value_error = 0; - bad = true; + sample_value = 0; + sample_value_error = 0; + bad = true; } // TODO: make sure we save things correctly when sample is useless - auto const useless_sample = (sample_value_error <= 0) | bad; + const auto useless_sample = (sample_value_error <= 0) | bad; useless_sample_values[tx] = useless_sample; sample_values[tx] = sample_value; sample_value_errors[tx] = useless_sample ? 1e+9 : sample_value_error; @@ -1021,85 +920,73 @@ void kernel_time_computation_init(uint16_t const* digis_eb, // DEBUG #ifdef ECAL_RECO_CUDA_TC_INIT_DEBUG if (ch == 0) { - printf("sample = %d sample_value = %f sample_value_error = %f useless = %c\n", - sample, sample_value, sample_value_error, - useless_sample ? '1' : '0'); + printf("sample = %d sample_value = %f sample_value_error = %f useless = %c\n", + sample, + sample_value, + sample_value_error, + useless_sample ? '1' : '0'); } #endif // store into the shared mem - shrSampleValues[threadIdx.x] = sample_value_error > 0 - ? sample_value - : std::numeric_limits::min(); + shrSampleValues[threadIdx.x] = sample_value_error > 0 ? sample_value : std::numeric_limits::min(); shrSampleValueErrors[threadIdx.x] = sample_value_error; __syncthreads(); // perform the reduction with min if (sample < 5) { - // note, if equal -> we keep the value with lower sample as for cpu - shrSampleValueErrors[threadIdx.x] = - shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x+5] - ? shrSampleValueErrors[threadIdx.x+5] - : shrSampleValueErrors[threadIdx.x]; - shrSampleValues[threadIdx.x] = - std::max(shrSampleValues[threadIdx.x], - shrSampleValues[threadIdx.x+5]); + // note, if equal -> we keep the value with lower sample as for cpu + shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 5] + ? shrSampleValueErrors[threadIdx.x + 5] + : shrSampleValueErrors[threadIdx.x]; + shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 5]); } __syncthreads(); // a bit of an overkill, but easier than to compare across 3 values - if (sample<3) { - shrSampleValueErrors[threadIdx.x] = - shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x+3] - ? shrSampleValueErrors[threadIdx.x+3] - : shrSampleValueErrors[threadIdx.x]; - shrSampleValues[threadIdx.x] = - std::max(shrSampleValues[threadIdx.x], - shrSampleValues[threadIdx.x+3]); + if (sample < 3) { + shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 3] + ? shrSampleValueErrors[threadIdx.x + 3] + : shrSampleValueErrors[threadIdx.x]; + shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 3]); } __syncthreads(); if (sample < 2) { - shrSampleValueErrors[threadIdx.x] = - shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x+2] - ? shrSampleValueErrors[threadIdx.x+2] - : shrSampleValueErrors[threadIdx.x]; - shrSampleValues[threadIdx.x] = - std::max(shrSampleValues[threadIdx.x], - shrSampleValues[threadIdx.x+2]); + shrSampleValueErrors[threadIdx.x] = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 2] + ? shrSampleValueErrors[threadIdx.x + 2] + : shrSampleValueErrors[threadIdx.x]; + shrSampleValues[threadIdx.x] = std::max(shrSampleValues[threadIdx.x], shrSampleValues[threadIdx.x + 2]); } __syncthreads(); - + if (sample == 0) { - // we only needd the max error - auto const maxSampleValueError = - shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x+1] - ? shrSampleValueErrors[threadIdx.x+1] - : shrSampleValueErrors[threadIdx.x]; - - // # pedestal samples used - pedestal_nums[ch] = num; - // this is used downstream - ampMaxError[ch] = maxSampleValueError; - - // DEBUG + // we only needd the max error + const auto maxSampleValueError = shrSampleValues[threadIdx.x] < shrSampleValues[threadIdx.x + 1] + ? shrSampleValueErrors[threadIdx.x + 1] + : shrSampleValueErrors[threadIdx.x]; + + // # pedestal samples used + pedestal_nums[ch] = num; + // this is used downstream + ampMaxError[ch] = maxSampleValueError; + + // DEBUG #ifdef ECAL_RECO_CUDA_TC_INIT_DEBUG - if (ch == 0) { - printf("pedestal_nums = %d ampMaxError = %f\n", - num, maxSampleValueError); - } + if (ch == 0) { + printf("pedestal_nums = %d ampMaxError = %f\n", num, maxSampleValueError); + } #endif } + } } -} - -/// -/// launch context parameters: 1 thread per channel -/// -//#define DEBUG_TIME_CORRECTION -__global__ -void kernel_time_correction_and_finalize( -// SampleVector::Scalar const* g_amplitude, + + /// + /// launch context parameters: 1 thread per channel + /// + //#define DEBUG_TIME_CORRECTION + __global__ void kernel_time_correction_and_finalize( + // SampleVector::Scalar const* g_amplitude, ::ecal::reco::StorageScalarType const* g_amplitude, uint16_t const* digis_eb, uint32_t const* dids_eb, @@ -1113,11 +1000,11 @@ void kernel_time_correction_and_finalize( SampleVector::Scalar const* g_timeError, float const* g_rms_x12, float const* timeCalibConstant, - float *g_jitter, - float *g_jitterError, - uint32_t *flags, - int const amplitudeBinsSizeEB, - int const amplitudeBinsSizeEE, + float* g_jitter, + float* g_jitterError, + uint32_t* flags, + const int amplitudeBinsSizeEB, + const int amplitudeBinsSizeEE, ConfigurationParameters::type const timeConstantTermEB, ConfigurationParameters::type const timeConstantTermEE, float const offsetTimeValueEB, @@ -1136,136 +1023,108 @@ void kernel_time_correction_and_finalize( ConfigurationParameters::type const outOfTimeThreshG61mEE, uint32_t const offsetForHashes, uint32_t const offsetForInputs, - int const nchannels) { - using ScalarType = SampleVector::Scalar; - - // constants - constexpr int nsamples = EcalDataFrame::MAXSAMPLES; - - // indices - int const gtx = threadIdx.x + blockIdx.x * blockDim.x; - int const inputGtx = gtx >= offsetForInputs - ? gtx - offsetForInputs - : gtx; - auto const* dids = gtx >= offsetForInputs - ? dids_ee - : dids_eb; - auto const& digis = gtx >= offsetForInputs - ? digis_ee - : digis_eb; - - // filter out outside of range threads - if (gtx >= nchannels) return; - - auto const did = DetId{dids[inputGtx]}; - auto const isBarrel = did.subdetId() == EcalBarrel; - auto const hashedId = isBarrel - ? hashedIndexEB(did.rawId()) - : offsetForHashes + hashedIndexEE(did.rawId()); - auto const* amplitudeBins = isBarrel - ? amplitudeBinsEB - : amplitudeBinsEE; - auto const* shiftBins = isBarrel - ? shiftBinsEB - : shiftBinsEE; - auto const amplitudeBinsSize = isBarrel - ? amplitudeBinsSizeEB - : amplitudeBinsSizeEE; - auto const timeConstantTerm = isBarrel - ? timeConstantTermEB - : timeConstantTermEE; - auto const timeNconst = isBarrel - ? timeNconstEB - : timeNconstEE; - auto const offsetTimeValue = isBarrel - ? offsetTimeValueEB - : offsetTimeValueEE; - auto const amplitudeThreshold = isBarrel - ? amplitudeThresholdEB - : amplitudeThresholdEE; - auto const outOfTimeThreshG12p = isBarrel - ? outOfTimeThreshG12pEB - : outOfTimeThreshG12pEE; - auto const outOfTimeThreshG12m = isBarrel - ? outOfTimeThreshG12mEB - : outOfTimeThreshG12mEE; - auto const outOfTimeThreshG61p = isBarrel - ? outOfTimeThreshG61pEB - : outOfTimeThreshG61pEE; - auto const outOfTimeThreshG61m = isBarrel - ? outOfTimeThreshG61mEB - : outOfTimeThreshG61mEE; - - // load some - auto const amplitude = g_amplitude[gtx]; - auto const rms_x12 = g_rms_x12[hashedId]; - auto const timeCalibConst = timeCalibConstant[hashedId]; - - int myBin = -1; - for (int bin=0; bin amplitudeBins[bin]) - myBin = bin; - else - break; - } - - ScalarType correction = 0; - if (myBin == -1) { + const int nchannels) { + using ScalarType = SampleVector::Scalar; + + // constants + constexpr int nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + const int gtx = threadIdx.x + blockIdx.x * blockDim.x; + const int inputGtx = gtx >= offsetForInputs ? gtx - offsetForInputs : gtx; + const auto* dids = gtx >= offsetForInputs ? dids_ee : dids_eb; + const auto& digis = gtx >= offsetForInputs ? digis_ee : digis_eb; + + // filter out outside of range threads + if (gtx >= nchannels) + return; + + const auto did = DetId{dids[inputGtx]}; + const auto isBarrel = did.subdetId() == EcalBarrel; + const auto hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId()); + const auto* amplitudeBins = isBarrel ? amplitudeBinsEB : amplitudeBinsEE; + const auto* shiftBins = isBarrel ? shiftBinsEB : shiftBinsEE; + const auto amplitudeBinsSize = isBarrel ? amplitudeBinsSizeEB : amplitudeBinsSizeEE; + const auto timeConstantTerm = isBarrel ? timeConstantTermEB : timeConstantTermEE; + const auto timeNconst = isBarrel ? timeNconstEB : timeNconstEE; + const auto offsetTimeValue = isBarrel ? offsetTimeValueEB : offsetTimeValueEE; + const auto amplitudeThreshold = isBarrel ? amplitudeThresholdEB : amplitudeThresholdEE; + const auto outOfTimeThreshG12p = isBarrel ? outOfTimeThreshG12pEB : outOfTimeThreshG12pEE; + const auto outOfTimeThreshG12m = isBarrel ? outOfTimeThreshG12mEB : outOfTimeThreshG12mEE; + const auto outOfTimeThreshG61p = isBarrel ? outOfTimeThreshG61pEB : outOfTimeThreshG61pEE; + const auto outOfTimeThreshG61m = isBarrel ? outOfTimeThreshG61mEB : outOfTimeThreshG61mEE; + + // load some + const auto amplitude = g_amplitude[gtx]; + const auto rms_x12 = g_rms_x12[hashedId]; + const auto timeCalibConst = timeCalibConstant[hashedId]; + + int myBin = -1; + for (int bin = 0; bin < amplitudeBinsSize; bin++) { + if (amplitude > amplitudeBins[bin]) + myBin = bin; + else + break; + } + + ScalarType correction = 0; + if (myBin == -1) { correction = shiftBins[0]; - } else if (myBin == amplitudeBinsSize-1) { + } else if (myBin == amplitudeBinsSize - 1) { correction = shiftBins[myBin]; - } else { - correction = shiftBins[myBin+1] - shiftBins[myBin]; - correction *= (amplitude - amplitudeBins[myBin]) / - (amplitudeBins[myBin+1] - amplitudeBins[myBin]); + } else { + correction = shiftBins[myBin + 1] - shiftBins[myBin]; + correction *= (amplitude - amplitudeBins[myBin]) / (amplitudeBins[myBin + 1] - amplitudeBins[myBin]); correction += shiftBins[myBin]; - } + } - // correction * 1./25. - correction = correction * 0.04; - auto const timeMax = g_timeMax[gtx]; - auto const timeError = g_timeError[gtx]; - auto const jitter = timeMax - 5 + correction; - auto const jitterError = std::sqrt(timeError*timeError + - timeConstantTerm*timeConstantTerm * 0.04 * 0.04); // 0.04 = 1./25. + // correction * 1./25. + correction = correction * 0.04; + const auto timeMax = g_timeMax[gtx]; + const auto timeError = g_timeError[gtx]; + const auto jitter = timeMax - 5 + correction; + const auto jitterError = + std::sqrt(timeError * timeError + timeConstantTerm * timeConstantTerm * 0.04 * 0.04); // 0.04 = 1./25. #ifdef DEBUG_TIME_CORRECTION -// if (gtx == 0) { - printf("ch = %d timeMax = %f timeError = %f jitter = %f correction = %f\n", - gtx, timeMax, timeError, jitter, correction); + // if (gtx == 0) { + printf("ch = %d timeMax = %f timeError = %f jitter = %f correction = %f\n", + gtx, + timeMax, + timeError, + jitter, + correction); // } #endif - // store back to global - g_jitter[gtx] = jitter; - g_jitterError[gtx] = jitterError; + // store back to global + g_jitter[gtx] = jitter; + g_jitterError[gtx] = jitterError; - // set the flag - // TODO: replace with something more efficient (if required), - // for now just to make it work - if (amplitude > amplitudeThreshold * rms_x12) { + // set the flag + // TODO: replace with something more efficient (if required), + // for now just to make it work + if (amplitude > amplitudeThreshold * rms_x12) { auto threshP = outOfTimeThreshG12p; auto threshM = outOfTimeThreshG12m; if (amplitude > 3000.) { - for (int isample=0; isample sigmat*threshP || - correctedTime < -sigmat*threshM) - flags[gtx] |= 0x1 << EcalUncalibratedRecHit::kOutOfTime; + const auto correctedTime = (timeMax - 5) * 25 + timeCalibConst + offsetTimeValue; + const auto nterm = timeNconst * rms_x12 / amplitude; + const auto sigmat = std::sqrt(nterm * nterm + timeConstantTerm * timeConstantTerm); + if (correctedTime > sigmat * threshP || correctedTime < -sigmat * threshM) + flags[gtx] |= 0x1 << EcalUncalibratedRecHit::kOutOfTime; + } } -} -}} + } // namespace multifit +} // namespace ecal diff --git a/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.cu b/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.cu index f657981b95fa0..327d9b20445fa 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/inplace_fnnls.cu @@ -1,120 +1,120 @@ #include "inplace_fnnls.h" -namespace ecal { namespace multifit { - -using matrix_t = SampleMatrix; -using vector_t = SampleVector; - -__device__ -bool inplace_fnnls(matrix_t& AtA, - vector_t& Atb, - vector_t& x, - int& npassive, - BXVectorType& activeBXs, - PulseMatrixType& pulse_matrix, - const double eps, - const unsigned int max_iterations) { - vector_t s; - vector_t w; - -// main loop - Eigen::Index w_max_idx_prev = 0; - matrix_t::Scalar w_max_prev = 0; - double eps_to_use = eps; - - int iter = 0; - while (true) { - if (iter>0 || npassive==0) { - const auto nActive = vector_t::RowsAtCompileTime - npassive; - if(!nActive) - break; - - w.tail(nActive) = Atb.tail(nActive) - (AtA * x).tail(nActive); - - // get the index of w that gives the maximum gain - Eigen::Index w_max_idx; - const auto max_w = w.tail(nActive).maxCoeff(&w_max_idx); - - // check for convergence - if (max_w < eps_to_use || (w_max_idx==w_max_idx_prev && max_w==w_max_prev)) - break; - - // worst case - if (iter >= 500) +namespace ecal { + namespace multifit { + + using matrix_t = SampleMatrix; + using vector_t = SampleVector; + + __device__ bool inplace_fnnls(matrix_t& AtA, + vector_t& Atb, + vector_t& x, + int& npassive, + BXVectorType& activeBXs, + PulseMatrixType& pulse_matrix, + const double eps, + const unsigned int max_iterations) { + vector_t s; + vector_t w; + + // main loop + Eigen::Index w_max_idx_prev = 0; + matrix_t::Scalar w_max_prev = 0; + double eps_to_use = eps; + + int iter = 0; + while (true) { + if (iter > 0 || npassive == 0) { + const auto nActive = vector_t::RowsAtCompileTime - npassive; + if (!nActive) break; - w_max_prev = max_w; - w_max_idx_prev = w_max_idx; + w.tail(nActive) = Atb.tail(nActive) - (AtA * x).tail(nActive); - // need to translate the index into the right part of the vector - w_max_idx += npassive; + // get the index of w that gives the maximum gain + Eigen::Index w_max_idx; + const auto max_w = w.tail(nActive).maxCoeff(&w_max_idx); - // swap AtA to avoid copy - AtA.col(npassive).swap(AtA.col(w_max_idx)); - AtA.row(npassive).swap(AtA.row(w_max_idx)); - // swap Atb to match with AtA - Eigen::numext::swap(Atb.coeffRef(npassive), Atb.coeffRef(w_max_idx)); - Eigen::numext::swap(x.coeffRef(npassive), x.coeffRef(w_max_idx)); - Eigen::numext::swap(activeBXs.coeffRef(npassive), activeBXs.coeffRef(w_max_idx)); - pulse_matrix.col(npassive).swap(pulse_matrix.col(w_max_idx)); + // check for convergence + if (max_w < eps_to_use || (w_max_idx == w_max_idx_prev && max_w == w_max_prev)) + break; - ++npassive; - } + // worst case + if (iter >= 500) + break; -// inner loop - while (true) { - if (npassive == 0) break; + w_max_prev = max_w; + w_max_idx_prev = w_max_idx; - s.head(npassive) = - AtA.topLeftCorner(npassive, npassive).llt().solve(Atb.head(npassive)); + // need to translate the index into the right part of the vector + w_max_idx += npassive; - // if all coefficients are positive, done for this iteration - if (s.head(npassive).minCoeff() > 0.) { - x.head(npassive) = s.head(npassive); - break; - } + // swap AtA to avoid copy + AtA.col(npassive).swap(AtA.col(w_max_idx)); + AtA.row(npassive).swap(AtA.row(w_max_idx)); + // swap Atb to match with AtA + Eigen::numext::swap(Atb.coeffRef(npassive), Atb.coeffRef(w_max_idx)); + Eigen::numext::swap(x.coeffRef(npassive), x.coeffRef(w_max_idx)); + Eigen::numext::swap(activeBXs.coeffRef(npassive), activeBXs.coeffRef(w_max_idx)); + pulse_matrix.col(npassive).swap(pulse_matrix.col(w_max_idx)); - auto alpha = std::numeric_limits::max(); - Eigen::Index alpha_idx = 0; + ++npassive; + } + + // inner loop + while (true) { + if (npassive == 0) + break; + + s.head(npassive) = AtA.topLeftCorner(npassive, npassive).llt().solve(Atb.head(npassive)); + + // if all coefficients are positive, done for this iteration + if (s.head(npassive).minCoeff() > 0.) { + x.head(npassive) = s.head(npassive); + break; + } + + auto alpha = std::numeric_limits::max(); + Eigen::Index alpha_idx = 0; #pragma unroll - for (auto i = 0; i < npassive; ++i) { - if (s[i] <= 0.) { - auto const ratio = x[i] / (x[i] - s[i]); - if (ratio < alpha) { - alpha = ratio; - alpha_idx = i; + for (auto i = 0; i < npassive; ++i) { + if (s[i] <= 0.) { + auto const ratio = x[i] / (x[i] - s[i]); + if (ratio < alpha) { + alpha = ratio; + alpha_idx = i; + } + } } - } - } - /* + /* if (std::numeric_limits::max() == alpha) { x.head(npassive) = s.head(npassive); break; }*/ - x.head(npassive) += alpha * (s.head(npassive) - x.head(npassive)); - x[alpha_idx] = 0; - --npassive; - - AtA.col(npassive).swap(AtA.col(alpha_idx)); - AtA.row(npassive).swap(AtA.row(alpha_idx)); - // swap Atb to match with AtA - Eigen::numext::swap(Atb.coeffRef(npassive), Atb.coeffRef(alpha_idx)); - Eigen::numext::swap(x.coeffRef(npassive), x.coeffRef(alpha_idx)); - Eigen::numext::swap(activeBXs.coeffRef(npassive), - activeBXs.coeffRef(alpha_idx)); - pulse_matrix.col(npassive).swap(pulse_matrix.col(alpha_idx)); - } + x.head(npassive) += alpha * (s.head(npassive) - x.head(npassive)); + x[alpha_idx] = 0; + --npassive; + + AtA.col(npassive).swap(AtA.col(alpha_idx)); + AtA.row(npassive).swap(AtA.row(alpha_idx)); + // swap Atb to match with AtA + Eigen::numext::swap(Atb.coeffRef(npassive), Atb.coeffRef(alpha_idx)); + Eigen::numext::swap(x.coeffRef(npassive), x.coeffRef(alpha_idx)); + Eigen::numext::swap(activeBXs.coeffRef(npassive), activeBXs.coeffRef(alpha_idx)); + pulse_matrix.col(npassive).swap(pulse_matrix.col(alpha_idx)); + } - // TODO as in cpu NNLS version - iter++; - if (iter % 16 == 0) - eps_to_use *= 2; - } - - return true; -} + // TODO as in cpu NNLS version + iter++; + if (iter % 16 == 0) + eps_to_use *= 2; + } + + return true; + } -}} + } // namespace multifit +} // namespace ecal diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc index 9661f98139f7b..95ccee87c726a 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPUUncalibRecHitProducer.cc @@ -10,7 +10,7 @@ #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" -#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/MakerMacros.h" // algorithm specific @@ -18,129 +18,106 @@ #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" -class EcalCPUUncalibRecHitProducer - : public edm::stream::EDProducer -{ +class EcalCPUUncalibRecHitProducer : public edm::stream::EDProducer { public: - explicit EcalCPUUncalibRecHitProducer(edm::ParameterSet const& ps); - ~EcalCPUUncalibRecHitProducer() override; - static void fillDescriptions(edm::ConfigurationDescriptions&); + explicit EcalCPUUncalibRecHitProducer(edm::ParameterSet const& ps); + ~EcalCPUUncalibRecHitProducer() override; + static void fillDescriptions(edm::ConfigurationDescriptions&); private: - void acquire(edm::Event const&, - edm::EventSetup const&, - edm::WaitingTaskWithArenaHolder) override; - void produce(edm::Event&, edm::EventSetup const&) override; + void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override; + void produce(edm::Event&, edm::EventSetup const&) override; private: - edm::EDGetTokenT>> - recHitsInEBToken_, recHitsInEEToken_; - edm::EDPutTokenT> - recHitsOutEBToken_, recHitsOutEEToken_; - - ecal::UncalibratedRecHit - recHitsEB_, recHitsEE_; - bool containsTimingInformation_; + edm::EDGetTokenT>> recHitsInEBToken_, recHitsInEEToken_; + edm::EDPutTokenT> recHitsOutEBToken_, recHitsOutEEToken_; + + ecal::UncalibratedRecHit recHitsEB_, recHitsEE_; + bool containsTimingInformation_; }; -void EcalCPUUncalibRecHitProducer::fillDescriptions( - edm::ConfigurationDescriptions& confDesc) { - edm::ParameterSetDescription desc; +void EcalCPUUncalibRecHitProducer::fillDescriptions(edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; - desc.add("recHitsInLabelEB", - edm::InputTag{"ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEB"}); - desc.add("recHitsInLabelEE", - edm::InputTag{"ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEE"}); - desc.add("recHitsOutLabelEB", "EcalUncalibRecHitsEB"); - desc.add("recHitsOutLabelEE", "EcalUncalibRecHitsEE"); - desc.add("containsTimingInformation", false); + desc.add("recHitsInLabelEB", edm::InputTag{"ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEB"}); + desc.add("recHitsInLabelEE", edm::InputTag{"ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEE"}); + desc.add("recHitsOutLabelEB", "EcalUncalibRecHitsEB"); + desc.add("recHitsOutLabelEE", "EcalUncalibRecHitsEE"); + desc.add("containsTimingInformation", false); - std::string label = "ecalCPUUncalibRecHitProducer"; - confDesc.add(label, desc); + std::string label = "ecalCPUUncalibRecHitProducer"; + confDesc.add(label, desc); } -EcalCPUUncalibRecHitProducer::EcalCPUUncalibRecHitProducer( - const edm::ParameterSet& ps) +EcalCPUUncalibRecHitProducer::EcalCPUUncalibRecHitProducer(const edm::ParameterSet& ps) : recHitsInEBToken_{consumes>>( - ps.getParameter("recHitsInLabelEB"))} - , recHitsInEEToken_{consumes>>( - ps.getParameter("recHitsInLabelEE"))} - , recHitsOutEBToken_{produces>( - ps.getParameter("recHitsOutLabelEB"))} - , recHitsOutEEToken_{produces>( - ps.getParameter("recHitsOutLabelEE"))} - , containsTimingInformation_{ps.getParameter("containsTimingInformation")} -{} + ps.getParameter("recHitsInLabelEB"))}, + recHitsInEEToken_{consumes>>( + ps.getParameter("recHitsInLabelEE"))}, + recHitsOutEBToken_{ + produces>(ps.getParameter("recHitsOutLabelEB"))}, + recHitsOutEEToken_{ + produces>(ps.getParameter("recHitsOutLabelEE"))}, + containsTimingInformation_{ps.getParameter("containsTimingInformation")} {} EcalCPUUncalibRecHitProducer::~EcalCPUUncalibRecHitProducer() {} -void EcalCPUUncalibRecHitProducer::acquire( - edm::Event const& event, - edm::EventSetup const& setup, - edm::WaitingTaskWithArenaHolder taskHolder) -{ - // retrieve data/ctx - auto const& ebRecHitsProduct = event.get(recHitsInEBToken_); - auto const& eeRecHitsProduct = event.get(recHitsInEEToken_); - cms::cuda::ScopedContextAcquire ctx{ebRecHitsProduct, std::move(taskHolder)}; - auto const& ebRecHits = ctx.get(ebRecHitsProduct); - auto const& eeRecHits = ctx.get(eeRecHitsProduct); - - // resize the output buffers - recHitsEB_.resize(ebRecHits.size); - recHitsEE_.resize(eeRecHits.size); - - auto lambdaToTransfer = [&ctx](auto& dest, auto* src) { - using vector_type = typename std::remove_reference::type; - using type = typename vector_type::value_type; - cudaCheck(cudaMemcpyAsync(dest.data(), - src, - dest.size() * sizeof(type), - cudaMemcpyDeviceToHost, - ctx.stream())); - }; - - // enqeue transfers - lambdaToTransfer(recHitsEB_.did, ebRecHits.did); - lambdaToTransfer(recHitsEE_.did, eeRecHits.did); - - lambdaToTransfer(recHitsEB_.amplitudesAll, ebRecHits.amplitudesAll); - lambdaToTransfer(recHitsEE_.amplitudesAll, eeRecHits.amplitudesAll); - - lambdaToTransfer(recHitsEB_.amplitude, ebRecHits.amplitude); - lambdaToTransfer(recHitsEE_.amplitude, eeRecHits.amplitude); - - lambdaToTransfer(recHitsEB_.chi2, ebRecHits.chi2); - lambdaToTransfer(recHitsEE_.chi2, eeRecHits.chi2); - - lambdaToTransfer(recHitsEB_.pedestal, ebRecHits.pedestal); - lambdaToTransfer(recHitsEE_.pedestal, eeRecHits.pedestal); - - lambdaToTransfer(recHitsEB_.flags, ebRecHits.flags); - lambdaToTransfer(recHitsEE_.flags, eeRecHits.flags); - - if (containsTimingInformation_) { - lambdaToTransfer(recHitsEB_.jitter, ebRecHits.jitter); - lambdaToTransfer(recHitsEE_.jitter, eeRecHits.jitter); - - lambdaToTransfer(recHitsEB_.jitterError, ebRecHits.jitterError); - lambdaToTransfer(recHitsEE_.jitterError, eeRecHits.jitterError); - } +void EcalCPUUncalibRecHitProducer::acquire(edm::Event const& event, + edm::EventSetup const& setup, + edm::WaitingTaskWithArenaHolder taskHolder) { + // retrieve data/ctx + auto const& ebRecHitsProduct = event.get(recHitsInEBToken_); + auto const& eeRecHitsProduct = event.get(recHitsInEEToken_); + cms::cuda::ScopedContextAcquire ctx{ebRecHitsProduct, std::move(taskHolder)}; + auto const& ebRecHits = ctx.get(ebRecHitsProduct); + auto const& eeRecHits = ctx.get(eeRecHitsProduct); + + // resize the output buffers + recHitsEB_.resize(ebRecHits.size); + recHitsEE_.resize(eeRecHits.size); + + auto lambdaToTransfer = [&ctx](auto& dest, auto* src) { + using vector_type = typename std::remove_reference::type; + using type = typename vector_type::value_type; + cudaCheck(cudaMemcpyAsync(dest.data(), src, dest.size() * sizeof(type), cudaMemcpyDeviceToHost, ctx.stream())); + }; + + // enqeue transfers + lambdaToTransfer(recHitsEB_.did, ebRecHits.did); + lambdaToTransfer(recHitsEE_.did, eeRecHits.did); + + lambdaToTransfer(recHitsEB_.amplitudesAll, ebRecHits.amplitudesAll); + lambdaToTransfer(recHitsEE_.amplitudesAll, eeRecHits.amplitudesAll); + + lambdaToTransfer(recHitsEB_.amplitude, ebRecHits.amplitude); + lambdaToTransfer(recHitsEE_.amplitude, eeRecHits.amplitude); + + lambdaToTransfer(recHitsEB_.chi2, ebRecHits.chi2); + lambdaToTransfer(recHitsEE_.chi2, eeRecHits.chi2); + + lambdaToTransfer(recHitsEB_.pedestal, ebRecHits.pedestal); + lambdaToTransfer(recHitsEE_.pedestal, eeRecHits.pedestal); + + lambdaToTransfer(recHitsEB_.flags, ebRecHits.flags); + lambdaToTransfer(recHitsEE_.flags, eeRecHits.flags); + + if (containsTimingInformation_) { + lambdaToTransfer(recHitsEB_.jitter, ebRecHits.jitter); + lambdaToTransfer(recHitsEE_.jitter, eeRecHits.jitter); + + lambdaToTransfer(recHitsEB_.jitterError, ebRecHits.jitterError); + lambdaToTransfer(recHitsEE_.jitterError, eeRecHits.jitterError); + } } -void EcalCPUUncalibRecHitProducer::produce( - edm::Event& event, - edm::EventSetup const& setup) -{ - // tmp vectors - auto recHitsOutEB = std::make_unique>( - std::move(recHitsEB_)); - auto recHitsOutEE = std::make_unique>( - std::move(recHitsEE_)); - - // put into event - event.put(recHitsOutEBToken_, std::move(recHitsOutEB)); - event.put(recHitsOutEEToken_, std::move(recHitsOutEE)); +void EcalCPUUncalibRecHitProducer::produce(edm::Event& event, edm::EventSetup const& setup) { + // tmp vectors + auto recHitsOutEB = std::make_unique>(std::move(recHitsEB_)); + auto recHitsOutEE = std::make_unique>(std::move(recHitsEE_)); + + // put into event + event.put(recHitsOutEBToken_, std::move(recHitsOutEB)); + event.put(recHitsOutEEToken_, std::move(recHitsOutEE)); } DEFINE_FWK_MODULE(EcalCPUUncalibRecHitProducer); diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc index 24b782b7b434d..c851bf24c0e40 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc @@ -18,32 +18,19 @@ #include -using EcalPedestalsGPUESProducer = EcalESProducerGPU; -using EcalGainRatiosGPUESProducer = EcalESProducerGPU; -using EcalPulseShapesGPUESProducer = EcalESProducerGPU; -using EcalPulseCovariancesGPUESProducer = EcalESProducerGPU; -using EcalSamplesCorrelationGPUESProducer = EcalESProducerGPU< - EcalSamplesCorrelationGPU, - EcalSamplesCorrelation, - EcalSamplesCorrelationRcd>; +using EcalPedestalsGPUESProducer = EcalESProducerGPU; +using EcalGainRatiosGPUESProducer = EcalESProducerGPU; +using EcalPulseShapesGPUESProducer = EcalESProducerGPU; +using EcalPulseCovariancesGPUESProducer = + EcalESProducerGPU; +using EcalSamplesCorrelationGPUESProducer = + EcalESProducerGPU; -using EcalTimeBiasCorrectionsGPUESProducer = EcalESProducerGPU< - EcalTimeBiasCorrectionsGPU, - EcalTimeBiasCorrections, - EcalTimeBiasCorrectionsRcd>; +using EcalTimeBiasCorrectionsGPUESProducer = + EcalESProducerGPU; -using EcalTimeCalibConstantsGPUESProducer = EcalESProducerGPU< - EcalTimeCalibConstantsGPU, - EcalTimeCalibConstants, - EcalTimeCalibConstantsRcd>; +using EcalTimeCalibConstantsGPUESProducer = + EcalESProducerGPU; DEFINE_FWK_EVENTSETUP_MODULE(EcalPedestalsGPUESProducer); DEFINE_FWK_EVENTSETUP_MODULE(EcalGainRatiosGPUESProducer); diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc index 916230516f070..20f51ea5245df 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitConvertGPU2CPUFormat.cc @@ -3,7 +3,7 @@ #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" -#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/MakerMacros.h" // algorithm specific #include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" @@ -14,103 +14,87 @@ #include -class EcalUncalibRecHitConvertGPU2CPUFormat - : public edm::stream::EDProducer<> -{ +class EcalUncalibRecHitConvertGPU2CPUFormat : public edm::stream::EDProducer<> { public: - explicit EcalUncalibRecHitConvertGPU2CPUFormat(edm::ParameterSet const& ps); - ~EcalUncalibRecHitConvertGPU2CPUFormat() override; - static void fillDescriptions(edm::ConfigurationDescriptions&); + explicit EcalUncalibRecHitConvertGPU2CPUFormat(edm::ParameterSet const& ps); + ~EcalUncalibRecHitConvertGPU2CPUFormat() override; + static void fillDescriptions(edm::ConfigurationDescriptions&); private: - using GPURecHitType = ecal::UncalibratedRecHit; - void produce(edm::Event&, edm::EventSetup const&) override; + using GPURecHitType = ecal::UncalibratedRecHit; + void produce(edm::Event&, edm::EventSetup const&) override; private: - const edm::EDGetTokenT recHitsGPUEB_; - const edm::EDGetTokenT recHitsGPUEE_; + const edm::EDGetTokenT recHitsGPUEB_; + const edm::EDGetTokenT recHitsGPUEE_; - const std::string recHitsLabelCPUEB_, recHitsLabelCPUEE_; + const std::string recHitsLabelCPUEB_, recHitsLabelCPUEE_; }; -void EcalUncalibRecHitConvertGPU2CPUFormat::fillDescriptions( - edm::ConfigurationDescriptions& confDesc) { - edm::ParameterSetDescription desc; +void EcalUncalibRecHitConvertGPU2CPUFormat::fillDescriptions(edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; - desc.add("recHitsLabelGPUEB", - edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEB")); - desc.add("recHitsLabelGPUEE", - edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEE")); + desc.add("recHitsLabelGPUEB", edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEB")); + desc.add("recHitsLabelGPUEE", edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEE")); - desc.add("recHitsLabelCPUEB", "EcalUncalibRecHitsEB"); - desc.add("recHitsLabelCPUEE", "EcalUncalibRecHitsEE"); + desc.add("recHitsLabelCPUEB", "EcalUncalibRecHitsEB"); + desc.add("recHitsLabelCPUEE", "EcalUncalibRecHitsEE"); - std::string label = "ecalUncalibRecHitConvertGPU2CPUFormat"; - confDesc.add(label, desc); + std::string label = "ecalUncalibRecHitConvertGPU2CPUFormat"; + confDesc.add(label, desc); } -EcalUncalibRecHitConvertGPU2CPUFormat::EcalUncalibRecHitConvertGPU2CPUFormat( - const edm::ParameterSet& ps) +EcalUncalibRecHitConvertGPU2CPUFormat::EcalUncalibRecHitConvertGPU2CPUFormat(const edm::ParameterSet& ps) : recHitsGPUEB_{consumes( - ps.getParameter("recHitsLabelGPUEB"))} - , recHitsGPUEE_{consumes( - ps.getParameter("recHitsLabelGPUEE"))} - , recHitsLabelCPUEB_{ps.getParameter("recHitsLabelCPUEB")} - , recHitsLabelCPUEE_{ps.getParameter("recHitsLabelCPUEE")} -{ - produces(recHitsLabelCPUEB_); - produces(recHitsLabelCPUEE_); + ps.getParameter("recHitsLabelGPUEB"))}, + recHitsGPUEE_{ + consumes(ps.getParameter("recHitsLabelGPUEE"))}, + recHitsLabelCPUEB_{ps.getParameter("recHitsLabelCPUEB")}, + recHitsLabelCPUEE_{ps.getParameter("recHitsLabelCPUEE")} { + produces(recHitsLabelCPUEB_); + produces(recHitsLabelCPUEE_); } EcalUncalibRecHitConvertGPU2CPUFormat::~EcalUncalibRecHitConvertGPU2CPUFormat() {} -void EcalUncalibRecHitConvertGPU2CPUFormat::produce( - edm::Event& event, - edm::EventSetup const& setup) -{ - edm::Handle hRecHitsGPUEB, hRecHitsGPUEE; - event.getByToken(recHitsGPUEB_, hRecHitsGPUEB); - event.getByToken(recHitsGPUEE_, hRecHitsGPUEE); - - auto recHitsCPUEB = std::make_unique(); - auto recHitsCPUEE = std::make_unique(); - recHitsCPUEB->reserve(hRecHitsGPUEB->amplitude.size()); - recHitsCPUEE->reserve(hRecHitsGPUEE->amplitude.size()); - - for (uint32_t i=0; iamplitude.size(); ++i) { - recHitsCPUEB->emplace_back( - DetId{hRecHitsGPUEB->did[i]}, - hRecHitsGPUEB->amplitude[i], - hRecHitsGPUEB->pedestal[i], - hRecHitsGPUEB->jitter[i], - hRecHitsGPUEB->chi2[i], - hRecHitsGPUEB->flags[i] - ); - (*recHitsCPUEB)[i].setJitterError(hRecHitsGPUEB->jitterError[i]); - auto const offset = i * EcalDataFrame::MAXSAMPLES; - for (uint32_t sample=0; sampleamplitudesAll[offset + sample]); - } - - for (uint32_t i=0; iamplitude.size(); ++i) { - recHitsCPUEE->emplace_back( - DetId{hRecHitsGPUEE->did[i]}, - hRecHitsGPUEE->amplitude[i], - hRecHitsGPUEE->pedestal[i], - hRecHitsGPUEE->jitter[i], - hRecHitsGPUEE->chi2[i], - hRecHitsGPUEE->flags[i] - ); - (*recHitsCPUEE)[i].setJitterError(hRecHitsGPUEE->jitterError[i]); - auto const offset = i * EcalDataFrame::MAXSAMPLES; - for (uint32_t sample=0; sampleamplitudesAll[offset + sample]); - } - - event.put(std::move(recHitsCPUEB), recHitsLabelCPUEB_); - event.put(std::move(recHitsCPUEE), recHitsLabelCPUEE_); +void EcalUncalibRecHitConvertGPU2CPUFormat::produce(edm::Event& event, edm::EventSetup const& setup) { + edm::Handle hRecHitsGPUEB, hRecHitsGPUEE; + event.getByToken(recHitsGPUEB_, hRecHitsGPUEB); + event.getByToken(recHitsGPUEE_, hRecHitsGPUEE); + + auto recHitsCPUEB = std::make_unique(); + auto recHitsCPUEE = std::make_unique(); + recHitsCPUEB->reserve(hRecHitsGPUEB->amplitude.size()); + recHitsCPUEE->reserve(hRecHitsGPUEE->amplitude.size()); + + for (uint32_t i = 0; i < hRecHitsGPUEB->amplitude.size(); ++i) { + recHitsCPUEB->emplace_back(DetId{hRecHitsGPUEB->did[i]}, + hRecHitsGPUEB->amplitude[i], + hRecHitsGPUEB->pedestal[i], + hRecHitsGPUEB->jitter[i], + hRecHitsGPUEB->chi2[i], + hRecHitsGPUEB->flags[i]); + (*recHitsCPUEB)[i].setJitterError(hRecHitsGPUEB->jitterError[i]); + auto const offset = i * EcalDataFrame::MAXSAMPLES; + for (uint32_t sample = 0; sample < EcalDataFrame::MAXSAMPLES; ++sample) + (*recHitsCPUEB)[i].setOutOfTimeAmplitude(sample, hRecHitsGPUEB->amplitudesAll[offset + sample]); + } + + for (uint32_t i = 0; i < hRecHitsGPUEE->amplitude.size(); ++i) { + recHitsCPUEE->emplace_back(DetId{hRecHitsGPUEE->did[i]}, + hRecHitsGPUEE->amplitude[i], + hRecHitsGPUEE->pedestal[i], + hRecHitsGPUEE->jitter[i], + hRecHitsGPUEE->chi2[i], + hRecHitsGPUEE->flags[i]); + (*recHitsCPUEE)[i].setJitterError(hRecHitsGPUEE->jitterError[i]); + auto const offset = i * EcalDataFrame::MAXSAMPLES; + for (uint32_t sample = 0; sample < EcalDataFrame::MAXSAMPLES; ++sample) + (*recHitsCPUEE)[i].setOutOfTimeAmplitude(sample, hRecHitsGPUEE->amplitudesAll[offset + sample]); + } + + event.put(std::move(recHitsCPUEB), recHitsLabelCPUEB_); + event.put(std::move(recHitsCPUEE), recHitsLabelCPUEE_); } DEFINE_FWK_MODULE(EcalUncalibRecHitConvertGPU2CPUFormat); diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc index a96b729223d01..d043d0f8e6e50 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitProducerGPU.cc @@ -8,7 +8,7 @@ #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" -#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/MakerMacros.h" // algorithm specific #include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" @@ -40,355 +40,323 @@ #include "RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalUncalibRecHitMultiFitAlgo_gpu_new.h" -class EcalUncalibRecHitProducerGPU - : public edm::stream::EDProducer -{ +class EcalUncalibRecHitProducerGPU : public edm::stream::EDProducer { public: - explicit EcalUncalibRecHitProducerGPU(edm::ParameterSet const& ps); - ~EcalUncalibRecHitProducerGPU() override; - static void fillDescriptions(edm::ConfigurationDescriptions&); + explicit EcalUncalibRecHitProducerGPU(edm::ParameterSet const& ps); + ~EcalUncalibRecHitProducerGPU() override; + static void fillDescriptions(edm::ConfigurationDescriptions&); private: - using RecHitType = ecal::UncalibratedRecHit; - void acquire(edm::Event const&, - edm::EventSetup const&, - edm::WaitingTaskWithArenaHolder) override; - void produce(edm::Event&, edm::EventSetup const&) override; + using RecHitType = ecal::UncalibratedRecHit; + void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override; + void produce(edm::Event&, edm::EventSetup const&) override; private: - edm::EDGetTokenT> digisTokenEB_, digisTokenEE_; - edm::EDPutTokenT>> - recHitsTokenEB_, recHitsTokenEE_; - - // conditions handles - edm::ESHandle pedestalsHandle_; - edm::ESHandle gainRatiosHandle_; - edm::ESHandle pulseShapesHandle_; - edm::ESHandle pulseCovariancesHandle_; - edm::ESHandle samplesCorrelationHandle_; - edm::ESHandle timeBiasCorrectionsHandle_; - edm::ESHandle timeCalibConstantsHandle_; - edm::ESHandle sampleMaskHandle_; - edm::ESHandle timeOffsetConstantHandle_; - - // configuration parameters - ecal::multifit::ConfigurationParameters configParameters_; - - // event data - ecal::multifit::EventOutputDataGPU eventOutputDataGPU_; - ecal::multifit::EventDataForScratchGPU eventDataForScratchGPU_; - bool shouldTransferToHost_{true}; - - cms::cuda::ContextState cudaState_; - - uint32_t maxNumberHits_; - uint32_t neb_, nee_; + edm::EDGetTokenT> digisTokenEB_, digisTokenEE_; + edm::EDPutTokenT>> recHitsTokenEB_, recHitsTokenEE_; + + // conditions handles + edm::ESHandle pedestalsHandle_; + edm::ESHandle gainRatiosHandle_; + edm::ESHandle pulseShapesHandle_; + edm::ESHandle pulseCovariancesHandle_; + edm::ESHandle samplesCorrelationHandle_; + edm::ESHandle timeBiasCorrectionsHandle_; + edm::ESHandle timeCalibConstantsHandle_; + edm::ESHandle sampleMaskHandle_; + edm::ESHandle timeOffsetConstantHandle_; + + // configuration parameters + ecal::multifit::ConfigurationParameters configParameters_; + + // event data + ecal::multifit::EventOutputDataGPU eventOutputDataGPU_; + ecal::multifit::EventDataForScratchGPU eventDataForScratchGPU_; + bool shouldTransferToHost_{true}; + + cms::cuda::ContextState cudaState_; + + uint32_t maxNumberHits_; + uint32_t neb_, nee_; }; -void EcalUncalibRecHitProducerGPU::fillDescriptions( - edm::ConfigurationDescriptions& confDesc) { - edm::ParameterSetDescription desc; - - desc.add("digisLabelEB", - edm::InputTag("ecalRawToDigiGPU", "ebDigisGPU")); - desc.add("digisLabelEE", - edm::InputTag("ecalRawToDigiGPU", "eeDigisGPU")); - - desc.add("recHitsLabelEB", "EcalUncalibRecHitsEB"); - desc.add("recHitsLabelEE", "EcalUncalibRecHitsEE"); - - desc.add>("EBtimeFitParameters", - {-2.015452e+00, 3.130702e+00, -1.234730e+01, 4.188921e+01, -8.283944e+01, - 9.101147e+01, -5.035761e+01, 1.105621e+01}); - desc.add>("EEtimeFitParameters", - {-2.390548e+00, 3.553628e+00, -1.762341e+01, 6.767538e+01, -1.332130e+02, - 1.407432e+02, -7.541106e+01, 1.620277e+01}); - desc.add>("EBamplitudeFitParameters", {1.138,1.652}); - desc.add>("EEamplitudeFitParameters", {1.890,1.400}); - desc.add("EBtimeFitLimits_Lower", 0.2); - desc.add("EBtimeFitLimits_Upper", 1.4); - desc.add("EEtimeFitLimits_Lower", 0.2); - desc.add("EEtimeFitLimits_Upper", 1.4); - desc.add("EBtimeConstantTerm", .6); - desc.add("EEtimeConstantTerm", 1.0); - desc.add("EBtimeNconst", 28.5); - desc.add("EEtimeNconst", 31.8); - desc.add("outOfTimeThresholdGain12pEB", 5); - desc.add("outOfTimeThresholdGain12mEB", 5); - desc.add("outOfTimeThresholdGain61pEB", 5); - desc.add("outOfTimeThresholdGain61mEB", 5); - desc.add("outOfTimeThresholdGain12pEE", 1000); - desc.add("outOfTimeThresholdGain12mEE", 1000); - desc.add("outOfTimeThresholdGain61pEE", 1000); - desc.add("outOfTimeThresholdGain61mEE", 1000); - desc.add("amplitudeThresholdEB", 10); - desc.add("amplitudeThresholdEE", 10); - desc.add("maxNumberHits", 20000); //---- AM TEST - desc.add("shouldTransferToHost", true); - desc.add>("kernelMinimizeThreads", {32, 1, 1}); - // ---- default false or true? It was set to true, but at HLT it is false - desc.add("shouldRunTimingComputation", false); - std::string label = "ecalUncalibRecHitProducerGPU"; - confDesc.add(label, desc); +void EcalUncalibRecHitProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; + + desc.add("digisLabelEB", edm::InputTag("ecalRawToDigiGPU", "ebDigisGPU")); + desc.add("digisLabelEE", edm::InputTag("ecalRawToDigiGPU", "eeDigisGPU")); + + desc.add("recHitsLabelEB", "EcalUncalibRecHitsEB"); + desc.add("recHitsLabelEE", "EcalUncalibRecHitsEE"); + + desc.add>("EBtimeFitParameters", + {-2.015452e+00, + 3.130702e+00, + -1.234730e+01, + 4.188921e+01, + -8.283944e+01, + 9.101147e+01, + -5.035761e+01, + 1.105621e+01}); + desc.add>("EEtimeFitParameters", + {-2.390548e+00, + 3.553628e+00, + -1.762341e+01, + 6.767538e+01, + -1.332130e+02, + 1.407432e+02, + -7.541106e+01, + 1.620277e+01}); + desc.add>("EBamplitudeFitParameters", {1.138, 1.652}); + desc.add>("EEamplitudeFitParameters", {1.890, 1.400}); + desc.add("EBtimeFitLimits_Lower", 0.2); + desc.add("EBtimeFitLimits_Upper", 1.4); + desc.add("EEtimeFitLimits_Lower", 0.2); + desc.add("EEtimeFitLimits_Upper", 1.4); + desc.add("EBtimeConstantTerm", .6); + desc.add("EEtimeConstantTerm", 1.0); + desc.add("EBtimeNconst", 28.5); + desc.add("EEtimeNconst", 31.8); + desc.add("outOfTimeThresholdGain12pEB", 5); + desc.add("outOfTimeThresholdGain12mEB", 5); + desc.add("outOfTimeThresholdGain61pEB", 5); + desc.add("outOfTimeThresholdGain61mEB", 5); + desc.add("outOfTimeThresholdGain12pEE", 1000); + desc.add("outOfTimeThresholdGain12mEE", 1000); + desc.add("outOfTimeThresholdGain61pEE", 1000); + desc.add("outOfTimeThresholdGain61mEE", 1000); + desc.add("amplitudeThresholdEB", 10); + desc.add("amplitudeThresholdEE", 10); + desc.add("maxNumberHits", 20000); //---- AM TEST + desc.add("shouldTransferToHost", true); + desc.add>("kernelMinimizeThreads", {32, 1, 1}); + // ---- default false or true? It was set to true, but at HLT it is false + desc.add("shouldRunTimingComputation", false); + std::string label = "ecalUncalibRecHitProducerGPU"; + confDesc.add(label, desc); } -EcalUncalibRecHitProducerGPU::EcalUncalibRecHitProducerGPU( - const edm::ParameterSet& ps) +EcalUncalibRecHitProducerGPU::EcalUncalibRecHitProducerGPU(const edm::ParameterSet& ps) : digisTokenEB_{consumes>( - ps.getParameter("digisLabelEB"))} - , digisTokenEE_{consumes>( - ps.getParameter("digisLabelEE"))} - , recHitsTokenEB_{produces>>( - ps.getParameter("recHitsLabelEB"))} - , recHitsTokenEE_{produces>>( - ps.getParameter("recHitsLabelEE"))} -{ - auto EBamplitudeFitParameters = ps.getParameter>( - "EBamplitudeFitParameters"); - auto EEamplitudeFitParameters = ps.getParameter>( - "EEamplitudeFitParameters"); - auto EBtimeFitParameters = ps.getParameter>( - "EBtimeFitParameters"); - auto EEtimeFitParameters = ps.getParameter>( - "EEtimeFitParameters"); - std::pair EBtimeFitLimits, EEtimeFitLimits; - EBtimeFitLimits.first = ps.getParameter("EBtimeFitLimits_Lower"); - EBtimeFitLimits.second = ps.getParameter("EBtimeFitLimits_Upper"); - EEtimeFitLimits.first = ps.getParameter("EEtimeFitLimits_Lower"); - EEtimeFitLimits.second = ps.getParameter("EEtimeFitLimits_Upper"); - - auto EBtimeConstantTerm = ps.getParameter("EBtimeConstantTerm"); - auto EEtimeConstantTerm = ps.getParameter("EEtimeConstantTerm"); - auto EBtimeNconst = ps.getParameter("EBtimeNconst"); - auto EEtimeNconst = ps.getParameter("EEtimeNconst"); - - auto outOfTimeThreshG12pEB = ps.getParameter( - "outOfTimeThresholdGain12pEB"); - auto outOfTimeThreshG12mEB = ps.getParameter( - "outOfTimeThresholdGain12mEB"); - auto outOfTimeThreshG61pEB = ps.getParameter( - "outOfTimeThresholdGain61pEB"); - auto outOfTimeThreshG61mEB = ps.getParameter( - "outOfTimeThresholdGain61mEB"); - auto outOfTimeThreshG12pEE = ps.getParameter( - "outOfTimeThresholdGain12pEE"); - auto outOfTimeThreshG12mEE = ps.getParameter( - "outOfTimeThresholdGain12mEE"); - auto outOfTimeThreshG61pEE = ps.getParameter( - "outOfTimeThresholdGain61pEE"); - auto outOfTimeThreshG61mEE = ps.getParameter( - "outOfTimeThresholdGain61mEE"); - auto amplitudeThreshEB = ps.getParameter("amplitudeThresholdEB"); - auto amplitudeThreshEE = ps.getParameter("amplitudeThresholdEE"); - - // max number of digis to allocate for - maxNumberHits_ = ps.getParameter("maxNumberHits"); - - // transfer to host switch - shouldTransferToHost_ = ps.getParameter("shouldTransferToHost"); - - // switch to run timing computation kernels - configParameters_.shouldRunTimingComputation = - ps.getParameter("shouldRunTimingComputation"); - - // minimize kernel launch conf - auto threadsMinimize = ps.getParameter>("kernelMinimizeThreads"); - configParameters_.kernelMinimizeThreads[0] = threadsMinimize[0]; - configParameters_.kernelMinimizeThreads[1] = threadsMinimize[1]; - configParameters_.kernelMinimizeThreads[2] = threadsMinimize[2]; - - // - // configuration and physics parameters: done once - // assume there is a single device - // use sync copying - // - - // amplitude fit parameters copying - cudaCheck( cudaMalloc((void**)&configParameters_.amplitudeFitParametersEB, - sizeof(ecal::multifit::ConfigurationParameters::type) - * EBamplitudeFitParameters.size()) ); - cudaCheck( cudaMemcpy(configParameters_.amplitudeFitParametersEB, - EBamplitudeFitParameters.data(), - EBamplitudeFitParameters.size() * - sizeof(ecal::multifit::ConfigurationParameters::type), - cudaMemcpyHostToDevice) ); - cudaCheck( cudaMalloc((void**)&configParameters_.amplitudeFitParametersEE, - sizeof(ecal::multifit::ConfigurationParameters::type) * - EEamplitudeFitParameters.size()) ); - cudaCheck( cudaMemcpy(configParameters_.amplitudeFitParametersEE, - EEamplitudeFitParameters.data(), - EEamplitudeFitParameters.size() * - sizeof(ecal::multifit::ConfigurationParameters::type), - cudaMemcpyHostToDevice) ); - - // time fit parameters and limits - configParameters_.timeFitParametersSizeEB = EBtimeFitParameters.size(); - configParameters_.timeFitParametersSizeEE = EEtimeFitParameters.size(); - configParameters_.timeFitLimitsFirstEB = EBtimeFitLimits.first; - configParameters_.timeFitLimitsSecondEB = EBtimeFitLimits.second; - configParameters_.timeFitLimitsFirstEE = EEtimeFitLimits.first; - configParameters_.timeFitLimitsSecondEE = EEtimeFitLimits.second; - cudaCheck( cudaMalloc((void**)&configParameters_.timeFitParametersEB, - sizeof(ecal::multifit::ConfigurationParameters::type) - * EBtimeFitParameters.size()) ); - cudaCheck( cudaMemcpy(configParameters_.timeFitParametersEB, - EBtimeFitParameters.data(), - EBtimeFitParameters.size() * - sizeof(ecal::multifit::ConfigurationParameters::type), - cudaMemcpyHostToDevice) ); - cudaCheck( cudaMalloc((void**)&configParameters_.timeFitParametersEE, - sizeof(ecal::multifit::ConfigurationParameters::type) - * EEtimeFitParameters.size()) ); - cudaCheck( cudaMemcpy(configParameters_.timeFitParametersEE, - EEtimeFitParameters.data(), - EEtimeFitParameters.size() - * sizeof(ecal::multifit::ConfigurationParameters::type), - cudaMemcpyHostToDevice) ); - - // time constant terms - configParameters_.timeConstantTermEB = EBtimeConstantTerm; - configParameters_.timeConstantTermEE = EEtimeConstantTerm; - - // time N const - configParameters_.timeNconstEB = EBtimeNconst; - configParameters_.timeNconstEE = EEtimeNconst; - - // amplitude threshold for time flags - configParameters_.amplitudeThreshEB = amplitudeThreshEB; - configParameters_.amplitudeThreshEE = amplitudeThreshEE; - - // out of time thresholds gain-dependent - configParameters_.outOfTimeThreshG12pEB = outOfTimeThreshG12pEB; - configParameters_.outOfTimeThreshG12pEE = outOfTimeThreshG12pEE; - configParameters_.outOfTimeThreshG61pEB = outOfTimeThreshG61pEB; - configParameters_.outOfTimeThreshG61pEE = outOfTimeThreshG61pEE; - configParameters_.outOfTimeThreshG12mEB = outOfTimeThreshG12mEB; - configParameters_.outOfTimeThreshG12mEE = outOfTimeThreshG12mEE; - configParameters_.outOfTimeThreshG61mEB = outOfTimeThreshG61mEB; - configParameters_.outOfTimeThreshG61mEE = outOfTimeThreshG61mEE; - - // allocate event output data - eventOutputDataGPU_.allocate(configParameters_, maxNumberHits_); - - // allocate scratch data for gpu - eventDataForScratchGPU_.allocate(configParameters_, maxNumberHits_); + ps.getParameter("digisLabelEB"))}, + digisTokenEE_{ + consumes>(ps.getParameter("digisLabelEE"))}, + recHitsTokenEB_{produces>>( + ps.getParameter("recHitsLabelEB"))}, + recHitsTokenEE_{produces>>( + ps.getParameter("recHitsLabelEE"))} { + auto EBamplitudeFitParameters = ps.getParameter>("EBamplitudeFitParameters"); + auto EEamplitudeFitParameters = ps.getParameter>("EEamplitudeFitParameters"); + auto EBtimeFitParameters = ps.getParameter>("EBtimeFitParameters"); + auto EEtimeFitParameters = ps.getParameter>("EEtimeFitParameters"); + std::pair EBtimeFitLimits, EEtimeFitLimits; + EBtimeFitLimits.first = ps.getParameter("EBtimeFitLimits_Lower"); + EBtimeFitLimits.second = ps.getParameter("EBtimeFitLimits_Upper"); + EEtimeFitLimits.first = ps.getParameter("EEtimeFitLimits_Lower"); + EEtimeFitLimits.second = ps.getParameter("EEtimeFitLimits_Upper"); + + auto EBtimeConstantTerm = ps.getParameter("EBtimeConstantTerm"); + auto EEtimeConstantTerm = ps.getParameter("EEtimeConstantTerm"); + auto EBtimeNconst = ps.getParameter("EBtimeNconst"); + auto EEtimeNconst = ps.getParameter("EEtimeNconst"); + + auto outOfTimeThreshG12pEB = ps.getParameter("outOfTimeThresholdGain12pEB"); + auto outOfTimeThreshG12mEB = ps.getParameter("outOfTimeThresholdGain12mEB"); + auto outOfTimeThreshG61pEB = ps.getParameter("outOfTimeThresholdGain61pEB"); + auto outOfTimeThreshG61mEB = ps.getParameter("outOfTimeThresholdGain61mEB"); + auto outOfTimeThreshG12pEE = ps.getParameter("outOfTimeThresholdGain12pEE"); + auto outOfTimeThreshG12mEE = ps.getParameter("outOfTimeThresholdGain12mEE"); + auto outOfTimeThreshG61pEE = ps.getParameter("outOfTimeThresholdGain61pEE"); + auto outOfTimeThreshG61mEE = ps.getParameter("outOfTimeThresholdGain61mEE"); + auto amplitudeThreshEB = ps.getParameter("amplitudeThresholdEB"); + auto amplitudeThreshEE = ps.getParameter("amplitudeThresholdEE"); + + // max number of digis to allocate for + maxNumberHits_ = ps.getParameter("maxNumberHits"); + + // transfer to host switch + shouldTransferToHost_ = ps.getParameter("shouldTransferToHost"); + + // switch to run timing computation kernels + configParameters_.shouldRunTimingComputation = ps.getParameter("shouldRunTimingComputation"); + + // minimize kernel launch conf + auto threadsMinimize = ps.getParameter>("kernelMinimizeThreads"); + configParameters_.kernelMinimizeThreads[0] = threadsMinimize[0]; + configParameters_.kernelMinimizeThreads[1] = threadsMinimize[1]; + configParameters_.kernelMinimizeThreads[2] = threadsMinimize[2]; + + // + // configuration and physics parameters: done once + // assume there is a single device + // use sync copying + // + + // amplitude fit parameters copying + cudaCheck(cudaMalloc((void**)&configParameters_.amplitudeFitParametersEB, + sizeof(ecal::multifit::ConfigurationParameters::type) * EBamplitudeFitParameters.size())); + cudaCheck(cudaMemcpy(configParameters_.amplitudeFitParametersEB, + EBamplitudeFitParameters.data(), + EBamplitudeFitParameters.size() * sizeof(ecal::multifit::ConfigurationParameters::type), + cudaMemcpyHostToDevice)); + cudaCheck(cudaMalloc((void**)&configParameters_.amplitudeFitParametersEE, + sizeof(ecal::multifit::ConfigurationParameters::type) * EEamplitudeFitParameters.size())); + cudaCheck(cudaMemcpy(configParameters_.amplitudeFitParametersEE, + EEamplitudeFitParameters.data(), + EEamplitudeFitParameters.size() * sizeof(ecal::multifit::ConfigurationParameters::type), + cudaMemcpyHostToDevice)); + + // time fit parameters and limits + configParameters_.timeFitParametersSizeEB = EBtimeFitParameters.size(); + configParameters_.timeFitParametersSizeEE = EEtimeFitParameters.size(); + configParameters_.timeFitLimitsFirstEB = EBtimeFitLimits.first; + configParameters_.timeFitLimitsSecondEB = EBtimeFitLimits.second; + configParameters_.timeFitLimitsFirstEE = EEtimeFitLimits.first; + configParameters_.timeFitLimitsSecondEE = EEtimeFitLimits.second; + cudaCheck(cudaMalloc((void**)&configParameters_.timeFitParametersEB, + sizeof(ecal::multifit::ConfigurationParameters::type) * EBtimeFitParameters.size())); + cudaCheck(cudaMemcpy(configParameters_.timeFitParametersEB, + EBtimeFitParameters.data(), + EBtimeFitParameters.size() * sizeof(ecal::multifit::ConfigurationParameters::type), + cudaMemcpyHostToDevice)); + cudaCheck(cudaMalloc((void**)&configParameters_.timeFitParametersEE, + sizeof(ecal::multifit::ConfigurationParameters::type) * EEtimeFitParameters.size())); + cudaCheck(cudaMemcpy(configParameters_.timeFitParametersEE, + EEtimeFitParameters.data(), + EEtimeFitParameters.size() * sizeof(ecal::multifit::ConfigurationParameters::type), + cudaMemcpyHostToDevice)); + + // time constant terms + configParameters_.timeConstantTermEB = EBtimeConstantTerm; + configParameters_.timeConstantTermEE = EEtimeConstantTerm; + + // time N const + configParameters_.timeNconstEB = EBtimeNconst; + configParameters_.timeNconstEE = EEtimeNconst; + + // amplitude threshold for time flags + configParameters_.amplitudeThreshEB = amplitudeThreshEB; + configParameters_.amplitudeThreshEE = amplitudeThreshEE; + + // out of time thresholds gain-dependent + configParameters_.outOfTimeThreshG12pEB = outOfTimeThreshG12pEB; + configParameters_.outOfTimeThreshG12pEE = outOfTimeThreshG12pEE; + configParameters_.outOfTimeThreshG61pEB = outOfTimeThreshG61pEB; + configParameters_.outOfTimeThreshG61pEE = outOfTimeThreshG61pEE; + configParameters_.outOfTimeThreshG12mEB = outOfTimeThreshG12mEB; + configParameters_.outOfTimeThreshG12mEE = outOfTimeThreshG12mEE; + configParameters_.outOfTimeThreshG61mEB = outOfTimeThreshG61mEB; + configParameters_.outOfTimeThreshG61mEE = outOfTimeThreshG61mEE; + + // allocate event output data + eventOutputDataGPU_.allocate(configParameters_, maxNumberHits_); + + // allocate scratch data for gpu + eventDataForScratchGPU_.allocate(configParameters_, maxNumberHits_); } EcalUncalibRecHitProducerGPU::~EcalUncalibRecHitProducerGPU() { - // - // assume single device for now - // - - if (configParameters_.amplitudeFitParametersEB) { - // configuration parameters - cudaCheck( cudaFree(configParameters_.amplitudeFitParametersEB) ); - cudaCheck( cudaFree(configParameters_.amplitudeFitParametersEE) ); - cudaCheck( cudaFree(configParameters_.timeFitParametersEB) ); - cudaCheck( cudaFree(configParameters_.timeFitParametersEE) ); - - // free event ouput data - eventOutputDataGPU_.deallocate(configParameters_); - - // free event scratch data - eventDataForScratchGPU_.deallocate(configParameters_); - } + // + // assume single device for now + // + + if (configParameters_.amplitudeFitParametersEB) { + // configuration parameters + cudaCheck(cudaFree(configParameters_.amplitudeFitParametersEB)); + cudaCheck(cudaFree(configParameters_.amplitudeFitParametersEE)); + cudaCheck(cudaFree(configParameters_.timeFitParametersEB)); + cudaCheck(cudaFree(configParameters_.timeFitParametersEE)); + + // free event ouput data + eventOutputDataGPU_.deallocate(configParameters_); + + // free event scratch data + eventDataForScratchGPU_.deallocate(configParameters_); + } } -void EcalUncalibRecHitProducerGPU::acquire( - edm::Event const& event, - edm::EventSetup const& setup, - edm::WaitingTaskWithArenaHolder holder) -{ - // cuda products - auto const& ebDigisProduct = event.get(digisTokenEB_); - auto const& eeDigisProduct = event.get(digisTokenEE_); - - // raii - cms::cuda::ScopedContextAcquire ctx{ebDigisProduct, std::move(holder), cudaState_}; - - // get actual obj - auto const& ebDigis = ctx.get(ebDigisProduct); - auto const& eeDigis = ctx.get(eeDigisProduct); - ecal::multifit::EventInputDataGPU inputDataGPU{ebDigis, eeDigis}; - neb_ = ebDigis.ndigis; - nee_ = eeDigis.ndigis; - - // conditions - setup.get().get(pedestalsHandle_); - setup.get().get(gainRatiosHandle_); - setup.get().get(pulseShapesHandle_); - setup.get().get(pulseCovariancesHandle_); - setup.get().get(samplesCorrelationHandle_); - setup.get().get(timeBiasCorrectionsHandle_); - setup.get().get(timeCalibConstantsHandle_); - setup.get().get(sampleMaskHandle_); - setup.get().get(timeOffsetConstantHandle_); - - auto const& pedProduct = pedestalsHandle_->getProduct(ctx.stream()); - auto const& gainsProduct = gainRatiosHandle_->getProduct(ctx.stream()); - auto const& pulseShapesProduct = pulseShapesHandle_->getProduct(ctx.stream()); - auto const& pulseCovariancesProduct = pulseCovariancesHandle_->getProduct(ctx.stream()); - auto const& samplesCorrelationProduct = samplesCorrelationHandle_->getProduct(ctx.stream()); - auto const& timeBiasCorrectionsProduct = timeBiasCorrectionsHandle_->getProduct(ctx.stream()); - auto const& timeCalibConstantsProduct = timeCalibConstantsHandle_->getProduct(ctx.stream()); - - // bundle up conditions - ecal::multifit::ConditionsProducts conditions { - pedProduct, gainsProduct, pulseShapesProduct, - pulseCovariancesProduct, - samplesCorrelationProduct, - timeBiasCorrectionsProduct, - timeCalibConstantsProduct, - *sampleMaskHandle_, - *timeOffsetConstantHandle_, - timeCalibConstantsHandle_->getOffset() - }; - - // - // schedule algorithms - // - ecal::multifit::entryPoint( - inputDataGPU, - eventOutputDataGPU_, - eventDataForScratchGPU_, - conditions, - configParameters_, - ctx.stream() - ); +void EcalUncalibRecHitProducerGPU::acquire(edm::Event const& event, + edm::EventSetup const& setup, + edm::WaitingTaskWithArenaHolder holder) { + // cuda products + auto const& ebDigisProduct = event.get(digisTokenEB_); + auto const& eeDigisProduct = event.get(digisTokenEE_); + + // raii + cms::cuda::ScopedContextAcquire ctx{ebDigisProduct, std::move(holder), cudaState_}; + + // get actual obj + auto const& ebDigis = ctx.get(ebDigisProduct); + auto const& eeDigis = ctx.get(eeDigisProduct); + ecal::multifit::EventInputDataGPU inputDataGPU{ebDigis, eeDigis}; + neb_ = ebDigis.ndigis; + nee_ = eeDigis.ndigis; + + // conditions + setup.get().get(pedestalsHandle_); + setup.get().get(gainRatiosHandle_); + setup.get().get(pulseShapesHandle_); + setup.get().get(pulseCovariancesHandle_); + setup.get().get(samplesCorrelationHandle_); + setup.get().get(timeBiasCorrectionsHandle_); + setup.get().get(timeCalibConstantsHandle_); + setup.get().get(sampleMaskHandle_); + setup.get().get(timeOffsetConstantHandle_); + + auto const& pedProduct = pedestalsHandle_->getProduct(ctx.stream()); + auto const& gainsProduct = gainRatiosHandle_->getProduct(ctx.stream()); + auto const& pulseShapesProduct = pulseShapesHandle_->getProduct(ctx.stream()); + auto const& pulseCovariancesProduct = pulseCovariancesHandle_->getProduct(ctx.stream()); + auto const& samplesCorrelationProduct = samplesCorrelationHandle_->getProduct(ctx.stream()); + auto const& timeBiasCorrectionsProduct = timeBiasCorrectionsHandle_->getProduct(ctx.stream()); + auto const& timeCalibConstantsProduct = timeCalibConstantsHandle_->getProduct(ctx.stream()); + + // bundle up conditions + ecal::multifit::ConditionsProducts conditions{pedProduct, + gainsProduct, + pulseShapesProduct, + pulseCovariancesProduct, + samplesCorrelationProduct, + timeBiasCorrectionsProduct, + timeCalibConstantsProduct, + *sampleMaskHandle_, + *timeOffsetConstantHandle_, + timeCalibConstantsHandle_->getOffset()}; + + // + // schedule algorithms + // + ecal::multifit::entryPoint( + inputDataGPU, eventOutputDataGPU_, eventDataForScratchGPU_, conditions, configParameters_, ctx.stream()); } -void EcalUncalibRecHitProducerGPU::produce( - edm::Event& event, - edm::EventSetup const& setup) -{ - //DurationMeasurer timer{std::string{"produce duration"}}; - cms::cuda::ScopedContextProduce ctx{cudaState_}; - - // copy construct output collections - // note, output collections do not own device memory! - ecal::UncalibratedRecHit - ebRecHits{eventOutputDataGPU_}, - eeRecHits{eventOutputDataGPU_}; - - // set the size of eb and ee - ebRecHits.size = neb_; - eeRecHits.size = nee_; - - // shift ptrs for ee - eeRecHits.amplitudesAll += neb_ * EcalDataFrame::MAXSAMPLES; - eeRecHits.amplitude += neb_; - eeRecHits.chi2 += neb_; - eeRecHits.pedestal += neb_; - eeRecHits.did += neb_; - eeRecHits.flags += neb_; - if (configParameters_.shouldRunTimingComputation) { - eeRecHits.jitter += neb_; - eeRecHits.jitterError += neb_; - } - - // put into the event - ctx.emplace(event, recHitsTokenEB_, std::move(ebRecHits)); - ctx.emplace(event, recHitsTokenEE_, std::move(eeRecHits)); +void EcalUncalibRecHitProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) { + //DurationMeasurer timer{std::string{"produce duration"}}; + cms::cuda::ScopedContextProduce ctx{cudaState_}; + + // copy construct output collections + // note, output collections do not own device memory! + ecal::UncalibratedRecHit ebRecHits{eventOutputDataGPU_}, eeRecHits{eventOutputDataGPU_}; + + // set the size of eb and ee + ebRecHits.size = neb_; + eeRecHits.size = nee_; + + // shift ptrs for ee + eeRecHits.amplitudesAll += neb_ * EcalDataFrame::MAXSAMPLES; + eeRecHits.amplitude += neb_; + eeRecHits.chi2 += neb_; + eeRecHits.pedestal += neb_; + eeRecHits.did += neb_; + eeRecHits.flags += neb_; + if (configParameters_.shouldRunTimingComputation) { + eeRecHits.jitter += neb_; + eeRecHits.jitterError += neb_; + } + + // put into the event + ctx.emplace(event, recHitsTokenEB_, std::move(ebRecHits)); + ctx.emplace(event, recHitsTokenEE_, std::move(eeRecHits)); } DEFINE_FWK_MODULE(EcalUncalibRecHitProducerGPU); From d5f111724fab3e72bc8be43b205690ac1a85d84b Mon Sep 17 00:00:00 2001 From: amassiro Date: Mon, 6 Apr 2020 08:56:45 +0200 Subject: [PATCH 09/30] first commit --- .../EcalRecHitSoA/interface/EcalRecHit_soa.h | 52 ++ .../interface/EcalUncalibratedRecHit_soa.h | 16 + CUDADataFormats/EcalRecHitSoA/src/classes.h | 1 + .../EcalRecHitSoA/src/classes_def.xml | 2 + .../EcalRecAlgos/interface/DeclsForKernels.h | 133 ++++ .../interface/EcalADCToGeVConstantGPU.h | 43 + .../interface/EcalChannelStatusGPU.h | 43 + .../interface/EcalIntercalibConstantsGPU.h | 44 ++ .../interface/EcalLaserAPDPNRatiosGPU.h | 58 ++ .../interface/EcalLaserAPDPNRatiosRefGPU.h | 44 ++ .../interface/EcalLaserAlphasGPU.h | 44 ++ .../interface/EcalLinearCorrectionsGPU.h | 57 ++ .../src/AmplitudeComputationCommonKernels.cu | 4 +- .../src/EcalADCToGeVConstantGPU.cc | 37 + .../EcalRecAlgos/src/EcalChannelStatusGPU.cc | 47 ++ .../src/EcalIntercalibConstantsGPU.cc | 44 ++ .../src/EcalLaserAPDPNRatiosGPU.cc | 109 +++ .../src/EcalLaserAPDPNRatiosRefGPU.cc | 44 ++ .../EcalRecAlgos/src/EcalLaserAlphasGPU.cc | 44 ++ .../src/EcalLinearCorrectionsGPU.cc | 102 +++ .../src/EcalRecHitBuilderKernels.cu | 734 ++++++++++++++++++ .../src/EcalRecHitBuilderKernels.h | 97 +++ .../EcalRecAlgos/src/KernelHelpers.cu | 308 +++++++- .../EcalRecAlgos/src/KernelHelpers.h | 21 +- .../src/TimeComputationKernels.cu | 4 +- .../plugins/EcalCPURecHitProducer.cc | 190 +++++ .../plugins/EcalRecHitConvertGPU2CPUFormat.cc | 137 ++++ .../python/ecalRecHitGPU_cfi.py | 132 ++++ .../test/sourceFromRawCmggpu_cff.py | 151 ++++ .../test/testEcalUncalibRechitProducer_cfg.py | 231 ++++++ 30 files changed, 2960 insertions(+), 13 deletions(-) create mode 100644 CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h create mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h create mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h create mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h create mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h create mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h create mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h create mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalADCToGeVConstantGPU.cc create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalIntercalibConstantsGPU.cc create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosGPU.cc create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosRefGPU.cc create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalLaserAlphasGPU.cc create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc create mode 100644 RecoLocalCalo/EcalRecProducers/python/ecalRecHitGPU_cfi.py create mode 100644 RecoLocalCalo/EcalRecProducers/test/sourceFromRawCmggpu_cff.py create mode 100644 RecoLocalCalo/EcalRecProducers/test/testEcalUncalibRechitProducer_cfg.py diff --git a/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h b/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h new file mode 100644 index 0000000000000..20d342d1b7073 --- /dev/null +++ b/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h @@ -0,0 +1,52 @@ +#ifndef CUDADataFormats_EcalRecHitSoA_interface_EcalRecHit_soa_h +#define CUDADataFormats_EcalRecHitSoA_interface_EcalRecHit_soa_h + +#include +#include + +#include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h" +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" + +// needed for "soa" definition +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" + +namespace ecal { + + template + struct RecHit : public Detail::Base { + + RecHit() = default; + RecHit(const RecHit&) = default; + RecHit& operator=(const RecHit&) = default; + + RecHit(RecHit&&) = default; + RecHit& operator=(RecHit&&) = default; + + typename type_wrapper::type energy; + typename type_wrapper::type time; + typename type_wrapper::type chi2; // should we remove this, since already included in "extra" ? + typename type_wrapper::type extra; // packed uint32_t for timeError, chi2, energyError + typename type_wrapper::type flagBits; // store rechit condition (see Flags enum) in a bit-wise way + + typename type_wrapper::type did; + + + template + typename std::enable_if::value, void>::type + resize(size_t size) { + energy.resize(size); + time.resize(size); + chi2.resize(size); + extra.resize(size); + flagBits.resize(size); + did.resize(size); + } + }; + + using SoARecHitCollection = RecHit; + +} + +#endif +// RecoLocalCalo_EcalRecAlgos_interface_EcalRecHit_soa_h + diff --git a/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h b/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h index e11c13ebdf4c2..fe11fc64dae8f 100644 --- a/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h +++ b/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h @@ -18,6 +18,22 @@ namespace ecal { } // namespace Tag + + namespace Detail { + + // empty base + template + struct Base {}; + + // add number of values for ptr case + template<> + struct Base<::ecal::Tag::ptr> { + uint32_t size; + }; + + } + + template struct type_wrapper { //#ifndef ECAL_MULTIFIT_DONOT_USE_PINNED_MEM diff --git a/CUDADataFormats/EcalRecHitSoA/src/classes.h b/CUDADataFormats/EcalRecHitSoA/src/classes.h index 8ad6b8d684b9a..5c47ccc6c10e9 100644 --- a/CUDADataFormats/EcalRecHitSoA/src/classes.h +++ b/CUDADataFormats/EcalRecHitSoA/src/classes.h @@ -1,2 +1,3 @@ #include "DataFormats/Common/interface/Wrapper.h" #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h" \ No newline at end of file diff --git a/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml b/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml index 461460835a723..7217782abac05 100644 --- a/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml +++ b/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml @@ -2,4 +2,6 @@ + + diff --git a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h index b997906006a22..1a117a63288ef 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h @@ -252,4 +252,137 @@ namespace ecal { } // namespace multifit } // namespace ecal + + + + + + + + +// +// ECAL Rechit producer +// + +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h" + +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h" + + +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h" + + + + +namespace ecal { + namespace rechit { + + // parameters that are read in the configuration file for rechit producer + struct ConfigurationParameters { + // device ptrs + int *ChannelStatusToBeExcluded=nullptr; + uint32_t ChannelStatusToBeExcludedSize; + + bool killDeadChannels; + + bool recoverEBIsolatedChannels ; + bool recoverEEIsolatedChannels ; + bool recoverEBVFE ; + bool recoverEEVFE ; + bool recoverEBFE ; + bool recoverEEFE ; + + float EBLaserMIN; + float EELaserMIN; + float EBLaserMAX; + float EELaserMAX; + + // std::vector > v_DB_reco_flags; + int* expanded_v_DB_reco_flags; + uint32_t* expanded_Sizes_v_DB_reco_flags; + uint32_t* expanded_flagbit_v_DB_reco_flags; + uint32_t expanded_v_DB_reco_flagsSize; + + uint32_t flagmask; + + + // + // bool shouldRunTimingComputation; + }; + + + + + + + struct EventOutputDataGPU final : public ::ecal::RecHit<::ecal::Tag::ptr> { + + void allocate(ConfigurationParameters const& configParameters, uint32_t size) { + // void allocate(uint32_t size) { + //---- configParameters -> needed only to decide if to save the timing information or not + + cudaCheck( cudaMalloc((void**)&energy, + size * sizeof(::ecal::reco::StorageScalarType)) ); + cudaCheck( cudaMalloc((void**)&time, + size * sizeof(::ecal::reco::StorageScalarType)) ); + cudaCheck( cudaMalloc((void**)&chi2, + size * sizeof(::ecal::reco::StorageScalarType)) ); + cudaCheck( cudaMalloc((void**)&flagBits, + size * sizeof(uint32_t)) ); + cudaCheck( cudaMalloc((void**)&extra, + size * sizeof(uint32_t)) ); + cudaCheck( cudaMalloc((void**)&did, + size * sizeof(uint32_t)) ); + } + + + void deallocate(ConfigurationParameters const& configParameters) { + // void deallocate() { + //---- configParameters -> needed only to decide if to save the timing information or not + + cudaCheck( cudaFree(energy) ); + cudaCheck( cudaFree(time) ); + cudaCheck( cudaFree(chi2) ); + cudaCheck( cudaFree(flagBits) ); + cudaCheck( cudaFree(extra) ); + cudaCheck( cudaFree(did) ); + } + }; + + + + struct EventInputDataGPU { + ecal::UncalibratedRecHit const& ebUncalibRecHits; + ecal::UncalibratedRecHit const& eeUncalibRecHits; + }; + + // const refs products to conditions + struct ConditionsProducts { + EcalADCToGeVConstantGPU::Product const& ADCToGeV; + EcalIntercalibConstantsGPU::Product const& Intercalib; + EcalChannelStatusGPU::Product const& ChannelStatus; + // + EcalLaserAPDPNRatiosGPU::Product const& LaserAPDPNRatios ; + EcalLaserAPDPNRatiosRefGPU::Product const& LaserAPDPNRatiosRef; + EcalLaserAlphasGPU::Product const& LaserAlphas ; + EcalLinearCorrectionsGPU::Product const& LinearCorrections ; + // + // + uint32_t offsetForHashes; + }; + + + + } +} + + + + + #endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h new file mode 100644 index 0000000000000..4f6cb43eddee0 --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h @@ -0,0 +1,43 @@ +#ifndef RecoLocalCalo_EcalRecProducers_src_EcalADCToGeVConstantGPU_h +#define RecoLocalCalo_EcalRecProducers_src_EcalADCToGeVConstantGPU_h + +#include "CondFormats/EcalObjects/interface/EcalADCToGeVConstant.h" + +#ifndef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" +#include "HeterogeneousCore/CUDACore/interface/ESProduct.h" +#endif + +class EcalADCToGeVConstantGPU { +public: + struct Product { + ~Product(); + float *adc2gev = nullptr; + }; + + #ifndef __CUDACC__ + + // + EcalADCToGeVConstantGPU(EcalADCToGeVConstant const&); + + // will call dealloation for Product thru ~Product + ~EcalADCToGeVConstantGPU() = default; + + // get device pointers + Product const& getProduct(cudaStream_t) const; + + // + static std::string name() { return std::string{"ecalADCToGeVConstantGPU"}; } + +private: + // in the future, we need to arrange so to avoid this copy on the host + // store eb first then ee + std::vector> adc2gev_; + + cms::cuda::ESProduct product_; + + #endif +}; + + +#endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h new file mode 100644 index 0000000000000..0932e7f0641d9 --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h @@ -0,0 +1,43 @@ +#ifndef RecoLocalCalo_EcalRecProducers_src_EcalChannelStatusGPU_h +#define RecoLocalCalo_EcalRecProducers_src_EcalChannelStatusGPU_h + +#include "CondFormats/EcalObjects/interface/EcalChannelStatus.h" + +#ifndef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" +#include "HeterogeneousCore/CUDACore/interface/ESProduct.h" +#endif + +class EcalChannelStatusGPU { +public: + struct Product { + ~Product(); + uint16_t *status = nullptr; + }; + + #ifndef __CUDACC__ + + // + EcalChannelStatusGPU(EcalChannelStatus const&); + + // will call dealloation for Product thru ~Product + ~EcalChannelStatusGPU() = default; + + // get device pointers + Product const& getProduct(cudaStream_t) const; + + // + static std::string name() { return std::string{"ecalChannelStatusGPU"}; } + +private: + // in the future, we need to arrange so to avoid this copy on the host + // store eb first then ee + std::vector> status_; + + cms::cuda::ESProduct product_; + + #endif +}; + + +#endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h new file mode 100644 index 0000000000000..ae36aa78c9e45 --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h @@ -0,0 +1,44 @@ +#ifndef RecoLocalCalo_EcalRecProducers_src_EcalIntercalibConstantsGPU_h +#define RecoLocalCalo_EcalRecProducers_src_EcalIntercalibConstantsGPU_h + +#include "CondFormats/EcalObjects/interface/EcalIntercalibConstants.h" + +#ifndef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" +#include "HeterogeneousCore/CUDACore/interface/ESProduct.h" +#endif + +class EcalIntercalibConstantsGPU { +public: + struct Product { + ~Product(); + float *values = nullptr; + }; + + #ifndef __CUDACC__ + // + EcalIntercalibConstantsGPU(EcalIntercalibConstants const&); + + // will call dealloation for Product thru ~Product + ~EcalIntercalibConstantsGPU() = default; + + // get device pointers + Product const& getProduct(cudaStream_t) const; + + // TODO: do this centrally + // get offset for hashes. equals number of barrel items + uint32_t getOffset() const { return valuesEB_.size(); } + + // + static std::string name() { return std::string{"ecalIntercalibConstantsGPU"}; } + +private: + std::vector const& valuesEB_; + std::vector const& valuesEE_; + + cms::cuda::ESProduct product_; + #endif +}; + + +#endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h new file mode 100644 index 0000000000000..53c8ea6ba67b7 --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h @@ -0,0 +1,58 @@ +#ifndef RecoLocalCalo_EcalRecProducers_src_EcalLaserAPDPNRatiosGPU_h +#define RecoLocalCalo_EcalRecProducers_src_EcalLaserAPDPNRatiosGPU_h + +#include "CondFormats/EcalObjects/interface/EcalLaserAPDPNRatios.h" + +#ifndef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" +#include "HeterogeneousCore/CUDACore/interface/ESProduct.h" +#endif + +class EcalLaserAPDPNRatiosGPU { +public: + struct Product { + ~Product(); + float *p1=nullptr; + float *p2=nullptr; + float *p3=nullptr; + edm::TimeValue_t *t1=nullptr; + edm::TimeValue_t *t2=nullptr; + edm::TimeValue_t *t3=nullptr; + }; + + #ifndef __CUDACC__ + + // + EcalLaserAPDPNRatiosGPU(EcalLaserAPDPNRatios const&); + + // will call dealloation for Product thru ~Product + ~EcalLaserAPDPNRatiosGPU() = default; + + // get device pointers + Product const& getProduct(cudaStream_t) const; + + // + static std::string name() { return std::string{"ecalLaserAPDPNRatiosGPU"}; } + +private: + // in the future, we need to arrange so to avoid this copy on the host + // store eb first then ee + std::vector > p1_; + std::vector > p2_; + std::vector > p3_; + + std::vector > t1_; + std::vector > t2_; + std::vector > t3_; + + cms::cuda::ESProduct product_; + + #endif +}; + + +#endif + + + + diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h new file mode 100644 index 0000000000000..191c78a7c4617 --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h @@ -0,0 +1,44 @@ +#ifndef RecoLocalCalo_EcalRecProducers_src_EcalLaserAPDPNRatiosRefGPU_h +#define RecoLocalCalo_EcalRecProducers_src_EcalLaserAPDPNRatiosRefGPU_h + +#include "CondFormats/EcalObjects/interface/EcalLaserAPDPNRatiosRef.h" + +#ifndef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" +#include "HeterogeneousCore/CUDACore/interface/ESProduct.h" +#endif + +class EcalLaserAPDPNRatiosRefGPU { +public: + struct Product { + ~Product(); + float *values = nullptr; + }; + + #ifndef __CUDACC__ + // + EcalLaserAPDPNRatiosRefGPU(EcalLaserAPDPNRatiosRef const&); + + // will call dealloation for Product thru ~Product + ~EcalLaserAPDPNRatiosRefGPU() = default; + + // get device pointers + Product const& getProduct(cudaStream_t) const; + + // TODO: do this centrally + // get offset for hashes. equals number of barrel items + uint32_t getOffset() const { return valuesEB_.size(); } + + // + static std::string name() { return std::string{"ecalLaserAPDPNRatiosRefGPU"}; } + +private: + std::vector const& valuesEB_; + std::vector const& valuesEE_; + + cms::cuda::ESProduct product_; + #endif +}; + + +#endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h new file mode 100644 index 0000000000000..ac97e6c514bac --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h @@ -0,0 +1,44 @@ +#ifndef RecoLocalCalo_EcalRecProducers_src_EcalLaserAlphasGPU_h +#define RecoLocalCalo_EcalRecProducers_src_EcalLaserAlphasGPU_h + +#include "CondFormats/EcalObjects/interface/EcalLaserAlphas.h" + +#ifndef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" +#include "HeterogeneousCore/CUDACore/interface/ESProduct.h" +#endif + +class EcalLaserAlphasGPU { +public: + struct Product { + ~Product(); + float *values = nullptr; + }; + + #ifndef __CUDACC__ + // + EcalLaserAlphasGPU(EcalLaserAlphas const&); + + // will call dealloation for Product thru ~Product + ~EcalLaserAlphasGPU() = default; + + // get device pointers + Product const& getProduct(cudaStream_t) const; + + // TODO: do this centrally + // get offset for hashes. equals number of barrel items + uint32_t getOffset() const { return valuesEB_.size(); } + + // + static std::string name() { return std::string{"ecalLaserAlphasGPU"}; } + +private: + std::vector const& valuesEB_; + std::vector const& valuesEE_; + + cms::cuda::ESProduct product_; + #endif +}; + + +#endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h new file mode 100644 index 0000000000000..41469bcf16c82 --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h @@ -0,0 +1,57 @@ +#ifndef RecoLocalCalo_EcalRecProducers_src_EcalLinearCorrectionsGPU_h +#define RecoLocalCalo_EcalRecProducers_src_EcalLinearCorrectionsGPU_h + +#include "CondFormats/EcalObjects/interface/EcalLinearCorrections.h" + +#ifndef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" +#include "HeterogeneousCore/CUDACore/interface/ESProduct.h" +#endif + +class EcalLinearCorrectionsGPU { +public: + struct Product { + ~Product(); + float *p1=nullptr; + float *p2=nullptr; + float *p3=nullptr; + edm::TimeValue_t *t1=nullptr; + edm::TimeValue_t *t2=nullptr; + edm::TimeValue_t *t3=nullptr; + }; + + #ifndef __CUDACC__ + + // + EcalLinearCorrectionsGPU(EcalLinearCorrections const&); + + // will call dealloation for Product thru ~Product + ~EcalLinearCorrectionsGPU() = default; + + // get device pointers + Product const& getProduct(cudaStream_t) const; + + // + static std::string name() { return std::string{"ecalLinearCorrectionsGPU"}; } + +private: + // in the future, we need to arrange so to avoid this copy on the host + // store eb first then ee + std::vector> p1_; + std::vector> p2_; + std::vector> p3_; + + std::vector> t1_; + std::vector> t2_; + std::vector> t3_; + + cms::cuda::ESProduct product_; + + #endif +}; + + +#endif + + + diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu index bc2b1300123dd..cf59775811486 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu @@ -77,7 +77,7 @@ namespace ecal { auto const did = DetId{dids[ch]}; auto const isBarrel = did.subdetId() == EcalBarrel; // TODO offset for ee, 0 for eb - auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId()); + auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId()); // // pulse shape template @@ -334,7 +334,7 @@ namespace ecal { bool tmp1 = hasSwitchToGain1[ch]; auto const did = DetId{dids[ch]}; auto const isBarrel = did.subdetId() == EcalBarrel; - auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId()); + auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId()); auto const G12SamplesCorrelation = isBarrel ? G12SamplesCorrelationEB : G12SamplesCorrelationEE; auto const* G6SamplesCorrelation = isBarrel ? G6SamplesCorrelationEB : G6SamplesCorrelationEE; auto const* G1SamplesCorrelation = isBarrel ? G1SamplesCorrelationEB : G1SamplesCorrelationEE; diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalADCToGeVConstantGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalADCToGeVConstantGPU.cc new file mode 100644 index 0000000000000..25ec93faad1e7 --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalADCToGeVConstantGPU.cc @@ -0,0 +1,37 @@ +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h" + +#include "FWCore/Utilities/interface/typelookup.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +EcalADCToGeVConstantGPU::EcalADCToGeVConstantGPU(EcalADCToGeVConstant const& values) +: adc2gev_(2) // size is 2, one form EB and one for EE +{ + adc2gev_[0] = values.getEBValue(); + adc2gev_[1] = values.getEEValue(); +} + +EcalADCToGeVConstantGPU::Product::~Product() { + // deallocation + cudaCheck( cudaFree(adc2gev) ); +} + +EcalADCToGeVConstantGPU::Product const& EcalADCToGeVConstantGPU::getProduct(cudaStream_t cudaStream) const { + auto const& product = product_.dataForCurrentDeviceAsync( + cudaStream, + [this](EcalADCToGeVConstantGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck( cudaMalloc((void**)&product.adc2gev, + this->adc2gev_.size() * sizeof(float)) ); + // transfer + cudaCheck( cudaMemcpyAsync(product.adc2gev, + this->adc2gev_.data(), + this->adc2gev_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + } + ); + + return product; +} + +TYPELOOKUP_DATA_REG(EcalADCToGeVConstantGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc new file mode 100644 index 0000000000000..c1cdc6631878b --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc @@ -0,0 +1,47 @@ +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h" + +#include "FWCore/Utilities/interface/typelookup.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +EcalChannelStatusGPU::EcalChannelStatusGPU(EcalChannelStatus const& values) +: status_(values.size()) +{ + // fill in eb + auto const& barrelValues = values.barrelItems(); + for (unsigned int i=0; istatus_.size() * sizeof(uint16_t)) ); + // transfer + cudaCheck( cudaMemcpyAsync(product.status, + this->status_.data(), + this->status_.size() * sizeof(uint16_t), + cudaMemcpyHostToDevice, + cudaStream) ); + } + ); + + return product; +} + +TYPELOOKUP_DATA_REG(EcalChannelStatusGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalIntercalibConstantsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalIntercalibConstantsGPU.cc new file mode 100644 index 0000000000000..844a28d27fd8e --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalIntercalibConstantsGPU.cc @@ -0,0 +1,44 @@ +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h" + +#include "FWCore/Utilities/interface/typelookup.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +EcalIntercalibConstantsGPU::EcalIntercalibConstantsGPU(EcalIntercalibConstants const& values) +: valuesEB_{values.barrelItems()} +, valuesEE_{values.endcapItems()} +{} + +EcalIntercalibConstantsGPU::Product::~Product() { + // deallocation + cudaCheck( cudaFree(values) ); +} + +EcalIntercalibConstantsGPU::Product const& EcalIntercalibConstantsGPU::getProduct(cudaStream_t cudaStream) const { + auto const& product = product_.dataForCurrentDeviceAsync(cudaStream, + [this](EcalIntercalibConstantsGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck( cudaMalloc((void**)&product.values, + (this->valuesEB_.size() + this->valuesEE_.size()) * + sizeof(float)) ); + + // offset in floats, not bytes + auto const offset = this->valuesEB_.size(); + + // transfer + cudaCheck( cudaMemcpyAsync(product.values, + this->valuesEB_.data(), + this->valuesEB_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.values + offset, + this->valuesEE_.data(), + this->valuesEE_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + } + ); + + return product; +} + +TYPELOOKUP_DATA_REG(EcalIntercalibConstantsGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosGPU.cc new file mode 100644 index 0000000000000..f54f7bd47c022 --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosGPU.cc @@ -0,0 +1,109 @@ +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h" + +#include "FWCore/Utilities/interface/typelookup.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +EcalLaserAPDPNRatiosGPU::EcalLaserAPDPNRatiosGPU(EcalLaserAPDPNRatios const& values) +: p1_(values.getLaserMap().size()) +, p2_(values.getLaserMap().size()) +, p3_(values.getLaserMap().size()) +, t1_(values.getTimeMap().size()) +, t2_(values.getTimeMap().size()) +, t3_(values.getTimeMap().size()) +{ + + // fill in eb + // auto const& barrelValues = values.barrelItems(); + for (unsigned int i=0; i EcalLaserTimeStampMap; + for (unsigned int i=0; ip1_.size() * sizeof(float)) ); + cudaCheck( cudaMalloc((void**)&product.p2, + this->p2_.size() * sizeof(float)) ); + cudaCheck( cudaMalloc((void**)&product.p3, + this->p3_.size() * sizeof(float)) ); + cudaCheck( cudaMalloc((void**)&product.t1, + this->t1_.size() * sizeof(edm::TimeValue_t)) ); + cudaCheck( cudaMalloc((void**)&product.t2, + this->t2_.size() * sizeof(edm::TimeValue_t)) ); + cudaCheck( cudaMalloc((void**)&product.t3, + this->t3_.size() * sizeof(edm::TimeValue_t)) ); + // transfer + cudaCheck( cudaMemcpyAsync(product.p1, + this->p1_.data(), + this->p1_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.p2, + this->p2_.data(), + this->p2_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.p3, + this->p3_.data(), + this->p3_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.t1, + this->t1_.data(), + this->t1_.size() * sizeof(edm::TimeValue_t), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.t2, + this->t2_.data(), + this->t2_.size() * sizeof(edm::TimeValue_t), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.t3, + this->t3_.data(), + this->t3_.size() * sizeof(edm::TimeValue_t), + cudaMemcpyHostToDevice, + cudaStream) ); + } + ); + + return product; + } + + TYPELOOKUP_DATA_REG(EcalLaserAPDPNRatiosGPU); + \ No newline at end of file diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosRefGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosRefGPU.cc new file mode 100644 index 0000000000000..c4c07361a8535 --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosRefGPU.cc @@ -0,0 +1,44 @@ +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h" + +#include "FWCore/Utilities/interface/typelookup.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +EcalLaserAPDPNRatiosRefGPU::EcalLaserAPDPNRatiosRefGPU(EcalLaserAPDPNRatiosRef const& values) +: valuesEB_{values.barrelItems()} +, valuesEE_{values.endcapItems()} +{} + +EcalLaserAPDPNRatiosRefGPU::Product::~Product() { + // deallocation + cudaCheck( cudaFree(values) ); +} + +EcalLaserAPDPNRatiosRefGPU::Product const& EcalLaserAPDPNRatiosRefGPU::getProduct(cudaStream_t cudaStream) const { + auto const& product = product_.dataForCurrentDeviceAsync(cudaStream, + [this](EcalLaserAPDPNRatiosRefGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck( cudaMalloc((void**)&product.values, + (this->valuesEB_.size() + this->valuesEE_.size()) * + sizeof(float)) ); + + // offset in floats, not bytes + auto const offset = this->valuesEB_.size(); + + // transfer + cudaCheck( cudaMemcpyAsync(product.values, + this->valuesEB_.data(), + this->valuesEB_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.values + offset, + this->valuesEE_.data(), + this->valuesEE_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + } + ); + + return product; +} + +TYPELOOKUP_DATA_REG(EcalLaserAPDPNRatiosRefGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAlphasGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAlphasGPU.cc new file mode 100644 index 0000000000000..24257fd8b547a --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAlphasGPU.cc @@ -0,0 +1,44 @@ +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h" + +#include "FWCore/Utilities/interface/typelookup.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +EcalLaserAlphasGPU::EcalLaserAlphasGPU(EcalLaserAlphas const& values) +: valuesEB_{values.barrelItems()} +, valuesEE_{values.endcapItems()} +{} + +EcalLaserAlphasGPU::Product::~Product() { + // deallocation + cudaCheck( cudaFree(values) ); +} + +EcalLaserAlphasGPU::Product const& EcalLaserAlphasGPU::getProduct(cudaStream_t cudaStream) const { + auto const& product = product_.dataForCurrentDeviceAsync(cudaStream, + [this](EcalLaserAlphasGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck( cudaMalloc((void**)&product.values, + (this->valuesEB_.size() + this->valuesEE_.size()) * + sizeof(float)) ); + + // offset in floats, not bytes + auto const offset = this->valuesEB_.size(); + + // transfer + cudaCheck( cudaMemcpyAsync(product.values, + this->valuesEB_.data(), + this->valuesEB_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.values + offset, + this->valuesEE_.data(), + this->valuesEE_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + } + ); + + return product; +} + +TYPELOOKUP_DATA_REG(EcalLaserAlphasGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc new file mode 100644 index 0000000000000..2dedb1074bee7 --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc @@ -0,0 +1,102 @@ +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h" + +#include "FWCore/Utilities/interface/typelookup.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +EcalLinearCorrectionsGPU::EcalLinearCorrectionsGPU(EcalLinearCorrections const& values) +: p1_(values.getValueMap().size()) +, p2_(values.getValueMap().size()) +, p3_(values.getValueMap().size()) +, t1_(values.getTimeMap().size()) +, t2_(values.getTimeMap().size()) +, t3_(values.getTimeMap().size()) +{ + + // fill in eb + for (unsigned int i=0; i EcalLaserTimeStampMap; + for (unsigned int i=0; ip1_.size() * sizeof(float)) ); + cudaCheck( cudaMalloc((void**)&product.p2, + this->p2_.size() * sizeof(float)) ); + cudaCheck( cudaMalloc((void**)&product.p3, + this->p3_.size() * sizeof(float)) ); + cudaCheck( cudaMalloc((void**)&product.t1, + this->t1_.size() * sizeof(edm::TimeValue_t)) ); + cudaCheck( cudaMalloc((void**)&product.t2, + this->t2_.size() * sizeof(edm::TimeValue_t)) ); + cudaCheck( cudaMalloc((void**)&product.t3, + this->t3_.size() * sizeof(edm::TimeValue_t)) ); + // transfer + cudaCheck( cudaMemcpyAsync(product.p1, + this->p1_.data(), + this->p1_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.p2, + this->p2_.data(), + this->p2_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.p3, + this->p3_.data(), + this->p3_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.t1, + this->t1_.data(), + this->t1_.size() * sizeof(edm::TimeValue_t), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.t2, + this->t2_.data(), + this->t2_.size() * sizeof(edm::TimeValue_t), + cudaMemcpyHostToDevice, + cudaStream) ); + cudaCheck( cudaMemcpyAsync(product.t3, + this->t3_.data(), + this->t3_.size() * sizeof(edm::TimeValue_t), + cudaMemcpyHostToDevice, + cudaStream) ); + } + ); + + return product; + } + + TYPELOOKUP_DATA_REG(EcalLinearCorrectionsGPU); + \ No newline at end of file diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu new file mode 100644 index 0000000000000..ab67ceb46fc0f --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu @@ -0,0 +1,734 @@ +#include "cuda.h" + +#include "KernelHelpers.h" + +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h" + +// +// +#include "EcalRecHitBuilderKernels.h" + + +#include "KernelHelpers.h" + + + + +namespace ecal { + namespace rechit { + + + // uncalibrecHit flags + enum UncalibRecHitFlags { + kGood=-1, // channel is good (mutually exclusive with other states) setFlagBit(kGood) reset flags_ to zero + kPoorReco, // channel has been badly reconstructed (e.g. bad shape, bad chi2 etc.) + kSaturated, // saturated channel + kOutOfTime, // channel out of time + kLeadingEdgeRecovered, // saturated channel: energy estimated from the leading edge before saturation + kHasSwitchToGain6, // at least one data frame is in G6 + kHasSwitchToGain1 // at least one data frame is in G1 + }; + + + // recHit flags + enum RecHitFlags { + RecHitFlags_kGood=0, // channel ok, the energy and time measurement are reliable + RecHitFlags_kPoorReco, // the energy is available from the UncalibRecHit, but approximate (bad shape, large chi2) + RecHitFlags_kOutOfTime, // the energy is available from the UncalibRecHit (sync reco), but the event is out of time + RecHitFlags_kFaultyHardware, // The energy is available from the UncalibRecHit, channel is faulty at some hardware level (e.g. noisy) + RecHitFlags_kNoisy, // the channel is very noisy + RecHitFlags_kPoorCalib, // the energy is available from the UncalibRecHit, but the calibration of the channel is poor + RecHitFlags_kSaturated, // saturated channel (recovery not tried) + RecHitFlags_kLeadingEdgeRecovered, // saturated channel: energy estimated from the leading edge before saturation + RecHitFlags_kNeighboursRecovered, // saturated/isolated dead: energy estimated from neighbours + RecHitFlags_kTowerRecovered, // channel in TT with no data link, info retrieved from Trigger Primitive + RecHitFlags_kDead, // channel is dead and any recovery fails + RecHitFlags_kKilled, // MC only flag: the channel is killed in the real detector + RecHitFlags_kTPSaturated, // the channel is in a region with saturated TP + RecHitFlags_kL1SpikeFlag, // the channel is in a region with TP with sFGVB = 0 + RecHitFlags_kWeird, // the signal is believed to originate from an anomalous deposit (spike) + RecHitFlags_kDiWeird, // the signal is anomalous, and neighbors another anomalous signal + RecHitFlags_kHasSwitchToGain6, // at least one data frame is in G6 + RecHitFlags_kHasSwitchToGain1, // at least one data frame is in G1 + // + RecHitFlags_kUnknown // to ease the interface with functions returning flags. + }; + + + // status code + enum EcalChannelStatusCode_Code { + kOk=0, + kDAC, + kNoLaser, + kNoisy, + kNNoisy, + kNNNoisy, + kNNNNoisy, + kNNNNNoisy, + kFixedG6, + kFixedG1, + kFixedG0, + kNonRespondingIsolated, + kDeadVFE, + kDeadFE, + kNoDataNoTP + }; + + + + + + __global__ + void kernel_create_ecal_rehit( + // configuration + int const* ChannelStatusToBeExcluded, + uint32_t ChannelStatusToBeExcludedSize, + bool const killDeadChannels, + bool const recoverEBIsolatedChannels, + bool const recoverEEIsolatedChannels, + bool const recoverEBVFE, + bool const recoverEEVFE, + bool const recoverEBFE, + bool const recoverEEFE, + float const EBLaserMIN, + float const EELaserMIN, + float const EBLaserMAX, + float const EELaserMAX, + // for flags setting + int const* expanded_v_DB_reco_flags, // FIXME AM: to be checked + uint32_t const* expanded_Sizes_v_DB_reco_flags, + uint32_t const* expanded_flagbit_v_DB_reco_flags, + uint32_t expanded_v_DB_reco_flagsSize, + uint32_t flagmask, + // conditions + float const* adc2gev, + float const* intercalib, + uint16_t const* status, + float const* apdpnrefs, + float const* alphas, + // input for transparency corrections + float const* p1, + float const* p2, + float const* p3, + edm::TimeValue_t const* t1, + edm::TimeValue_t const* t2, + edm::TimeValue_t const* t3, + // input for linear corrections + float const* lp1, + float const* lp2, + float const* lp3, + edm::TimeValue_t const* lt1, + edm::TimeValue_t const* lt2, + edm::TimeValue_t const* lt3, + // time, used for time dependent corrections + edm::TimeValue_t const event_time, + // input + uint32_t const* did_eb, + uint32_t const* did_ee, + ::ecal::reco::StorageScalarType const* amplitude_eb, // in adc counts + ::ecal::reco::StorageScalarType const* amplitude_ee, // in adc counts + ::ecal::reco::StorageScalarType const* time_eb, + ::ecal::reco::StorageScalarType const* time_ee, + ::ecal::reco::StorageScalarType const* chi2_eb, + ::ecal::reco::StorageScalarType const* chi2_ee, + uint32_t const* flags_eb, + uint32_t const* flags_ee, + // output + uint32_t *did, + ::ecal::reco::StorageScalarType* energy, // in energy [GeV] + ::ecal::reco::StorageScalarType* time, + ::ecal::reco::StorageScalarType* chi2, + uint32_t* flagBits, + uint32_t* extra, + // other + int const nchannels, + uint32_t const offsetForInput, + uint32_t const offsetForHashes + ) { + + + // + // NB: energy "type_wrapper::type" most likely std::vector + // + + int ch = threadIdx.x + blockDim.x*blockIdx.x; + + if (ch < nchannels) { + + int const inputCh = ch >= offsetForInput + ? ch - offsetForInput + : ch; + + uint32_t const * didCh = ch >= offsetForInput + ? did_ee + : did_eb; + + // only two values, EB or EE + // AM : FIXME : why not using "isBarrel" ? isBarrel ? adc2gev[0] : adc2gev[1] + float adc2gev_to_use = ch >= offsetForInput + ? adc2gev[1] // ee + : adc2gev[0]; // eb + + + // first EB and then EE + + ::ecal::reco::StorageScalarType const* amplitude = ch >= offsetForInput + ? amplitude_ee + : amplitude_eb; + + ::ecal::reco::StorageScalarType const* time_in = ch >= offsetForInput + ? time_ee + : time_eb; + + ::ecal::reco::StorageScalarType const* chi2_in = ch >= offsetForInput + ? chi2_ee + : chi2_eb; + + uint32_t const* flags_in = ch >= offsetForInput + ? flags_ee + : flags_eb; + + // simple copy + did[ch] = didCh[inputCh]; + + auto const did_to_use = DetId{didCh[inputCh]}; + + auto const isBarrel = did_to_use.subdetId() == EcalBarrel; + auto const hashedId = isBarrel + ? ecal::reconstruction::hashedIndexEB(did_to_use.rawId()) + : offsetForHashes + ecal::reconstruction::hashedIndexEE(did_to_use.rawId()); + + float const intercalib_to_use = intercalib[hashedId]; + + + // get laser coefficient + float lasercalib = 1.; + + // + // AM: ideas + // + // One possibility is to create the map of laser corrections once on CPU + // for all crystals and push them on GPU. + // Then only if the LS is different, update the laser correction + // The variation within a LS is not worth pursuing (<< 0.1% !!) + // and below the precision we can claim on the laser corrections (right?). + // This will save quite some time (also for the CPU version?) + // + + int iLM = 1; + + if (isBarrel) { + iLM = ecal::reconstruction::laser_monitoring_region_EB (did_to_use.rawId()); + } + else { + iLM = ecal::reconstruction::laser_monitoring_region_EE (did_to_use.rawId()); + } + + + long long t_i = 0, t_f = 0; + float p_i = 0, p_f = 0; + long long lt_i = 0, lt_f = 0; + float lp_i = 0, lp_f = 0; + + // laser + if (event_time >= t1[iLM - 1] && event_time < t2[iLM - 1]) { + t_i = t1[iLM - 1]; + t_f = t2[iLM - 1]; + p_i = p1[hashedId]; + p_f = p2[hashedId]; + } else if (event_time >= t2[iLM - 1] && event_time <= t3[iLM - 1]) { + t_i = t2[iLM - 1]; + t_f = t3[iLM - 1]; + p_i = p2[hashedId]; + p_f = p3[hashedId]; + } else if (event_time < t1[iLM - 1]) { + t_i = t1[iLM - 1]; + t_f = t2[iLM - 1]; + p_i = p1[hashedId]; + p_f = p2[hashedId]; + + } else if (event_time > t3[iLM - 1]) { + t_i = t2[iLM - 1]; + t_f = t3[iLM - 1]; + p_i = p2[hashedId]; + p_f = p3[hashedId]; + } + + + // linear corrections + if (event_time >= lt1[iLM - 1] && event_time < lt2[iLM - 1]) { + lt_i = lt1[iLM - 1]; + lt_f = lt2[iLM - 1]; + lp_i = lp1[hashedId]; + lp_f = lp2[hashedId]; + } else if (event_time >= lt2[iLM - 1] && event_time <= lt3[iLM - 1]) { + lt_i = lt2[iLM - 1]; + lt_f = lt3[iLM - 1]; + lp_i = lp2[hashedId]; + lp_f = lp3[hashedId]; + } else if (event_time < lt1[iLM - 1]) { + lt_i = lt1[iLM - 1]; + lt_f = lt2[iLM - 1]; + lp_i = lp1[hashedId]; + lp_f = lp2[hashedId]; + + } else if (event_time > lt3[iLM - 1]) { + lt_i = lt2[iLM - 1]; + lt_f = lt3[iLM - 1]; + lp_i = lp2[hashedId]; + lp_f = lp3[hashedId]; + } + + + // apdpnref and alpha + float apdpnref = apdpnrefs[hashedId]; + float alpha = alphas[hashedId]; + + // now calculate transparency correction + if (apdpnref != 0 && (t_i - t_f) != 0 && (lt_i - lt_f) != 0) { + long long tt = event_time; // never subtract two unsigned! + float interpolatedLaserResponse = p_i / apdpnref + float(tt - t_i) * (p_f - p_i) / (apdpnref * float(t_f - t_i)); + + float interpolatedLinearResponse = lp_i / apdpnref + float(tt - lt_i) * (lp_f - lp_i) / (apdpnref * float(lt_f - lt_i)); // FIXED BY FC + + if (interpolatedLinearResponse > 2.f || interpolatedLinearResponse < 0.1f) { + interpolatedLinearResponse = 1.f; + } + if (interpolatedLaserResponse <= 0.) { + // AM : how the heck is it possible? + // interpolatedLaserResponse = 0.0001; + lasercalib = 1.; + + } + else { + + float interpolatedTransparencyResponse = interpolatedLaserResponse / interpolatedLinearResponse; + + // ... and now this: + lasercalib = 1.f / ( std::pow(interpolatedTransparencyResponse, alpha) * interpolatedLinearResponse); + + } + } + + // + // Check for channels to be excluded from reconstruction + // + // + // Default energy? Not to be updated if "ChannelStatusToBeExcluded" + // Exploited later by the module "EcalRecHitConvertGPU2CPUFormat" + // + energy[ch] = -1; //---- AM: default, un-physical, ok + + // + static const int chStatusMask = 0x1F; + // ChannelStatusToBeExcluded is a "int" then I put "dbstatus" to be the same + int dbstatus = EcalChannelStatusCode_Code( (status[hashedId]) & chStatusMask ); + if (ChannelStatusToBeExcludedSize != 0) { + for (int ich_to_check = 0; ich_to_check recHit flagbits and return the apporpriate flagbit word + + // + // AM: get the smaller "flagbit_counter" with match + // + + uint32_t temporary_flagBits = 0; + + int iterator_flags = 0; + bool need_to_exit = false; + int flagbit_counter = 0; + while (!need_to_exit) { + iterator_flags = 0; + for (unsigned int i = 0; i != expanded_v_DB_reco_flagsSize; ++i) { + // check the correct "flagbit" + if (expanded_flagbit_v_DB_reco_flags[i] == flagbit_counter) { + + for (unsigned int j = 0; j < expanded_Sizes_v_DB_reco_flags[i]; j++) { + + if ( expanded_v_DB_reco_flags[iterator_flags] == dbstatus ) { + temporary_flagBits = 0x1 << expanded_flagbit_v_DB_reco_flags[i]; + need_to_exit = true; + break; // also from the big loop!!! + + } + iterator_flags++; + } + } + else { + // if not, got to the next bunch directly + iterator_flags += expanded_Sizes_v_DB_reco_flags[i]; + } + + if (need_to_exit) { + break; + } + + } + flagbit_counter+=1; + } + + + if ( (flagmask & temporary_flagBits) && killDeadChannels ) { + return; + } + + + // + flagBits[ch] = temporary_flagBits; + + // + // multiply the adc counts with factors to get GeV + // + + // energy[ch] = amplitude[inputCh] * adc2gev_to_use * intercalib_to_use ; + energy[ch] = amplitude[inputCh] * adc2gev_to_use * intercalib_to_use * lasercalib; + + // Time is not saved so far, FIXME + // time[ch] = time_in[inputCh]; + + + if (chi2_in[inputCh] > 64) chi2[ch] = 64; + else chi2[ch] = chi2_in[inputCh]; + + + // FIXME: calculate the "flagBits extra" --> not really "flags", but actually an encoded version of energy uncertainty, time unc., ... + extra[ch] = 0; + + // + // extra packing ... + // + + uint32_t offset; + uint32_t width; + uint32_t value; + + float chi2_temp = chi2[ch]; + if (chi2_temp > 64) chi2_temp = 64; + // use 7 bits + uint32_t rawChi2 = lround(chi2_temp / 64. * ((1<<7)-1)); + + offset = 0; + width = 7; + value = 0; + + uint32_t mask = ((1 << width) - 1) << offset; + value &= ~mask; + value |= (rawChi2 & ((1U << width) - 1)) << offset; + + // extra[ch] = value; + // + + // rawEnergy is actually "error" !!! + uint32_t rawEnergy = 0; + + + // AM: FIXME: this is not propagated currently to the uncalibrecHit collection SOA + // if you want to store this in "extra", we need first to add it to the uncalibrecHit results + // then it will be something like the following + // amplitudeError[inputCh] * adc2gev_to_use * intercalib_to_use * lasercalib + // + // + + float amplitudeError_ch = 0. ; // amplitudeError[ch]; + + if (amplitudeError_ch > 0.001) { + // uint16_t exponent = getPower10(amplitudeError_ch); + + static constexpr float p10[] = {1.e-2f,1.e-1f,1.f,1.e1f,1.e2f,1.e3f,1.e4f,1.e5f,1.e6f}; + int b = amplitudeError_ch + // + // bool EcalUncalibratedRecHit::isSaturated() const { + // return EcalUncalibratedRecHit::checkFlag(kSaturated); + // } + // + // + + if ( flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kSaturated) ) ) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kSaturated)); + good = false; + } + + if (flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kOutOfTime) ) ) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kOutOfTime)); + good = false; + } + if (flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kPoorReco) ) ) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorReco)); + good = false; + } + if (flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kHasSwitchToGain6) ) ) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kHasSwitchToGain6)); + } + if (flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kHasSwitchToGain1) ) ) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kHasSwitchToGain1)); + } + + + if (good) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kGood)); + } + + if (isBarrel && (lasercalib < EBLaserMIN || lasercalib > EBLaserMAX)) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorCalib)); + + } + if (!isBarrel && (lasercalib < EELaserMIN || lasercalib > EELaserMAX)) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorCalib)); + } + + + + // recover, killing, and other stuff + + // + // Structure: + // EB + // EE + // + // + // - single MVA + // - democratic sharing + // - kill all the other cases + // + + bool is_Single = false; + bool is_FE = false; + bool is_VFE = false; + + bool is_recoverable = false; // DetIdToBeRecovered + + if ( dbstatus == 10 || dbstatus == 11 || dbstatus == 12 ) { + is_recoverable = true; + } + + + if (is_recoverable) { + if (dbstatus == EcalChannelStatusCode_Code::kDeadVFE) { + is_VFE = true; + } + else if (dbstatus == EcalChannelStatusCode_Code::kDeadVFE) { + is_FE = true; + } + else { + is_Single = true; + } + + + // EB + if (isBarrel) { + if (is_Single || is_FE || is_VFE) { + // single MVA + if (is_Single && (recoverEBIsolatedChannels || !killDeadChannels) ) { + + + } + // decmocratic sharing + else if (is_FE && (recoverEBFE || !killDeadChannels) ) { + + + } + // kill all the other cases + else { + energy[ch] = 0.; // Need to set also the flags ... + } + } + } + // EE + else { + if (is_Single || is_FE || is_VFE) { + // single MVA + if (is_Single && (recoverEBIsolatedChannels || !killDeadChannels) ) { + + + } + // decmocratic sharing + else if (is_FE && (recoverEBFE || !killDeadChannels) ) { + + // + // Code is definitely too long ... + // + + } + // kill all the other cases + else { + energy[ch] = 0.; // Need to set also the flags ... + } + } + } + + } + + + } // end channel + + } + + + + // host version, to be called by the plugin + void create_ecal_rehit( + EventInputDataGPU const& eventInputGPU, + EventOutputDataGPU& eventOutputGPU, + // eventDataForScratchGPU_, + ConditionsProducts const& conditions, + ConfigurationParameters const& configParameters, + uint32_t const offsetForInput, + edm::TimeValue_t const event_time, + cudaStream_t cudaStream + ){ + +// int nchannels = eventInputGPU.ebUncalibRecHits.size + eventInputGPU.eeUncalibRecHits.size ; //---- AM FIXME Once the PR by Viktor is integrated. The following is bad! + int nchannels = 100 ; + + unsigned int nchannels_per_block = 32; + unsigned int threads_1d = nchannels_per_block; + unsigned int blocks_1d = (nchannels + threads_1d) / threads_1d; // TEST : to be optimized (AM) + + + // + // kernel create rechit + // + kernel_create_ecal_rehit <<< blocks_1d, threads_1d >>> ( + // configuration + configParameters.ChannelStatusToBeExcluded, + configParameters.ChannelStatusToBeExcludedSize, + configParameters.killDeadChannels, + configParameters.recoverEBIsolatedChannels, + configParameters.recoverEEIsolatedChannels, + configParameters.recoverEBVFE, + configParameters.recoverEEVFE, + configParameters.recoverEBFE, + configParameters.recoverEEFE, + configParameters.EBLaserMIN, + configParameters.EELaserMIN, + configParameters.EBLaserMAX, + configParameters.EELaserMAX, + // for flags setting + configParameters.expanded_v_DB_reco_flags, + configParameters.expanded_Sizes_v_DB_reco_flags, + configParameters.expanded_flagbit_v_DB_reco_flags, + configParameters.expanded_v_DB_reco_flagsSize, + configParameters.flagmask, + // conditions + conditions.ADCToGeV.adc2gev, + conditions.Intercalib.values, + conditions.ChannelStatus.status, + conditions.LaserAPDPNRatiosRef.values, + conditions.LaserAlphas.values, + // input for transparency corrections + conditions.LaserAPDPNRatios.p1, + conditions.LaserAPDPNRatios.p2, + conditions.LaserAPDPNRatios.p3, + conditions.LaserAPDPNRatios.t1, + conditions.LaserAPDPNRatios.t2, + conditions.LaserAPDPNRatios.t3, + // input for linear corrections + conditions.LinearCorrections.p1, + conditions.LinearCorrections.p2, + conditions.LinearCorrections.p3, + conditions.LinearCorrections.t1, + conditions.LinearCorrections.t2, + conditions.LinearCorrections.t3, + // time, used for time dependent corrections + event_time, + // input + eventInputGPU.ebUncalibRecHits.did, + eventInputGPU.eeUncalibRecHits.did, + eventInputGPU.ebUncalibRecHits.amplitude, + eventInputGPU.eeUncalibRecHits.amplitude, + eventInputGPU.ebUncalibRecHits.jitter, + eventInputGPU.eeUncalibRecHits.jitter, + eventInputGPU.ebUncalibRecHits.chi2, + eventInputGPU.eeUncalibRecHits.chi2, + eventInputGPU.ebUncalibRecHits.flags, + eventInputGPU.eeUncalibRecHits.flags, + // output + eventOutputGPU.did, + eventOutputGPU.energy, + eventOutputGPU.time, + eventOutputGPU.chi2, + eventOutputGPU.flagBits, + eventOutputGPU.extra, + // other + nchannels, + offsetForInput, + conditions.offsetForHashes + ); + + + + } + + + } + +} diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h new file mode 100644 index 0000000000000..587abe0575883 --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h @@ -0,0 +1,97 @@ +// +// Builder of ECAL RecHits on GPU +// + +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" + +#include "RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/Common.h" + +#include "DataFormats/Provenance/interface/Timestamp.h" + + +namespace ecal { + namespace rechit { + + + __global__ + void kernel_create_ecal_rehit( + // configuration + int const* ChannelStatusToBeExcluded, + uint32_t ChannelStatusToBeExcludedSize, + bool killDeadChannels, + bool const recoverEBIsolatedChannels, + bool const recoverEEIsolatedChannels, + bool const recoverEBVFE, + bool const recoverEEVFE, + bool const recoverEBFE, + bool const recoverEEFE, + // for flags setting + int const* expanded_v_DB_reco_flags, + uint32_t const* expanded_Sizes_v_DB_reco_flags, + uint32_t const* expanded_flagbit_v_DB_reco_flags, + uint32_t expanded_v_DB_reco_flagsSize, + uint32_t flagmask, + // conditions + float const* adc2gev, + float const* intercalib, + uint16_t const* status, + float const* apdpnrefs, + float const* alphas, + // input for transparency corrections + float const* p1, + float const* p2, + float const* p3, + edm::TimeValue_t const* t1, + edm::TimeValue_t const* t2, + edm::TimeValue_t const* t3, + // input for linear corrections + float const* lp1, + float const* lp2, + float const* lp3, + edm::TimeValue_t const* lt1, + edm::TimeValue_t const* lt2, + edm::TimeValue_t const* lt3, + // time, used for time dependent corrections + edm::TimeValue_t const event_time, + // input + uint32_t const* did_eb, + uint32_t const* did_ee, + ::ecal::reco::StorageScalarType const* amplitude_eb, // in adc counts + ::ecal::reco::StorageScalarType const* amplitude_ee, // in adc counts + ::ecal::reco::StorageScalarType const* time_eb, + ::ecal::reco::StorageScalarType const* time_ee, + ::ecal::reco::StorageScalarType const* chi2_eb, + ::ecal::reco::StorageScalarType const* chi2_ee, + uint32_t const* flags_eb, + uint32_t const* flags_ee, + // output + uint32_t *did, + ::ecal::reco::StorageScalarType* energy, // in energy [GeV] + ::ecal::reco::StorageScalarType* time, + ::ecal::reco::StorageScalarType* chi2, + uint32_t* flagBits, + uint32_t* extra, + int const nchannels, + uint32_t const offsetForInput, + uint32_t const offsetForHashes + ); + + + // host version, to be called by the plugin + + void create_ecal_rehit( + EventInputDataGPU const& eventInputGPU, + EventOutputDataGPU& eventOutputGPU, + // eventDataForScratchGPU_, + ConditionsProducts const& conditions, + ConfigurationParameters const& configParameters, + uint32_t const offsetForInput, + edm::TimeValue_t const event_time, + cudaStream_t cudaStream + ); + + } + +} + diff --git a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu index b85f002464f65..b6aee22e7da6f 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu @@ -4,7 +4,7 @@ #include "DataFormats/EcalDetId/interface/EEDetId.h" namespace ecal { - namespace multifit { + namespace reconstruction { namespace internal { @@ -16,6 +16,137 @@ namespace ecal { __device__ __forceinline__ uint32_t iphi(uint32_t id) { return id & 0x1FF; } + + + __device__ int dccFromSm(int ism) { + int iz = 1; + if (ism > 18) + iz = -1; + if (iz == -1) + ism -= 18; + int idcc = 9 + ism; + if (iz == +1) + idcc += 18; + return idcc; + } + + __device__ int sm(int ieta, int iphi) { + int iz = 1; + if (ieta < 0) + iz = -1; + ieta *= iz; + int iphi_ = iphi; + if (iphi_ > 360) + iphi_ -= 360; + int ism = (iphi_ - 1) / 20 + 1; + if (iz == -1) + ism += 18; + return ism; + } + + + __device__ int dcc(int ieta, int iphi) { + int ism = sm(ieta, iphi); + return dccFromSm(ism); + } + + + + + // + // ---- why on hell things are so complex and not simple ??? + // + + + __device__ int lm_channel (int iX, int iY) { + + static const int idx_[] = { + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + 1, 2, 2, 2, 2, 4, 4, 4, 4, + 6, 6, 6, 6, 8, 8, 8, 8, // 3 + 1, 2, 2, 2, 2, 4, 4, 4, 4, + 6, 6, 6, 6, 8, 8, 8, 8, // 2 + 1, 3, 3, 3, 3, 5, 5, 5, 5, + 7, 7, 7, 7, 9, 9, 9, 9, // 1 + 1, 3, 3, 3, 3, 5, 5, 5, 5, + 7, 7, 7, 7, 9, 9, 9, 9 // 0 + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + }; + + int iym, ixm, il, ic, ii; + iym = 4; + ixm = 17; + int iX_ = iX + 1; + int iY_ = iY + 1; + il = iym - iY_; + ic = iX_ - 1; + ii = il * ixm + ic; + if (ii < 0 || ii > (int)(sizeof(idx_) / sizeof(int))) { + return -1; + }; + return idx_[ii]; + + } + + + + __device__ int localCoord_x (int ieta, int iphi) { + int iz = 1; + if (ieta < 0) { + iz = -1; + } + ieta *= iz; + // int iphi_ = iphi; + // if (iphi_ > 360) { + // iphi_ -= 360; + // } + int ix = ieta - 1; + // int iy = (iphi_ - 1) % 20; + // if (iz == -1) { + // iy = 19 - iy; + // } + + return ix; + } + + + __device__ int localCoord_y (int ieta, int iphi) { + int iz = 1; + if (ieta < 0) { + iz = -1; + } + // ieta *= iz; + int iphi_ = iphi; + if (iphi_ > 360) { + iphi_ -= 360; + } + // int ix = ieta - 1; + int iy = (iphi_ - 1) % 20; + if (iz == -1) { + iy = 19 - iy; + } + + return iy; + } + + + __device__ int lmmod (int ieta, int iphi) { + + int ix = localCoord_x(ieta, iphi); + int iy = localCoord_y(ieta, iphi); + + return lm_channel(ix / 5, iy / 5); + } + + + + __device__ int side (int ieta, int iphi) { + int ilmmod = lmmod(ieta, iphi); + return (ilmmod % 2 == 0) ? 1 : 0; + } + + + } // namespace barrel } // namespace internal @@ -25,6 +156,41 @@ namespace ecal { return (EBDetId::MAX_IETA + (positiveZ(id) ? ietaAbs(id) - 1 : -ietaAbs(id))) * EBDetId::MAX_IPHI + iphi(id) - 1; } + + + // + // https://cmssdt.cern.ch/lxr/source/CalibCalorimetry/EcalLaserAnalyzer/src/MEEBGeom.cc + // function: "lmr" + + __device__ + int laser_monitoring_region_EB(uint32_t id) { + using namespace internal::barrel; + + int ieta; + if (positiveZ(id)) { + ieta = ietaAbs(id); + } + else { + ieta = - ietaAbs(id); + } + + int idcc = dcc(ieta, (int) (iphi(id)) ); + int ism = idcc - 9; + + int iside = side(ieta, (int) (iphi(id)) ); + // int iside = positiveZ(id) ? 1 : 0; + + return ( 1 + 2 * (ism - 1) + iside ); + // return ieta; + // return (int) (iphi(id)); + // return idcc; + // return iside; + + } + + + + namespace internal { namespace endcap { @@ -60,6 +226,96 @@ namespace ecal { 6614, 6649, 6684, 6719, 6754, 6784, 6814, 6844, 6874, 6904, 6934, 6964, 6994, 7024, 7054, 7079, 7104, 7129, 7154, 7179, 7204, 7219, 7234, 7249, 7264, 7274, 7284, 7294, 7304, 7314}; + + __device__ int quadrant(int iX, int iY) { + bool near = iX >= 11; + bool far = !near; + bool top = iY >= 11; + bool bot = !top; + + int iquad = 0; + if (near && top) + iquad = 1; + if (far && top) + iquad = 2; + if (far && bot) + iquad = 3; + if (near && bot) + iquad = 4; + + return iquad; + } + + __device__ int sector(int iX, int iY) { + // Y (towards the surface) + // T + // | + // | + // | + // o---------| X (towards center of LHC) + // + static const int idx_[] = { + // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 9, + 9, 9, 0, 0, 0, 0, 0, 0, 0, // 20 + 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 9, + 9, 9, 9, 9, 9, 0, 0, 0, 0, // 19 + 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 9, + 9, 9, 9, 9, 9, 8, 0, 0, 0, // 18 + 0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 9, + 9, 9, 9, 9, 8, 8, 8, 0, 0, // 17 + 0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 9, + 9, 9, 9, 9, 8, 8, 8, 8, 0, // 16 + 0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 9, + 9, 9, 9, 8, 8, 8, 8, 8, 0, // 15 + 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 9, + 9, 9, 8, 8, 8, 8, 8, 8, 0, // 14 + 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 9, + 9, 8, 8, 8, 8, 8, 8, 8, 8, // 13 + 3, 3, 2, 2, 2, 2, 2, 2, 2, 0, 0, + 8, 8, 8, 8, 8, 8, 8, 7, 7, // 12 + 3, 3, 3, 3, 3, 3, 3, 2, 0, 0, 0, + 0, 8, 7, 7, 7, 7, 7, 7, 7, // 11 + 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, + 0, 7, 7, 7, 7, 7, 7, 7, 7, // 10 + 3, 3, 3, 3, 3, 3, 3, 4, 4, 0, 0, + 6, 6, 7, 7, 7, 7, 7, 7, 7, // 9 + 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, + 6, 6, 6, 7, 7, 7, 7, 7, 7, // 8 + 0, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, + 6, 6, 6, 6, 6, 7, 7, 7, 0, // 7 + 0, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, + 5, 6, 6, 6, 6, 6, 6, 7, 0, // 6 + 0, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, + 5, 6, 6, 6, 6, 6, 6, 6, 0, // 5 + 0, 0, 4, 4, 4, 4, 4, 4, 5, 5, 5, + 5, 6, 6, 6, 6, 6, 6, 0, 0, // 4 + 0, 0, 0, 4, 4, 4, 4, 5, 5, 5, 5, + 5, 5, 6, 6, 6, 6, 0, 0, 0, // 3 + 0, 0, 0, 0, 4, 4, 4, 5, 5, 5, 5, + 5, 5, 6, 6, 6, 0, 0, 0, 0, // 2 + 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, + 5, 5, 0, 0, 0, 0, 0, 0, 0 // 1 + // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 + }; + + int iym, ixm, il, ic, ii; + iym = 20; + ixm = 20; + int iX_ = iX; + int iY_ = iY; + il = iym - iY_; + ic = iX_ - 1; + ii = il * ixm + ic; + + if (ii < 0 || ii > (int)(sizeof(idx_) / sizeof(int)) || idx_[ii] == 0) { + return -1; + }; + return idx_[ii]; + } + + + } // namespace endcap } // namespace internal @@ -72,5 +328,53 @@ namespace ecal { return ((positiveZ(id) ? EEDetId::kEEhalf : 0) + kdi[jd] + jx - kxf[jd]); } - } // namespace multifit + + + + // + // https://cmssdt.cern.ch/lxr/source/CalibCalorimetry/EcalLaserAnalyzer/src/MEEEGeom.cc + // https://github.com/cms-sw/cmssw/blob/master/CalibCalorimetry/EcalLaserCorrection/src/EcalLaserDbService.cc + // + + __device__ + int laser_monitoring_region_EE(uint32_t id) { + using namespace internal::endcap; + + // SuperCrysCoord + uint32_t iX = (ix(id) - 1) / 5 + 1; + uint32_t iY = (iy(id) - 1) / 5 + 1; + + // Correct convention + // * @param iz iz/zside index: -1 for EE-, +1 for EE+ + // https://github.com/cms-sw/cmssw/blob/master/DataFormats/EcalDetId/interface/EEDetId.h#L68-L71 + // zside in https://github.com/cms-sw/cmssw/blob/master/CalibCalorimetry/EcalLaserCorrection/src/EcalLaserDbService.cc#L63 + // + int iz = positiveZ(id) ? 1 : -1; + + int iquad = quadrant(iX, iY); + int isect = sector(iX, iY); + if (isect < 0) + return -1; + + int ilmr = 0; + ilmr = isect - 6; + if (ilmr <= 0) + ilmr += 9; + if (ilmr == 9) + ilmr++; + if (ilmr == 8 && iquad == 4) + ilmr++; + if (iz == +1) + ilmr += 72; + else + ilmr += 82; + + return ilmr; + + } + + + + + } // namespace reconstruction } // namespace ecal diff --git a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h index b148ab91915d1..3a8125bbe8fb1 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h +++ b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h @@ -2,13 +2,20 @@ #define RecoLocalCalo_EcalRecAlgos_src_KernelHelpers_h namespace ecal { - namespace multifit { - - __device__ uint32_t hashedIndexEB(uint32_t id); + namespace reconstruction { + + __device__ uint32_t hashedIndexEB(uint32_t id); + + __device__ uint32_t hashedIndexEE(uint32_t id); + + + __device__ int laser_monitoring_region_EB(uint32_t id); + + __device__ int laser_monitoring_region_EE(uint32_t id); + + } // namespace reconstruction +} // namespace ecal - __device__ uint32_t hashedIndexEE(uint32_t id); +#endif // RecoLocalCalo_EcalRecAlgos_src_KernelHelpers_h - } // namespace multifit -} // namespace ecal -#endif // RecoLocalCalo_EcalRecAlgos_src_KernelHelpers_h diff --git a/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.cu index 3726ea43d95db..ce4426df03227 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/TimeComputationKernels.cu @@ -852,7 +852,7 @@ namespace ecal { auto const did = DetId{dids[ch]}; auto const isBarrel = did.subdetId() == EcalBarrel; auto const sample_mask = did.subdetId() == EcalBarrel ? sample_maskEB : sample_maskEE; - auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId()); + auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId()); // set pedestal // TODO this branch is non-divergent for a group of 10 threads @@ -1022,7 +1022,7 @@ namespace ecal { auto const did = DetId{dids[gtx]}; auto const isBarrel = did.subdetId() == EcalBarrel; - auto const hashedId = isBarrel ? hashedIndexEB(did.rawId()) : offsetForHashes + hashedIndexEE(did.rawId()); + auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId()); auto const* amplitudeBins = isBarrel ? amplitudeBinsEB : amplitudeBinsEE; auto const* shiftBins = isBarrel ? shiftBinsEB : shiftBinsEE; auto const amplitudeBinsSize = isBarrel ? amplitudeBinsSizeEB : amplitudeBinsSizeEE; diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc new file mode 100644 index 0000000000000..7216c6edb7e73 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc @@ -0,0 +1,190 @@ +#include + +// framework +#include "FWCore/Framework/interface/stream/EDProducer.h" +//#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h" +//#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" + + +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" + +// algorithm specific + +#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h" + +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h" + +class EcalCPURecHitProducer +: public edm::stream::EDProducer +{ +public: + explicit EcalCPURecHitProducer(edm::ParameterSet const& ps); + ~EcalCPURecHitProducer() override; + static void fillDescriptions(edm::ConfigurationDescriptions&); + +private: + void acquire(edm::Event const&, + edm::EventSetup const&, + edm::WaitingTaskWithArenaHolder) override; + void produce(edm::Event&, edm::EventSetup const&) override; + +private: + edm::EDGetTokenT>> recHitsInEBToken_, recHitsInEEToken_; + edm::EDPutTokenT> recHitsOutEBToken_, recHitsOutEEToken_; + + ecal::RecHit recHitsEB_, recHitsEE_; + bool containsTimingInformation_; +}; + +void EcalCPURecHitProducer::fillDescriptions( + edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; + + desc.add("recHitsInLabelEB", edm::InputTag{"ecalRecHitProducerGPU", "EcalRecHitsEB"}); + desc.add("recHitsInLabelEE", edm::InputTag{"ecalRecHitProducerGPU", "EcalRecHitsEE"}); + desc.add("recHitsOutLabelEB", "EcalRecHitsEB"); + desc.add("recHitsOutLabelEE", "EcalRecHitsEE"); + desc.add("containsTimingInformation", false); + + std::string label = "ecalCPURecHitProducer"; + confDesc.add(label, desc); + } + + EcalCPURecHitProducer::EcalCPURecHitProducer( + const edm::ParameterSet& ps) + : recHitsInEBToken_{consumes>>(ps.getParameter("recHitsInLabelEB"))} + , recHitsInEEToken_{consumes>>(ps.getParameter("recHitsInLabelEE"))} + , recHitsOutEBToken_{produces>(ps.getParameter("recHitsOutLabelEB"))} + , recHitsOutEEToken_{produces>(ps.getParameter("recHitsOutLabelEE"))} + , containsTimingInformation_{ps.getParameter("containsTimingInformation")} + {} + + EcalCPURecHitProducer::~EcalCPURecHitProducer() {} + + void EcalCPURecHitProducer::acquire( + edm::Event const& event, + edm::EventSetup const& setup, + edm::WaitingTaskWithArenaHolder taskHolder) + { + // retrieve data/ctx + auto const& ebRecHitsProduct = event.get(recHitsInEBToken_); + auto const& eeRecHitsProduct = event.get(recHitsInEEToken_); + cms::cuda::ScopedContextAcquire ctx{ebRecHitsProduct, std::move(taskHolder)}; + auto const& ebRecHits = ctx.get(ebRecHitsProduct); + auto const& eeRecHits = ctx.get(eeRecHitsProduct); + + // resize the output buffers + recHitsEB_.resize(ebRecHits.size); + recHitsEE_.resize(eeRecHits.size); + + // std::cout << " [EcalCPURecHitProducer::acquire] ebRecHits.size = " << ebRecHits.size << std::endl; + // std::cout << " [EcalCPURecHitProducer::acquire] eeRecHits.size = " << eeRecHits.size << std::endl; + + + // AM: FIXME : why all "uint32_t" and not "float" where needed? + + + // enqeue transfers + cudaCheck( cudaMemcpyAsync(recHitsEB_.did.data(), + ebRecHits.did, + recHitsEB_.did.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + cudaCheck( cudaMemcpyAsync(recHitsEE_.did.data(), + eeRecHits.did, + recHitsEE_.did.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + // + // ./CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h:using StorageScalarType = float; + // + + cudaCheck( cudaMemcpyAsync(recHitsEB_.energy.data(), + ebRecHits.energy, + recHitsEB_.energy.size() * sizeof(::ecal::reco::StorageScalarType), // AM: FIX + cudaMemcpyDeviceToHost, + ctx.stream()) ); + cudaCheck( cudaMemcpyAsync(recHitsEE_.energy.data(), + eeRecHits.energy, + recHitsEE_.energy.size() * sizeof(::ecal::reco::StorageScalarType), // AM: FIX + cudaMemcpyDeviceToHost, + ctx.stream()) ); + + cudaCheck( cudaMemcpyAsync(recHitsEB_.chi2.data(), + ebRecHits.chi2, + recHitsEB_.chi2.size() * sizeof(::ecal::reco::StorageScalarType), // AM: FIX + cudaMemcpyDeviceToHost, + ctx.stream()) ); + cudaCheck( cudaMemcpyAsync(recHitsEE_.chi2.data(), + eeRecHits.chi2, + recHitsEE_.chi2.size() * sizeof(::ecal::reco::StorageScalarType), // AM: FIX + cudaMemcpyDeviceToHost, + ctx.stream()) ); + + cudaCheck( cudaMemcpyAsync(recHitsEB_.extra.data(), + ebRecHits.extra, + recHitsEB_.extra.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + cudaCheck( cudaMemcpyAsync(recHitsEE_.extra.data(), + eeRecHits.extra, + recHitsEE_.extra.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + + cudaCheck( cudaMemcpyAsync(recHitsEB_.flagBits.data(), + ebRecHits.flagBits, + recHitsEB_.flagBits.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + cudaCheck( cudaMemcpyAsync(recHitsEE_.flagBits.data(), + eeRecHits.flagBits, + recHitsEE_.flagBits.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream()) ); + + + + + // for (unsigned int ieb = 0; ieb < ebRecHits.size ; ieb++) { + // if (recHitsEB_.extra[ieb] != 0 ) std::cout << " [ " << ieb << " :: " << ebRecHits.size << " ] [ " << recHitsEB_.did[ieb] << " ] eb extra = " << recHitsEB_.extra[ieb] << std::endl; + // } + + // + // for (unsigned int ieb = 0; ieb < ebRecHits.size ; ieb++) { + // if (recHitsEB_.energy[ieb] != 0 ) std::cout << " [ " << ieb << " :: " << ebRecHits.size << " ] [ " << recHitsEB_.did[ieb] << " ] eb energy = " << recHitsEB_.energy[ieb] << std::endl; + // } + // + // for (unsigned int iee = 0; iee < eeRecHits.size ; iee++) { + // if (recHitsEE_.energy[iee] != 0 ) std::cout << " [ " << iee << " :: " << eeRecHits.size << " ] [ " << recHitsEE_.did[iee] << " ] ee energy = " << recHitsEE_.energy[iee] << std::endl; + // } + // + + + + + } + + void EcalCPURecHitProducer::produce( + edm::Event& event, + edm::EventSetup const& setup) + { + // tmp vectors + auto recHitsOutEB = std::make_unique>(std::move(recHitsEB_)); + auto recHitsOutEE = std::make_unique>(std::move(recHitsEE_)); + + // put into event + event.put(recHitsOutEBToken_, std::move(recHitsOutEB)); + event.put(recHitsOutEEToken_, std::move(recHitsOutEE)); + } + + DEFINE_FWK_MODULE(EcalCPURecHitProducer); + + + \ No newline at end of file diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc new file mode 100644 index 0000000000000..54d772efa806b --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc @@ -0,0 +1,137 @@ +// framework +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" + +// algorithm specific +#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h" +#include "DataFormats/EcalRecHit/interface/EcalRecHit.h" +#include "DataFormats/EcalRecHit/interface/EcalRecHitCollections.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/Common.h" + +#include + +class EcalRecHitConvertGPU2CPUFormat +: public edm::stream::EDProducer<> +{ +public: + explicit EcalRecHitConvertGPU2CPUFormat(edm::ParameterSet const& ps); + ~EcalRecHitConvertGPU2CPUFormat() override; + static void fillDescriptions(edm::ConfigurationDescriptions&); + +private: + using GPURecHitType = ecal::RecHit; + void produce(edm::Event&, edm::EventSetup const&) override; + +private: + const edm::EDGetTokenT recHitsGPUEB_; + const edm::EDGetTokenT recHitsGPUEE_; + + const std::string recHitsLabelCPUEB_, recHitsLabelCPUEE_; +}; + +void EcalRecHitConvertGPU2CPUFormat::fillDescriptions( + edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; + + desc.add("recHitsLabelGPUEB", edm::InputTag("ecalRecHitProducerGPU", "EcalRecHitsGPUEB")); + desc.add("recHitsLabelGPUEE", edm::InputTag("ecalRecHitProducerGPU", "EcalRecHitsGPUEE")); + + desc.add("recHitsLabelCPUEB", "EcalRecHitsEB"); + desc.add("recHitsLabelCPUEE", "EcalRecHitsEE"); + + std::string label = "ecalRecHitConvertGPU2CPUFormat"; + confDesc.add(label, desc); + } + + EcalRecHitConvertGPU2CPUFormat::EcalRecHitConvertGPU2CPUFormat(const edm::ParameterSet& ps) + : recHitsGPUEB_{consumes(ps.getParameter("recHitsLabelGPUEB"))} + , recHitsGPUEE_{consumes(ps.getParameter("recHitsLabelGPUEE"))} + , recHitsLabelCPUEB_{ps.getParameter("recHitsLabelCPUEB")} + , recHitsLabelCPUEE_{ps.getParameter("recHitsLabelCPUEE")} + { + produces(recHitsLabelCPUEB_); + produces(recHitsLabelCPUEE_); + } + + EcalRecHitConvertGPU2CPUFormat::~EcalRecHitConvertGPU2CPUFormat() {} + + void EcalRecHitConvertGPU2CPUFormat::produce( + edm::Event& event, + edm::EventSetup const& setup) + { + edm::Handle hRecHitsGPUEB, hRecHitsGPUEE; + event.getByToken(recHitsGPUEB_, hRecHitsGPUEB); + event.getByToken(recHitsGPUEE_, hRecHitsGPUEE); + + auto recHitsCPUEB = std::make_unique(); + auto recHitsCPUEE = std::make_unique(); + recHitsCPUEB->reserve(hRecHitsGPUEB->energy.size()); + recHitsCPUEE->reserve(hRecHitsGPUEE->energy.size()); + + // + // explicit EcalRecHit(const DetId& id, float energy, float time, uint32_t extra = 0, uint32_t flagBits = 0): + // + + for (uint32_t i=0; ienergy.size(); ++i) { + + // + // Save only if energy is >= 0 ! + // This is extremely important because the channels that were supposed + // to be excluded get "-1" as energy + // + + if (hRecHitsGPUEB->energy[i] >=0) { + recHitsCPUEB->emplace_back( + DetId{hRecHitsGPUEB->did[i]}, + hRecHitsGPUEB->energy[i], + hRecHitsGPUEB->time[i], + hRecHitsGPUEB->extra[i], + hRecHitsGPUEB->flagBits[i] + ); + } + + // std::cout << " EB :: extra [" << i << "::" << hRecHitsGPUEB->energy.size() << "] = " << hRecHitsGPUEB->extra[i] << std::endl; + + // (*recHitsCPUEB)[i].setJitterError(hRecHitsGPUEB->timeError[i]); + // auto const offset = i * EcalDataFrame::MAXSAMPLES; + // for (uint32_t sample=0; sampleenergysAll[offset + sample]); + } + + for (uint32_t i=0; ienergy.size(); ++i) { + // + // Save only if energy is >= 0 ! + // This is extremely important because the channels that were supposed + // to be excluded get "-1" as energy + // + + if (hRecHitsGPUEE->energy[i] >=0) { + recHitsCPUEE->emplace_back( + DetId{hRecHitsGPUEE->did[i]}, + hRecHitsGPUEE->energy[i], + hRecHitsGPUEE->time[i], + hRecHitsGPUEE->extra[i], + hRecHitsGPUEE->flagBits[i] + ); + } + + // std::cout << " EE :: extra [" << i << "::" << hRecHitsGPUEE->energy.size() << "] = " << hRecHitsGPUEE->extra[i] << std::endl; + + // (*recHitsCPUEE)[i].setJitterError(hRecHitsGPUEE->timeError[i]); + // auto const offset = i * EcalDataFrame::MAXSAMPLES; + // for (uint32_t sample=0; sampleenergysAll[offset + sample]); + } + + event.put(std::move(recHitsCPUEB), recHitsLabelCPUEB_); + event.put(std::move(recHitsCPUEE), recHitsLabelCPUEE_); + } + + DEFINE_FWK_MODULE(EcalRecHitConvertGPU2CPUFormat); + \ No newline at end of file diff --git a/RecoLocalCalo/EcalRecProducers/python/ecalRecHitGPU_cfi.py b/RecoLocalCalo/EcalRecProducers/python/ecalRecHitGPU_cfi.py new file mode 100644 index 0000000000000..76299519b51dc --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/python/ecalRecHitGPU_cfi.py @@ -0,0 +1,132 @@ +import FWCore.ParameterSet.Config as cms + +from RecoLocalCalo.EcalRecAlgos.ecalCleaningAlgo import cleaningAlgoConfig + +# rechit producer +ecalRecHitGPU = cms.EDProducer("EcalRecHitProducerGPU", + + uncalibrecHitsInLabelEB = cms.InputTag("ecalUncalibRecHitProducerGPU","EcalUncalibRecHitsEB"), + uncalibrecHitsInLabelEE = cms.InputTag("ecalUncalibRecHitProducerGPU","EcalUncalibRecHitsEE"), + + #recHitsLabelEB = cms.string("EcalRecHitsGPUEB"), + #recHitsLabelEE = cms.string("EcalRecHitsGPUEE"), + recHitsLabelEB = cms.string("EcalRecHitsEB"), + recHitsLabelEE = cms.string("EcalRecHitsEE"), + + maxNumberHits = cms.uint32(20000), # FIXME AM + + + #EErechitCollection = cms.string('EcalRecHitsEE'), + #EEuncalibRecHitCollection = cms.InputTag("ecalMultiFitUncalibRecHit","EcalUncalibRecHitsEE"), + #EBuncalibRecHitCollection = cms.InputTag("ecalMultiFitUncalibRecHit","EcalUncalibRecHitsEB"), + #EBrechitCollection = cms.string('EcalRecHitsEB'), + + ## db statuses to be exluded from reconstruction (some will be recovered) + ChannelStatusToBeExcluded = cms.vstring( 'kDAC', + 'kNoisy', + 'kNNoisy', + 'kFixedG6', + 'kFixedG1', + 'kFixedG0', + 'kNonRespondingIsolated', + 'kDeadVFE', + 'kDeadFE', + 'kNoDataNoTP', + # + # AM should I add them here????? + # next ones from "flagsMapDBReco" + # but not defined in "EcalChannelStatusCode.h" + # but they are defined in "EcalRecHit.h" + # + #'kKilled', + #'kTPSaturated', + #'kL1SpikeFlag', + ), + + ## avoid propagation of dead channels other than after recovery + killDeadChannels = cms.bool(True), + #algo = cms.string("EcalRecHitWorkerSimple"), + + ## define maximal and minimal values for the laser corrections + + EBLaserMIN = cms.double(0.01), # EBLaserMIN = cms.double(0.5), + EELaserMIN = cms.double(0.01), # EELaserMIN = cms.double(0.5), + + EBLaserMAX = cms.double(30.0), # EBLaserMAX = cms.double(3.0), + EELaserMAX = cms.double(30.0), # EELaserMAX = cms.double(8.0), + + + ## useful if time is not calculated, as at HLT + #skipTimeCalib = cms.bool(False), + + ## apply laser corrections + #laserCorrection = cms.bool(True), + + ## reco flags association to DB flag + flagsMapDBReco = cms.PSet( + kGood = cms.vstring('kOk','kDAC','kNoLaser','kNoisy'), + kNoisy = cms.vstring('kNNoisy','kFixedG6','kFixedG1'), + kNeighboursRecovered = cms.vstring('kFixedG0', + 'kNonRespondingIsolated', + 'kDeadVFE'), + kTowerRecovered = cms.vstring('kDeadFE'), + kDead = cms.vstring('kNoDataNoTP') + ), + +#// flagmask_ |= 0x1 << EcalRecHit::kNeighboursRecovered; +#// flagmask_ |= 0x1 << EcalRecHit::kTowerRecovered; +#// flagmask_ |= 0x1 << EcalRecHit::kDead; +#// flagmask_ |= 0x1 << EcalRecHit::kKilled; +#// flagmask_ |= 0x1 << EcalRecHit::kTPSaturated; +#// flagmask_ |= 0x1 << EcalRecHit::kL1SpikeFlag; + + + + ## for channel recovery + #algoRecover = cms.string("EcalRecHitWorkerRecover"), + recoverEBIsolatedChannels = cms.bool(False), + recoverEEIsolatedChannels = cms.bool(False), + recoverEBVFE = cms.bool(False), + recoverEEVFE = cms.bool(False), + recoverEBFE = cms.bool(True), + recoverEEFE = cms.bool(True), + + ##db statuses for which recovery in EE/EB should not be attempted + #dbStatusToBeExcludedEE = cms.vint32( + #14, # dead, no TP + #78, # dead, HV off + #142, # dead,LV off + #), + #dbStatusToBeExcludedEB = cms.vint32( + #14, # dead, no TP + #78, # dead, HV off + #142, # dead,LV off + #), + + ## --- logWarnings for saturated DeadFEs + ## if the logWarningThreshold is negative the Algo will not try recovery (in EE is not tested we may need negative threshold e.g. -1.e+9) + ## if you want to enable recovery but you don't wish to throw logWarnings put the logWarningThresholds very high e.g +1.e+9 + ## ~64 GeV is the TP saturation level + #logWarningEtThreshold_EB_FE = cms.double(50),# in EB logWarningThreshold is actually in E (GeV) + #logWarningEtThreshold_EE_FE = cms.double(50),# in EE the energy should correspond to Et (GeV) but the recovered values of energies are not tested if make sense + #ebDetIdToBeRecovered = cms.InputTag("ecalDetIdToBeRecovered:ebDetId"), + #eeDetIdToBeRecovered = cms.InputTag("ecalDetIdToBeRecovered:eeDetId"), + #ebFEToBeRecovered = cms.InputTag("ecalDetIdToBeRecovered:ebFE"), + #eeFEToBeRecovered = cms.InputTag("ecalDetIdToBeRecovered:eeFE"), + #singleChannelRecoveryMethod = cms.string("NeuralNetworks"), + #singleChannelRecoveryThreshold = cms.double(8), + #triggerPrimitiveDigiCollection = cms.InputTag("ecalDigis:EcalTriggerPrimitives"), + #cleaningConfig=cleaningAlgoConfig, + + ) + + + +#from Configuration.Eras.Modifier_fastSim_cff import fastSim +## no flags for bad channels in FastSim +#fastSim.toModify(ecalRecHit, + #killDeadChannels = False, + #recoverEBFE = False, + #recoverEEFE = False) + + diff --git a/RecoLocalCalo/EcalRecProducers/test/sourceFromRawCmggpu_cff.py b/RecoLocalCalo/EcalRecProducers/test/sourceFromRawCmggpu_cff.py new file mode 100644 index 0000000000000..e993a7573b689 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/test/sourceFromRawCmggpu_cff.py @@ -0,0 +1,151 @@ +import FWCore.ParameterSet.Config as cms + +# input +FastMonitoringService = cms.Service( "FastMonitoringService", + filePerFwkStream = cms.untracked.bool( False ), + fastMonIntervals = cms.untracked.uint32( 2 ), + sleepTime = cms.untracked.int32( 1 ) +) + +EvFDaqDirector = cms.Service( "EvFDaqDirector", + runNumber = cms.untracked.uint32( 321177 ), + + baseDir = cms.untracked.string( "tmp" ), + buBaseDir = cms.untracked.string( "tmp" ), + + useFileBroker = cms.untracked.bool( False ), + fileBrokerKeepAlive = cms.untracked.bool( True ), + fileBrokerPort = cms.untracked.string( "8080" ), + fileBrokerUseLocalLock = cms.untracked.bool( True ), + fuLockPollInterval = cms.untracked.uint32( 2000 ), + + requireTransfersPSet = cms.untracked.bool( False ), + selectedTransferMode = cms.untracked.string( "" ), + mergingPset = cms.untracked.string( "" ), + + outputAdler32Recheck = cms.untracked.bool( False ), +) + +source = cms.Source( "FedRawDataInputSource", + runNumber = cms.untracked.uint32( 321177 ), + getLSFromFilename = cms.untracked.bool(True), + testModeNoBuilderUnit = cms.untracked.bool(False), + verifyAdler32 = cms.untracked.bool( True ), + verifyChecksum = cms.untracked.bool( True ), + useL1EventID = cms.untracked.bool( False ), # True + alwaysStartFromfirstLS = cms.untracked.uint32( 0 ), + + eventChunkBlock = cms.untracked.uint32( 240 ), # 32 + eventChunkSize = cms.untracked.uint32( 240), # 32 + maxBufferedFiles = cms.untracked.uint32( 8 ), # 2 + numBuffers = cms.untracked.uint32( 8 ), # 2 + + fileListMode = cms.untracked.bool( True ), # False + fileNames = cms.untracked.vstring( + #'/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0142_index000000.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0142_index000001.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0142_index000002.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0142_index000003.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0142_index000004.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0143_index000000.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0143_index000001.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0143_index000002.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0143_index000003.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0143_index000004.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0144_index000000.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0144_index000001.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0144_index000002.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0144_index000003.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0144_index000004.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0145_index000000.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0145_index000001.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0145_index000002.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0145_index000003.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0145_index000004.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0146_index000000.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0146_index000001.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0146_index000002.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0146_index000003.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0146_index000004.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0147_index000000.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0147_index000001.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0147_index000002.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0147_index000003.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0147_index000004.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0148_index000000.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0148_index000001.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0148_index000002.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0148_index000003.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0148_index000004.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0149_index000000.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0149_index000001.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0149_index000002.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0149_index000003.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0149_index000004.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0150_index000000.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0150_index000001.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0150_index000002.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0150_index000003.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0150_index000004.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0151_index000000.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0151_index000001.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0151_index000002.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0151_index000003.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0151_index000004.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0152_index000000.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0152_index000001.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0152_index000002.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0152_index000003.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0152_index000004.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0153_index000000.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0153_index000001.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0153_index000002.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0153_index000003.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0153_index000004.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0154_index000000.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0154_index000001.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0154_index000002.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0154_index000003.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0154_index000004.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0155_index000000.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0155_index000001.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0155_index000002.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0155_index000003.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0155_index000004.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0156_index000000.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0156_index000001.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0156_index000002.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0156_index000003.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0156_index000004.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0157_index000000.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0157_index000001.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0157_index000002.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0157_index000003.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0157_index000004.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0158_index000000.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0158_index000001.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0158_index000002.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0158_index000003.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0158_index000004.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0159_index000000.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0159_index000001.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0159_index000002.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0159_index000003.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0159_index000004.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0160_index000000.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0160_index000001.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0160_index000002.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0160_index000003.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0160_index000004.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0161_index000000.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0161_index000001.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0161_index000002.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0161_index000003.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0161_index000004.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0162_index000000.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0162_index000001.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0162_index000002.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0162_index000003.raw', + '/data/patatrack/store/raw/Run2018D/JetHT/RAW/v1/000/321/177/00000/run321177_ls0162_index000004.raw', + ), +) \ No newline at end of file diff --git a/RecoLocalCalo/EcalRecProducers/test/testEcalUncalibRechitProducer_cfg.py b/RecoLocalCalo/EcalRecProducers/test/testEcalUncalibRechitProducer_cfg.py new file mode 100644 index 0000000000000..7fdf723b67bdd --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/test/testEcalUncalibRechitProducer_cfg.py @@ -0,0 +1,231 @@ + +import FWCore.ParameterSet.Config as cms + +from Configuration.StandardSequences.Eras import eras +#from Configuration.ProcessModifiers.gpu_cff import gpu + +process = cms.Process('RECO', eras.Run2_2018) + +# import of standard configurations +process.load('Configuration.StandardSequences.Services_cff') +#process.load('SimGeneral.HepPDTESSource.pythiapdt_cfi') +process.load('FWCore.MessageService.MessageLogger_cfi') +process.load('HeterogeneousCore.CUDAServices.CUDAService_cfi') +#process.load('Configuration.EventContent.EventContent_cff') +process.load('Configuration.StandardSequences.GeometryRecoDB_cff') +process.load('Configuration.StandardSequences.MagneticField_AutoFromDBCurrent_cff') +#process.load('Configuration.StandardSequences.RawToDigi_Data_cff') +#process.load('Configuration.StandardSequences.Reconstruction_Data_cff') +#process.load('DQMOffline.Configuration.DQMOffline_cff') +process.load('Configuration.StandardSequences.FrontierConditions_GlobalTag_cff') + + + + + + +# Other statements +from Configuration.AlCa.GlobalTag import GlobalTag +process.GlobalTag = GlobalTag(process.GlobalTag, '102X_dataRun2_HLT_v2', '') + + +process.maxEvents = cms.untracked.PSet( + #input = cms.untracked.int32(100) + input = cms.untracked.int32(1000) +) + +# load data using the DAQ source +import sys, os, inspect +sys.path.append(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))) +process.load('sourceFromRawCmggpu_cff') + +#----------------------------------------- +# CMSSW/Hcal non-DQM Related Module import +#----------------------------------------- +process.load('Configuration.StandardSequences.GeometryRecoDB_cff') +process.load("RecoLocalCalo.Configuration.hcalLocalReco_cff") +#process.load("RecoLocalCalo.Configuration.ecalLocalRecoSequence_cff") +process.load("EventFilter.HcalRawToDigi.HcalRawToDigi_cfi") +process.load("EventFilter.EcalRawToDigi.EcalUnpackerData_cfi") +process.load("RecoLuminosity.LumiProducer.bunchSpacingProducer_cfi") + +# load both cpu and gpu plugins +# +# ../cfipython/slc7_amd64_gcc700/RecoLocalCalo/EcalRecProducers/ecalUncalibRecHitProducerGPU_cfi.py +# +process.load("RecoLocalCalo.EcalRecProducers.ecalUncalibRecHitProducerGPU_cfi") +# +process.load("RecoLocalCalo.EcalRecProducers.ecalMultiFitUncalibRecHit_cfi") + +# for validation of gpu multifit products +process.load("RecoLocalCalo.EcalRecProducers.ecalCPUUncalibRecHitProducer_cfi") +# +# ../cfipython/slc7_amd64_gcc700/RecoLocalCalo/EcalRecProducers/ecalCPUUncalibRecHitProducer_cfi.py +# + +process.load("EventFilter.EcalRawToDigi.ecalRawToDigiGPU_cfi") +process.load("EventFilter.EcalRawToDigi.ecalElectronicsMappingGPUESProducer_cfi") + +#process.ecalUncalibRecHitProducerGPU.kernelsVersion = 0 +#process.ecalUncalibRecHitProducerGPU.kernelMinimizeThreads = cms.vuint32(16, 1, 1) +# +# process.ecalUncalibRecHitProducerGPU.shouldRunTimingComputation = cms.bool(False) +# + + +process.load("RecoLocalCalo.EcalRecProducers.ecalPedestalsGPUESProducer_cfi") +process.load("RecoLocalCalo.EcalRecProducers.ecalGainRatiosGPUESProducer_cfi") +process.load("RecoLocalCalo.EcalRecProducers.ecalPulseShapesGPUESProducer_cfi") +process.load("RecoLocalCalo.EcalRecProducers.ecalPulseCovariancesGPUESProducer_cfi") +process.load("RecoLocalCalo.EcalRecProducers.ecalSamplesCorrelationGPUESProducer_cfi") +process.load("RecoLocalCalo.EcalRecProducers.ecalTimeBiasCorrectionsGPUESProducer_cfi") +process.load("RecoLocalCalo.EcalRecProducers.ecalTimeCalibConstantsGPUESProducer_cfi") + +#process.ecalMultiFitUncalibRecHitgpu.algoPSet.threads = cms.vint32(256, 1, 1) + + +## +## force HLT configuration for ecalMultiFitUncalibRecHit +## + +process.ecalMultiFitUncalibRecHit.algoPSet = cms.PSet( + ebSpikeThreshold = cms.double( 1.042 ), + EBtimeFitLimits_Upper = cms.double( 1.4 ), + EEtimeFitLimits_Lower = cms.double( 0.2 ), + timealgo = cms.string( "None" ), # ----> no timing computation for CPU version + EBtimeNconst = cms.double( 28.5 ), + prefitMaxChiSqEE = cms.double( 10.0 ), + outOfTimeThresholdGain12mEB = cms.double( 5.0 ), + outOfTimeThresholdGain12mEE = cms.double( 1000.0 ), + EEtimeFitParameters = cms.vdouble( -2.390548, 3.553628, -17.62341, 67.67538, -133.213, 140.7432, -75.41106, 16.20277 ), + prefitMaxChiSqEB = cms.double( 25.0 ), + simplifiedNoiseModelForGainSwitch = cms.bool( True ), + EBtimeFitParameters = cms.vdouble( -2.015452, 3.130702, -12.3473, 41.88921, -82.83944, 91.01147, -50.35761, 11.05621 ), + selectiveBadSampleCriteriaEB = cms.bool( False ), + dynamicPedestalsEB = cms.bool( False ), + useLumiInfoRunHeader = cms.bool( False ), + EBamplitudeFitParameters = cms.vdouble( 1.138, 1.652 ), + doPrefitEE = cms.bool( False ), + dynamicPedestalsEE = cms.bool( False ), + selectiveBadSampleCriteriaEE = cms.bool( False ), + outOfTimeThresholdGain61pEE = cms.double( 1000.0 ), + outOfTimeThresholdGain61pEB = cms.double( 5.0 ), + activeBXs = cms.vint32( -5, -4, -3, -2, -1, 0, 1, 2, 3, 4 ), + EcalPulseShapeParameters = cms.PSet( + EEPulseShapeTemplate = cms.vdouble( 0.116442, 0.756246, 1.0, 0.897182, 0.686831, 0.491506, 0.344111, 0.245731, 0.174115, 0.123361, 0.0874288, 0.061957 ), + EEdigiCollection = cms.string( "" ), + EcalPreMixStage2 = cms.bool( False ), + EcalPreMixStage1 = cms.bool( False ), + EBPulseShapeCovariance = cms.vdouble( 3.001E-6, 1.233E-5, 0.0, -4.416E-6, -4.571E-6, -3.614E-6, -2.636E-6, -1.286E-6, -8.41E-7, -5.296E-7, 0.0, 0.0, 1.233E-5, 6.154E-5, 0.0, -2.2E-5, -2.309E-5, -1.838E-5, -1.373E-5, -7.334E-6, -5.088E-6, -3.745E-6, -2.428E-6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -4.416E-6, -2.2E-5, 0.0, 8.319E-6, 8.545E-6, 6.792E-6, 5.059E-6, 2.678E-6, 1.816E-6, 1.223E-6, 8.245E-7, 5.589E-7, -4.571E-6, -2.309E-5, 0.0, 8.545E-6, 9.182E-6, 7.219E-6, 5.388E-6, 2.853E-6, 1.944E-6, 1.324E-6, 9.083E-7, 6.335E-7, -3.614E-6, -1.838E-5, 0.0, 6.792E-6, 7.219E-6, 6.016E-6, 4.437E-6, 2.385E-6, 1.636E-6, 1.118E-6, 7.754E-7, 5.556E-7, -2.636E-6, -1.373E-5, 0.0, 5.059E-6, 5.388E-6, 4.437E-6, 3.602E-6, 1.917E-6, 1.322E-6, 9.079E-7, 6.529E-7, 4.752E-7, -1.286E-6, -7.334E-6, 0.0, 2.678E-6, 2.853E-6, 2.385E-6, 1.917E-6, 1.375E-6, 9.1E-7, 6.455E-7, 4.693E-7, 3.657E-7, -8.41E-7, -5.088E-6, 0.0, 1.816E-6, 1.944E-6, 1.636E-6, 1.322E-6, 9.1E-7, 9.115E-7, 6.062E-7, 4.436E-7, 3.422E-7, -5.296E-7, -3.745E-6, 0.0, 1.223E-6, 1.324E-6, 1.118E-6, 9.079E-7, 6.455E-7, 6.062E-7, 7.217E-7, 4.862E-7, 3.768E-7, 0.0, -2.428E-6, 0.0, 8.245E-7, 9.083E-7, 7.754E-7, 6.529E-7, 4.693E-7, 4.436E-7, 4.862E-7, 6.509E-7, 4.418E-7, 0.0, 0.0, 0.0, 5.589E-7, 6.335E-7, 5.556E-7, 4.752E-7, 3.657E-7, 3.422E-7, 3.768E-7, 4.418E-7, 6.142E-7 ), + ESdigiCollection = cms.string( "" ), + EBdigiCollection = cms.string( "" ), + EBCorrNoiseMatrixG01 = cms.vdouble( 1.0, 0.73354, 0.64442, 0.58851, 0.55425, 0.53082, 0.51916, 0.51097, 0.50732, 0.50409 ), + EBCorrNoiseMatrixG12 = cms.vdouble( 1.0, 0.71073, 0.55721, 0.46089, 0.40449, 0.35931, 0.33924, 0.32439, 0.31581, 0.30481 ), + EBCorrNoiseMatrixG06 = cms.vdouble( 1.0, 0.70946, 0.58021, 0.49846, 0.45006, 0.41366, 0.39699, 0.38478, 0.37847, 0.37055 ), + EEPulseShapeCovariance = cms.vdouble( 3.941E-5, 3.333E-5, 0.0, -1.449E-5, -1.661E-5, -1.424E-5, -1.183E-5, -6.842E-6, -4.915E-6, -3.411E-6, 0.0, 0.0, 3.333E-5, 2.862E-5, 0.0, -1.244E-5, -1.431E-5, -1.233E-5, -1.032E-5, -5.883E-6, -4.154E-6, -2.902E-6, -2.128E-6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.449E-5, -1.244E-5, 0.0, 5.84E-6, 6.649E-6, 5.72E-6, 4.812E-6, 2.708E-6, 1.869E-6, 1.33E-6, 9.186E-7, 6.446E-7, -1.661E-5, -1.431E-5, 0.0, 6.649E-6, 7.966E-6, 6.898E-6, 5.794E-6, 3.157E-6, 2.184E-6, 1.567E-6, 1.084E-6, 7.575E-7, -1.424E-5, -1.233E-5, 0.0, 5.72E-6, 6.898E-6, 6.341E-6, 5.347E-6, 2.859E-6, 1.991E-6, 1.431E-6, 9.839E-7, 6.886E-7, -1.183E-5, -1.032E-5, 0.0, 4.812E-6, 5.794E-6, 5.347E-6, 4.854E-6, 2.628E-6, 1.809E-6, 1.289E-6, 9.02E-7, 6.146E-7, -6.842E-6, -5.883E-6, 0.0, 2.708E-6, 3.157E-6, 2.859E-6, 2.628E-6, 1.863E-6, 1.296E-6, 8.882E-7, 6.108E-7, 4.283E-7, -4.915E-6, -4.154E-6, 0.0, 1.869E-6, 2.184E-6, 1.991E-6, 1.809E-6, 1.296E-6, 1.217E-6, 8.669E-7, 5.751E-7, 3.882E-7, -3.411E-6, -2.902E-6, 0.0, 1.33E-6, 1.567E-6, 1.431E-6, 1.289E-6, 8.882E-7, 8.669E-7, 9.522E-7, 6.717E-7, 4.293E-7, 0.0, -2.128E-6, 0.0, 9.186E-7, 1.084E-6, 9.839E-7, 9.02E-7, 6.108E-7, 5.751E-7, 6.717E-7, 7.911E-7, 5.493E-7, 0.0, 0.0, 0.0, 6.446E-7, 7.575E-7, 6.886E-7, 6.146E-7, 4.283E-7, 3.882E-7, 4.293E-7, 5.493E-7, 7.027E-7 ), + EBPulseShapeTemplate = cms.vdouble( 0.0113979, 0.758151, 1.0, 0.887744, 0.673548, 0.474332, 0.319561, 0.215144, 0.147464, 0.101087, 0.0693181, 0.0475044 ), + EECorrNoiseMatrixG01 = cms.vdouble( 1.0, 0.72698, 0.62048, 0.55691, 0.51848, 0.49147, 0.47813, 0.47007, 0.46621, 0.46265 ), + EECorrNoiseMatrixG12 = cms.vdouble( 1.0, 0.71373, 0.44825, 0.30152, 0.21609, 0.14786, 0.11772, 0.10165, 0.09465, 0.08098 ), + UseLCcorrection = cms.untracked.bool( True ), + EECorrNoiseMatrixG06 = cms.vdouble( 1.0, 0.71217, 0.47464, 0.34056, 0.26282, 0.20287, 0.17734, 0.16256, 0.15618, 0.14443 ) + ), + doPrefitEB = cms.bool( False ), + addPedestalUncertaintyEE = cms.double( 0.0 ), + addPedestalUncertaintyEB = cms.double( 0.0 ), + gainSwitchUseMaxSampleEB = cms.bool( True ), + EEtimeNconst = cms.double( 31.8 ), + EEamplitudeFitParameters = cms.vdouble( 1.89, 1.4 ), + chi2ThreshEE_ = cms.double( 50.0 ), + eePulseShape = cms.vdouble( 5.2E-5, -5.26E-5, 6.66E-5, 0.1168, 0.7575, 1.0, 0.8876, 0.6732, 0.4741, 0.3194 ), + outOfTimeThresholdGain12pEB = cms.double( 5.0 ), + gainSwitchUseMaxSampleEE = cms.bool( False ), + mitigateBadSamplesEB = cms.bool( False ), + outOfTimeThresholdGain12pEE = cms.double( 1000.0 ), + ebPulseShape = cms.vdouble( 5.2E-5, -5.26E-5, 6.66E-5, 0.1168, 0.7575, 1.0, 0.8876, 0.6732, 0.4741, 0.3194 ), + ampErrorCalculation = cms.bool( False ), + mitigateBadSamplesEE = cms.bool( False ), + amplitudeThresholdEB = cms.double( 10.0 ), + kPoorRecoFlagEB = cms.bool( True ), + amplitudeThresholdEE = cms.double( 10.0 ), + EBtimeFitLimits_Lower = cms.double( 0.2 ), + kPoorRecoFlagEE = cms.bool( False ), + EEtimeFitLimits_Upper = cms.double( 1.4 ), + outOfTimeThresholdGain61mEE = cms.double( 1000.0 ), + EEtimeConstantTerm = cms.double( 1.0 ), + EBtimeConstantTerm = cms.double( 0.6 ), + chi2ThreshEB_ = cms.double( 65.0 ), + outOfTimeThresholdGain61mEB = cms.double( 5.0 ) +) + +## + + + +#process.load('Configuration.StandardSequences.Reconstruction_cff') +#process.ecalRecHit + + + +#process.load("RecoLocalCalo.EcalRecProducers.ecalRecHitGPU_cfi") +#process.ecalRecHitGPU + + + +#process.hcalDigis.silent = cms.untracked.bool(False) +#process.hcalDigis.InputLabel = rawTag +process.ecalDigis = process.ecalEBunpacker.clone() +process.ecalDigis.InputLabel = cms.InputTag('rawDataCollector') +#process.hbheprerecogpu.processQIE11 = cms.bool(True) + +process.out = cms.OutputModule( + "PoolOutputModule", + fileName = cms.untracked.string("test_uncalib.root") +) + +#process.out = cms.OutputModule("AsciiOutputModule", +# outputCommands = cms.untracked.vstring( +# 'keep *_ecalMultiFitUncalibRecHit_*_*', +# ), +# verbosity = cms.untracked.uint32(0) +#) +process.finalize = cms.EndPath(process.out) + +process.bunchSpacing = cms.Path( + process.bunchSpacingProducer +) + +process.digiPath = cms.Path( + #process.hcalDigis + process.ecalDigis + *process.ecalRawToDigiGPU +) + +process.recoPath = cms.Path( + #(process.ecalMultiFitUncalibRecHit+process.ecalDetIdToBeRecovered) + process.ecalMultiFitUncalibRecHit + #*process.ecalRecHit +# gpu + *process.ecalUncalibRecHitProducerGPU + *process.ecalCPUUncalibRecHitProducer + #*process.ecalRecHitGPU +) + +process.schedule = cms.Schedule( + process.bunchSpacing, + process.digiPath, + process.recoPath, +# process.ecalecalLocalRecoSequence + process.finalize +) + +process.options = cms.untracked.PSet( + numberOfThreads = cms.untracked.uint32(8), + numberOfStreams = cms.untracked.uint32(8), + SkipEvent = cms.untracked.vstring('ProductNotFound'), + wantSummary = cms.untracked.bool(True) +) + +# report CUDAService messages +process.MessageLogger.categories.append("CUDAService") + + From de950168d13a34f4fc5982ee033c10f29e7c8e85 Mon Sep 17 00:00:00 2001 From: amassiro Date: Fri, 10 Apr 2020 15:54:24 +0200 Subject: [PATCH 10/30] minor fix --- EventFilter/EcalRawToDigi/src/UnpackGPU.cu | 1 + .../EcalRecProducers/test/testEcalRechitProducer_cfg.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/EventFilter/EcalRawToDigi/src/UnpackGPU.cu b/EventFilter/EcalRawToDigi/src/UnpackGPU.cu index a2e5057bbbf6a..a4742f85ef6ca 100644 --- a/EventFilter/EcalRawToDigi/src/UnpackGPU.cu +++ b/EventFilter/EcalRawToDigi/src/UnpackGPU.cu @@ -317,6 +317,7 @@ namespace ecal { scratchGPU.pChannelsCounter, conditions.eMappingProduct.eid2did, nbytesTotal); + cudaCheck(cudaGetLastError()); // transfer the counters for how many eb and ee channels we got diff --git a/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py b/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py index 77d562242985b..a18d2c0ea7e4c 100644 --- a/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py +++ b/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py @@ -154,7 +154,7 @@ process.load("RecoLocalCalo.EcalRecProducers.ecalADCToGeVConstantGPUESProducer_cfi") process.load("RecoLocalCalo.EcalRecProducers.ecalIntercalibConstantsGPUESProducer_cfi") -#process.load("RecoLocalCalo.EcalRecProducers.ecalChannelStatusGPUESProducer_cfi") +process.load("RecoLocalCalo.EcalRecProducers.ecalChannelStatusGPUESProducer_cfi") process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosGPUESProducer_cfi") process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosRefGPUESProducer_cfi") From ee0c5ea653a03ebfd2d01a205f5719c8b315e892 Mon Sep 17 00:00:00 2001 From: amassiro Date: Tue, 21 Apr 2020 15:27:47 +0200 Subject: [PATCH 11/30] tests ongoing --- CUDADataFormats/EcalRecHitSoA/BuildFile.xml | 4 +--- .../EcalRecHitSoA/src/classes_def.xml | 1 + EventFilter/EcalRawToDigi/src/UnpackGPU.cu | 3 ++- .../EcalRecAlgos/interface/DeclsForKernels.h | 20 +++++++++---------- .../src/AmplitudeComputationKernels.cu | 2 +- .../EcalRecAlgos/src/EcalChannelStatusGPU.cc | 2 ++ .../src/EcalRecHitBuilderKernels.cu | 15 +++++++++----- .../src/EcalRecHitBuilderKernels.h | 1 + .../EcalUncalibRecHitMultiFitAlgo_gpu_new.cu | 16 +++++++-------- RecoLocalCalo/EcalRecProducers/BuildFile.xml | 2 ++ .../EcalRecProducers/plugins/BuildFile.xml | 1 + .../plugins/EcalRecHitProducerGPU.cc | 3 --- .../test/testEcalRechitProducer_cfg.py | 8 ++++---- 13 files changed, 43 insertions(+), 35 deletions(-) diff --git a/CUDADataFormats/EcalRecHitSoA/BuildFile.xml b/CUDADataFormats/EcalRecHitSoA/BuildFile.xml index 3b6d026d40d11..aaaaf306dd7c7 100644 --- a/CUDADataFormats/EcalRecHitSoA/BuildFile.xml +++ b/CUDADataFormats/EcalRecHitSoA/BuildFile.xml @@ -1,9 +1,7 @@ - - - + diff --git a/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml b/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml index 6721bfff3126c..266324f5fac31 100644 --- a/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml +++ b/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml @@ -2,6 +2,7 @@ + diff --git a/EventFilter/EcalRawToDigi/src/UnpackGPU.cu b/EventFilter/EcalRawToDigi/src/UnpackGPU.cu index a4742f85ef6ca..d8ffbec039b7c 100644 --- a/EventFilter/EcalRawToDigi/src/UnpackGPU.cu +++ b/EventFilter/EcalRawToDigi/src/UnpackGPU.cu @@ -307,7 +307,8 @@ namespace ecal { cudaCheck(cudaMemcpyAsync( inputGPU.feds, inputCPU.feds.data(), nfedsWithData * sizeof(int), cudaMemcpyHostToDevice, cudaStream)); - kernel_unpack_test<32><<>>(inputGPU.data, +// kernel_unpack_test<32><<>>(inputGPU.data, + kernel_unpack_test<16><<>>(inputGPU.data, inputGPU.offsets, inputGPU.feds, outputGPU.samplesEB, diff --git a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h index 6bc816fca5295..419e50b3636c6 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h @@ -15,7 +15,6 @@ #include "CondFormats/EcalObjects/interface/EcalPedestals.h" #include "CondFormats/EcalObjects/interface/EcalGainRatios.h" #include "CondFormats/EcalObjects/interface/EcalTimeBiasCorrections.h" -#include "CondFormats/EcalObjects/interface/EcalWeightSet.h" #include "CondFormats/EcalObjects/interface/EcalTimeOffsetConstant.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h" @@ -281,11 +280,13 @@ struct conf_data { #include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h" +#include "CondFormats/EcalObjects/interface/EcalChannelStatus.h" +#include "CondFormats/EcalObjects/interface/EcalChannelStatusCode.h" + #include "RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h" - #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h" @@ -293,7 +294,6 @@ struct conf_data { - namespace ecal { namespace rechit { @@ -378,14 +378,14 @@ namespace ecal { // const refs products to conditions struct ConditionsProducts { - EcalADCToGeVConstantGPU::Product const& ADCToGeV; - EcalIntercalibConstantsGPU::Product const& Intercalib; - EcalChannelStatusGPU::Product const& ChannelStatus; + EcalADCToGeVConstantGPU::Product const& ADCToGeV ; + EcalIntercalibConstantsGPU::Product const& Intercalib ; + EcalChannelStatusGPU::Product const& ChannelStatus ; // - EcalLaserAPDPNRatiosGPU::Product const& LaserAPDPNRatios ; - EcalLaserAPDPNRatiosRefGPU::Product const& LaserAPDPNRatiosRef; - EcalLaserAlphasGPU::Product const& LaserAlphas ; - EcalLinearCorrectionsGPU::Product const& LinearCorrections ; + EcalLaserAPDPNRatiosGPU::Product const& LaserAPDPNRatios ; + EcalLaserAPDPNRatiosRefGPU::Product const& LaserAPDPNRatiosRef ; + EcalLaserAlphasGPU::Product const& LaserAlphas ; + EcalLinearCorrectionsGPU::Product const& LinearCorrections ; // // uint32_t offsetForHashes; diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu index 23d9c12aa0582..c67677055c189 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu @@ -392,7 +392,7 @@ namespace ecal { 50, offsetForHashes, offsetForInputs); - cudaCheck(cudaGetLastError()); +// cudaCheck(cudaGetLastError()); } } // namespace v1 diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc index c1cdc6631878b..91293902bb667 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc @@ -45,3 +45,5 @@ EcalChannelStatusGPU::Product const& EcalChannelStatusGPU::getProduct(cudaStream } TYPELOOKUP_DATA_REG(EcalChannelStatusGPU); + + diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu index 84aacc0cf5b33..5c50bdaa58f7f 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu @@ -648,15 +648,20 @@ namespace ecal { int nchannels = eventInputGPU.ebUncalibRecHits.size + eventInputGPU.eeUncalibRecHits.size ; - unsigned int nchannels_per_block = 32; - unsigned int threads_1d = nchannels_per_block; - unsigned int blocks_1d = (nchannels + threads_1d) / threads_1d; // TEST : to be optimized (AM) - +// unsigned int nchannels_per_block = 32; + unsigned int nchannels_per_block = 16; + unsigned int threads_min = nchannels_per_block; + unsigned int blocks_min = (nchannels + threads_min - 1) / threads_min; // TEST : to be optimized (AM) // // kernel create rechit // - kernel_create_ecal_rehit <<< blocks_1d, threads_1d >>> ( + +// auto const nbytesShared = 2 * threads_min * MapSymM::total * sizeof(DataType); + + kernel_create_ecal_rehit <<< blocks_min, threads_min, 0, cudaStream >>> ( +// kernel_create_ecal_rehit <<< blocks_min, threads_min, nbytesShared, cudaStream >>> ( +// kernel_create_ecal_rehit <<< blocks_min, threads_min >>> ( // configuration configParameters.ChannelStatusToBeExcluded, configParameters.ChannelStatusToBeExcludedSize, diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h index 587abe0575883..a1809dbded6bd 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h @@ -3,6 +3,7 @@ // #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h" #include "RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h" #include "RecoLocalCalo/EcalRecAlgos/interface/Common.h" diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu index c8d2926b29afc..dbfe4833c7d3f 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu @@ -83,7 +83,7 @@ namespace ecal { gainSwitchUseMaxSampleEB, gainSwitchUseMaxSampleEE, totalChannels); - cudaCheck(cudaGetLastError()); +// cudaCheck(cudaGetLastError()); // // 2d preparation kernel @@ -112,7 +112,7 @@ namespace ecal { scratch.isSaturated, offsetForHashes, offsetForInputs); - cudaCheck(cudaGetLastError()); +// cudaCheck(cudaGetLastError()); // run minimization kernels v1::minimization_procedure(eventInputGPU, eventOutputGPU, scratch, conditions, configParameters, cudaStream); @@ -148,7 +148,7 @@ namespace ecal { conditions.sampleMask.getEcalSampleMaskRecordEB(), conditions.sampleMask.getEcalSampleMaskRecordEE(), totalChannels); - cudaCheck(cudaGetLastError()); +// cudaCheck(cudaGetLastError()); // // TODO: small kernel only for EB. It needs to be checked if @@ -170,7 +170,7 @@ namespace ecal { conditions.sampleMask.getEcalSampleMaskRecordEB(), totalChannels, offsetForInputs); - cudaCheck(cudaGetLastError()); +// cudaCheck(cudaGetLastError()); // // @@ -186,7 +186,7 @@ namespace ecal { scratch.sum0sNullHypot, scratch.sumAAsNullHypot, totalChannels); - cudaCheck(cudaGetLastError()); +// cudaCheck(cudaGetLastError()); unsigned int nchannels_per_block_makeratio = 10; unsigned int threads_makeratio = 45 * nchannels_per_block_makeratio; @@ -220,7 +220,7 @@ namespace ecal { configParameters.timeFitLimitsSecondEE, totalChannels, offsetForInputs); - cudaCheck(cudaGetLastError()); +// cudaCheck(cudaGetLastError()); // // @@ -252,7 +252,7 @@ namespace ecal { scratch.timeError, totalChannels, offsetForInputs); - cudaCheck(cudaGetLastError()); +// cudaCheck(cudaGetLastError()); // // @@ -298,7 +298,7 @@ namespace ecal { offsetForHashes, offsetForInputs, totalChannels); - cudaCheck(cudaGetLastError()); +// cudaCheck(cudaGetLastError()); } /* diff --git a/RecoLocalCalo/EcalRecProducers/BuildFile.xml b/RecoLocalCalo/EcalRecProducers/BuildFile.xml index 836b1c5090955..59d0c5987d7fd 100644 --- a/RecoLocalCalo/EcalRecProducers/BuildFile.xml +++ b/RecoLocalCalo/EcalRecProducers/BuildFile.xml @@ -1,9 +1,11 @@ + + diff --git a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml index b4dfcc1cc3b0d..89e5e9d93c549 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml +++ b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml @@ -16,6 +16,7 @@ + diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc index 69c3a95244ed8..a9d4bb9e670f4 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc @@ -401,9 +401,6 @@ void EcalRecHitProducerGPU::acquire( ctx.stream() ); - - - cudaCheck(cudaGetLastError()); diff --git a/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py b/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py index a18d2c0ea7e4c..f1b68836b2101 100644 --- a/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py +++ b/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py @@ -152,7 +152,7 @@ -process.load("RecoLocalCalo.EcalRecProducers.ecalADCToGeVConstantGPUESProducer_cfi") +#process.load("RecoLocalCalo.EcalRecProducers.ecalADCToGeVConstantGPUESProducer_cfi") process.load("RecoLocalCalo.EcalRecProducers.ecalIntercalibConstantsGPUESProducer_cfi") process.load("RecoLocalCalo.EcalRecProducers.ecalChannelStatusGPUESProducer_cfi") @@ -161,8 +161,8 @@ process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAlphasGPUESProducer_cfi") process.load("RecoLocalCalo.EcalRecProducers.ecalLinearCorrectionsGPUESProducer_cfi") -process.load("RecoLocalCalo.EcalRecProducers.ecalRecHitGPU_cfi") -process.ecalRecHitProducerGPU = process.ecalRecHitGPU.clone() +#process.load("RecoLocalCalo.EcalRecProducers.ecalRecHitGPU_cfi") +#process.ecalRecHitProducerGPU = process.ecalRecHitGPU.clone() # @@ -273,7 +273,7 @@ # gpu *process.ecalUncalibRecHitProducerGPU *process.ecalCPUUncalibRecHitProducer - *process.ecalRecHitProducerGPU + #*process.ecalRecHitProducerGPU #*process.ecalCPURecHitProducer ) From ba982009f43a9036425ea6f1b2ef55f1552a64d2 Mon Sep 17 00:00:00 2001 From: amassiro Date: Tue, 21 Apr 2020 15:28:00 +0200 Subject: [PATCH 12/30] last file missing --- .../test/ecalRawDecodingAndMultifit.py | 201 ++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100644 RecoLocalCalo/EcalRecProducers/test/ecalRawDecodingAndMultifit.py diff --git a/RecoLocalCalo/EcalRecProducers/test/ecalRawDecodingAndMultifit.py b/RecoLocalCalo/EcalRecProducers/test/ecalRawDecodingAndMultifit.py new file mode 100644 index 0000000000000..4886238cc620f --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/test/ecalRawDecodingAndMultifit.py @@ -0,0 +1,201 @@ +import FWCore.ParameterSet.Config as cms + +from Configuration.StandardSequences.Eras import eras +#from Configuration.ProcessModifiers.gpu_cff import gpu + +process = cms.Process('RECO', eras.Run2_2018) + +# import of standard configurations +process.load('Configuration.StandardSequences.Services_cff') +#process.load('SimGeneral.HepPDTESSource.pythiapdt_cfi') +process.load('FWCore.MessageService.MessageLogger_cfi') +process.load('HeterogeneousCore.CUDAServices.CUDAService_cfi') +#process.load('Configuration.EventContent.EventContent_cff') +process.load('Configuration.StandardSequences.GeometryRecoDB_cff') +process.load('Configuration.StandardSequences.MagneticField_AutoFromDBCurrent_cff') +#process.load('Configuration.StandardSequences.RawToDigi_Data_cff') +#process.load('Configuration.StandardSequences.Reconstruction_Data_cff') +#process.load('DQMOffline.Configuration.DQMOffline_cff') +process.load('Configuration.StandardSequences.FrontierConditions_GlobalTag_cff') + +# Other statements +from Configuration.AlCa.GlobalTag import GlobalTag +process.GlobalTag = GlobalTag(process.GlobalTag, '102X_dataRun2_HLT_v2', '') + + +process.maxEvents = cms.untracked.PSet( + input = cms.untracked.int32(100) +) + +# load data using the DAQ source +import sys, os, inspect +sys.path.append(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))) +process.load('sourceFromRawCmggpu_cff') + +#----------------------------------------- +# CMSSW/Hcal non-DQM Related Module import +#----------------------------------------- +process.load('Configuration.StandardSequences.GeometryRecoDB_cff') +process.load("RecoLocalCalo.Configuration.hcalLocalReco_cff") +#process.load("RecoLocalCalo.Configuration.ecalLocalRecoSequence_cff") +process.load("EventFilter.HcalRawToDigi.HcalRawToDigi_cfi") +process.load("EventFilter.EcalRawToDigi.EcalUnpackerData_cfi") +process.load("RecoLuminosity.LumiProducer.bunchSpacingProducer_cfi") + +# load both cpu and gpu plugins +process.load("RecoLocalCalo.EcalRecProducers.ecalUncalibRecHitProducerGPU_cfi") +process.load("RecoLocalCalo.EcalRecProducers.ecalMultiFitUncalibRecHit_cfi") + +# for validation of gpu multifit products +process.load("RecoLocalCalo.EcalRecProducers.ecalCPUUncalibRecHitProducer_cfi") +process.load("EventFilter.EcalRawToDigi.ecalCPUDigisProducer_cfi") + +process.load("EventFilter.EcalRawToDigi.ecalRawToDigiGPU_cfi") +process.load("EventFilter.EcalRawToDigi.ecalElectronicsMappingGPUESProducer_cfi") + +#process.ecalUncalibRecHitProducerGPU.kernelsVersion = 0 +#process.ecalUncalibRecHitProducerGPU.kernelMinimizeThreads = cms.vuint32(16, 1, 1) + +process.load("RecoLocalCalo.EcalRecProducers.ecalPedestalsGPUESProducer_cfi") +process.load("RecoLocalCalo.EcalRecProducers.ecalGainRatiosGPUESProducer_cfi") +process.load("RecoLocalCalo.EcalRecProducers.ecalPulseShapesGPUESProducer_cfi") +process.load("RecoLocalCalo.EcalRecProducers.ecalPulseCovariancesGPUESProducer_cfi") +process.load("RecoLocalCalo.EcalRecProducers.ecalSamplesCorrelationGPUESProducer_cfi") +process.load("RecoLocalCalo.EcalRecProducers.ecalTimeBiasCorrectionsGPUESProducer_cfi") +process.load("RecoLocalCalo.EcalRecProducers.ecalTimeCalibConstantsGPUESProducer_cfi") + +#process.load("RecoLocalCalo.EcalRecProducers.ecalIntercalibConstantsGPUESProducer_cfi") +#process.load("RecoLocalCalo.EcalRecProducers.ecalChannelStatusGPUESProducer_cfi") + + +#process.ecalMultiFitUncalibRecHitgpu.algoPSet.threads = cms.vint32(256, 1, 1) + + +## +## force HLT configuration for ecalMultiFitUncalibRecHit +## + +process.ecalMultiFitUncalibRecHit.algoPSet = cms.PSet( + ebSpikeThreshold = cms.double( 1.042 ), + EBtimeFitLimits_Upper = cms.double( 1.4 ), + EEtimeFitLimits_Lower = cms.double( 0.2 ), + timealgo = cms.string( "None" ), + EBtimeNconst = cms.double( 28.5 ), + prefitMaxChiSqEE = cms.double( 10.0 ), + outOfTimeThresholdGain12mEB = cms.double( 5.0 ), + outOfTimeThresholdGain12mEE = cms.double( 1000.0 ), + EEtimeFitParameters = cms.vdouble( -2.390548, 3.553628, -17.62341, 67.67538, -133.213, 140.7432, -75.41106, 16.20277 ), + prefitMaxChiSqEB = cms.double( 25.0 ), + simplifiedNoiseModelForGainSwitch = cms.bool( True ), + EBtimeFitParameters = cms.vdouble( -2.015452, 3.130702, -12.3473, 41.88921, -82.83944, 91.01147, -50.35761, 11.05621 ), + selectiveBadSampleCriteriaEB = cms.bool( False ), + dynamicPedestalsEB = cms.bool( False ), + useLumiInfoRunHeader = cms.bool( False ), + EBamplitudeFitParameters = cms.vdouble( 1.138, 1.652 ), + doPrefitEE = cms.bool( False ), + dynamicPedestalsEE = cms.bool( False ), + selectiveBadSampleCriteriaEE = cms.bool( False ), + outOfTimeThresholdGain61pEE = cms.double( 1000.0 ), + outOfTimeThresholdGain61pEB = cms.double( 5.0 ), + activeBXs = cms.vint32( -5, -4, -3, -2, -1, 0, 1, 2, 3, 4 ), + EcalPulseShapeParameters = cms.PSet( + EEPulseShapeTemplate = cms.vdouble( 0.116442, 0.756246, 1.0, 0.897182, 0.686831, 0.491506, 0.344111, 0.245731, 0.174115, 0.123361, 0.0874288, 0.061957 ), + EEdigiCollection = cms.string( "" ), + EcalPreMixStage2 = cms.bool( False ), + EcalPreMixStage1 = cms.bool( False ), + EBPulseShapeCovariance = cms.vdouble( 3.001E-6, 1.233E-5, 0.0, -4.416E-6, -4.571E-6, -3.614E-6, -2.636E-6, -1.286E-6, -8.41E-7, -5.296E-7, 0.0, 0.0, 1.233E-5, 6.154E-5, 0.0, -2.2E-5, -2.309E-5, -1.838E-5, -1.373E-5, -7.334E-6, -5.088E-6, -3.745E-6, -2.428E-6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -4.416E-6, -2.2E-5, 0.0, 8.319E-6, 8.545E-6, 6.792E-6, 5.059E-6, 2.678E-6, 1.816E-6, 1.223E-6, 8.245E-7, 5.589E-7, -4.571E-6, -2.309E-5, 0.0, 8.545E-6, 9.182E-6, 7.219E-6, 5.388E-6, 2.853E-6, 1.944E-6, 1.324E-6, 9.083E-7, 6.335E-7, -3.614E-6, -1.838E-5, 0.0, 6.792E-6, 7.219E-6, 6.016E-6, 4.437E-6, 2.385E-6, 1.636E-6, 1.118E-6, 7.754E-7, 5.556E-7, -2.636E-6, -1.373E-5, 0.0, 5.059E-6, 5.388E-6, 4.437E-6, 3.602E-6, 1.917E-6, 1.322E-6, 9.079E-7, 6.529E-7, 4.752E-7, -1.286E-6, -7.334E-6, 0.0, 2.678E-6, 2.853E-6, 2.385E-6, 1.917E-6, 1.375E-6, 9.1E-7, 6.455E-7, 4.693E-7, 3.657E-7, -8.41E-7, -5.088E-6, 0.0, 1.816E-6, 1.944E-6, 1.636E-6, 1.322E-6, 9.1E-7, 9.115E-7, 6.062E-7, 4.436E-7, 3.422E-7, -5.296E-7, -3.745E-6, 0.0, 1.223E-6, 1.324E-6, 1.118E-6, 9.079E-7, 6.455E-7, 6.062E-7, 7.217E-7, 4.862E-7, 3.768E-7, 0.0, -2.428E-6, 0.0, 8.245E-7, 9.083E-7, 7.754E-7, 6.529E-7, 4.693E-7, 4.436E-7, 4.862E-7, 6.509E-7, 4.418E-7, 0.0, 0.0, 0.0, 5.589E-7, 6.335E-7, 5.556E-7, 4.752E-7, 3.657E-7, 3.422E-7, 3.768E-7, 4.418E-7, 6.142E-7 ), + ESdigiCollection = cms.string( "" ), + EBdigiCollection = cms.string( "" ), + EBCorrNoiseMatrixG01 = cms.vdouble( 1.0, 0.73354, 0.64442, 0.58851, 0.55425, 0.53082, 0.51916, 0.51097, 0.50732, 0.50409 ), + EBCorrNoiseMatrixG12 = cms.vdouble( 1.0, 0.71073, 0.55721, 0.46089, 0.40449, 0.35931, 0.33924, 0.32439, 0.31581, 0.30481 ), + EBCorrNoiseMatrixG06 = cms.vdouble( 1.0, 0.70946, 0.58021, 0.49846, 0.45006, 0.41366, 0.39699, 0.38478, 0.37847, 0.37055 ), + EEPulseShapeCovariance = cms.vdouble( 3.941E-5, 3.333E-5, 0.0, -1.449E-5, -1.661E-5, -1.424E-5, -1.183E-5, -6.842E-6, -4.915E-6, -3.411E-6, 0.0, 0.0, 3.333E-5, 2.862E-5, 0.0, -1.244E-5, -1.431E-5, -1.233E-5, -1.032E-5, -5.883E-6, -4.154E-6, -2.902E-6, -2.128E-6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.449E-5, -1.244E-5, 0.0, 5.84E-6, 6.649E-6, 5.72E-6, 4.812E-6, 2.708E-6, 1.869E-6, 1.33E-6, 9.186E-7, 6.446E-7, -1.661E-5, -1.431E-5, 0.0, 6.649E-6, 7.966E-6, 6.898E-6, 5.794E-6, 3.157E-6, 2.184E-6, 1.567E-6, 1.084E-6, 7.575E-7, -1.424E-5, -1.233E-5, 0.0, 5.72E-6, 6.898E-6, 6.341E-6, 5.347E-6, 2.859E-6, 1.991E-6, 1.431E-6, 9.839E-7, 6.886E-7, -1.183E-5, -1.032E-5, 0.0, 4.812E-6, 5.794E-6, 5.347E-6, 4.854E-6, 2.628E-6, 1.809E-6, 1.289E-6, 9.02E-7, 6.146E-7, -6.842E-6, -5.883E-6, 0.0, 2.708E-6, 3.157E-6, 2.859E-6, 2.628E-6, 1.863E-6, 1.296E-6, 8.882E-7, 6.108E-7, 4.283E-7, -4.915E-6, -4.154E-6, 0.0, 1.869E-6, 2.184E-6, 1.991E-6, 1.809E-6, 1.296E-6, 1.217E-6, 8.669E-7, 5.751E-7, 3.882E-7, -3.411E-6, -2.902E-6, 0.0, 1.33E-6, 1.567E-6, 1.431E-6, 1.289E-6, 8.882E-7, 8.669E-7, 9.522E-7, 6.717E-7, 4.293E-7, 0.0, -2.128E-6, 0.0, 9.186E-7, 1.084E-6, 9.839E-7, 9.02E-7, 6.108E-7, 5.751E-7, 6.717E-7, 7.911E-7, 5.493E-7, 0.0, 0.0, 0.0, 6.446E-7, 7.575E-7, 6.886E-7, 6.146E-7, 4.283E-7, 3.882E-7, 4.293E-7, 5.493E-7, 7.027E-7 ), + EBPulseShapeTemplate = cms.vdouble( 0.0113979, 0.758151, 1.0, 0.887744, 0.673548, 0.474332, 0.319561, 0.215144, 0.147464, 0.101087, 0.0693181, 0.0475044 ), + EECorrNoiseMatrixG01 = cms.vdouble( 1.0, 0.72698, 0.62048, 0.55691, 0.51848, 0.49147, 0.47813, 0.47007, 0.46621, 0.46265 ), + EECorrNoiseMatrixG12 = cms.vdouble( 1.0, 0.71373, 0.44825, 0.30152, 0.21609, 0.14786, 0.11772, 0.10165, 0.09465, 0.08098 ), + UseLCcorrection = cms.untracked.bool( True ), + EECorrNoiseMatrixG06 = cms.vdouble( 1.0, 0.71217, 0.47464, 0.34056, 0.26282, 0.20287, 0.17734, 0.16256, 0.15618, 0.14443 ) + ), + doPrefitEB = cms.bool( False ), + addPedestalUncertaintyEE = cms.double( 0.0 ), + addPedestalUncertaintyEB = cms.double( 0.0 ), + gainSwitchUseMaxSampleEB = cms.bool( True ), + EEtimeNconst = cms.double( 31.8 ), + EEamplitudeFitParameters = cms.vdouble( 1.89, 1.4 ), + chi2ThreshEE_ = cms.double( 50.0 ), + eePulseShape = cms.vdouble( 5.2E-5, -5.26E-5, 6.66E-5, 0.1168, 0.7575, 1.0, 0.8876, 0.6732, 0.4741, 0.3194 ), + outOfTimeThresholdGain12pEB = cms.double( 5.0 ), + gainSwitchUseMaxSampleEE = cms.bool( False ), + mitigateBadSamplesEB = cms.bool( False ), + outOfTimeThresholdGain12pEE = cms.double( 1000.0 ), + ebPulseShape = cms.vdouble( 5.2E-5, -5.26E-5, 6.66E-5, 0.1168, 0.7575, 1.0, 0.8876, 0.6732, 0.4741, 0.3194 ), + ampErrorCalculation = cms.bool( False ), + mitigateBadSamplesEE = cms.bool( False ), + amplitudeThresholdEB = cms.double( 10.0 ), + kPoorRecoFlagEB = cms.bool( True ), + amplitudeThresholdEE = cms.double( 10.0 ), + EBtimeFitLimits_Lower = cms.double( 0.2 ), + kPoorRecoFlagEE = cms.bool( False ), + EEtimeFitLimits_Upper = cms.double( 1.4 ), + outOfTimeThresholdGain61mEE = cms.double( 1000.0 ), + EEtimeConstantTerm = cms.double( 1.0 ), + EBtimeConstantTerm = cms.double( 0.6 ), + chi2ThreshEB_ = cms.double( 65.0 ), + outOfTimeThresholdGain61mEB = cms.double( 5.0 ) + ) + + +#process.hcalDigis.silent = cms.untracked.bool(False) +#process.hcalDigis.InputLabel = rawTag +process.ecalDigis = process.ecalEBunpacker.clone() +process.ecalDigis.InputLabel = cms.InputTag('rawDataCollector') +#process.hbheprerecogpu.processQIE11 = cms.bool(True) + +process.out = cms.OutputModule( + "PoolOutputModule", + fileName = cms.untracked.string("test.root") +) + +#process.out = cms.OutputModule("AsciiOutputModule", +# outputCommands = cms.untracked.vstring( +# 'keep *_ecalMultiFitUncalibRecHit_*_*', +# ), +# verbosity = cms.untracked.uint32(0) +#) +process.finalize = cms.EndPath(process.out) + +process.bunchSpacing = cms.Path( + process.bunchSpacingProducer +) + +process.digiPath = cms.Path( + #process.hcalDigis + process.ecalDigis + *process.ecalRawToDigiGPU + *process.ecalCPUDigisProducer +) + +process.recoPath = cms.Path( + process.ecalMultiFitUncalibRecHit +# process.ecalMultiFitUncalibRecHitgpu + *process.ecalUncalibRecHitProducerGPU + *process.ecalCPUUncalibRecHitProducer +) + +process.schedule = cms.Schedule( + process.bunchSpacing, + process.digiPath, + process.recoPath, +# process.ecalecalLocalRecoSequence + process.finalize +) + +process.options = cms.untracked.PSet( + numberOfThreads = cms.untracked.uint32(4), + numberOfStreams = cms.untracked.uint32(4), + SkipEvent = cms.untracked.vstring('ProductNotFound'), + wantSummary = cms.untracked.bool(True) +) + +# report CUDAService messages +process.MessageLogger.categories.append("CUDAService") From e1268b1ce316e60565be45b3b35638c6cce05410 Mon Sep 17 00:00:00 2001 From: amassiro Date: Wed, 13 May 2020 16:35:01 +0200 Subject: [PATCH 13/30] update to make it work --- .../interface/EcalChannelStatusCode.h | 19 +- EventFilter/EcalRawToDigi/src/UnpackGPU.cu | 3 +- RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml | 10 + ...eEcalMultifitResultsGpuValidationPlots.cpp | 508 +++++++++++++----- .../EcalRecAlgos/interface/DeclsForKernels.h | 10 +- .../interface/EcalADCToGeVConstantGPU.h | 43 -- .../interface/EcalChannelStatusGPU.h | 43 -- .../src/AmplitudeComputationKernels.cu | 2 +- .../src/EcalADCToGeVConstantGPU.cc | 39 -- .../EcalRecAlgos/src/EcalChannelStatusGPU.cc | 49 -- .../EcalUncalibRecHitMultiFitAlgo_gpu_new.cu | 16 +- .../plugins/EcalESProducersGPUDefs.cc | 16 +- .../plugins/EcalRecHitProducerGPU.cc | 12 +- .../test/ecalRawDecodingAndMultifit.py | 12 + .../test/testEcalRechitProducer_cfg.py | 22 +- 15 files changed, 461 insertions(+), 343 deletions(-) delete mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h delete mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h delete mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalADCToGeVConstantGPU.cc delete mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc diff --git a/CondFormats/EcalObjects/interface/EcalChannelStatusCode.h b/CondFormats/EcalObjects/interface/EcalChannelStatusCode.h index 09202950bfc68..a52868fe0d8df 100644 --- a/CondFormats/EcalObjects/interface/EcalChannelStatusCode.h +++ b/CondFormats/EcalObjects/interface/EcalChannelStatusCode.h @@ -5,6 +5,7 @@ * Created: 14 Nov 2006 **/ + #include "CondFormats/Serialization/interface/Serializable.h" #include @@ -16,7 +17,10 @@ */ class EcalChannelStatusCode { + + public: + enum Code { kOk = 0, kDAC, @@ -35,12 +39,22 @@ class EcalChannelStatusCode { kNoDataNoTP }; - enum Bits { kHV = 0, kLV, kDAQ, kTP, kTrigger, kTemperature, kNextToDead }; - + enum Bits { + kHV=0, + kLV, + kDAQ, + kTP, + kTrigger, + kTemperature, + kNextToDead + }; + public: + EcalChannelStatusCode() : status_(0) {} EcalChannelStatusCode(const uint16_t& encodedStatus) : status_(encodedStatus){}; + void print(std::ostream& s) const { s << "status is: " << status_; } /// return decoded status @@ -55,6 +69,7 @@ class EcalChannelStatusCode { static const int chStatusMask = 0x1F; private: + static const int kBitsOffset = 5; /* bits 1-5 store a status code: 0 channel ok diff --git a/EventFilter/EcalRawToDigi/src/UnpackGPU.cu b/EventFilter/EcalRawToDigi/src/UnpackGPU.cu index d8ffbec039b7c..a4742f85ef6ca 100644 --- a/EventFilter/EcalRawToDigi/src/UnpackGPU.cu +++ b/EventFilter/EcalRawToDigi/src/UnpackGPU.cu @@ -307,8 +307,7 @@ namespace ecal { cudaCheck(cudaMemcpyAsync( inputGPU.feds, inputCPU.feds.data(), nfedsWithData * sizeof(int), cudaMemcpyHostToDevice, cudaStream)); -// kernel_unpack_test<32><<>>(inputGPU.data, - kernel_unpack_test<16><<>>(inputGPU.data, + kernel_unpack_test<32><<>>(inputGPU.data, inputGPU.offsets, inputGPU.feds, outputGPU.samplesEB, diff --git a/RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml b/RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml index bf61d052856ad..4c98171091b84 100644 --- a/RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml +++ b/RecoLocalCalo/EcalRecAlgos/bin/BuildFile.xml @@ -5,3 +5,13 @@ + + + + + + + + + + diff --git a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp index 4d50b758d39f3..04ba175eebb1e 100644 --- a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp +++ b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp @@ -15,181 +15,290 @@ #include "DataFormats/EcalRecHit/interface/EcalRecHitCollections.h" #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" +#include "TStyle.h" + + +void setAxis(TH2D* histo) { + histo->GetXaxis()->SetTitle("cpu"); + histo->GetYaxis()->SetTitle("gpu"); +} + + +void setAxisDelta(TH2D* histo) { + histo->GetXaxis()->SetTitle("cpu"); + histo->GetYaxis()->SetTitle("#Delta gpu-cpu"); +} + int main(int argc, char *argv[]) { - if (argc < 3) { + if (argc<3) { std::cout << "run with: ./validateGPU \n"; exit(0); } - - edm::Wrapper> *wgpuEB = nullptr; - edm::Wrapper> *wgpuEE = nullptr; + + gStyle->SetOptStat("ourme"); + + edm::Wrapper> *wgpuEB=nullptr; + edm::Wrapper> *wgpuEE=nullptr; edm::Wrapper *wcpuEB = nullptr; edm::Wrapper *wcpuEE = nullptr; - + std::string fileName = argv[1]; std::string outFileName = argv[2]; - + // output TFile rfout{outFileName.c_str(), "recreate"}; - + + int nbins_count = 200; + float last_count = 5000.; + int nbins_count_delta = 201; + int nbins = 300; float last = 3000.; - + + // int nbins_chi2 = 1000; + // float last_chi2 = 1000.; int nbins_chi2 = 1000; - float last_chi2 = 1000.; - + float last_chi2 = 200.; + + int nbins_flags = 100; + float last_flags = 100.; + float delta_flags = 20; + int nbins_delta = 201; // use an odd number to center around 0 float delta = 0.2; - + + + // RecHits plots for EB and EE on both GPU and CPU + auto hRechitsEBGPU = new TH1D("RechitsEBGPU", "RechitsEBGPU; No. of Rechits", nbins_count, 0, last_count); + auto hRechitsEBCPU = new TH1D("RechitsEBCPU", "RechitsEBCPU; No. of Rechits", nbins_count, 0, last_count); + auto hRechitsEEGPU = new TH1D("RechitsEEGPU", "RechitsEEGPU; No. of Rechits", nbins_count, 0, last_count); + auto hRechitsEECPU = new TH1D("RechitsEECPU", "RechitsEECPU; No. of Rechits", nbins_count, 0, last_count); + auto hRechitsEBGPUCPUratio = new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU", 50, 0.9, 1.1); + auto hRechitsEEGPUCPUratio = new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU", 50, 0.9, 1.1); + auto hSOIAmplitudesEBGPU = new TH1D("hSOIAmplitudesEBGPU", "hSOIAmplitudesEBGPU", nbins, 0, last); auto hSOIAmplitudesEEGPU = new TH1D("hSOIAmplitudesEEGPU", "hSOIAmplitudesEEGPU", nbins, 0, last); auto hSOIAmplitudesEBCPU = new TH1D("hSOIAmplitudesEBCPU", "hSOIAmplitudesEBCPU", nbins, 0, last); auto hSOIAmplitudesEECPU = new TH1D("hSOIAmplitudesEECPU", "hSOIAmplitudesEECPU", nbins, 0, last); - auto hSOIAmplitudesEBGPUCPUratio = - new TH1D("SOIAmplitudesEBGPU/CPUratio", "SOIAmplitudesEBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); - auto hSOIAmplitudesEEGPUCPUratio = - new TH1D("SOIAmplitudesEEGPU/CPUratio", "SOIAmplitudesEEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); - + auto hSOIAmplitudesEBGPUCPUratio = new TH1D("SOIAmplitudesEBGPU/CPUratio", "SOIAmplitudesEBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); + auto hSOIAmplitudesEEGPUCPUratio = new TH1D("SOIAmplitudesEEGPU/CPUratio", "SOIAmplitudesEEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); + auto hChi2EBGPU = new TH1D("hChi2EBGPU", "hChi2EBGPU", nbins_chi2, 0, last_chi2); auto hChi2EEGPU = new TH1D("hChi2EEGPU", "hChi2EEGPU", nbins_chi2, 0, last_chi2); auto hChi2EBCPU = new TH1D("hChi2EBCPU", "hChi2EBCPU", nbins_chi2, 0, last_chi2); auto hChi2EECPU = new TH1D("hChi2EECPU", "hChi2EECPU", nbins_chi2, 0, last_chi2); - - auto hSOIAmplitudesEBGPUvsCPU = - new TH2D("hSOIAmplitudesEBGPUvsCPU", "hSOIAmplitudesEBGPUvsCPU", nbins, 0, last, nbins, 0, last); - auto hSOIAmplitudesEEGPUvsCPU = - new TH2D("hSOIAmplitudesEEGPUvsCPU", "hSOIAmplitudesEEGPUvsCPU", nbins, 0, last, nbins, 0, last); - auto hSOIAmplitudesEBdeltavsCPU = - new TH2D("hSOIAmplitudesEBdeltavsCPU", "hSOIAmplitudesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); - auto hSOIAmplitudesEEdeltavsCPU = - new TH2D("hSOIAmplitudesEEdeltavsCPU", "hSOIAmplitudesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); - - auto hChi2EBGPUvsCPU = - new TH2D("hChi2EBGPUvsCPU", "hChi2EBGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2); - auto hChi2EEGPUvsCPU = - new TH2D("hChi2EEGPUvsCPU", "hChi2EEGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2); - auto hChi2EBdeltavsCPU = - new TH2D("hChi2EBdeltavsCPU", "hChi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); - auto hChi2EEdeltavsCPU = - new TH2D("hChi2EEdeltavsCPU", "hChi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); - + auto hChi2EBGPUCPUratio = new TH1D("Chi2EBGPU/CPUratio", "Chi2EBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); + auto hChi2EEGPUCPUratio = new TH1D("Chi2EEGPU/CPUratio", "Chi2EEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); + + auto hFlagsEBGPU = new TH1D("hFlagsEBGPU", "hFlagsEBGPU", nbins_flags, 0, last_flags); + auto hFlagsEEGPU = new TH1D("hFlagsEEGPU", "hFlagsEEGPU", nbins_flags, 0, last_flags); + auto hFlagsEBCPU = new TH1D("hFlagsEBCPU", "hFlagsEBCPU", nbins_flags, 0, last_flags); + auto hFlagsEECPU = new TH1D("hFlagsEECPU", "hFlagsEECPU", nbins_flags, 0, last_flags); + auto hFlagsEBGPUCPUratio = new TH1D("FlagsEBGPU/CPUratio", "FlagsEBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); + auto hFlagsEEGPUCPUratio = new TH1D("FlagsEEGPU/CPUratio", "FlagsEEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); + + auto hSOIAmplitudesEBGPUvsCPU = new TH2D("hSOIAmplitudesEBGPUvsCPU", "hSOIAmplitudesEBGPUvsCPU", nbins, 0, last, nbins, 0, last); setAxis(hSOIAmplitudesEBGPUvsCPU ) ; + auto hSOIAmplitudesEEGPUvsCPU = new TH2D("hSOIAmplitudesEEGPUvsCPU", "hSOIAmplitudesEEGPUvsCPU", nbins, 0, last, nbins, 0, last); setAxis(hSOIAmplitudesEEGPUvsCPU ) ; + auto hSOIAmplitudesEBdeltavsCPU = new TH2D("hSOIAmplitudesEBdeltavsCPU", "hSOIAmplitudesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); setAxisDelta(hSOIAmplitudesEBdeltavsCPU) ; + auto hSOIAmplitudesEEdeltavsCPU = new TH2D("hSOIAmplitudesEEdeltavsCPU", "hSOIAmplitudesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); setAxisDelta(hSOIAmplitudesEEdeltavsCPU) ; + + auto hChi2EBGPUvsCPU = new TH2D("hChi2EBGPUvsCPU", "hChi2EBGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2); setAxis(hChi2EBGPUvsCPU ) ; + auto hChi2EEGPUvsCPU = new TH2D("hChi2EEGPUvsCPU", "hChi2EEGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2); setAxis(hChi2EEGPUvsCPU ) ; + auto hChi2EBdeltavsCPU = new TH2D("hChi2EBdeltavsCPU", "hChi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); setAxisDelta(hChi2EBdeltavsCPU) ; + auto hChi2EEdeltavsCPU = new TH2D("hChi2EEdeltavsCPU", "hChi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); setAxisDelta(hChi2EEdeltavsCPU) ; + + auto hFlagsEBGPUvsCPU = new TH2D("hFlagsEBGPUvsCPU", "hFlagsEBGPUvsCPU", nbins_flags, 0, last_flags, nbins_flags, 0, last_flags); setAxis(hFlagsEBGPUvsCPU ) ; + auto hFlagsEEGPUvsCPU = new TH2D("hFlagsEEGPUvsCPU", "hFlagsEEGPUvsCPU", nbins_flags, 0, last_flags, nbins_flags, 0, last_flags); setAxis(hFlagsEEGPUvsCPU ) ; + auto hFlagsEBdeltavsCPU = new TH2D("hFlagsEBdeltavsCPU", "hFlagsEBdeltavsCPU", nbins_flags, 0, last_flags, nbins_delta, -delta_flags, delta_flags); setAxisDelta(hFlagsEBdeltavsCPU) ; + auto hFlagsEEdeltavsCPU = new TH2D("hFlagsEEdeltavsCPU", "hFlagsEEdeltavsCPU", nbins_flags, 0, last_flags, nbins_delta, -delta_flags, delta_flags); setAxisDelta(hFlagsEEdeltavsCPU) ; + + auto hRechitsEBGPUvsCPU = new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU", last_count, 0, last_count, last_count, 0, last_count); setAxis(hRechitsEBGPUvsCPU ) ; + auto hRechitsEEGPUvsCPU = new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU", last_count, 0, last_count, last_count, 0, last_count); setAxis(hRechitsEEGPUvsCPU ) ; + auto hRechitsEBdeltavsCPU = new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU", nbins_count, 0, last_count, nbins_count_delta, -delta, delta); setAxisDelta(hRechitsEBdeltavsCPU) ; + auto hRechitsEEdeltavsCPU = new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU", nbins_count, 0, last_count, nbins_count_delta, -delta, delta); setAxisDelta(hRechitsEEdeltavsCPU) ; + + // input std::cout << "validating file " << fileName << std::endl; TFile rf{fileName.c_str()}; - TTree *rt = (TTree *)rf.Get("Events"); - rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEB_RECO.", - &wgpuEB); - rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEE_RECO.", - &wgpuEE); + TTree *rt = (TTree*)rf.Get("Events"); + rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEB_RECO.", &wgpuEB); + rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEE_RECO.", &wgpuEE); rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEB_RECO.", &wcpuEB); rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEE_RECO.", &wcpuEE); - + constexpr float eps_diff = 1e-3; - + // accumulate auto const nentries = rt->GetEntries(); std::cout << "#events to validate over: " << nentries << std::endl; - for (int ie = 0; ie < nentries; ++ie) { + for (int ie=0; ieGetEntry(ie); - - const char *ordinal[] = {"th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th"}; + + const char* ordinal[] = { "th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th" }; auto cpu_eb_size = wcpuEB->bareProduct().size(); auto cpu_ee_size = wcpuEE->bareProduct().size(); auto gpu_eb_size = wgpuEB->bareProduct().amplitude.size(); auto gpu_ee_size = wgpuEE->bareProduct().amplitude.size(); + + float eb_ratio = (float) gpu_eb_size/cpu_eb_size; + float ee_ratio = (float) gpu_ee_size/cpu_ee_size; + + // Filling up the histograms on events sizes for EB and EE on both GPU and CPU + hRechitsEBGPU->Fill(gpu_eb_size); + hRechitsEBCPU->Fill(cpu_eb_size); + hRechitsEEGPU->Fill(gpu_ee_size); + hRechitsEECPU->Fill(cpu_ee_size); + hRechitsEBGPUvsCPU->Fill(cpu_eb_size, gpu_eb_size); + hRechitsEEGPUvsCPU->Fill(cpu_ee_size, gpu_ee_size); + hRechitsEBGPUCPUratio->Fill(eb_ratio); + hRechitsEEGPUCPUratio->Fill(ee_ratio); + hRechitsEBdeltavsCPU->Fill(cpu_eb_size, gpu_eb_size-cpu_eb_size); + hRechitsEEdeltavsCPU->Fill(cpu_ee_size, gpu_ee_size-cpu_ee_size); + + if (cpu_eb_size != gpu_eb_size or cpu_ee_size != gpu_ee_size) { std::cerr << ie << ordinal[ie % 10] << " entry:\n" - << " EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << gpu_eb_size - << " (gpu)\n" - << " EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << gpu_ee_size - << " (gpu)" << std::endl; + << " EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << gpu_eb_size << " (gpu)\n" + << " EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << gpu_ee_size << " (gpu)" << std::endl; continue; } - + assert(wgpuEB->bareProduct().amplitude.size() == wcpuEB->bareProduct().size()); assert(wgpuEE->bareProduct().amplitude.size() == wcpuEE->bareProduct().size()); auto const neb = wcpuEB->bareProduct().size(); auto const nee = wcpuEE->bareProduct().size(); - - for (uint32_t i = 0; i < neb; ++i) { + + + for (uint32_t i=0; ibareProduct().did[i]; auto const soi_amp_gpu = wgpuEB->bareProduct().amplitude[i]; auto const cpu_iter = wcpuEB->bareProduct().find(DetId{did_gpu}); if (cpu_iter == wcpuEB->bareProduct().end()) { std::cerr << ie << ordinal[ie % 10] << " entry\n" - << " Did not find a DetId " << did_gpu << " in a CPU collection\n"; + << " Did not find a DetId " << did_gpu + << " in a CPU collection\n"; continue; } auto const soi_amp_cpu = cpu_iter->amplitude(); auto const chi2_gpu = wgpuEB->bareProduct().chi2[i]; auto const chi2_cpu = cpu_iter->chi2(); - + + auto const flags_gpu = wgpuEB->bareProduct().flags[i]; + auto const flags_cpu = cpu_iter->flags(); + hSOIAmplitudesEBGPU->Fill(soi_amp_gpu); hSOIAmplitudesEBCPU->Fill(soi_amp_cpu); hSOIAmplitudesEBGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu); - hSOIAmplitudesEBdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu - soi_amp_cpu); - hSOIAmplitudesEBGPUCPUratio->Fill((float)soi_amp_gpu / soi_amp_cpu); + hSOIAmplitudesEBdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu-soi_amp_cpu); + if (soi_amp_cpu>0) hSOIAmplitudesEBGPUCPUratio->Fill( (float) soi_amp_gpu/soi_amp_cpu); + hChi2EBGPU->Fill(chi2_gpu); hChi2EBCPU->Fill(chi2_cpu); hChi2EBGPUvsCPU->Fill(chi2_cpu, chi2_gpu); - hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu); - - if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or - std::isnan(chi2_gpu)) { + hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu); + if (chi2_cpu>0) hChi2EBGPUCPUratio->Fill( (float) chi2_gpu/chi2_cpu); + + if (fabs(chi2_gpu/chi2_cpu-1) > 0.05 || fabs(soi_amp_gpu/soi_amp_cpu-1) > 0.05) { + std::cout << " ---- EB " << std::endl; + std::cout << " eventid = " << ie << " xtal = " << i << std::endl; + std::cout << " chi2_gpu = " << chi2_gpu << " chi2_cpu = " << chi2_cpu << std::endl; + std::cout << " soi_amp_gpu = " << soi_amp_gpu << " soi_amp_cpu = " << soi_amp_cpu << std::endl; + std::cout << " flags_gpu = " << flags_gpu << " flags_cpu = " << flags_cpu << std::endl; + } + + hFlagsEBGPU->Fill(flags_gpu); + hFlagsEBCPU->Fill(flags_cpu); + hFlagsEBGPUvsCPU->Fill(flags_cpu, flags_gpu); + hFlagsEBdeltavsCPU->Fill(flags_cpu, flags_gpu-flags_cpu); + if (flags_cpu>0) hFlagsEBGPUCPUratio->Fill( (float) flags_gpu/flags_cpu); + + if (flags_cpu!=flags_gpu) { + std::cout << " >> No! Different flag cpu:gpu = " << flags_cpu << " : " << flags_gpu; + std::cout << std::endl; + } + + if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or + (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu) + or (flags_cpu!=flags_gpu) ) + { printf("EB eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n", - ie, - i, - soi_amp_gpu, - soi_amp_cpu, - chi2_gpu, - chi2_cpu); + ie, i, soi_amp_gpu, soi_amp_cpu, chi2_gpu, chi2_cpu); if (std::isnan(chi2_gpu)) printf("*** nan ***\n"); } } - - for (uint32_t i = 0; i < nee; ++i) { + + for (uint32_t i=0; ibareProduct().did[i]; auto const soi_amp_gpu = wgpuEE->bareProduct().amplitude[i]; auto const cpu_iter = wcpuEE->bareProduct().find(DetId{did_gpu}); if (cpu_iter == wcpuEE->bareProduct().end()) { std::cerr << ie << ordinal[ie % 10] << " entry\n" - << " did not find a DetId " << did_gpu << " in a CPU collection\n"; + << " did not find a DetId " << did_gpu + << " in a CPU collection\n"; continue; } auto const soi_amp_cpu = cpu_iter->amplitude(); auto const chi2_gpu = wgpuEE->bareProduct().chi2[i]; auto const chi2_cpu = cpu_iter->chi2(); - + + auto const flags_gpu = wgpuEE->bareProduct().flags[i]; + auto const flags_cpu = cpu_iter->flags(); + + hSOIAmplitudesEEGPU->Fill(soi_amp_gpu); hSOIAmplitudesEECPU->Fill(soi_amp_cpu); hSOIAmplitudesEEGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu); - hSOIAmplitudesEEdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu - soi_amp_cpu); - hSOIAmplitudesEEGPUCPUratio->Fill((float)soi_amp_gpu / soi_amp_cpu); + hSOIAmplitudesEEdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu-soi_amp_cpu); + if (soi_amp_cpu>0) hSOIAmplitudesEEGPUCPUratio->Fill( (float) soi_amp_gpu/soi_amp_cpu); + hChi2EEGPU->Fill(chi2_gpu); hChi2EECPU->Fill(chi2_cpu); hChi2EEGPUvsCPU->Fill(chi2_cpu, chi2_gpu); - hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu); - - if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or - std::isnan(chi2_gpu)) { + hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu); + if (chi2_cpu>0) hChi2EEGPUCPUratio->Fill( (float) chi2_gpu/chi2_cpu); + + if (fabs(chi2_gpu/chi2_cpu-1) > 0.05 || fabs(soi_amp_gpu/soi_amp_cpu-1) > 0.05) { + std::cout << " ---- EE " << std::endl; + std::cout << " eventid = " << ie << " xtal = " << i << std::endl; + std::cout << " chi2_gpu = " << chi2_gpu << " chi2_cpu = " << chi2_cpu << std::endl; + std::cout << " soi_amp_gpu = " << soi_amp_gpu << " soi_amp_cpu = " << soi_amp_cpu << std::endl; + std::cout << " flags_gpu = " << flags_gpu << " flags_cpu = " << flags_cpu << std::endl; + } + + hFlagsEEGPU->Fill(flags_gpu); + hFlagsEECPU->Fill(flags_cpu); + hFlagsEEGPUvsCPU->Fill(flags_cpu, flags_gpu); + hFlagsEEdeltavsCPU->Fill(flags_cpu, flags_gpu-flags_cpu); + if (flags_cpu>0) hFlagsEEGPUCPUratio->Fill( (float) flags_gpu/flags_cpu); + + if (flags_cpu!=flags_gpu) { + std::cout << " >> No! Different flag cpu:gpu = " << flags_cpu << " : " << flags_gpu; + std::cout << std::endl; + } + + if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or + (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu) + or (flags_cpu!=flags_gpu) ) + { printf("EE eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n", - ie, - static_cast(neb + i), - soi_amp_gpu, - soi_amp_cpu, - chi2_gpu, - chi2_cpu); + ie, static_cast(neb+i), soi_amp_gpu, soi_amp_cpu, chi2_gpu, chi2_cpu); if (std::isnan(chi2_gpu)) printf("*** nan ***\n"); } } } - + { - TCanvas c("plots", "plots", 4200, 6200); - c.Divide(2, 4); - + + + // TCanvas c("plots", "plots", 4200, 6200); + TCanvas c("plots", "plots", 1750, 860); + // c.Divide(2, 3); + c.Divide(3, 2); + + // c.cd(1); c.cd(1); { gPad->SetLogy(); @@ -200,13 +309,14 @@ int main(int argc, char *argv[]) { hSOIAmplitudesEBGPU->SetLineWidth(1.); hSOIAmplitudesEBGPU->Draw("sames"); gPad->Update(); - auto stats = (TPaveStats *)hSOIAmplitudesEBGPU->FindObject("stats"); + auto stats = (TPaveStats*)hSOIAmplitudesEBGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2 - y1)); + stats->SetY1NDC(y1 - (y2-y1)); } - c.cd(2); + // c.cd(2); + c.cd(4); { gPad->SetLogy(); hSOIAmplitudesEECPU->SetLineColor(kBlack); @@ -216,41 +326,35 @@ int main(int argc, char *argv[]) { hSOIAmplitudesEEGPU->SetLineWidth(1.); hSOIAmplitudesEEGPU->Draw("sames"); gPad->Update(); - auto stats = (TPaveStats *)hSOIAmplitudesEEGPU->FindObject("stats"); + auto stats = (TPaveStats*)hSOIAmplitudesEEGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2 - y1)); + stats->SetY1NDC(y1 - (y2-y1)); } - c.cd(3); + // c.cd(3); + c.cd(2); + gPad->SetGrid(); hSOIAmplitudesEBGPUvsCPU->Draw("COLZ"); - c.cd(4); - hSOIAmplitudesEEGPUvsCPU->Draw("COLZ"); + // c.cd(4); c.cd(5); - hSOIAmplitudesEBdeltavsCPU->Draw("COLZ"); + gPad->SetGrid(); + hSOIAmplitudesEEGPUvsCPU->Draw("COLZ"); + // c.cd(5); + c.cd(3); + // hSOIAmplitudesEBdeltavsCPU->Draw("COLZ"); + hSOIAmplitudesEBGPUCPUratio->Draw(""); + // c.cd(6); c.cd(6); - hSOIAmplitudesEEdeltavsCPU->Draw("COLZ"); - c.cd(7); - { - gPad->SetLogy(); - hSOIAmplitudesEBGPUCPUratio->SetLineColor(kBlack); - hSOIAmplitudesEBGPUCPUratio->SetLineWidth(1.); - hSOIAmplitudesEBGPUCPUratio->Draw(""); - } - c.cd(8); - { - gPad->SetLogy(); - hSOIAmplitudesEEGPUCPUratio->SetLineColor(kBlack); - hSOIAmplitudesEEGPUCPUratio->SetLineWidth(1.); - hSOIAmplitudesEEGPUCPUratio->Draw(""); - } - - c.SaveAs("ecal-amplitudes.pdf"); - } - { - TCanvas c("plots", "plots", 4200, 6200); - c.Divide(2, 3); - + // hSOIAmplitudesEEdeltavsCPU->Draw("COLZ"); + hSOIAmplitudesEEGPUCPUratio->Draw(""); + + c.SaveAs("ecal-amplitudes.root"); + c.SaveAs("ecal-amplitudes.png"); + + // chi2 + + // c.cd(1); c.cd(1); { gPad->SetLogy(); @@ -261,13 +365,14 @@ int main(int argc, char *argv[]) { hChi2EBGPU->SetLineWidth(1.); hChi2EBGPU->Draw("sames"); gPad->Update(); - auto stats = (TPaveStats *)hChi2EBGPU->FindObject("stats"); + auto stats = (TPaveStats*)hChi2EBGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2 - y1)); + stats->SetY1NDC(y1 - (y2-y1)); } - c.cd(2); + // c.cd(2); + c.cd(4); { gPad->SetLogy(); hChi2EECPU->SetLineColor(kBlack); @@ -277,27 +382,170 @@ int main(int argc, char *argv[]) { hChi2EEGPU->SetLineWidth(1.); hChi2EEGPU->Draw("sames"); gPad->Update(); - auto stats = (TPaveStats *)hChi2EEGPU->FindObject("stats"); + auto stats = (TPaveStats*)hChi2EEGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2 - y1)); + stats->SetY1NDC(y1 - (y2-y1)); } - c.cd(3); + // c.cd(3); + c.cd(2); + gPad->SetGrid(); hChi2EBGPUvsCPU->Draw("COLZ"); - c.cd(4); + // c.cd(4); + c.cd(5); + gPad->SetGrid(); hChi2EEGPUvsCPU->Draw("COLZ"); + // c.cd(5); + c.cd(3); + // hChi2EBdeltavsCPU->Draw("COLZ"); + hChi2EBGPUCPUratio->Draw(""); + // c.cd(6); + c.cd(6); + // hChi2EEdeltavsCPU->Draw("COLZ"); + hChi2EEGPUCPUratio->Draw(""); + + c.SaveAs("ecal-chi2.root"); + c.SaveAs("ecal-chi2.png"); + + + + // flags + + // c.cd(1); + c.cd(1); + { + gPad->SetLogy(); + hFlagsEBCPU->SetLineColor(kBlack); + hFlagsEBCPU->SetLineWidth(1.); + hFlagsEBCPU->Draw(""); + hFlagsEBGPU->SetLineColor(kBlue); + hFlagsEBGPU->SetLineWidth(1.); + hFlagsEBGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hFlagsEBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + // c.cd(2); + c.cd(4); + { + gPad->SetLogy(); + hFlagsEECPU->SetLineColor(kBlack); + hFlagsEECPU->SetLineWidth(1.); + hFlagsEECPU->Draw(""); + hFlagsEEGPU->SetLineColor(kBlue); + hFlagsEEGPU->SetLineWidth(1.); + hFlagsEEGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hFlagsEEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + // c.cd(3); + c.cd(2); + gPad->SetGrid(); + hFlagsEBGPUvsCPU->Draw("COLZ"); + // c.cd(4); c.cd(5); - hChi2EBdeltavsCPU->Draw("COLZ"); + gPad->SetGrid(); + hFlagsEEGPUvsCPU->Draw("COLZ"); + // c.cd(5); + c.cd(3); + // hFlagsEBdeltavsCPU->Draw("COLZ"); + hFlagsEBGPUCPUratio->Draw(""); + + // c.cd(6); c.cd(6); - hChi2EEdeltavsCPU->Draw("COLZ"); - - c.SaveAs("ecal-chi2.pdf"); + // hFlagsEEdeltavsCPU->Draw("COLZ"); + hFlagsEEGPUCPUratio->Draw(""); + + + c.SaveAs("ecal-flags.root"); + c.SaveAs("ecal-flags.png"); + + + + + + + + + + + + + + + TCanvas cRechits("Rechits", "Rechits", 1750, 860); + cRechits.Divide(3, 2); + + // Plotting the sizes of GPU vs CPU for each event of EB + cRechits.cd(1); + { + gPad->SetLogy(); + hRechitsEBCPU->SetLineColor(kRed); + hRechitsEBCPU->SetLineWidth(2); + hRechitsEBCPU->Draw(""); + hRechitsEBGPU->SetLineColor(kBlue); + hRechitsEBGPU->SetLineWidth(2); + hRechitsEBGPU->Draw("sames"); + cRechits.Update(); + auto stats = (TPaveStats*)hRechitsEBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + cRechits.cd(4); + { + gPad->SetLogy(); + hRechitsEECPU->SetLineColor(kRed); + hRechitsEECPU->SetLineWidth(2); + hRechitsEECPU->Draw(""); + hRechitsEEGPU->SetLineColor(kBlue); + hRechitsEEGPU->SetLineWidth(2); + hRechitsEEGPU->Draw("sames"); + cRechits.Update(); + auto stats = (TPaveStats*)hRechitsEEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + cRechits.cd(2); { + hRechitsEBGPUvsCPU->Draw("COLZ"); + } + cRechits.cd(5); { + hRechitsEEGPUvsCPU->Draw("COLZ"); + } + cRechits.cd(3); { + gPad->SetLogy(); + //hRechitsEBdeltavsCPU->Draw("COLZ"); + hRechitsEBGPUCPUratio->Draw(""); + } + cRechits.cd(6); { + gPad->SetLogy(); + //hRechitsEEdeltavsCPU->Draw("COLZ"); + hRechitsEEGPUCPUratio->Draw(""); + } + cRechits.SaveAs("ecal-rechits.root"); + cRechits.SaveAs("ecal-rechits.png"); + + + + + + } - + rf.Close(); rfout.Write(); rfout.Close(); - + return 0; } diff --git a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h index 419e50b3636c6..a35ef1c57a381 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h @@ -283,9 +283,9 @@ struct conf_data { #include "CondFormats/EcalObjects/interface/EcalChannelStatus.h" #include "CondFormats/EcalObjects/interface/EcalChannelStatusCode.h" -#include "RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h" -#include "RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h" @@ -378,9 +378,9 @@ namespace ecal { // const refs products to conditions struct ConditionsProducts { - EcalADCToGeVConstantGPU::Product const& ADCToGeV ; - EcalIntercalibConstantsGPU::Product const& Intercalib ; - EcalChannelStatusGPU::Product const& ChannelStatus ; + EcalRechitADCToGeVConstantGPU::Product const& ADCToGeV ; + EcalIntercalibConstantsGPU::Product const& Intercalib ; + EcalRechitChannelStatusGPU::Product const& ChannelStatus ; // EcalLaserAPDPNRatiosGPU::Product const& LaserAPDPNRatios ; EcalLaserAPDPNRatiosRefGPU::Product const& LaserAPDPNRatiosRef ; diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h deleted file mode 100644 index 4f6cb43eddee0..0000000000000 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef RecoLocalCalo_EcalRecProducers_src_EcalADCToGeVConstantGPU_h -#define RecoLocalCalo_EcalRecProducers_src_EcalADCToGeVConstantGPU_h - -#include "CondFormats/EcalObjects/interface/EcalADCToGeVConstant.h" - -#ifndef __CUDACC__ -#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" -#include "HeterogeneousCore/CUDACore/interface/ESProduct.h" -#endif - -class EcalADCToGeVConstantGPU { -public: - struct Product { - ~Product(); - float *adc2gev = nullptr; - }; - - #ifndef __CUDACC__ - - // - EcalADCToGeVConstantGPU(EcalADCToGeVConstant const&); - - // will call dealloation for Product thru ~Product - ~EcalADCToGeVConstantGPU() = default; - - // get device pointers - Product const& getProduct(cudaStream_t) const; - - // - static std::string name() { return std::string{"ecalADCToGeVConstantGPU"}; } - -private: - // in the future, we need to arrange so to avoid this copy on the host - // store eb first then ee - std::vector> adc2gev_; - - cms::cuda::ESProduct product_; - - #endif -}; - - -#endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h deleted file mode 100644 index 0932e7f0641d9..0000000000000 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef RecoLocalCalo_EcalRecProducers_src_EcalChannelStatusGPU_h -#define RecoLocalCalo_EcalRecProducers_src_EcalChannelStatusGPU_h - -#include "CondFormats/EcalObjects/interface/EcalChannelStatus.h" - -#ifndef __CUDACC__ -#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" -#include "HeterogeneousCore/CUDACore/interface/ESProduct.h" -#endif - -class EcalChannelStatusGPU { -public: - struct Product { - ~Product(); - uint16_t *status = nullptr; - }; - - #ifndef __CUDACC__ - - // - EcalChannelStatusGPU(EcalChannelStatus const&); - - // will call dealloation for Product thru ~Product - ~EcalChannelStatusGPU() = default; - - // get device pointers - Product const& getProduct(cudaStream_t) const; - - // - static std::string name() { return std::string{"ecalChannelStatusGPU"}; } - -private: - // in the future, we need to arrange so to avoid this copy on the host - // store eb first then ee - std::vector> status_; - - cms::cuda::ESProduct product_; - - #endif -}; - - -#endif diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu index c67677055c189..23d9c12aa0582 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu @@ -392,7 +392,7 @@ namespace ecal { 50, offsetForHashes, offsetForInputs); -// cudaCheck(cudaGetLastError()); + cudaCheck(cudaGetLastError()); } } // namespace v1 diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalADCToGeVConstantGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalADCToGeVConstantGPU.cc deleted file mode 100644 index acddf19fe01c2..0000000000000 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalADCToGeVConstantGPU.cc +++ /dev/null @@ -1,39 +0,0 @@ -#include "RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h" - -#include "FWCore/Utilities/interface/typelookup.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" - -EcalADCToGeVConstantGPU::EcalADCToGeVConstantGPU(EcalADCToGeVConstant const& values) -: adc2gev_(2) // size is 2, one form EB and one for EE -{ - adc2gev_[0] = values.getEBValue(); - adc2gev_[1] = values.getEEValue(); -} - -EcalADCToGeVConstantGPU::Product::~Product() { - // deallocation - cudaCheck( cudaFree(adc2gev) ); -} - -EcalADCToGeVConstantGPU::Product const& EcalADCToGeVConstantGPU::getProduct( - cudaStream_t cudaStream) const -{ - auto const& product = product_.dataForCurrentDeviceAsync( - cudaStream, - [this](EcalADCToGeVConstantGPU::Product& product, cudaStream_t cudaStream) { - // malloc - cudaCheck( cudaMalloc((void**)&product.adc2gev, - this->adc2gev_.size() * sizeof(float)) ); - // transfer - cudaCheck( cudaMemcpyAsync(product.adc2gev, - this->adc2gev_.data(), - this->adc2gev_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - } - ); - - return product; -} - -TYPELOOKUP_DATA_REG(EcalADCToGeVConstantGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc deleted file mode 100644 index 91293902bb667..0000000000000 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalChannelStatusGPU.cc +++ /dev/null @@ -1,49 +0,0 @@ -#include "RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h" - -#include "FWCore/Utilities/interface/typelookup.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" - -EcalChannelStatusGPU::EcalChannelStatusGPU(EcalChannelStatus const& values) -: status_(values.size()) -{ - // fill in eb - auto const& barrelValues = values.barrelItems(); - for (unsigned int i=0; istatus_.size() * sizeof(uint16_t)) ); - // transfer - cudaCheck( cudaMemcpyAsync(product.status, - this->status_.data(), - this->status_.size() * sizeof(uint16_t), - cudaMemcpyHostToDevice, - cudaStream) ); - } - ); - - return product; -} - -TYPELOOKUP_DATA_REG(EcalChannelStatusGPU); - - diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu index dbfe4833c7d3f..c8d2926b29afc 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalUncalibRecHitMultiFitAlgo_gpu_new.cu @@ -83,7 +83,7 @@ namespace ecal { gainSwitchUseMaxSampleEB, gainSwitchUseMaxSampleEE, totalChannels); -// cudaCheck(cudaGetLastError()); + cudaCheck(cudaGetLastError()); // // 2d preparation kernel @@ -112,7 +112,7 @@ namespace ecal { scratch.isSaturated, offsetForHashes, offsetForInputs); -// cudaCheck(cudaGetLastError()); + cudaCheck(cudaGetLastError()); // run minimization kernels v1::minimization_procedure(eventInputGPU, eventOutputGPU, scratch, conditions, configParameters, cudaStream); @@ -148,7 +148,7 @@ namespace ecal { conditions.sampleMask.getEcalSampleMaskRecordEB(), conditions.sampleMask.getEcalSampleMaskRecordEE(), totalChannels); -// cudaCheck(cudaGetLastError()); + cudaCheck(cudaGetLastError()); // // TODO: small kernel only for EB. It needs to be checked if @@ -170,7 +170,7 @@ namespace ecal { conditions.sampleMask.getEcalSampleMaskRecordEB(), totalChannels, offsetForInputs); -// cudaCheck(cudaGetLastError()); + cudaCheck(cudaGetLastError()); // // @@ -186,7 +186,7 @@ namespace ecal { scratch.sum0sNullHypot, scratch.sumAAsNullHypot, totalChannels); -// cudaCheck(cudaGetLastError()); + cudaCheck(cudaGetLastError()); unsigned int nchannels_per_block_makeratio = 10; unsigned int threads_makeratio = 45 * nchannels_per_block_makeratio; @@ -220,7 +220,7 @@ namespace ecal { configParameters.timeFitLimitsSecondEE, totalChannels, offsetForInputs); -// cudaCheck(cudaGetLastError()); + cudaCheck(cudaGetLastError()); // // @@ -252,7 +252,7 @@ namespace ecal { scratch.timeError, totalChannels, offsetForInputs); -// cudaCheck(cudaGetLastError()); + cudaCheck(cudaGetLastError()); // // @@ -298,7 +298,7 @@ namespace ecal { offsetForHashes, offsetForInputs, totalChannels); -// cudaCheck(cudaGetLastError()); + cudaCheck(cudaGetLastError()); } /* diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc index c9dba159719b3..c2a01e3d5c349 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc @@ -27,9 +27,9 @@ #include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeBiasCorrectionsGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalTimeCalibConstantsGPU.h" // for rechit -#include "RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h" -#include "RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h" @@ -75,8 +75,8 @@ using EcalTimeCalibConstantsGPUESProducer = EcalESProducerGPU< EcalTimeCalibConstantsRcd >; -using EcalADCToGeVConstantGPUESProducer = EcalESProducerGPU< - EcalADCToGeVConstantGPU, +using EcalRechitADCToGeVConstantGPUESProducer = EcalESProducerGPU< + EcalRechitADCToGeVConstantGPU, EcalADCToGeVConstant, EcalADCToGeVConstantRcd >; @@ -87,8 +87,8 @@ using EcalIntercalibConstantsGPUESProducer = EcalESProducerGPU< EcalIntercalibConstantsRcd >; -using EcalChannelStatusGPUESProducer = EcalESProducerGPU< - EcalChannelStatusGPU, +using EcalRechitChannelStatusGPUESProducer = EcalESProducerGPU< + EcalRechitChannelStatusGPU, EcalChannelStatus, EcalChannelStatusRcd >; @@ -129,9 +129,9 @@ DEFINE_FWK_EVENTSETUP_MODULE(EcalSamplesCorrelationGPUESProducer); DEFINE_FWK_EVENTSETUP_MODULE(EcalTimeBiasCorrectionsGPUESProducer); DEFINE_FWK_EVENTSETUP_MODULE(EcalTimeCalibConstantsGPUESProducer); -DEFINE_FWK_EVENTSETUP_MODULE(EcalADCToGeVConstantGPUESProducer); +DEFINE_FWK_EVENTSETUP_MODULE(EcalRechitADCToGeVConstantGPUESProducer); DEFINE_FWK_EVENTSETUP_MODULE(EcalIntercalibConstantsGPUESProducer); -DEFINE_FWK_EVENTSETUP_MODULE(EcalChannelStatusGPUESProducer); +DEFINE_FWK_EVENTSETUP_MODULE(EcalRechitChannelStatusGPUESProducer); DEFINE_FWK_EVENTSETUP_MODULE(EcalLaserAPDPNRatiosGPUESProducer); DEFINE_FWK_EVENTSETUP_MODULE(EcalLaserAPDPNRatiosRefGPUESProducer); DEFINE_FWK_EVENTSETUP_MODULE(EcalLaserAlphasGPUESProducer); diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc index a9d4bb9e670f4..bbe05aceda79b 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc @@ -36,9 +36,9 @@ // conditions gpu -#include "RecoLocalCalo/EcalRecAlgos/interface/EcalADCToGeVConstantGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h" -#include "RecoLocalCalo/EcalRecAlgos/interface/EcalChannelStatusGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h" @@ -95,9 +95,9 @@ class EcalRecHitProducerGPU: public edm::stream::EDProducer { // conditions handles - edm::ESHandle ADCToGeVConstantHandle_; - edm::ESHandle IntercalibConstantsHandle_; - edm::ESHandle ChannelStatusHandle_; + edm::ESHandle ADCToGeVConstantHandle_; + edm::ESHandle IntercalibConstantsHandle_; + edm::ESHandle ChannelStatusHandle_; edm::ESHandle LaserAPDPNRatiosHandle_; edm::ESHandle LaserAPDPNRatiosRefHandle_; @@ -401,7 +401,7 @@ void EcalRecHitProducerGPU::acquire( ctx.stream() ); - cudaCheck(cudaGetLastError()); +// cudaCheck(cudaGetLastError()); } diff --git a/RecoLocalCalo/EcalRecProducers/test/ecalRawDecodingAndMultifit.py b/RecoLocalCalo/EcalRecProducers/test/ecalRawDecodingAndMultifit.py index 4886238cc620f..dbfa0ca20e5fe 100644 --- a/RecoLocalCalo/EcalRecProducers/test/ecalRawDecodingAndMultifit.py +++ b/RecoLocalCalo/EcalRecProducers/test/ecalRawDecodingAndMultifit.py @@ -68,6 +68,18 @@ #process.load("RecoLocalCalo.EcalRecProducers.ecalChannelStatusGPUESProducer_cfi") + +process.load("RecoLocalCalo.EcalRecProducers.ecalRechitADCToGeVConstantGPUESProducer_cfi") +process.load("RecoLocalCalo.EcalRecProducers.ecalIntercalibConstantsGPUESProducer_cfi") +process.load("RecoLocalCalo.EcalRecProducers.ecalRechitChannelStatusGPUESProducer_cfi") + +process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosGPUESProducer_cfi") +process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosRefGPUESProducer_cfi") +process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAlphasGPUESProducer_cfi") +process.load("RecoLocalCalo.EcalRecProducers.ecalLinearCorrectionsGPUESProducer_cfi") + + + #process.ecalMultiFitUncalibRecHitgpu.algoPSet.threads = cms.vint32(256, 1, 1) diff --git a/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py b/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py index f1b68836b2101..02f84eebf21b3 100644 --- a/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py +++ b/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py @@ -151,19 +151,23 @@ #process.ecalRecHit - +process.load("RecoLocalCalo.EcalRecProducers.ecalRechitADCToGeVConstantGPUESProducer_cfi") +process.load("RecoLocalCalo.EcalRecProducers.ecalRechitChannelStatusGPUESProducer_cfi") #process.load("RecoLocalCalo.EcalRecProducers.ecalADCToGeVConstantGPUESProducer_cfi") +#process.load("RecoLocalCalo.EcalRecProducers.ecalChannelStatusGPUESProducer_cfi") process.load("RecoLocalCalo.EcalRecProducers.ecalIntercalibConstantsGPUESProducer_cfi") -process.load("RecoLocalCalo.EcalRecProducers.ecalChannelStatusGPUESProducer_cfi") process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosGPUESProducer_cfi") process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosRefGPUESProducer_cfi") process.load("RecoLocalCalo.EcalRecProducers.ecalLaserAlphasGPUESProducer_cfi") process.load("RecoLocalCalo.EcalRecProducers.ecalLinearCorrectionsGPUESProducer_cfi") -#process.load("RecoLocalCalo.EcalRecProducers.ecalRecHitGPU_cfi") -#process.ecalRecHitProducerGPU = process.ecalRecHitGPU.clone() +process.load("RecoLocalCalo.EcalRecProducers.ecalRecHitGPU_cfi") +process.ecalRecHitProducerGPU = process.ecalRecHitGPU.clone() + +process.load("RecoLocalCalo.EcalRecProducers.ecalCPURecHitProducer_cfi") + # # AM : TEST to see if the number of rechits matches @@ -244,7 +248,7 @@ process.out = cms.OutputModule( "PoolOutputModule", - fileName = cms.untracked.string("test.root") + fileName = cms.untracked.string("testRechit.root") ) #process.out = cms.OutputModule("AsciiOutputModule", @@ -273,8 +277,8 @@ # gpu *process.ecalUncalibRecHitProducerGPU *process.ecalCPUUncalibRecHitProducer - #*process.ecalRecHitProducerGPU - #*process.ecalCPURecHitProducer + *process.ecalRecHitProducerGPU + *process.ecalCPURecHitProducer ) process.schedule = cms.Schedule( @@ -296,3 +300,7 @@ process.MessageLogger.categories.append("CUDAService") +# +process.DependencyGraph = cms.Service("DependencyGraph") + + From 32b56f616fa7336fe12412fbf792337f140b6476 Mon Sep 17 00:00:00 2001 From: amassiro Date: Wed, 13 May 2020 16:35:35 +0200 Subject: [PATCH 14/30] missing files --- .../bin/makeEcalRechitValidationPlots.cpp | 844 ++++++++++++++++++ .../interface/EcalRechitADCToGeVConstantGPU.h | 43 + .../interface/EcalRechitChannelStatusGPU.h | 43 + .../src/EcalRechitADCToGeVConstantGPU.cc | 39 + .../src/EcalRechitChannelStatusGPU.cc | 52 ++ 5 files changed, 1021 insertions(+) create mode 100644 RecoLocalCalo/EcalRecAlgos/bin/makeEcalRechitValidationPlots.cpp create mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h create mode 100644 RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalRechitADCToGeVConstantGPU.cc create mode 100644 RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc diff --git a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalRechitValidationPlots.cpp b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalRechitValidationPlots.cpp new file mode 100644 index 0000000000000..4e7718791b603 --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalRechitValidationPlots.cpp @@ -0,0 +1,844 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "DataFormats/Common/interface/Wrapper.h" +#include "DataFormats/EcalRecHit/interface/EcalRecHit.h" +#include "DataFormats/EcalRecHit/interface/EcalRecHitCollections.h" +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h" + +int main(int argc, char *argv[]) { + if (argc<3) { + std::cout << "run with: ./makeEcalRechitValidationPlots \n"; + exit(0); + } + // Set the GPU and CPU pointers for both EB and EE + edm::Wrapper> *wgpuEB=nullptr; + edm::Wrapper> *wgpuEE=nullptr; + edm::Wrapper *wcpuEB = nullptr; + edm::Wrapper *wcpuEE = nullptr; + + std::string fileName = argv[1]; // The input file containing the data to be validated (i.e. result.root) + std::string outFileName = argv[2]; //The output file in which the validation results will be saved (i.e. output.root) + + //output + TFile rfout{outFileName.c_str(), "recreate"}; + + int nbins = 200; + int last = 5000.; + + int nbins_energy = 300; + float last_energy = 2.; + + int nbins_chi2 = 200; + float last_chi2 = 100.; + + int nbins_flag = 40; + // int nbins_flag = 1000; + int last_flag = 1500; + // int nbins_flag = 40; + // int last_flag = 10000; + + int nbins_extra = 200; + int last_extra = 200; + + int nbins_delta = 201; // use an odd number to center around 0 + float delta = 0.2; + + // RecHits plots for EB and EE on both GPU and CPU + auto hRechitsEBGPU = new TH1D("RechitsEBGPU", "RechitsEBGPU; No. of Rechits. No Filter GPU", nbins, 0, last); + auto hRechitsEBCPU = new TH1D("RechitsEBCPU", "RechitsEBCPU; No. of Rechits. No Filter GPU", nbins, 0, last); + auto hRechitsEEGPU = new TH1D("RechitsEEGPU", "RechitsEEGPU; No. of Rechits. No Filter GPU", nbins, 0, last); + auto hRechitsEECPU = new TH1D("RechitsEECPU", "RechitsEECPU; No. of Rechits. No Filter GPU", nbins, 0, last); + auto hRechitsEBGPUvsCPU = new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU. No Filter GPU", last, 0, last, last, 0, last); + auto hRechitsEEGPUvsCPU = new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU. No Filter GPU", last, 0, last, last, 0, last); + auto hRechitsEBGPUCPUratio = new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU. No Filter GPU", 200, 0.95, 1.05); + auto hRechitsEEGPUCPUratio = new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU. No Filter GPU", 200, 0.95, 1.05); + auto hRechitsEBdeltavsCPU = new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU. No Filter GPU", nbins, 0, last, nbins_delta, -delta, delta); + auto hRechitsEEdeltavsCPU = new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU. No Filter GPU", nbins, 0, last, nbins_delta, -delta, delta); + + // RecHits plots for EB and EE on both GPU and CPU + auto hSelectedRechitsEBGPU = new TH1D("RechitsEBGPU", "RechitsEBGPU; No. of Rechits", nbins, 0, last); + auto hSelectedRechitsEBCPU = new TH1D("RechitsEBCPU", "RechitsEBCPU; No. of Rechits", nbins, 0, last); + auto hSelectedRechitsEEGPU = new TH1D("RechitsEEGPU", "RechitsEEGPU; No. of Rechits", nbins, 0, last); + auto hSelectedRechitsEECPU = new TH1D("RechitsEECPU", "RechitsEECPU; No. of Rechits", nbins, 0, last); + auto hSelectedRechitsEBGPUvsCPU = new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last); + auto hSelectedRechitsEEGPUvsCPU = new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last); + auto hSelectedRechitsEBGPUCPUratio = new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05); + auto hSelectedRechitsEEGPUCPUratio = new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05); + auto hSelectedRechitsEBdeltavsCPU = new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); + auto hSelectedRechitsEEdeltavsCPU = new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); + + // RecHits plots for EB and EE on both GPU and CPU + auto hPositiveRechitsEBGPU = new TH1D("RechitsEBGPU", "RechitsEBGPU; No. of Rechits", nbins, 0, last); + auto hPositiveRechitsEBCPU = new TH1D("RechitsEBCPU", "RechitsEBCPU; No. of Rechits", nbins, 0, last); + auto hPositiveRechitsEEGPU = new TH1D("RechitsEEGPU", "RechitsEEGPU; No. of Rechits", nbins, 0, last); + auto hPositiveRechitsEECPU = new TH1D("RechitsEECPU", "RechitsEECPU; No. of Rechits", nbins, 0, last); + auto hPositiveRechitsEBGPUvsCPU = new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last); + auto hPositiveRechitsEEGPUvsCPU = new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last); + auto hPositiveRechitsEBGPUCPUratio = new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05); + auto hPositiveRechitsEEGPUCPUratio = new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05); + auto hPositiveRechitsEBdeltavsCPU = new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); + auto hPositiveRechitsEEdeltavsCPU = new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); + + // Energies plots for EB and EE on both GPU and CPU + auto hEnergiesEBGPU = new TH1D("EnergiesEBGPU", "EnergiesEBGPU; Energy [GeV]", nbins_energy, 0, last_energy); + auto hEnergiesEEGPU = new TH1D("EnergiesEEGPU", "EnergiesEEGPU; Energy [GeV]", nbins_energy, 0, last_energy); + auto hEnergiesEBCPU = new TH1D("EnergiesEBCPU", "EnergiesEBCPU; Energy [GeV]", nbins_energy, 0, last_energy); + auto hEnergiesEECPU = new TH1D("EnergiesEECPU", "EnergiesEECPU; Energy [GeV]", nbins_energy, 0, last_energy); + auto hEnergiesEBGPUvsCPU = new TH2D("EnergiesEBGPUvsCPU", "EnergiesEBGPUvsCPU; CPU; GPU", nbins_energy, 0, last_energy, nbins_energy, 0, last_energy); + auto hEnergiesEEGPUvsCPU = new TH2D("EnergiesEEGPUvsCPU", "EnergiesEEGPUvsCPU; CPU; GPU", nbins_energy, 0, last_energy, nbins_energy, 0, last_energy); + auto hEnergiesEBGPUCPUratio = new TH1D("EnergiesEBGPU/CPUratio", "EnergiesEBGPU/CPUratio; GPU/CPU", 100, 0.8, 1.2); + auto hEnergiesEEGPUCPUratio = new TH1D("EnergiesEEGPU/CPUratio", "EnergiesEEGPU/CPUratio; GPU/CPU", 100, 0.8, 1.2); + auto hEnergiesEBdeltavsCPU = new TH2D("EnergiesEBdeltavsCPU", "EnergiesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); + auto hEnergiesEEdeltavsCPU = new TH2D("EnergiesEEdeltavsCPU", "EnergiesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); + + // Chi2 plots for EB and EE on both GPU and CPU + auto hChi2EBGPU = new TH1D("Chi2EBGPU", "Chi2EBGPU; Ch^{2}", nbins_chi2, 0, last_chi2); + auto hChi2EEGPU = new TH1D("Chi2EEGPU", "Chi2EEGPU; Ch^{2}", nbins_chi2, 0, last_chi2); + auto hChi2EBCPU = new TH1D("Chi2EBCPU", "Chi2EBCPU; Ch^{2}", nbins_chi2, 0, last_chi2); + auto hChi2EECPU = new TH1D("Chi2EECPU", "Chi2EECPU; Ch^{2}", nbins_chi2, 0, last_chi2); + auto hChi2EBGPUvsCPU = new TH2D("Chi2EBGPUvsCPU", "Chi2EBGPUvsCPU; CPU; GPU", nbins_chi2, 0, 100, nbins_chi2, 0, 100); + auto hChi2EEGPUvsCPU = new TH2D("Chi2EEGPUvsCPU", "Chi2EEGPUvsCPU; CPU; GPU", nbins_chi2, 0, 100, nbins_chi2, 0, 100); + auto hChi2EBGPUCPUratio = new TH1D("Chi2EBGPU/CPUratio", "Chi2EBGPU/CPUratio; GPU/CPU", 100, 0.8, 1.2); + auto hChi2EEGPUCPUratio = new TH1D("Chi2EEGPU/CPUratio", "Chi2EEGPU/CPUratio; GPU/CPU", 100, 0.8, 1.2); + auto hChi2EBdeltavsCPU = new TH2D("Chi2EBdeltavsCPU", "Chi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); + auto hChi2EEdeltavsCPU = new TH2D("Chi2EEdeltavsCPU", "Chi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); + + // Flags plots for EB and EE on both GPU and CPU + auto hFlagsEBGPU = new TH1D("FlagsEBGPU", "FlagsEBGPU; Flags", nbins_flag, -10, last_flag); + auto hFlagsEBCPU = new TH1D("FlagsEBCPU", "FlagsEBCPU; Flags", nbins_flag, -10, last_flag); + auto hFlagsEEGPU = new TH1D("FlagsEEGPU", "FlagsEEGPU; Flags", nbins_flag, -10, last_flag); + auto hFlagsEECPU = new TH1D("FlagsEECPU", "FlagsEECPU; Flags", nbins_flag, -10, last_flag); + auto hFlagsEBGPUvsCPU = new TH2D("FlagsEBGPUvsCPU", "FlagsEBGPUvsCPU; CPU; GPU", nbins_flag, -10, last_flag, nbins_flag, -10, last_flag); + auto hFlagsEEGPUvsCPU = new TH2D("FlagsEEGPUvsCPU", "FlagsEEGPUvsCPU; CPU; GPU", nbins_flag, -10, last_flag, nbins_flag, -10, last_flag); + auto hFlagsEBGPUCPUratio = new TH1D("FlagsEBGPU/CPUratio", "FlagsEBGPU/CPUratio; GPU/CPU", 50, -5, 10); + auto hFlagsEEGPUCPUratio = new TH1D("FlagsEEGPU/CPUratio", "FlagsEEGPU/CPUratio; GPU/CPU", 50, -5, 10); + auto hFlagsEBdeltavsCPU = new TH2D("FlagsEBdeltavsCPU", "FlagsEBdeltavsCPU", nbins_flag, -10, last_flag, nbins_delta, -delta, delta); + auto hFlagsEEdeltavsCPU = new TH2D("FlagsEEdeltavsCPU", "FlagsEEdeltavsCPU", nbins_flag, -10, last_flag, nbins_delta, -delta, delta); + + // Extras plots for EB and EE on both GPU and CPU + auto hExtrasEBGPU = new TH1D("ExtrasEBGPU", "ExtrasEBGPU; No. of Extras", nbins_extra, 0, last_extra); + auto hExtrasEBCPU = new TH1D("ExtrasEBCPU", "ExtrasEBCPU; No. of Extras", nbins_extra, 0, last_extra); + auto hExtrasEEGPU = new TH1D("ExtrasEEGPU", "ExtrasEEGPU; No. of Extras", nbins_extra, 0, last_extra); + auto hExtrasEECPU = new TH1D("ExtrasEECPU", "ExtrasEECPU; No. of Extras", nbins_extra, 0, last_extra); + auto hExtrasEBGPUvsCPU = new TH2D("ExtrasEBGPUvsCPU", "ExtrasEBGPUvsCPU; CPU; GPU", nbins_extra, 0, last_extra,nbins_extra, 0, last_extra); + auto hExtrasEEGPUvsCPU = new TH2D("ExtrasEEGPUvsCPU", "ExtrasEEGPUvsCPU; CPU; GPU", nbins_extra, 0, last_extra, nbins_extra, 0, last_extra); + auto hExtrasEBGPUCPUratio = new TH1D("ExtrasEBGPU/CPUratio", "ExtrasEBGPU/CPUratio; GPU/CPU", 50, 0.0, 2.0); + auto hExtrasEEGPUCPUratio = new TH1D("ExtrasEEGPU/CPUratio", "ExtrasEEGPU/CPUratio; GPU/CPU", 50, 0.0, 2.0); + auto hExtrasEBdeltavsCPU = new TH2D("ExtrasEBdeltavsCPU", "ExtrasEBdeltavsCPU", nbins_extra, 0, last_extra, nbins_delta, -delta, delta); + auto hExtrasEEdeltavsCPU = new TH2D("ExtrasEEdeltavsCPU", "ExtrasEEdeltavsCPU", nbins_extra, 0, last_extra, nbins_delta, -delta, delta); + + // input file setup for tree + std::cout << "validating file " << fileName << std::endl; + TFile rf{fileName.c_str()}; + TTree *rt = (TTree*)rf.Get("Events"); + + // Allocating the appropriate data to their respective pointers + rt->SetBranchAddress("ecalTagsoaecalRecHit_ecalCPURecHitProducer_EcalRecHitsEB_RECO.", &wgpuEB); + rt->SetBranchAddress("ecalTagsoaecalRecHit_ecalCPURecHitProducer_EcalRecHitsEE_RECO.", &wgpuEE); + rt->SetBranchAddress("EcalRecHitsSorted_ecalRecHit_EcalRecHitsEB_RECO.", &wcpuEB); + rt->SetBranchAddress("EcalRecHitsSorted_ecalRecHit_EcalRecHitsEE_RECO.", &wcpuEE); + + constexpr float eps_diff = 1e-3; + + // accumulate sizes for events and sizes of each event on both GPU and CPU + // auto const nentries = rt->GetEntries(); + int nentries = rt->GetEntries(); + + //---- AM: tests + if (nentries > 1000) { + nentries = 1000; + } + // nentries = 1; + + std::cout << "#events to validate over: " << nentries << std::endl; + for (int ie=0; ieGetEntry(ie); + + // const char* ordinal[] = { "th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th" }; + auto cpu_eb_size = wcpuEB->bareProduct().size(); + auto cpu_ee_size = wcpuEE->bareProduct().size(); + auto gpu_eb_size = wgpuEB->bareProduct().energy.size(); + auto gpu_ee_size = wgpuEE->bareProduct().energy.size(); + float eb_ratio = (float) gpu_eb_size/cpu_eb_size; + float ee_ratio = (float) gpu_ee_size/cpu_ee_size; + + // Filling up the histograms on events sizes for EB and EE on both GPU and CPU + hRechitsEBGPU->Fill(gpu_eb_size); + hRechitsEBCPU->Fill(cpu_eb_size); + hRechitsEEGPU->Fill(gpu_ee_size); + hRechitsEECPU->Fill(cpu_ee_size); + hRechitsEBGPUvsCPU->Fill(cpu_eb_size, gpu_eb_size); + hRechitsEEGPUvsCPU->Fill(cpu_ee_size, gpu_ee_size); + hRechitsEBGPUCPUratio->Fill(eb_ratio); + hRechitsEEGPUCPUratio->Fill(ee_ratio); + hRechitsEBdeltavsCPU->Fill(cpu_eb_size, gpu_eb_size-cpu_eb_size); + hRechitsEEdeltavsCPU->Fill(cpu_ee_size, gpu_ee_size-cpu_ee_size); + + /* + * // condition that sizes on GPU and CPU should be the same for EB or EE + * if (cpu_eb_size != gpu_eb_size or cpu_ee_size != gpu_ee_size) { + * std::cerr << ie << ordinal[ie % 10] << " entry:\n" + * << " EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << gpu_eb_size << " (gpu)\n" + * << " EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << gpu_ee_size << " (gpu)" << std::endl; + * + * continue; + } + assert(wgpuEB->bareProduct().energy.size() == wcpuEB->bareProduct().size()); + assert(wgpuEE->bareProduct().energy.size() == wcpuEE->bareProduct().size()); + auto const neb = wcpuEB->bareProduct().size(); //like cpu_eb_size but set to constant + auto const nee = wcpuEE->bareProduct().size(); //like cpu_ee_size but set to constant + */ + + uint selected_gpu_eb_size = 0; + uint selected_gpu_ee_size = 0; + + uint positive_gpu_eb_size = 0; + uint positive_gpu_ee_size = 0; + + // EB: + for (uint32_t i=0; ibareProduct().did[i]; // set the did for the current RecHit + // Set the variables for GPU + auto const enr_gpu = wgpuEB->bareProduct().energy[i]; + auto const chi2_gpu = wgpuEB->bareProduct().chi2[i]; + auto const flag_gpu = wgpuEB->bareProduct().flagBits[i]; + auto const extra_gpu = wgpuEB->bareProduct().extra[i]; + + // you have "-1" if the crystal is not selected + if ( enr_gpu>=0 ) { + selected_gpu_eb_size++; + + if ( enr_gpu>0 ) { + positive_gpu_eb_size++; + } + + // find the Rechit on CPU reflecting the same did + auto const cpu_iter = wcpuEB->bareProduct().find(DetId{did_gpu}); + if (cpu_iter == wcpuEB->bareProduct().end()) { + // std::cerr << ie << ordinal[ie % 10] << " entry\n" + // << " Did not find a DetId " << did_gpu_eb + // << " in a CPU collection\n"; + std::cerr << " Did not find a DetId " << did_gpu << " in a CPU collection\n"; + continue; + } + // Set the variables for CPU + auto const enr_cpu = cpu_iter->energy(); + auto const chi2_cpu = cpu_iter->chi2(); +// auto const flag_cpu = cpu_iter->flagBits(); + auto const flag_cpu = 1; +// auto const extra_cpu = cpu_iter->extra(); + auto const extra_cpu = 1; + // auto const flag_cpu = cpu_iter->flagBits() ? cpu_iter->flagBits():-1; + // auto const extra_cpu = cpu_iter->extra() ? cpu_iter->extra():-1; + + // AM: TEST + // if (extra_cpu != 10) continue; + + // Fill the energy and Chi2 histograms for GPU and CPU and their comparisons with delta + hEnergiesEBGPU->Fill(enr_gpu); + hEnergiesEBCPU->Fill(enr_cpu); + // std::cout<<"EB CPU Energy:\t"<Fill(enr_cpu, enr_gpu); + hEnergiesEBGPUCPUratio->Fill(enr_gpu/enr_cpu); + hEnergiesEBdeltavsCPU->Fill(enr_cpu, enr_gpu-enr_cpu); + + hChi2EBGPU->Fill(chi2_gpu); + hChi2EBCPU->Fill(chi2_cpu); + hChi2EBGPUvsCPU->Fill(chi2_cpu, chi2_gpu); + hChi2EBGPUCPUratio->Fill(chi2_gpu/chi2_cpu); + hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu); + + hFlagsEBGPU->Fill(flag_gpu); + hFlagsEBCPU->Fill(flag_cpu); + hFlagsEBGPUvsCPU->Fill(flag_cpu, flag_gpu); + hFlagsEBGPUCPUratio->Fill(flag_cpu ? flag_gpu/flag_cpu : -1); + hFlagsEBdeltavsCPU->Fill(flag_cpu, flag_gpu-flag_cpu); + + hExtrasEBGPU->Fill(extra_gpu); + hExtrasEBCPU->Fill(extra_cpu); + hExtrasEBGPUvsCPU->Fill(extra_cpu, extra_gpu); + hExtrasEBGPUCPUratio->Fill(extra_cpu ? extra_gpu/extra_cpu : -1); + hExtrasEBdeltavsCPU->Fill(extra_cpu, extra_gpu-extra_cpu); + + // Check if abs difference between GPU and CPU values for energy and Chi2 are smaller than eps, if not print message + // if ((std::abs(enr_gpu - enr_cpu) >= eps_diff) or + // (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu)) + // { + // printf("EB eventid = %d chid = %d energy_gpu = %f energy_cpu %f chi2_gpu = %f chi2_cpu = %f\n", + // ie, i, enr_gpu, enr_cpu, chi2_gpu, chi2_cpu); + // if (std::isnan(chi2_gpu)) + // printf("*** nan ***\n"); + // } + + } + } + + // EE: + for (uint32_t i=0; ibareProduct().did[i]; // set the did for the current RecHit + // Set the variables for GPU + auto const enr_gpu = wgpuEE->bareProduct().energy[i]; + auto const chi2_gpu = wgpuEE->bareProduct().chi2[i]; + auto const flag_gpu = wgpuEE->bareProduct().flagBits[i]; + auto const extra_gpu = wgpuEE->bareProduct().extra[i]; + + // you have "-1" if the crystal is not selected + if ( enr_gpu>=0 ) { + selected_gpu_ee_size++; + + if ( enr_gpu>0 ) { + positive_gpu_ee_size++; + } + + // find the Rechit on CPU reflecting the same did + auto const cpu_iter = wcpuEE->bareProduct().find(DetId{did_gpu}); + if (cpu_iter == wcpuEE->bareProduct().end()) { + // std::cerr << ie << ordinal[ie % 10] << " entry\n" + // << " Did not find a DetId " << did_gpu + // << " in a CPU collection\n"; + std::cerr << " Did not find a DetId " << did_gpu << " in a CPU collection\n"; + continue; + } + // Set the variables for CPU + auto const enr_cpu = cpu_iter->energy(); + auto const chi2_cpu = cpu_iter->chi2(); +// auto const flag_cpu = cpu_iter->flagBits(); + auto const flag_cpu = 1; +// auto const extra_cpu = cpu_iter->extra(); + auto const extra_cpu = 1; + // auto const flag_cpu = cpu_iter->flagBits()?cpu_iter->flagBits():-1; + // auto const extra_cpu = cpu_iter->extra()?cpu_iter->extra():-1; + + + // AM: TEST + // if (extra_cpu != 10) continue; + + + // Fill the energy and Chi2 histograms for GPU and CPU and their comparisons with delta + hEnergiesEEGPU->Fill(enr_gpu); + hEnergiesEECPU->Fill(enr_cpu); + hEnergiesEEGPUvsCPU->Fill(enr_cpu, enr_gpu); + hEnergiesEEGPUCPUratio->Fill(enr_gpu/enr_cpu); + hEnergiesEEdeltavsCPU->Fill(enr_cpu, enr_gpu-enr_cpu); + + hChi2EEGPU->Fill(chi2_gpu); + hChi2EECPU->Fill(chi2_cpu); + hChi2EEGPUvsCPU->Fill(chi2_cpu, chi2_gpu); + hChi2EEGPUCPUratio->Fill(chi2_gpu/chi2_cpu); + hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu); + + hFlagsEEGPU->Fill(flag_gpu); + hFlagsEECPU->Fill(flag_cpu); + hFlagsEEGPUvsCPU->Fill(flag_cpu, flag_gpu); + hFlagsEEGPUCPUratio->Fill(flag_cpu ? flag_gpu/flag_cpu : -1); + hFlagsEEdeltavsCPU->Fill(flag_cpu, flag_gpu-flag_cpu); + + hExtrasEEGPU->Fill(extra_gpu); + hExtrasEECPU->Fill(extra_cpu); + hExtrasEEGPUvsCPU->Fill(extra_cpu, extra_gpu); + hExtrasEEGPUCPUratio->Fill(extra_cpu ? extra_gpu/extra_cpu : -1); + hExtrasEEdeltavsCPU->Fill(extra_cpu, extra_gpu-extra_cpu); + + // Check if abs difference between GPU and CPU values for energy and Chi2 are smaller than eps, if not print message + // if ((std::abs(enr_gpu - enr_cpu) >= eps_diff) or + // (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu)) + // { + // printf("EE eventid = %d chid = %d energy_gpu = %f energy_cpu %f chi2_gpu = %f chi2_cpu = %f\n", + // ie, i, enr_gpu, enr_cpu, chi2_gpu, chi2_cpu); + // if (std::isnan(chi2_gpu)) + // printf("*** nan ***\n"); + // } + } + } + + + // + // now the rechit counting + // + float selected_eb_ratio = (float) selected_gpu_eb_size/cpu_eb_size; + float selected_ee_ratio = (float) selected_gpu_ee_size/cpu_ee_size; + + // Filling up the histograms on events sizes for EB and EE on both GPU and CPU + hSelectedRechitsEBGPU->Fill(selected_gpu_eb_size); + hSelectedRechitsEBCPU->Fill(cpu_eb_size); + hSelectedRechitsEEGPU->Fill(selected_gpu_ee_size); + hSelectedRechitsEECPU->Fill(cpu_ee_size); + hSelectedRechitsEBGPUvsCPU->Fill(cpu_eb_size, selected_gpu_eb_size); + hSelectedRechitsEEGPUvsCPU->Fill(cpu_ee_size, selected_gpu_ee_size); + hSelectedRechitsEBGPUCPUratio->Fill(selected_eb_ratio); + hSelectedRechitsEEGPUCPUratio->Fill(selected_ee_ratio); + hSelectedRechitsEBdeltavsCPU->Fill(cpu_eb_size, selected_gpu_eb_size-cpu_eb_size); + hSelectedRechitsEEdeltavsCPU->Fill(cpu_ee_size, selected_gpu_ee_size-cpu_ee_size); + + + // + // now the rechit counting + // + + + uint positive_cpu_eb_size = 0; + uint positive_cpu_ee_size = 0; + + // EB: + for (uint32_t i=0; ibareProduct()[i].energy(); + if (enr_cpu > 0) { + positive_cpu_eb_size++; + } + } + // EE: + for (uint32_t i=0; ibareProduct()[i].energy(); + if (enr_cpu > 0) { + positive_cpu_ee_size++; + } + } + + + float positive_eb_ratio = (float) positive_gpu_eb_size/positive_cpu_eb_size; + float positive_ee_ratio = (float) positive_gpu_ee_size/positive_cpu_ee_size; + + // Filling up the histograms on events sizes for EB and EE on both GPU and CPU + hPositiveRechitsEBGPU->Fill(positive_gpu_eb_size); + hPositiveRechitsEBCPU->Fill(positive_cpu_eb_size); + hPositiveRechitsEEGPU->Fill(positive_gpu_ee_size); + hPositiveRechitsEECPU->Fill(positive_cpu_ee_size); + hPositiveRechitsEBGPUvsCPU->Fill(positive_cpu_eb_size, positive_gpu_eb_size); + hPositiveRechitsEEGPUvsCPU->Fill(positive_cpu_ee_size, positive_gpu_ee_size); + hPositiveRechitsEBGPUCPUratio->Fill(positive_eb_ratio); + hPositiveRechitsEEGPUCPUratio->Fill(positive_ee_ratio); + hPositiveRechitsEBdeltavsCPU->Fill(positive_cpu_eb_size, positive_gpu_eb_size-positive_cpu_eb_size); + hPositiveRechitsEEdeltavsCPU->Fill(positive_cpu_ee_size, positive_gpu_ee_size-positive_cpu_ee_size); + + + + if (cpu_eb_size != selected_gpu_eb_size or cpu_ee_size != selected_gpu_ee_size) { + // std::cerr << ie << ordinal[ie % 10] << " entry:\n" + std::cerr << ie << " entry:\n" + << " EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << selected_gpu_eb_size << " (gpu)\n" + << " EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << selected_gpu_ee_size << " (gpu)" << std::endl; + } + + + + } + + + + + // Plotting the results: + { + // Canvases Setup: + TCanvas cAllRechits("AllRechits", "AllRechits", 1750, 860); + cAllRechits.Divide(3, 2); + TCanvas cRechits("Rechits", "Rechits", 1750, 860); + cRechits.Divide(3, 2); + TCanvas cRechitsPositive("RechitsPositive", "RechitsPositive", 1750, 860); + cRechitsPositive.Divide(3, 2); + TCanvas cEnergies("Energies", "Energies", 1750, 860); + cEnergies.Divide(3, 2); + TCanvas cChi2("Chi2", "Chi2", 1750, 860); + cChi2.Divide(3, 2); + TCanvas cFlags("Flags", "Flags", 1750, 860); + cFlags.Divide(3, 2); + TCanvas cExtras("Extras", "Extras", 1750, 860); + cExtras.Divide(3, 2); + + + + // Plotting the sizes of GPU vs CPU for each event of EB + cAllRechits.cd(1); + { + gPad->SetLogy(); + hRechitsEBCPU->SetLineColor(kRed); + hRechitsEBCPU->SetLineWidth(2); + hRechitsEBCPU->Draw(""); + hRechitsEBGPU->SetLineColor(kBlue); + hRechitsEBGPU->SetLineWidth(2); + hRechitsEBGPU->Draw("sames"); + cAllRechits.Update(); + auto stats = (TPaveStats*)hRechitsEBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + cAllRechits.cd(4); + { + gPad->SetLogy(); + hRechitsEECPU->SetLineColor(kRed); + hRechitsEECPU->SetLineWidth(2); + hRechitsEECPU->Draw(""); + hRechitsEEGPU->SetLineColor(kBlue); + hRechitsEEGPU->SetLineWidth(2); + hRechitsEEGPU->Draw("sames"); + cAllRechits.Update(); + auto stats = (TPaveStats*)hRechitsEEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + cAllRechits.cd(2); { + gStyle->SetPalette(55); + hRechitsEBGPUvsCPU->Draw("COLZ"); + } + cAllRechits.cd(5); { + gStyle->SetPalette(55); + hRechitsEEGPUvsCPU->Draw("COLZ"); + } + cAllRechits.cd(3); { + gPad->SetLogy(); + //hRechitsEBdeltavsCPU->Draw("COLZ"); + hRechitsEBGPUCPUratio->Draw(""); + } + cAllRechits.cd(6); { + gPad->SetLogy(); + //hRechitsEEdeltavsCPU->Draw("COLZ"); + hRechitsEEGPUCPUratio->Draw(""); + } + cAllRechits.SaveAs("ecal-allrechits.root"); + cAllRechits.SaveAs("ecal-allrechits.png"); + + + + // Plotting the sizes of GPU vs CPU for each event of EB + cRechits.cd(1); + { + gPad->SetLogy(); + hSelectedRechitsEBCPU->SetLineColor(kRed); + hSelectedRechitsEBCPU->SetLineWidth(2); + hSelectedRechitsEBCPU->Draw(""); + hSelectedRechitsEBGPU->SetLineColor(kBlue); + hSelectedRechitsEBGPU->SetLineWidth(2); + hSelectedRechitsEBGPU->Draw("sames"); + cRechits.Update(); + auto stats = (TPaveStats*)hSelectedRechitsEBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + cRechits.cd(4); + { + gPad->SetLogy(); + hSelectedRechitsEECPU->SetLineColor(kRed); + hSelectedRechitsEECPU->SetLineWidth(2); + hSelectedRechitsEECPU->Draw(""); + hSelectedRechitsEEGPU->SetLineColor(kBlue); + hSelectedRechitsEEGPU->SetLineWidth(2); + hSelectedRechitsEEGPU->Draw("sames"); + cRechits.Update(); + auto stats = (TPaveStats*)hSelectedRechitsEEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + cRechits.cd(2); { + gStyle->SetPalette(55); + hSelectedRechitsEBGPUvsCPU->Draw("COLZ"); + } + cRechits.cd(5); { + gStyle->SetPalette(55); + hSelectedRechitsEEGPUvsCPU->Draw("COLZ"); + } + cRechits.cd(3); { + gPad->SetLogy(); + //hSelectedRechitsEBdeltavsCPU->Draw("COLZ"); + hSelectedRechitsEBGPUCPUratio->Draw(""); + } + cRechits.cd(6); { + gPad->SetLogy(); + //hSelectedRechitsEEdeltavsCPU->Draw("COLZ"); + hSelectedRechitsEEGPUCPUratio->Draw(""); + } + cRechits.SaveAs("ecal-rechits.root"); + cRechits.SaveAs("ecal-rechits.png"); + + + + + // Plotting the sizes of GPU vs CPU for each event of EB + cRechitsPositive.cd(1); + { + gPad->SetLogy(); + hPositiveRechitsEBCPU->SetLineColor(kRed); + hPositiveRechitsEBCPU->SetLineWidth(2); + hPositiveRechitsEBCPU->Draw(""); + hPositiveRechitsEBGPU->SetLineColor(kBlue); + hPositiveRechitsEBGPU->SetLineWidth(2); + hPositiveRechitsEBGPU->Draw("sames"); + cRechitsPositive.Update(); + auto stats = (TPaveStats*)hPositiveRechitsEBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + cRechitsPositive.cd(4); + { + gPad->SetLogy(); + hPositiveRechitsEECPU->SetLineColor(kRed); + hPositiveRechitsEECPU->SetLineWidth(2); + hPositiveRechitsEECPU->Draw(""); + hPositiveRechitsEEGPU->SetLineColor(kBlue); + hPositiveRechitsEEGPU->SetLineWidth(2); + hPositiveRechitsEEGPU->Draw("sames"); + cRechitsPositive.Update(); + auto stats = (TPaveStats*)hPositiveRechitsEEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + cRechitsPositive.cd(2); { + gStyle->SetPalette(55); + hPositiveRechitsEBGPUvsCPU->Draw("COLZ"); + } + cRechitsPositive.cd(5); { + gStyle->SetPalette(55); + hPositiveRechitsEEGPUvsCPU->Draw("COLZ"); + } + cRechitsPositive.cd(3); { + gPad->SetLogy(); + //hPositiveRechitsEBdeltavsCPU->Draw("COLZ"); + hPositiveRechitsEBGPUCPUratio->Draw(""); + } + cRechitsPositive.cd(6); { + gPad->SetLogy(); + //hPositiveRechitsEEdeltavsCPU->Draw("COLZ"); + hPositiveRechitsEEGPUCPUratio->Draw(""); + } + cRechitsPositive.SaveAs("ecal-rechits-positive.root"); + cRechitsPositive.SaveAs("ecal-rechits-positive.png"); + + + cEnergies.cd(1); + { + gPad->SetLogy(); + hEnergiesEBCPU->SetLineColor(kBlack); + hEnergiesEBCPU->SetLineWidth(2); + hEnergiesEBCPU->Draw(""); + hEnergiesEBGPU->SetLineColor(kBlue); + hEnergiesEBGPU->SetLineWidth(2); + hEnergiesEBGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hEnergiesEBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + cEnergies.cd(4); + { + gPad->SetLogy(); + hEnergiesEECPU->SetLineColor(kBlack); + hEnergiesEECPU->SetLineWidth(2); + hEnergiesEECPU->Draw(""); + hEnergiesEEGPU->SetLineColor(kBlue); + hEnergiesEEGPU->SetLineWidth(2); + hEnergiesEEGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hEnergiesEEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + cEnergies.cd(2); { + hEnergiesEBGPUvsCPU->Draw("COLZ"); + } + cEnergies.cd(5); { + hEnergiesEEGPUvsCPU->Draw("COLZ"); + } + cEnergies.cd(3); { + gPad->SetLogy(); + //hEnergiesEBdeltavsCPU->Draw("COLZ"); + hEnergiesEBGPUCPUratio->Draw(""); + } + cEnergies.cd(6); { + gPad->SetLogy(); + //hEnergiesEEdeltavsCPU->Draw("COLZ"); + hEnergiesEEGPUCPUratio->Draw(""); + } + cEnergies.SaveAs("ecal-energies.root"); + cEnergies.SaveAs("ecal-energies.png"); + + + cChi2.cd(1); + { + gPad->SetLogy(); + hChi2EBCPU->SetLineColor(kBlack); + hChi2EBCPU->SetLineWidth(2); + hChi2EBCPU->Draw(""); + hChi2EBGPU->SetLineColor(kBlue); + hChi2EBGPU->SetLineWidth(2); + hChi2EBGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hChi2EBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + cChi2.cd(4); + { + gPad->SetLogy(); + hChi2EECPU->SetLineColor(kBlack); + hChi2EECPU->SetLineWidth(2); + hChi2EECPU->Draw(""); + hChi2EEGPU->SetLineColor(kBlue); + hChi2EEGPU->SetLineWidth(2); + hChi2EEGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hChi2EEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + cChi2.cd(2); { + hChi2EBGPUvsCPU->Draw("COLZ"); + } + cChi2.cd(5); { + hChi2EEGPUvsCPU->Draw("COLZ"); + } + cChi2.cd(3); { + gPad->SetLogy(); + //hChi2EBdeltavsCPU->Draw("COLZ"); + hChi2EBGPUCPUratio->Draw(""); + } + cChi2.cd(6); { + gPad->SetLogy(); + //hChi2EEdeltavsCPU->Draw("COLZ"); + hChi2EEGPUCPUratio->Draw(""); + } + cChi2.SaveAs("ecal-chi2.root"); + cChi2.SaveAs("ecal-chi2.png"); + + + cFlags.cd(1); + { + gPad->SetLogy(); + hFlagsEBCPU->SetLineColor(kBlack); + hFlagsEBCPU->SetLineWidth(2); + hFlagsEBCPU->Draw(""); + hFlagsEBGPU->SetLineColor(kBlue); + hFlagsEBGPU->SetLineWidth(2); + hFlagsEBGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hFlagsEBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + cFlags.cd(4); + { + gPad->SetLogy(); + hFlagsEECPU->SetLineColor(kBlack); + hFlagsEECPU->SetLineWidth(2); + hFlagsEECPU->Draw(""); + hFlagsEEGPU->SetLineColor(kBlue); + hFlagsEEGPU->SetLineWidth(2); + hFlagsEEGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hFlagsEEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + cFlags.cd(2); { + hFlagsEBGPUvsCPU->Draw("COLZ"); + } + cFlags.cd(5); { + hFlagsEEGPUvsCPU->Draw("COLZ"); + } + cFlags.cd(3); { + gPad->SetLogy(); + //hFlagsEBdeltavsCPU->Draw("COLZ"); + hFlagsEBGPUCPUratio->Draw(""); + } + cFlags.cd(6); { + gPad->SetLogy(); + //hFlagsEEdeltavsCPU->Draw("COLZ"); + hFlagsEEGPUCPUratio->Draw(""); + } + cFlags.SaveAs("ecal-flags.root"); + cFlags.SaveAs("ecal-flags.png"); + + + cExtras.cd(1); + { + gPad->SetLogy(); + hExtrasEBCPU->SetLineColor(kBlack); + hExtrasEBCPU->SetLineWidth(2); + hExtrasEBCPU->Draw(""); + hExtrasEBGPU->SetLineColor(kBlue); + hExtrasEBGPU->SetLineWidth(2); + hExtrasEBGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hExtrasEBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + cExtras.cd(4); + { + gPad->SetLogy(); + hExtrasEECPU->SetLineColor(kBlack); + hExtrasEECPU->SetLineWidth(2); + hExtrasEECPU->Draw(""); + hExtrasEEGPU->SetLineColor(kBlue); + hExtrasEEGPU->SetLineWidth(2); + hExtrasEEGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hExtrasEEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2-y1)); + } + cExtras.cd(2); { + hExtrasEBGPUvsCPU->Draw("COLZ"); + } + cExtras.cd(5); { + hExtrasEEGPUvsCPU->Draw("COLZ"); + } + cExtras.cd(3); { + gPad->SetLogy(); + //hExtrasEBdeltavsCPU->Draw("COLZ"); + hExtrasEBGPUCPUratio->Draw(""); + } + cExtras.cd(6); { + gPad->SetLogy(); + //hExtrasEEdeltavsCPU->Draw("COLZ"); + hExtrasEEGPUCPUratio->Draw(""); + } + cExtras.SaveAs("ecal-extras.root"); + cExtras.SaveAs("ecal-extras.png"); + } + + // Close all open files + rf.Close(); + rfout.Write(); + rfout.Close(); + + return 0; +} diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h new file mode 100644 index 0000000000000..8addc316f366d --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h @@ -0,0 +1,43 @@ +#ifndef RecoLocalCalo_EcalRecProducers_src_EcalRechitADCToGeVConstantGPU_h +#define RecoLocalCalo_EcalRecProducers_src_EcalRechitADCToGeVConstantGPU_h + +#include "CondFormats/EcalObjects/interface/EcalADCToGeVConstant.h" + +#ifndef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" +#include "HeterogeneousCore/CUDACore/interface/ESProduct.h" +#endif + +class EcalRechitADCToGeVConstantGPU { +public: + struct Product { + ~Product(); + float *adc2gev = nullptr; + }; + + #ifndef __CUDACC__ + + // + EcalRechitADCToGeVConstantGPU(EcalADCToGeVConstant const&); + + // will call dealloation for Product thru ~Product + ~EcalRechitADCToGeVConstantGPU() = default; + + // get device pointers + Product const& getProduct(cudaStream_t) const; + + // + static std::string name() { return std::string{"ecalRechitADCToGeVConstantGPU"}; } + +private: + // in the future, we need to arrange so to avoid this copy on the host + // store eb first then ee + std::vector> adc2gev_; + + cms::cuda::ESProduct product_; + + #endif +}; + + +#endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h new file mode 100644 index 0000000000000..2329b3752089d --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h @@ -0,0 +1,43 @@ +#ifndef RecoLocalCalo_EcalRecProducers_src_EcalRechitChannelStatusGPU_h +#define RecoLocalCalo_EcalRecProducers_src_EcalRechitChannelStatusGPU_h + +#include "CondFormats/EcalObjects/interface/EcalChannelStatus.h" + +#ifndef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" +#include "HeterogeneousCore/CUDACore/interface/ESProduct.h" +#endif + +class EcalRechitChannelStatusGPU { +public: + struct Product { + ~Product(); + uint16_t *status = nullptr; + }; + + #ifndef __CUDACC__ + + // + EcalRechitChannelStatusGPU(EcalChannelStatus const&); + + // will call dealloation for Product thru ~Product + ~EcalRechitChannelStatusGPU() = default; + + // get device pointers + Product const& getProduct(cudaStream_t) const; + + // + static std::string name() { return std::string{"ecalRechitChannelStatusGPU"}; } + +private: + // in the future, we need to arrange so to avoid this copy on the host + // store eb first then ee + std::vector> status_; + + cms::cuda::ESProduct product_; + + #endif +}; + + +#endif diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRechitADCToGeVConstantGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitADCToGeVConstantGPU.cc new file mode 100644 index 0000000000000..3824b0989f622 --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitADCToGeVConstantGPU.cc @@ -0,0 +1,39 @@ +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h" + +#include "FWCore/Utilities/interface/typelookup.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +EcalRechitADCToGeVConstantGPU::EcalRechitADCToGeVConstantGPU(EcalADCToGeVConstant const& values) +: adc2gev_(2) // size is 2, one form EB and one for EE +{ + adc2gev_[0] = values.getEBValue(); + adc2gev_[1] = values.getEEValue(); +} + +EcalRechitADCToGeVConstantGPU::Product::~Product() { + // deallocation + cudaCheck( cudaFree(adc2gev) ); +} + +EcalRechitADCToGeVConstantGPU::Product const& EcalRechitADCToGeVConstantGPU::getProduct( + cudaStream_t cudaStream) const +{ + auto const& product = product_.dataForCurrentDeviceAsync( + cudaStream, + [this](EcalRechitADCToGeVConstantGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck( cudaMalloc((void**)&product.adc2gev, + this->adc2gev_.size() * sizeof(float)) ); + // transfer + cudaCheck( cudaMemcpyAsync(product.adc2gev, + this->adc2gev_.data(), + this->adc2gev_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream) ); + } + ); + + return product; +} + +TYPELOOKUP_DATA_REG(EcalRechitADCToGeVConstantGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc new file mode 100644 index 0000000000000..53f32df6f9697 --- /dev/null +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc @@ -0,0 +1,52 @@ +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h" + +#include "FWCore/Utilities/interface/typelookup.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +EcalRechitChannelStatusGPU::EcalRechitChannelStatusGPU(EcalChannelStatus const& values) +: status_(values.size()) +{ + + std::cout << " I am running EcalRechitChannelStatusGPU ::>> debug ongoing ... " << std::endl; + + // fill in eb + auto const& barrelValues = values.barrelItems(); + for (unsigned int i=0; istatus_.size() * sizeof(uint16_t)) ); + // transfer + cudaCheck( cudaMemcpyAsync(product.status, + this->status_.data(), + this->status_.size() * sizeof(uint16_t), + cudaMemcpyHostToDevice, + cudaStream) ); + } + ); + + return product; + } + + TYPELOOKUP_DATA_REG(EcalRechitChannelStatusGPU); + \ No newline at end of file From f159c2922c2c5b2f1e2608e7639a53fd0678f934 Mon Sep 17 00:00:00 2001 From: amassiro Date: Fri, 15 May 2020 17:58:11 +0200 Subject: [PATCH 15/30] fixes as for PR and clang-tidy --- .../interface/EcalChannelStatusCode.h | 19 +-------- EventFilter/EcalRawToDigi/BuildFile.xml | 10 ++--- .../EcalRecAlgos/interface/DeclsForKernels.h | 41 +++++++++++-------- .../src/EcalRecHitBuilderKernels.cu | 24 ++++++----- .../src/EcalRecHitBuilderKernels.h | 4 +- .../src/EcalRechitChannelStatusGPU.cc | 5 +-- .../EcalRecAlgos/src/KernelHelpers.cu | 6 +-- RecoLocalCalo/EcalRecProducers/BuildFile.xml | 1 - .../EcalRecProducers/plugins/BuildFile.xml | 2 - .../plugins/EcalCPURecHitProducer.cc | 3 +- .../plugins/EcalRecHitProducerGPU.cc | 7 +--- .../test/testEcalRechitProducer_cfg.py | 7 +++- .../test/testEcalUncalibRechitProducer_cfg.py | 2 - 13 files changed, 59 insertions(+), 72 deletions(-) diff --git a/CondFormats/EcalObjects/interface/EcalChannelStatusCode.h b/CondFormats/EcalObjects/interface/EcalChannelStatusCode.h index a52868fe0d8df..09202950bfc68 100644 --- a/CondFormats/EcalObjects/interface/EcalChannelStatusCode.h +++ b/CondFormats/EcalObjects/interface/EcalChannelStatusCode.h @@ -5,7 +5,6 @@ * Created: 14 Nov 2006 **/ - #include "CondFormats/Serialization/interface/Serializable.h" #include @@ -17,10 +16,7 @@ */ class EcalChannelStatusCode { - - public: - enum Code { kOk = 0, kDAC, @@ -39,22 +35,12 @@ class EcalChannelStatusCode { kNoDataNoTP }; - enum Bits { - kHV=0, - kLV, - kDAQ, - kTP, - kTrigger, - kTemperature, - kNextToDead - }; - + enum Bits { kHV = 0, kLV, kDAQ, kTP, kTrigger, kTemperature, kNextToDead }; + public: - EcalChannelStatusCode() : status_(0) {} EcalChannelStatusCode(const uint16_t& encodedStatus) : status_(encodedStatus){}; - void print(std::ostream& s) const { s << "status is: " << status_; } /// return decoded status @@ -69,7 +55,6 @@ class EcalChannelStatusCode { static const int chStatusMask = 0x1F; private: - static const int kBitsOffset = 5; /* bits 1-5 store a status code: 0 channel ok diff --git a/EventFilter/EcalRawToDigi/BuildFile.xml b/EventFilter/EcalRawToDigi/BuildFile.xml index b23c65ec201e4..da28405324833 100644 --- a/EventFilter/EcalRawToDigi/BuildFile.xml +++ b/EventFilter/EcalRawToDigi/BuildFile.xml @@ -1,4 +1,6 @@ + + @@ -14,15 +16,11 @@ + + - - - - - - diff --git a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h index a35ef1c57a381..92d4bee3100f3 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h @@ -11,6 +11,10 @@ #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" #include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h" +// +// ECAL UncalibRechit producer +// + #include "CondFormats/EcalObjects/interface/EcalWeightSet.h" #include "CondFormats/EcalObjects/interface/EcalPedestals.h" #include "CondFormats/EcalObjects/interface/EcalGainRatios.h" @@ -27,6 +31,27 @@ #include "CUDADataFormats/EcalDigi/interface/DigisCollection.h" +// +// ECAL Rechit producer +// + +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h" + +#include "CondFormats/EcalObjects/interface/EcalChannelStatus.h" +#include "CondFormats/EcalObjects/interface/EcalChannelStatusCode.h" + +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h" + +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h" +#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h" + + + + struct EcalPulseShape; class EcalSampleMask; class EcalTimeBiasCorrections; @@ -278,22 +303,6 @@ struct conf_data { // ECAL Rechit producer // -#include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h" - -#include "CondFormats/EcalObjects/interface/EcalChannelStatus.h" -#include "CondFormats/EcalObjects/interface/EcalChannelStatusCode.h" - -#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h" -#include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h" -#include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h" - -#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h" -#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h" -#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h" -#include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h" - - - namespace ecal { namespace rechit { diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu index 5c50bdaa58f7f..c4f3d22dd0a1d 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu @@ -143,7 +143,7 @@ namespace ecal { uint32_t* extra, // other int const nchannels, - uint32_t const offsetForInput, + uint32_t const nChannelsBarrel, uint32_t const offsetForHashes ) { @@ -155,37 +155,39 @@ namespace ecal { int ch = threadIdx.x + blockDim.x*blockIdx.x; if (ch < nchannels) { + + bool isEndcap = (ch >= nChannelsBarrel); - int const inputCh = ch >= offsetForInput - ? ch - offsetForInput + int const inputCh = isEndcap + ? ch - nChannelsBarrel : ch; - uint32_t const * didCh = ch >= offsetForInput + uint32_t const * didCh = isEndcap ? did_ee : did_eb; // only two values, EB or EE // AM : FIXME : why not using "isBarrel" ? isBarrel ? adc2gev[0] : adc2gev[1] - float adc2gev_to_use = ch >= offsetForInput + float adc2gev_to_use = isEndcap ? adc2gev[1] // ee : adc2gev[0]; // eb // first EB and then EE - ::ecal::reco::StorageScalarType const* amplitude = ch >= offsetForInput + ::ecal::reco::StorageScalarType const* amplitude = isEndcap ? amplitude_ee : amplitude_eb; - ::ecal::reco::StorageScalarType const* time_in = ch >= offsetForInput + ::ecal::reco::StorageScalarType const* time_in = isEndcap ? time_ee : time_eb; - ::ecal::reco::StorageScalarType const* chi2_in = ch >= offsetForInput + ::ecal::reco::StorageScalarType const* chi2_in = isEndcap ? chi2_ee : chi2_eb; - uint32_t const* flags_in = ch >= offsetForInput + uint32_t const* flags_in = isEndcap ? flags_ee : flags_eb; @@ -641,7 +643,7 @@ namespace ecal { // eventDataForScratchGPU_, ConditionsProducts const& conditions, ConfigurationParameters const& configParameters, - uint32_t const offsetForInput, + uint32_t const nChannelsBarrel, edm::TimeValue_t const event_time, cudaStream_t cudaStream ){ @@ -724,7 +726,7 @@ namespace ecal { eventOutputGPU.extra, // other nchannels, - offsetForInput, + nChannelsBarrel, conditions.offsetForHashes ); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h index a1809dbded6bd..30bc589a9a5c2 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h @@ -74,7 +74,7 @@ namespace ecal { uint32_t* flagBits, uint32_t* extra, int const nchannels, - uint32_t const offsetForInput, + uint32_t const nChannelsBarrel, uint32_t const offsetForHashes ); @@ -87,7 +87,7 @@ namespace ecal { // eventDataForScratchGPU_, ConditionsProducts const& conditions, ConfigurationParameters const& configParameters, - uint32_t const offsetForInput, + uint32_t const nChannelsBarrel, edm::TimeValue_t const event_time, cudaStream_t cudaStream ); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc index 53f32df6f9697..7f38a23ec9168 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc @@ -5,10 +5,7 @@ EcalRechitChannelStatusGPU::EcalRechitChannelStatusGPU(EcalChannelStatus const& values) : status_(values.size()) -{ - - std::cout << " I am running EcalRechitChannelStatusGPU ::>> debug ongoing ... " << std::endl; - +{ // fill in eb auto const& barrelValues = values.barrelItems(); for (unsigned int i=0; i - diff --git a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml index 328f809bfd56a..4b10eee31c1ce 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml +++ b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml @@ -1,5 +1,4 @@ - @@ -17,7 +16,6 @@ - diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc index c3a5851c1d2bd..fc6ae22ff57e0 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc @@ -25,7 +25,7 @@ class EcalCPURecHitProducer { public: explicit EcalCPURecHitProducer(edm::ParameterSet const& ps); - ~EcalCPURecHitProducer() override; + ~EcalCPURecHitProducer() override = default; static void fillDescriptions(edm::ConfigurationDescriptions&); private: @@ -65,7 +65,6 @@ void EcalCPURecHitProducer::fillDescriptions( , containsTimingInformation_{ps.getParameter("containsTimingInformation")} {} - EcalCPURecHitProducer::~EcalCPURecHitProducer() {} void EcalCPURecHitProducer::acquire( edm::Event const& event, diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc index bbe05aceda79b..7422838471ebc 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc @@ -336,16 +336,13 @@ void EcalRecHitProducerGPU::acquire( nee_ = eeUncalibRecHits.size; // std::cout << " [EcalRecHitProducerGPU::acquire] neb_:nee_ = " << neb_ << " : " << nee_ << std::endl; - int nchannelsEB = ebUncalibRecHits.size; - - int offsetForInput = nchannelsEB; // first EB and then EE + int nchannelsEB = ebUncalibRecHits.size; // --> offsetForInput, first EB and then EE // conditions // - laser correction // - IC // - adt2gev - // setup.get() .get(ADCToGeVConstantHandle_); setup.get().get(IntercalibConstantsHandle_); @@ -396,7 +393,7 @@ void EcalRecHitProducerGPU::acquire( // eventDataForScratchGPU_, conditions, configParameters_, - offsetForInput, + nchannelsEB, event_time, ctx.stream() ); diff --git a/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py b/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py index 02f84eebf21b3..bc3d8a20ea5c1 100644 --- a/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py +++ b/RecoLocalCalo/EcalRecProducers/test/testEcalRechitProducer_cfg.py @@ -18,6 +18,11 @@ #process.load('DQMOffline.Configuration.DQMOffline_cff') process.load('Configuration.StandardSequences.FrontierConditions_GlobalTag_cff') + + + + + # Other statements from Configuration.AlCa.GlobalTag import GlobalTag process.GlobalTag = GlobalTag(process.GlobalTag, '102X_dataRun2_HLT_v2', '') @@ -301,6 +306,6 @@ # -process.DependencyGraph = cms.Service("DependencyGraph") +#process.DependencyGraph = cms.Service("DependencyGraph") diff --git a/RecoLocalCalo/EcalRecProducers/test/testEcalUncalibRechitProducer_cfg.py b/RecoLocalCalo/EcalRecProducers/test/testEcalUncalibRechitProducer_cfg.py index 7fdf723b67bdd..be1934d8e002c 100644 --- a/RecoLocalCalo/EcalRecProducers/test/testEcalUncalibRechitProducer_cfg.py +++ b/RecoLocalCalo/EcalRecProducers/test/testEcalUncalibRechitProducer_cfg.py @@ -1,4 +1,3 @@ - import FWCore.ParameterSet.Config as cms from Configuration.StandardSequences.Eras import eras @@ -54,7 +53,6 @@ # ../cfipython/slc7_amd64_gcc700/RecoLocalCalo/EcalRecProducers/ecalUncalibRecHitProducerGPU_cfi.py # process.load("RecoLocalCalo.EcalRecProducers.ecalUncalibRecHitProducerGPU_cfi") -# process.load("RecoLocalCalo.EcalRecProducers.ecalMultiFitUncalibRecHit_cfi") # for validation of gpu multifit products From e2f7e8af219e22ec3a675ec68b402885cc2fa997 Mon Sep 17 00:00:00 2001 From: amassiro Date: Mon, 18 May 2020 09:50:55 +0200 Subject: [PATCH 16/30] further changes from PR comments --- .../EcalRecAlgos/src/EcalRecHitBuilderKernels.cu | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu index c4f3d22dd0a1d..792b422cefd6f 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu @@ -151,10 +151,12 @@ namespace ecal { // // NB: energy "type_wrapper::type" most likely std::vector // + + for (int ch = threadIdx.x + blockDim.x*blockIdx.x; ch < nchannels; ch += blockDim.x*gridDim.x) { + +// int ch = threadIdx.x + blockDim.x*blockIdx.x; - int ch = threadIdx.x + blockDim.x*blockIdx.x; - - if (ch < nchannels) { +// if (ch < nchannels) { bool isEndcap = (ch >= nChannelsBarrel); From 1cb2afa59cd0f491f69fa41c575e742714f2ab58 Mon Sep 17 00:00:00 2001 From: amassiro Date: Mon, 18 May 2020 10:42:02 +0200 Subject: [PATCH 17/30] after running clang tidy --- .../EcalRecHitSoA/interface/EcalRecHit_soa.h | 36 +- ...eEcalMultifitResultsGpuValidationPlots.cpp | 385 ++++---- .../bin/makeEcalRechitValidationPlots.cpp | 546 ++++++------ .../EcalRecAlgos/interface/DeclsForKernels.h | 509 +++++------ .../interface/EcalIntercalibConstantsGPU.h | 25 +- .../interface/EcalLaserAPDPNRatiosGPU.h | 47 +- .../interface/EcalLaserAPDPNRatiosRefGPU.h | 25 +- .../interface/EcalLaserAlphasGPU.h | 25 +- .../interface/EcalLinearCorrectionsGPU.h | 46 +- .../interface/EcalRechitADCToGeVConstantGPU.h | 27 +- .../interface/EcalRechitChannelStatusGPU.h | 27 +- .../src/AmplitudeComputationCommonKernels.cu | 7 +- .../src/AmplitudeComputationKernels.cu | 5 +- .../src/EcalIntercalibConstantsGPU.cc | 54 +- .../src/EcalLaserAPDPNRatiosGPU.cc | 135 ++- .../src/EcalLaserAPDPNRatiosRefGPU.cc | 54 +- .../EcalRecAlgos/src/EcalLaserAlphasGPU.cc | 54 +- .../src/EcalLinearCorrectionsGPU.cc | 126 ++- .../src/EcalRecHitBuilderKernels.cu | 831 ++++++++---------- .../src/EcalRecHitBuilderKernels.h | 158 ++-- .../src/EcalRechitADCToGeVConstantGPU.cc | 37 +- .../src/EcalRechitChannelStatusGPU.cc | 51 +- .../EcalRecAlgos/src/KernelHelpers.cu | 366 ++++---- .../EcalRecAlgos/src/KernelHelpers.h | 16 +- .../plugins/EcalCPURecHitProducer.cc | 268 +++--- .../plugins/EcalESProducersGPUDefs.cc | 117 +-- .../plugins/EcalRecHitConvertGPU2CPUFormat.cc | 185 ++-- .../plugins/EcalRecHitProducerGPU.cc | 437 ++++----- 28 files changed, 2100 insertions(+), 2499 deletions(-) diff --git a/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h b/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h index 20d342d1b7073..8379dec5c81ad 100644 --- a/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h +++ b/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h @@ -11,29 +11,26 @@ #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" namespace ecal { - - template - struct RecHit : public Detail::Base { - + + template + struct RecHit : public Detail::Base { RecHit() = default; RecHit(const RecHit&) = default; RecHit& operator=(const RecHit&) = default; - + RecHit(RecHit&&) = default; RecHit& operator=(RecHit&&) = default; - + typename type_wrapper::type energy; typename type_wrapper::type time; - typename type_wrapper::type chi2; // should we remove this, since already included in "extra" ? - typename type_wrapper::type extra; // packed uint32_t for timeError, chi2, energyError - typename type_wrapper::type flagBits; // store rechit condition (see Flags enum) in a bit-wise way - + typename type_wrapper::type chi2; // should we remove this, since already included in "extra" ? + typename type_wrapper::type extra; // packed uint32_t for timeError, chi2, energyError + typename type_wrapper::type flagBits; // store rechit condition (see Flags enum) in a bit-wise way + typename type_wrapper::type did; - - - template - typename std::enable_if::value, void>::type - resize(size_t size) { + + template + typename std::enable_if::value, void>::type resize(size_t size) { energy.resize(size); time.resize(size); chi2.resize(size); @@ -42,11 +39,10 @@ namespace ecal { did.resize(size); } }; - + using SoARecHitCollection = RecHit; - -} -#endif -// RecoLocalCalo_EcalRecAlgos_interface_EcalRecHit_soa_h +} // namespace ecal +#endif +// RecoLocalCalo_EcalRecAlgos_interface_EcalRecHit_soa_h diff --git a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp index 04ba175eebb1e..1cf7c9d706317 100644 --- a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp +++ b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp @@ -17,57 +17,54 @@ #include "TStyle.h" - -void setAxis(TH2D* histo) { +void setAxis(TH2D *histo) { histo->GetXaxis()->SetTitle("cpu"); histo->GetYaxis()->SetTitle("gpu"); } - -void setAxisDelta(TH2D* histo) { +void setAxisDelta(TH2D *histo) { histo->GetXaxis()->SetTitle("cpu"); histo->GetYaxis()->SetTitle("#Delta gpu-cpu"); } int main(int argc, char *argv[]) { - if (argc<3) { + if (argc < 3) { std::cout << "run with: ./validateGPU \n"; exit(0); } - + gStyle->SetOptStat("ourme"); - - edm::Wrapper> *wgpuEB=nullptr; - edm::Wrapper> *wgpuEE=nullptr; + + edm::Wrapper> *wgpuEB = nullptr; + edm::Wrapper> *wgpuEE = nullptr; edm::Wrapper *wcpuEB = nullptr; edm::Wrapper *wcpuEE = nullptr; - + std::string fileName = argv[1]; std::string outFileName = argv[2]; - + // output TFile rfout{outFileName.c_str(), "recreate"}; - + int nbins_count = 200; float last_count = 5000.; int nbins_count_delta = 201; - + int nbins = 300; float last = 3000.; - + // int nbins_chi2 = 1000; // float last_chi2 = 1000.; int nbins_chi2 = 1000; float last_chi2 = 200.; - + int nbins_flags = 100; float last_flags = 100.; float delta_flags = 20; - + int nbins_delta = 201; // use an odd number to center around 0 float delta = 0.2; - - + // RecHits plots for EB and EE on both GPU and CPU auto hRechitsEBGPU = new TH1D("RechitsEBGPU", "RechitsEBGPU; No. of Rechits", nbins_count, 0, last_count); auto hRechitsEBCPU = new TH1D("RechitsEBCPU", "RechitsEBCPU; No. of Rechits", nbins_count, 0, last_count); @@ -75,75 +72,110 @@ int main(int argc, char *argv[]) { auto hRechitsEECPU = new TH1D("RechitsEECPU", "RechitsEECPU; No. of Rechits", nbins_count, 0, last_count); auto hRechitsEBGPUCPUratio = new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU", 50, 0.9, 1.1); auto hRechitsEEGPUCPUratio = new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU", 50, 0.9, 1.1); - + auto hSOIAmplitudesEBGPU = new TH1D("hSOIAmplitudesEBGPU", "hSOIAmplitudesEBGPU", nbins, 0, last); auto hSOIAmplitudesEEGPU = new TH1D("hSOIAmplitudesEEGPU", "hSOIAmplitudesEEGPU", nbins, 0, last); auto hSOIAmplitudesEBCPU = new TH1D("hSOIAmplitudesEBCPU", "hSOIAmplitudesEBCPU", nbins, 0, last); auto hSOIAmplitudesEECPU = new TH1D("hSOIAmplitudesEECPU", "hSOIAmplitudesEECPU", nbins, 0, last); - auto hSOIAmplitudesEBGPUCPUratio = new TH1D("SOIAmplitudesEBGPU/CPUratio", "SOIAmplitudesEBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); - auto hSOIAmplitudesEEGPUCPUratio = new TH1D("SOIAmplitudesEEGPU/CPUratio", "SOIAmplitudesEEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); - + auto hSOIAmplitudesEBGPUCPUratio = + new TH1D("SOIAmplitudesEBGPU/CPUratio", "SOIAmplitudesEBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); + auto hSOIAmplitudesEEGPUCPUratio = + new TH1D("SOIAmplitudesEEGPU/CPUratio", "SOIAmplitudesEEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); + auto hChi2EBGPU = new TH1D("hChi2EBGPU", "hChi2EBGPU", nbins_chi2, 0, last_chi2); auto hChi2EEGPU = new TH1D("hChi2EEGPU", "hChi2EEGPU", nbins_chi2, 0, last_chi2); auto hChi2EBCPU = new TH1D("hChi2EBCPU", "hChi2EBCPU", nbins_chi2, 0, last_chi2); auto hChi2EECPU = new TH1D("hChi2EECPU", "hChi2EECPU", nbins_chi2, 0, last_chi2); auto hChi2EBGPUCPUratio = new TH1D("Chi2EBGPU/CPUratio", "Chi2EBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); auto hChi2EEGPUCPUratio = new TH1D("Chi2EEGPU/CPUratio", "Chi2EEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); - + auto hFlagsEBGPU = new TH1D("hFlagsEBGPU", "hFlagsEBGPU", nbins_flags, 0, last_flags); auto hFlagsEEGPU = new TH1D("hFlagsEEGPU", "hFlagsEEGPU", nbins_flags, 0, last_flags); auto hFlagsEBCPU = new TH1D("hFlagsEBCPU", "hFlagsEBCPU", nbins_flags, 0, last_flags); auto hFlagsEECPU = new TH1D("hFlagsEECPU", "hFlagsEECPU", nbins_flags, 0, last_flags); auto hFlagsEBGPUCPUratio = new TH1D("FlagsEBGPU/CPUratio", "FlagsEBGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); auto hFlagsEEGPUCPUratio = new TH1D("FlagsEEGPU/CPUratio", "FlagsEEGPU/CPUratio; GPU/CPU", 200, 0.9, 1.1); - - auto hSOIAmplitudesEBGPUvsCPU = new TH2D("hSOIAmplitudesEBGPUvsCPU", "hSOIAmplitudesEBGPUvsCPU", nbins, 0, last, nbins, 0, last); setAxis(hSOIAmplitudesEBGPUvsCPU ) ; - auto hSOIAmplitudesEEGPUvsCPU = new TH2D("hSOIAmplitudesEEGPUvsCPU", "hSOIAmplitudesEEGPUvsCPU", nbins, 0, last, nbins, 0, last); setAxis(hSOIAmplitudesEEGPUvsCPU ) ; - auto hSOIAmplitudesEBdeltavsCPU = new TH2D("hSOIAmplitudesEBdeltavsCPU", "hSOIAmplitudesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); setAxisDelta(hSOIAmplitudesEBdeltavsCPU) ; - auto hSOIAmplitudesEEdeltavsCPU = new TH2D("hSOIAmplitudesEEdeltavsCPU", "hSOIAmplitudesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); setAxisDelta(hSOIAmplitudesEEdeltavsCPU) ; - - auto hChi2EBGPUvsCPU = new TH2D("hChi2EBGPUvsCPU", "hChi2EBGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2); setAxis(hChi2EBGPUvsCPU ) ; - auto hChi2EEGPUvsCPU = new TH2D("hChi2EEGPUvsCPU", "hChi2EEGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2); setAxis(hChi2EEGPUvsCPU ) ; - auto hChi2EBdeltavsCPU = new TH2D("hChi2EBdeltavsCPU", "hChi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); setAxisDelta(hChi2EBdeltavsCPU) ; - auto hChi2EEdeltavsCPU = new TH2D("hChi2EEdeltavsCPU", "hChi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); setAxisDelta(hChi2EEdeltavsCPU) ; - - auto hFlagsEBGPUvsCPU = new TH2D("hFlagsEBGPUvsCPU", "hFlagsEBGPUvsCPU", nbins_flags, 0, last_flags, nbins_flags, 0, last_flags); setAxis(hFlagsEBGPUvsCPU ) ; - auto hFlagsEEGPUvsCPU = new TH2D("hFlagsEEGPUvsCPU", "hFlagsEEGPUvsCPU", nbins_flags, 0, last_flags, nbins_flags, 0, last_flags); setAxis(hFlagsEEGPUvsCPU ) ; - auto hFlagsEBdeltavsCPU = new TH2D("hFlagsEBdeltavsCPU", "hFlagsEBdeltavsCPU", nbins_flags, 0, last_flags, nbins_delta, -delta_flags, delta_flags); setAxisDelta(hFlagsEBdeltavsCPU) ; - auto hFlagsEEdeltavsCPU = new TH2D("hFlagsEEdeltavsCPU", "hFlagsEEdeltavsCPU", nbins_flags, 0, last_flags, nbins_delta, -delta_flags, delta_flags); setAxisDelta(hFlagsEEdeltavsCPU) ; - - auto hRechitsEBGPUvsCPU = new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU", last_count, 0, last_count, last_count, 0, last_count); setAxis(hRechitsEBGPUvsCPU ) ; - auto hRechitsEEGPUvsCPU = new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU", last_count, 0, last_count, last_count, 0, last_count); setAxis(hRechitsEEGPUvsCPU ) ; - auto hRechitsEBdeltavsCPU = new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU", nbins_count, 0, last_count, nbins_count_delta, -delta, delta); setAxisDelta(hRechitsEBdeltavsCPU) ; - auto hRechitsEEdeltavsCPU = new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU", nbins_count, 0, last_count, nbins_count_delta, -delta, delta); setAxisDelta(hRechitsEEdeltavsCPU) ; - - + + auto hSOIAmplitudesEBGPUvsCPU = + new TH2D("hSOIAmplitudesEBGPUvsCPU", "hSOIAmplitudesEBGPUvsCPU", nbins, 0, last, nbins, 0, last); + setAxis(hSOIAmplitudesEBGPUvsCPU); + auto hSOIAmplitudesEEGPUvsCPU = + new TH2D("hSOIAmplitudesEEGPUvsCPU", "hSOIAmplitudesEEGPUvsCPU", nbins, 0, last, nbins, 0, last); + setAxis(hSOIAmplitudesEEGPUvsCPU); + auto hSOIAmplitudesEBdeltavsCPU = + new TH2D("hSOIAmplitudesEBdeltavsCPU", "hSOIAmplitudesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); + setAxisDelta(hSOIAmplitudesEBdeltavsCPU); + auto hSOIAmplitudesEEdeltavsCPU = + new TH2D("hSOIAmplitudesEEdeltavsCPU", "hSOIAmplitudesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); + setAxisDelta(hSOIAmplitudesEEdeltavsCPU); + + auto hChi2EBGPUvsCPU = + new TH2D("hChi2EBGPUvsCPU", "hChi2EBGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2); + setAxis(hChi2EBGPUvsCPU); + auto hChi2EEGPUvsCPU = + new TH2D("hChi2EEGPUvsCPU", "hChi2EEGPUvsCPU", nbins_chi2, 0, last_chi2, nbins_chi2, 0, last_chi2); + setAxis(hChi2EEGPUvsCPU); + auto hChi2EBdeltavsCPU = + new TH2D("hChi2EBdeltavsCPU", "hChi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); + setAxisDelta(hChi2EBdeltavsCPU); + auto hChi2EEdeltavsCPU = + new TH2D("hChi2EEdeltavsCPU", "hChi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); + setAxisDelta(hChi2EEdeltavsCPU); + + auto hFlagsEBGPUvsCPU = + new TH2D("hFlagsEBGPUvsCPU", "hFlagsEBGPUvsCPU", nbins_flags, 0, last_flags, nbins_flags, 0, last_flags); + setAxis(hFlagsEBGPUvsCPU); + auto hFlagsEEGPUvsCPU = + new TH2D("hFlagsEEGPUvsCPU", "hFlagsEEGPUvsCPU", nbins_flags, 0, last_flags, nbins_flags, 0, last_flags); + setAxis(hFlagsEEGPUvsCPU); + auto hFlagsEBdeltavsCPU = new TH2D( + "hFlagsEBdeltavsCPU", "hFlagsEBdeltavsCPU", nbins_flags, 0, last_flags, nbins_delta, -delta_flags, delta_flags); + setAxisDelta(hFlagsEBdeltavsCPU); + auto hFlagsEEdeltavsCPU = new TH2D( + "hFlagsEEdeltavsCPU", "hFlagsEEdeltavsCPU", nbins_flags, 0, last_flags, nbins_delta, -delta_flags, delta_flags); + setAxisDelta(hFlagsEEdeltavsCPU); + + auto hRechitsEBGPUvsCPU = new TH2D( + "RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU", last_count, 0, last_count, last_count, 0, last_count); + setAxis(hRechitsEBGPUvsCPU); + auto hRechitsEEGPUvsCPU = new TH2D( + "RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU", last_count, 0, last_count, last_count, 0, last_count); + setAxis(hRechitsEEGPUvsCPU); + auto hRechitsEBdeltavsCPU = new TH2D( + "RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU", nbins_count, 0, last_count, nbins_count_delta, -delta, delta); + setAxisDelta(hRechitsEBdeltavsCPU); + auto hRechitsEEdeltavsCPU = new TH2D( + "RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU", nbins_count, 0, last_count, nbins_count_delta, -delta, delta); + setAxisDelta(hRechitsEEdeltavsCPU); + // input std::cout << "validating file " << fileName << std::endl; TFile rf{fileName.c_str()}; - TTree *rt = (TTree*)rf.Get("Events"); - rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEB_RECO.", &wgpuEB); - rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEE_RECO.", &wgpuEE); + TTree *rt = (TTree *)rf.Get("Events"); + rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEB_RECO.", + &wgpuEB); + rt->SetBranchAddress("ecalTagsoaecalUncalibratedRecHit_ecalCPUUncalibRecHitProducer_EcalUncalibRecHitsEE_RECO.", + &wgpuEE); rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEB_RECO.", &wcpuEB); rt->SetBranchAddress("EcalUncalibratedRecHitsSorted_ecalMultiFitUncalibRecHit_EcalUncalibRecHitsEE_RECO.", &wcpuEE); - + constexpr float eps_diff = 1e-3; - + // accumulate auto const nentries = rt->GetEntries(); std::cout << "#events to validate over: " << nentries << std::endl; - for (int ie=0; ieGetEntry(ie); - - const char* ordinal[] = { "th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th" }; + + const char *ordinal[] = {"th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th"}; auto cpu_eb_size = wcpuEB->bareProduct().size(); auto cpu_ee_size = wcpuEE->bareProduct().size(); auto gpu_eb_size = wgpuEB->bareProduct().amplitude.size(); auto gpu_ee_size = wgpuEE->bareProduct().amplitude.size(); - - float eb_ratio = (float) gpu_eb_size/cpu_eb_size; - float ee_ratio = (float) gpu_ee_size/cpu_ee_size; - + + float eb_ratio = (float)gpu_eb_size / cpu_eb_size; + float ee_ratio = (float)gpu_ee_size / cpu_ee_size; + // Filling up the histograms on events sizes for EB and EE on both GPU and CPU hRechitsEBGPU->Fill(gpu_eb_size); hRechitsEBCPU->Fill(cpu_eb_size); @@ -153,151 +185,158 @@ int main(int argc, char *argv[]) { hRechitsEEGPUvsCPU->Fill(cpu_ee_size, gpu_ee_size); hRechitsEBGPUCPUratio->Fill(eb_ratio); hRechitsEEGPUCPUratio->Fill(ee_ratio); - hRechitsEBdeltavsCPU->Fill(cpu_eb_size, gpu_eb_size-cpu_eb_size); - hRechitsEEdeltavsCPU->Fill(cpu_ee_size, gpu_ee_size-cpu_ee_size); - - + hRechitsEBdeltavsCPU->Fill(cpu_eb_size, gpu_eb_size - cpu_eb_size); + hRechitsEEdeltavsCPU->Fill(cpu_ee_size, gpu_ee_size - cpu_ee_size); + if (cpu_eb_size != gpu_eb_size or cpu_ee_size != gpu_ee_size) { std::cerr << ie << ordinal[ie % 10] << " entry:\n" - << " EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << gpu_eb_size << " (gpu)\n" - << " EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << gpu_ee_size << " (gpu)" << std::endl; + << " EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << gpu_eb_size + << " (gpu)\n" + << " EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << gpu_ee_size + << " (gpu)" << std::endl; continue; } - + assert(wgpuEB->bareProduct().amplitude.size() == wcpuEB->bareProduct().size()); assert(wgpuEE->bareProduct().amplitude.size() == wcpuEE->bareProduct().size()); auto const neb = wcpuEB->bareProduct().size(); auto const nee = wcpuEE->bareProduct().size(); - - - for (uint32_t i=0; ibareProduct().did[i]; auto const soi_amp_gpu = wgpuEB->bareProduct().amplitude[i]; auto const cpu_iter = wcpuEB->bareProduct().find(DetId{did_gpu}); if (cpu_iter == wcpuEB->bareProduct().end()) { std::cerr << ie << ordinal[ie % 10] << " entry\n" - << " Did not find a DetId " << did_gpu - << " in a CPU collection\n"; + << " Did not find a DetId " << did_gpu << " in a CPU collection\n"; continue; } auto const soi_amp_cpu = cpu_iter->amplitude(); auto const chi2_gpu = wgpuEB->bareProduct().chi2[i]; auto const chi2_cpu = cpu_iter->chi2(); - + auto const flags_gpu = wgpuEB->bareProduct().flags[i]; auto const flags_cpu = cpu_iter->flags(); - + hSOIAmplitudesEBGPU->Fill(soi_amp_gpu); hSOIAmplitudesEBCPU->Fill(soi_amp_cpu); hSOIAmplitudesEBGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu); - hSOIAmplitudesEBdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu-soi_amp_cpu); - if (soi_amp_cpu>0) hSOIAmplitudesEBGPUCPUratio->Fill( (float) soi_amp_gpu/soi_amp_cpu); - + hSOIAmplitudesEBdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu - soi_amp_cpu); + if (soi_amp_cpu > 0) + hSOIAmplitudesEBGPUCPUratio->Fill((float)soi_amp_gpu / soi_amp_cpu); + hChi2EBGPU->Fill(chi2_gpu); hChi2EBCPU->Fill(chi2_cpu); hChi2EBGPUvsCPU->Fill(chi2_cpu, chi2_gpu); - hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu); - if (chi2_cpu>0) hChi2EBGPUCPUratio->Fill( (float) chi2_gpu/chi2_cpu); - - if (fabs(chi2_gpu/chi2_cpu-1) > 0.05 || fabs(soi_amp_gpu/soi_amp_cpu-1) > 0.05) { + hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu); + if (chi2_cpu > 0) + hChi2EBGPUCPUratio->Fill((float)chi2_gpu / chi2_cpu); + + if (fabs(chi2_gpu / chi2_cpu - 1) > 0.05 || fabs(soi_amp_gpu / soi_amp_cpu - 1) > 0.05) { std::cout << " ---- EB " << std::endl; std::cout << " eventid = " << ie << " xtal = " << i << std::endl; - std::cout << " chi2_gpu = " << chi2_gpu << " chi2_cpu = " << chi2_cpu << std::endl; + std::cout << " chi2_gpu = " << chi2_gpu << " chi2_cpu = " << chi2_cpu << std::endl; std::cout << " soi_amp_gpu = " << soi_amp_gpu << " soi_amp_cpu = " << soi_amp_cpu << std::endl; - std::cout << " flags_gpu = " << flags_gpu << " flags_cpu = " << flags_cpu << std::endl; - } - + std::cout << " flags_gpu = " << flags_gpu << " flags_cpu = " << flags_cpu << std::endl; + } + hFlagsEBGPU->Fill(flags_gpu); hFlagsEBCPU->Fill(flags_cpu); hFlagsEBGPUvsCPU->Fill(flags_cpu, flags_gpu); - hFlagsEBdeltavsCPU->Fill(flags_cpu, flags_gpu-flags_cpu); - if (flags_cpu>0) hFlagsEBGPUCPUratio->Fill( (float) flags_gpu/flags_cpu); - - if (flags_cpu!=flags_gpu) { + hFlagsEBdeltavsCPU->Fill(flags_cpu, flags_gpu - flags_cpu); + if (flags_cpu > 0) + hFlagsEBGPUCPUratio->Fill((float)flags_gpu / flags_cpu); + + if (flags_cpu != flags_gpu) { std::cout << " >> No! Different flag cpu:gpu = " << flags_cpu << " : " << flags_gpu; std::cout << std::endl; } - - if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or - (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu) - or (flags_cpu!=flags_gpu) ) - { + + if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or + std::isnan(chi2_gpu) or (flags_cpu != flags_gpu)) { printf("EB eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n", - ie, i, soi_amp_gpu, soi_amp_cpu, chi2_gpu, chi2_cpu); + ie, + i, + soi_amp_gpu, + soi_amp_cpu, + chi2_gpu, + chi2_cpu); if (std::isnan(chi2_gpu)) printf("*** nan ***\n"); } } - - for (uint32_t i=0; ibareProduct().did[i]; auto const soi_amp_gpu = wgpuEE->bareProduct().amplitude[i]; auto const cpu_iter = wcpuEE->bareProduct().find(DetId{did_gpu}); if (cpu_iter == wcpuEE->bareProduct().end()) { std::cerr << ie << ordinal[ie % 10] << " entry\n" - << " did not find a DetId " << did_gpu - << " in a CPU collection\n"; + << " did not find a DetId " << did_gpu << " in a CPU collection\n"; continue; } auto const soi_amp_cpu = cpu_iter->amplitude(); auto const chi2_gpu = wgpuEE->bareProduct().chi2[i]; auto const chi2_cpu = cpu_iter->chi2(); - + auto const flags_gpu = wgpuEE->bareProduct().flags[i]; auto const flags_cpu = cpu_iter->flags(); - - + hSOIAmplitudesEEGPU->Fill(soi_amp_gpu); hSOIAmplitudesEECPU->Fill(soi_amp_cpu); hSOIAmplitudesEEGPUvsCPU->Fill(soi_amp_cpu, soi_amp_gpu); - hSOIAmplitudesEEdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu-soi_amp_cpu); - if (soi_amp_cpu>0) hSOIAmplitudesEEGPUCPUratio->Fill( (float) soi_amp_gpu/soi_amp_cpu); - + hSOIAmplitudesEEdeltavsCPU->Fill(soi_amp_cpu, soi_amp_gpu - soi_amp_cpu); + if (soi_amp_cpu > 0) + hSOIAmplitudesEEGPUCPUratio->Fill((float)soi_amp_gpu / soi_amp_cpu); + hChi2EEGPU->Fill(chi2_gpu); hChi2EECPU->Fill(chi2_cpu); hChi2EEGPUvsCPU->Fill(chi2_cpu, chi2_gpu); - hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu); - if (chi2_cpu>0) hChi2EEGPUCPUratio->Fill( (float) chi2_gpu/chi2_cpu); - - if (fabs(chi2_gpu/chi2_cpu-1) > 0.05 || fabs(soi_amp_gpu/soi_amp_cpu-1) > 0.05) { + hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu); + if (chi2_cpu > 0) + hChi2EEGPUCPUratio->Fill((float)chi2_gpu / chi2_cpu); + + if (fabs(chi2_gpu / chi2_cpu - 1) > 0.05 || fabs(soi_amp_gpu / soi_amp_cpu - 1) > 0.05) { std::cout << " ---- EE " << std::endl; std::cout << " eventid = " << ie << " xtal = " << i << std::endl; - std::cout << " chi2_gpu = " << chi2_gpu << " chi2_cpu = " << chi2_cpu << std::endl; + std::cout << " chi2_gpu = " << chi2_gpu << " chi2_cpu = " << chi2_cpu << std::endl; std::cout << " soi_amp_gpu = " << soi_amp_gpu << " soi_amp_cpu = " << soi_amp_cpu << std::endl; - std::cout << " flags_gpu = " << flags_gpu << " flags_cpu = " << flags_cpu << std::endl; - } - + std::cout << " flags_gpu = " << flags_gpu << " flags_cpu = " << flags_cpu << std::endl; + } + hFlagsEEGPU->Fill(flags_gpu); hFlagsEECPU->Fill(flags_cpu); hFlagsEEGPUvsCPU->Fill(flags_cpu, flags_gpu); - hFlagsEEdeltavsCPU->Fill(flags_cpu, flags_gpu-flags_cpu); - if (flags_cpu>0) hFlagsEEGPUCPUratio->Fill( (float) flags_gpu/flags_cpu); - - if (flags_cpu!=flags_gpu) { + hFlagsEEdeltavsCPU->Fill(flags_cpu, flags_gpu - flags_cpu); + if (flags_cpu > 0) + hFlagsEEGPUCPUratio->Fill((float)flags_gpu / flags_cpu); + + if (flags_cpu != flags_gpu) { std::cout << " >> No! Different flag cpu:gpu = " << flags_cpu << " : " << flags_gpu; std::cout << std::endl; } - - if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or - (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu) - or (flags_cpu!=flags_gpu) ) - { + + if ((std::abs(soi_amp_gpu - soi_amp_cpu) >= eps_diff) or (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or + std::isnan(chi2_gpu) or (flags_cpu != flags_gpu)) { printf("EE eventid = %d chid = %d amp_gpu = %f amp_cpu %f chi2_gpu = %f chi2_cpu = %f\n", - ie, static_cast(neb+i), soi_amp_gpu, soi_amp_cpu, chi2_gpu, chi2_cpu); + ie, + static_cast(neb + i), + soi_amp_gpu, + soi_amp_cpu, + chi2_gpu, + chi2_cpu); if (std::isnan(chi2_gpu)) printf("*** nan ***\n"); } } } - + { - - // TCanvas c("plots", "plots", 4200, 6200); TCanvas c("plots", "plots", 1750, 860); // c.Divide(2, 3); c.Divide(3, 2); - + // c.cd(1); c.cd(1); { @@ -309,11 +348,11 @@ int main(int argc, char *argv[]) { hSOIAmplitudesEBGPU->SetLineWidth(1.); hSOIAmplitudesEBGPU->Draw("sames"); gPad->Update(); - auto stats = (TPaveStats*)hSOIAmplitudesEBGPU->FindObject("stats"); + auto stats = (TPaveStats *)hSOIAmplitudesEBGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); + stats->SetY1NDC(y1 - (y2 - y1)); } // c.cd(2); c.cd(4); @@ -326,11 +365,11 @@ int main(int argc, char *argv[]) { hSOIAmplitudesEEGPU->SetLineWidth(1.); hSOIAmplitudesEEGPU->Draw("sames"); gPad->Update(); - auto stats = (TPaveStats*)hSOIAmplitudesEEGPU->FindObject("stats"); + auto stats = (TPaveStats *)hSOIAmplitudesEEGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); + stats->SetY1NDC(y1 - (y2 - y1)); } // c.cd(3); c.cd(2); @@ -348,12 +387,12 @@ int main(int argc, char *argv[]) { c.cd(6); // hSOIAmplitudesEEdeltavsCPU->Draw("COLZ"); hSOIAmplitudesEEGPUCPUratio->Draw(""); - + c.SaveAs("ecal-amplitudes.root"); c.SaveAs("ecal-amplitudes.png"); - + // chi2 - + // c.cd(1); c.cd(1); { @@ -365,11 +404,11 @@ int main(int argc, char *argv[]) { hChi2EBGPU->SetLineWidth(1.); hChi2EBGPU->Draw("sames"); gPad->Update(); - auto stats = (TPaveStats*)hChi2EBGPU->FindObject("stats"); + auto stats = (TPaveStats *)hChi2EBGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); + stats->SetY1NDC(y1 - (y2 - y1)); } // c.cd(2); c.cd(4); @@ -382,11 +421,11 @@ int main(int argc, char *argv[]) { hChi2EEGPU->SetLineWidth(1.); hChi2EEGPU->Draw("sames"); gPad->Update(); - auto stats = (TPaveStats*)hChi2EEGPU->FindObject("stats"); + auto stats = (TPaveStats *)hChi2EEGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); + stats->SetY1NDC(y1 - (y2 - y1)); } // c.cd(3); c.cd(2); @@ -404,14 +443,12 @@ int main(int argc, char *argv[]) { c.cd(6); // hChi2EEdeltavsCPU->Draw("COLZ"); hChi2EEGPUCPUratio->Draw(""); - + c.SaveAs("ecal-chi2.root"); c.SaveAs("ecal-chi2.png"); - - - + // flags - + // c.cd(1); c.cd(1); { @@ -423,11 +460,11 @@ int main(int argc, char *argv[]) { hFlagsEBGPU->SetLineWidth(1.); hFlagsEBGPU->Draw("sames"); gPad->Update(); - auto stats = (TPaveStats*)hFlagsEBGPU->FindObject("stats"); + auto stats = (TPaveStats *)hFlagsEBGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); + stats->SetY1NDC(y1 - (y2 - y1)); } // c.cd(2); c.cd(4); @@ -440,11 +477,11 @@ int main(int argc, char *argv[]) { hFlagsEEGPU->SetLineWidth(1.); hFlagsEEGPU->Draw("sames"); gPad->Update(); - auto stats = (TPaveStats*)hFlagsEEGPU->FindObject("stats"); + auto stats = (TPaveStats *)hFlagsEEGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); + stats->SetY1NDC(y1 - (y2 - y1)); } // c.cd(3); c.cd(2); @@ -458,33 +495,19 @@ int main(int argc, char *argv[]) { c.cd(3); // hFlagsEBdeltavsCPU->Draw("COLZ"); hFlagsEBGPUCPUratio->Draw(""); - + // c.cd(6); c.cd(6); // hFlagsEEdeltavsCPU->Draw("COLZ"); hFlagsEEGPUCPUratio->Draw(""); - - + c.SaveAs("ecal-flags.root"); c.SaveAs("ecal-flags.png"); - - - - - - - - - - - - - - + TCanvas cRechits("Rechits", "Rechits", 1750, 860); cRechits.Divide(3, 2); - - // Plotting the sizes of GPU vs CPU for each event of EB + + // Plotting the sizes of GPU vs CPU for each event of EB cRechits.cd(1); { gPad->SetLogy(); @@ -495,12 +518,12 @@ int main(int argc, char *argv[]) { hRechitsEBGPU->SetLineWidth(2); hRechitsEBGPU->Draw("sames"); cRechits.Update(); - auto stats = (TPaveStats*)hRechitsEBGPU->FindObject("stats"); + auto stats = (TPaveStats *)hRechitsEBGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); - } + stats->SetY1NDC(y1 - (y2 - y1)); + } cRechits.cd(4); { gPad->SetLogy(); @@ -511,41 +534,35 @@ int main(int argc, char *argv[]) { hRechitsEEGPU->SetLineWidth(2); hRechitsEEGPU->Draw("sames"); cRechits.Update(); - auto stats = (TPaveStats*)hRechitsEEGPU->FindObject("stats"); + auto stats = (TPaveStats *)hRechitsEEGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); - } - cRechits.cd(2); { - hRechitsEBGPUvsCPU->Draw("COLZ"); - } - cRechits.cd(5); { - hRechitsEEGPUvsCPU->Draw("COLZ"); + stats->SetY1NDC(y1 - (y2 - y1)); } - cRechits.cd(3); { + cRechits.cd(2); + { hRechitsEBGPUvsCPU->Draw("COLZ"); } + cRechits.cd(5); + { hRechitsEEGPUvsCPU->Draw("COLZ"); } + cRechits.cd(3); + { gPad->SetLogy(); //hRechitsEBdeltavsCPU->Draw("COLZ"); hRechitsEBGPUCPUratio->Draw(""); } - cRechits.cd(6); { + cRechits.cd(6); + { gPad->SetLogy(); //hRechitsEEdeltavsCPU->Draw("COLZ"); hRechitsEEGPUCPUratio->Draw(""); } cRechits.SaveAs("ecal-rechits.root"); cRechits.SaveAs("ecal-rechits.png"); - - - - - - } - + rf.Close(); rfout.Write(); rfout.Close(); - + return 0; } diff --git a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalRechitValidationPlots.cpp b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalRechitValidationPlots.cpp index 4e7718791b603..18f6bed0648ad 100644 --- a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalRechitValidationPlots.cpp +++ b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalRechitValidationPlots.cpp @@ -19,91 +19,113 @@ #include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h" int main(int argc, char *argv[]) { - if (argc<3) { + if (argc < 3) { std::cout << "run with: ./makeEcalRechitValidationPlots \n"; exit(0); } // Set the GPU and CPU pointers for both EB and EE - edm::Wrapper> *wgpuEB=nullptr; - edm::Wrapper> *wgpuEE=nullptr; + edm::Wrapper> *wgpuEB = nullptr; + edm::Wrapper> *wgpuEE = nullptr; edm::Wrapper *wcpuEB = nullptr; edm::Wrapper *wcpuEE = nullptr; - - std::string fileName = argv[1]; // The input file containing the data to be validated (i.e. result.root) - std::string outFileName = argv[2]; //The output file in which the validation results will be saved (i.e. output.root) - + + std::string fileName = argv[1]; // The input file containing the data to be validated (i.e. result.root) + std::string outFileName = argv[2]; //The output file in which the validation results will be saved (i.e. output.root) + //output TFile rfout{outFileName.c_str(), "recreate"}; - + int nbins = 200; int last = 5000.; - + int nbins_energy = 300; float last_energy = 2.; - + int nbins_chi2 = 200; float last_chi2 = 100.; - + int nbins_flag = 40; // int nbins_flag = 1000; int last_flag = 1500; // int nbins_flag = 40; // int last_flag = 10000; - + int nbins_extra = 200; int last_extra = 200; - + int nbins_delta = 201; // use an odd number to center around 0 float delta = 0.2; - + // RecHits plots for EB and EE on both GPU and CPU auto hRechitsEBGPU = new TH1D("RechitsEBGPU", "RechitsEBGPU; No. of Rechits. No Filter GPU", nbins, 0, last); auto hRechitsEBCPU = new TH1D("RechitsEBCPU", "RechitsEBCPU; No. of Rechits. No Filter GPU", nbins, 0, last); auto hRechitsEEGPU = new TH1D("RechitsEEGPU", "RechitsEEGPU; No. of Rechits. No Filter GPU", nbins, 0, last); auto hRechitsEECPU = new TH1D("RechitsEECPU", "RechitsEECPU; No. of Rechits. No Filter GPU", nbins, 0, last); - auto hRechitsEBGPUvsCPU = new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU. No Filter GPU", last, 0, last, last, 0, last); - auto hRechitsEEGPUvsCPU = new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU. No Filter GPU", last, 0, last, last, 0, last); - auto hRechitsEBGPUCPUratio = new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU. No Filter GPU", 200, 0.95, 1.05); - auto hRechitsEEGPUCPUratio = new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU. No Filter GPU", 200, 0.95, 1.05); - auto hRechitsEBdeltavsCPU = new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU. No Filter GPU", nbins, 0, last, nbins_delta, -delta, delta); - auto hRechitsEEdeltavsCPU = new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU. No Filter GPU", nbins, 0, last, nbins_delta, -delta, delta); - + auto hRechitsEBGPUvsCPU = + new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU. No Filter GPU", last, 0, last, last, 0, last); + auto hRechitsEEGPUvsCPU = + new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU. No Filter GPU", last, 0, last, last, 0, last); + auto hRechitsEBGPUCPUratio = + new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU. No Filter GPU", 200, 0.95, 1.05); + auto hRechitsEEGPUCPUratio = + new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU. No Filter GPU", 200, 0.95, 1.05); + auto hRechitsEBdeltavsCPU = + new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU. No Filter GPU", nbins, 0, last, nbins_delta, -delta, delta); + auto hRechitsEEdeltavsCPU = + new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU. No Filter GPU", nbins, 0, last, nbins_delta, -delta, delta); + // RecHits plots for EB and EE on both GPU and CPU auto hSelectedRechitsEBGPU = new TH1D("RechitsEBGPU", "RechitsEBGPU; No. of Rechits", nbins, 0, last); auto hSelectedRechitsEBCPU = new TH1D("RechitsEBCPU", "RechitsEBCPU; No. of Rechits", nbins, 0, last); auto hSelectedRechitsEEGPU = new TH1D("RechitsEEGPU", "RechitsEEGPU; No. of Rechits", nbins, 0, last); auto hSelectedRechitsEECPU = new TH1D("RechitsEECPU", "RechitsEECPU; No. of Rechits", nbins, 0, last); - auto hSelectedRechitsEBGPUvsCPU = new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last); - auto hSelectedRechitsEEGPUvsCPU = new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last); - auto hSelectedRechitsEBGPUCPUratio = new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05); - auto hSelectedRechitsEEGPUCPUratio = new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05); - auto hSelectedRechitsEBdeltavsCPU = new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); - auto hSelectedRechitsEEdeltavsCPU = new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); - + auto hSelectedRechitsEBGPUvsCPU = + new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last); + auto hSelectedRechitsEEGPUvsCPU = + new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last); + auto hSelectedRechitsEBGPUCPUratio = + new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05); + auto hSelectedRechitsEEGPUCPUratio = + new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05); + auto hSelectedRechitsEBdeltavsCPU = + new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); + auto hSelectedRechitsEEdeltavsCPU = + new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); + // RecHits plots for EB and EE on both GPU and CPU auto hPositiveRechitsEBGPU = new TH1D("RechitsEBGPU", "RechitsEBGPU; No. of Rechits", nbins, 0, last); auto hPositiveRechitsEBCPU = new TH1D("RechitsEBCPU", "RechitsEBCPU; No. of Rechits", nbins, 0, last); auto hPositiveRechitsEEGPU = new TH1D("RechitsEEGPU", "RechitsEEGPU; No. of Rechits", nbins, 0, last); auto hPositiveRechitsEECPU = new TH1D("RechitsEECPU", "RechitsEECPU; No. of Rechits", nbins, 0, last); - auto hPositiveRechitsEBGPUvsCPU = new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last); - auto hPositiveRechitsEEGPUvsCPU = new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last); - auto hPositiveRechitsEBGPUCPUratio = new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05); - auto hPositiveRechitsEEGPUCPUratio = new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05); - auto hPositiveRechitsEBdeltavsCPU = new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); - auto hPositiveRechitsEEdeltavsCPU = new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); - + auto hPositiveRechitsEBGPUvsCPU = + new TH2D("RechitsEBGPUvsCPU", "RechitsEBGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last); + auto hPositiveRechitsEEGPUvsCPU = + new TH2D("RechitsEEGPUvsCPU", "RechitsEEGPUvsCPU; CPU; GPU", last, 0, last, last, 0, last); + auto hPositiveRechitsEBGPUCPUratio = + new TH1D("RechitsEBGPU/CPUratio", "RechitsEBGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05); + auto hPositiveRechitsEEGPUCPUratio = + new TH1D("RechitsEEGPU/CPUratio", "RechitsEEGPU/CPUratio; GPU/CPU", 200, 0.95, 1.05); + auto hPositiveRechitsEBdeltavsCPU = + new TH2D("RechitsEBdeltavsCPU", "RechitsEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); + auto hPositiveRechitsEEdeltavsCPU = + new TH2D("RechitsEEdeltavsCPU", "RechitsEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); + // Energies plots for EB and EE on both GPU and CPU auto hEnergiesEBGPU = new TH1D("EnergiesEBGPU", "EnergiesEBGPU; Energy [GeV]", nbins_energy, 0, last_energy); auto hEnergiesEEGPU = new TH1D("EnergiesEEGPU", "EnergiesEEGPU; Energy [GeV]", nbins_energy, 0, last_energy); auto hEnergiesEBCPU = new TH1D("EnergiesEBCPU", "EnergiesEBCPU; Energy [GeV]", nbins_energy, 0, last_energy); auto hEnergiesEECPU = new TH1D("EnergiesEECPU", "EnergiesEECPU; Energy [GeV]", nbins_energy, 0, last_energy); - auto hEnergiesEBGPUvsCPU = new TH2D("EnergiesEBGPUvsCPU", "EnergiesEBGPUvsCPU; CPU; GPU", nbins_energy, 0, last_energy, nbins_energy, 0, last_energy); - auto hEnergiesEEGPUvsCPU = new TH2D("EnergiesEEGPUvsCPU", "EnergiesEEGPUvsCPU; CPU; GPU", nbins_energy, 0, last_energy, nbins_energy, 0, last_energy); + auto hEnergiesEBGPUvsCPU = new TH2D( + "EnergiesEBGPUvsCPU", "EnergiesEBGPUvsCPU; CPU; GPU", nbins_energy, 0, last_energy, nbins_energy, 0, last_energy); + auto hEnergiesEEGPUvsCPU = new TH2D( + "EnergiesEEGPUvsCPU", "EnergiesEEGPUvsCPU; CPU; GPU", nbins_energy, 0, last_energy, nbins_energy, 0, last_energy); auto hEnergiesEBGPUCPUratio = new TH1D("EnergiesEBGPU/CPUratio", "EnergiesEBGPU/CPUratio; GPU/CPU", 100, 0.8, 1.2); auto hEnergiesEEGPUCPUratio = new TH1D("EnergiesEEGPU/CPUratio", "EnergiesEEGPU/CPUratio; GPU/CPU", 100, 0.8, 1.2); - auto hEnergiesEBdeltavsCPU = new TH2D("EnergiesEBdeltavsCPU", "EnergiesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); - auto hEnergiesEEdeltavsCPU = new TH2D("EnergiesEEdeltavsCPU", "EnergiesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); - + auto hEnergiesEBdeltavsCPU = + new TH2D("EnergiesEBdeltavsCPU", "EnergiesEBdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); + auto hEnergiesEEdeltavsCPU = + new TH2D("EnergiesEEdeltavsCPU", "EnergiesEEdeltavsCPU", nbins, 0, last, nbins_delta, -delta, delta); + // Chi2 plots for EB and EE on both GPU and CPU auto hChi2EBGPU = new TH1D("Chi2EBGPU", "Chi2EBGPU; Ch^{2}", nbins_chi2, 0, last_chi2); auto hChi2EEGPU = new TH1D("Chi2EEGPU", "Chi2EEGPU; Ch^{2}", nbins_chi2, 0, last_chi2); @@ -113,68 +135,78 @@ int main(int argc, char *argv[]) { auto hChi2EEGPUvsCPU = new TH2D("Chi2EEGPUvsCPU", "Chi2EEGPUvsCPU; CPU; GPU", nbins_chi2, 0, 100, nbins_chi2, 0, 100); auto hChi2EBGPUCPUratio = new TH1D("Chi2EBGPU/CPUratio", "Chi2EBGPU/CPUratio; GPU/CPU", 100, 0.8, 1.2); auto hChi2EEGPUCPUratio = new TH1D("Chi2EEGPU/CPUratio", "Chi2EEGPU/CPUratio; GPU/CPU", 100, 0.8, 1.2); - auto hChi2EBdeltavsCPU = new TH2D("Chi2EBdeltavsCPU", "Chi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); - auto hChi2EEdeltavsCPU = new TH2D("Chi2EEdeltavsCPU", "Chi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); - + auto hChi2EBdeltavsCPU = + new TH2D("Chi2EBdeltavsCPU", "Chi2EBdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); + auto hChi2EEdeltavsCPU = + new TH2D("Chi2EEdeltavsCPU", "Chi2EEdeltavsCPU", nbins_chi2, 0, last_chi2, nbins_delta, -delta, delta); + // Flags plots for EB and EE on both GPU and CPU auto hFlagsEBGPU = new TH1D("FlagsEBGPU", "FlagsEBGPU; Flags", nbins_flag, -10, last_flag); auto hFlagsEBCPU = new TH1D("FlagsEBCPU", "FlagsEBCPU; Flags", nbins_flag, -10, last_flag); auto hFlagsEEGPU = new TH1D("FlagsEEGPU", "FlagsEEGPU; Flags", nbins_flag, -10, last_flag); auto hFlagsEECPU = new TH1D("FlagsEECPU", "FlagsEECPU; Flags", nbins_flag, -10, last_flag); - auto hFlagsEBGPUvsCPU = new TH2D("FlagsEBGPUvsCPU", "FlagsEBGPUvsCPU; CPU; GPU", nbins_flag, -10, last_flag, nbins_flag, -10, last_flag); - auto hFlagsEEGPUvsCPU = new TH2D("FlagsEEGPUvsCPU", "FlagsEEGPUvsCPU; CPU; GPU", nbins_flag, -10, last_flag, nbins_flag, -10, last_flag); + auto hFlagsEBGPUvsCPU = + new TH2D("FlagsEBGPUvsCPU", "FlagsEBGPUvsCPU; CPU; GPU", nbins_flag, -10, last_flag, nbins_flag, -10, last_flag); + auto hFlagsEEGPUvsCPU = + new TH2D("FlagsEEGPUvsCPU", "FlagsEEGPUvsCPU; CPU; GPU", nbins_flag, -10, last_flag, nbins_flag, -10, last_flag); auto hFlagsEBGPUCPUratio = new TH1D("FlagsEBGPU/CPUratio", "FlagsEBGPU/CPUratio; GPU/CPU", 50, -5, 10); auto hFlagsEEGPUCPUratio = new TH1D("FlagsEEGPU/CPUratio", "FlagsEEGPU/CPUratio; GPU/CPU", 50, -5, 10); - auto hFlagsEBdeltavsCPU = new TH2D("FlagsEBdeltavsCPU", "FlagsEBdeltavsCPU", nbins_flag, -10, last_flag, nbins_delta, -delta, delta); - auto hFlagsEEdeltavsCPU = new TH2D("FlagsEEdeltavsCPU", "FlagsEEdeltavsCPU", nbins_flag, -10, last_flag, nbins_delta, -delta, delta); - + auto hFlagsEBdeltavsCPU = + new TH2D("FlagsEBdeltavsCPU", "FlagsEBdeltavsCPU", nbins_flag, -10, last_flag, nbins_delta, -delta, delta); + auto hFlagsEEdeltavsCPU = + new TH2D("FlagsEEdeltavsCPU", "FlagsEEdeltavsCPU", nbins_flag, -10, last_flag, nbins_delta, -delta, delta); + // Extras plots for EB and EE on both GPU and CPU auto hExtrasEBGPU = new TH1D("ExtrasEBGPU", "ExtrasEBGPU; No. of Extras", nbins_extra, 0, last_extra); auto hExtrasEBCPU = new TH1D("ExtrasEBCPU", "ExtrasEBCPU; No. of Extras", nbins_extra, 0, last_extra); auto hExtrasEEGPU = new TH1D("ExtrasEEGPU", "ExtrasEEGPU; No. of Extras", nbins_extra, 0, last_extra); auto hExtrasEECPU = new TH1D("ExtrasEECPU", "ExtrasEECPU; No. of Extras", nbins_extra, 0, last_extra); - auto hExtrasEBGPUvsCPU = new TH2D("ExtrasEBGPUvsCPU", "ExtrasEBGPUvsCPU; CPU; GPU", nbins_extra, 0, last_extra,nbins_extra, 0, last_extra); - auto hExtrasEEGPUvsCPU = new TH2D("ExtrasEEGPUvsCPU", "ExtrasEEGPUvsCPU; CPU; GPU", nbins_extra, 0, last_extra, nbins_extra, 0, last_extra); + auto hExtrasEBGPUvsCPU = new TH2D( + "ExtrasEBGPUvsCPU", "ExtrasEBGPUvsCPU; CPU; GPU", nbins_extra, 0, last_extra, nbins_extra, 0, last_extra); + auto hExtrasEEGPUvsCPU = new TH2D( + "ExtrasEEGPUvsCPU", "ExtrasEEGPUvsCPU; CPU; GPU", nbins_extra, 0, last_extra, nbins_extra, 0, last_extra); auto hExtrasEBGPUCPUratio = new TH1D("ExtrasEBGPU/CPUratio", "ExtrasEBGPU/CPUratio; GPU/CPU", 50, 0.0, 2.0); auto hExtrasEEGPUCPUratio = new TH1D("ExtrasEEGPU/CPUratio", "ExtrasEEGPU/CPUratio; GPU/CPU", 50, 0.0, 2.0); - auto hExtrasEBdeltavsCPU = new TH2D("ExtrasEBdeltavsCPU", "ExtrasEBdeltavsCPU", nbins_extra, 0, last_extra, nbins_delta, -delta, delta); - auto hExtrasEEdeltavsCPU = new TH2D("ExtrasEEdeltavsCPU", "ExtrasEEdeltavsCPU", nbins_extra, 0, last_extra, nbins_delta, -delta, delta); - + auto hExtrasEBdeltavsCPU = + new TH2D("ExtrasEBdeltavsCPU", "ExtrasEBdeltavsCPU", nbins_extra, 0, last_extra, nbins_delta, -delta, delta); + auto hExtrasEEdeltavsCPU = + new TH2D("ExtrasEEdeltavsCPU", "ExtrasEEdeltavsCPU", nbins_extra, 0, last_extra, nbins_delta, -delta, delta); + // input file setup for tree std::cout << "validating file " << fileName << std::endl; TFile rf{fileName.c_str()}; - TTree *rt = (TTree*)rf.Get("Events"); - + TTree *rt = (TTree *)rf.Get("Events"); + // Allocating the appropriate data to their respective pointers rt->SetBranchAddress("ecalTagsoaecalRecHit_ecalCPURecHitProducer_EcalRecHitsEB_RECO.", &wgpuEB); rt->SetBranchAddress("ecalTagsoaecalRecHit_ecalCPURecHitProducer_EcalRecHitsEE_RECO.", &wgpuEE); rt->SetBranchAddress("EcalRecHitsSorted_ecalRecHit_EcalRecHitsEB_RECO.", &wcpuEB); rt->SetBranchAddress("EcalRecHitsSorted_ecalRecHit_EcalRecHitsEE_RECO.", &wcpuEE); - + constexpr float eps_diff = 1e-3; - + // accumulate sizes for events and sizes of each event on both GPU and CPU // auto const nentries = rt->GetEntries(); int nentries = rt->GetEntries(); - - //---- AM: tests + + //---- AM: tests if (nentries > 1000) { nentries = 1000; } // nentries = 1; - + std::cout << "#events to validate over: " << nentries << std::endl; - for (int ie=0; ieGetEntry(ie); - + // const char* ordinal[] = { "th", "st", "nd", "rd", "th", "th", "th", "th", "th", "th" }; auto cpu_eb_size = wcpuEB->bareProduct().size(); auto cpu_ee_size = wcpuEE->bareProduct().size(); auto gpu_eb_size = wgpuEB->bareProduct().energy.size(); auto gpu_ee_size = wgpuEE->bareProduct().energy.size(); - float eb_ratio = (float) gpu_eb_size/cpu_eb_size; - float ee_ratio = (float) gpu_ee_size/cpu_ee_size; - + float eb_ratio = (float)gpu_eb_size / cpu_eb_size; + float ee_ratio = (float)gpu_ee_size / cpu_ee_size; + // Filling up the histograms on events sizes for EB and EE on both GPU and CPU hRechitsEBGPU->Fill(gpu_eb_size); hRechitsEBCPU->Fill(cpu_eb_size); @@ -184,9 +216,9 @@ int main(int argc, char *argv[]) { hRechitsEEGPUvsCPU->Fill(cpu_ee_size, gpu_ee_size); hRechitsEBGPUCPUratio->Fill(eb_ratio); hRechitsEEGPUCPUratio->Fill(ee_ratio); - hRechitsEBdeltavsCPU->Fill(cpu_eb_size, gpu_eb_size-cpu_eb_size); - hRechitsEEdeltavsCPU->Fill(cpu_ee_size, gpu_ee_size-cpu_ee_size); - + hRechitsEBdeltavsCPU->Fill(cpu_eb_size, gpu_eb_size - cpu_eb_size); + hRechitsEEdeltavsCPU->Fill(cpu_ee_size, gpu_ee_size - cpu_ee_size); + /* * // condition that sizes on GPU and CPU should be the same for EB or EE * if (cpu_eb_size != gpu_eb_size or cpu_ee_size != gpu_ee_size) { @@ -201,32 +233,32 @@ int main(int argc, char *argv[]) { auto const neb = wcpuEB->bareProduct().size(); //like cpu_eb_size but set to constant auto const nee = wcpuEE->bareProduct().size(); //like cpu_ee_size but set to constant */ - + uint selected_gpu_eb_size = 0; uint selected_gpu_ee_size = 0; - + uint positive_gpu_eb_size = 0; uint positive_gpu_ee_size = 0; - + // EB: - for (uint32_t i=0; ibareProduct().did[i]; // set the did for the current RecHit + for (uint32_t i = 0; i < gpu_eb_size; ++i) { + auto const did_gpu = wgpuEB->bareProduct().did[i]; // set the did for the current RecHit // Set the variables for GPU - auto const enr_gpu = wgpuEB->bareProduct().energy[i]; + auto const enr_gpu = wgpuEB->bareProduct().energy[i]; auto const chi2_gpu = wgpuEB->bareProduct().chi2[i]; - auto const flag_gpu = wgpuEB->bareProduct().flagBits[i]; + auto const flag_gpu = wgpuEB->bareProduct().flagBits[i]; auto const extra_gpu = wgpuEB->bareProduct().extra[i]; - + // you have "-1" if the crystal is not selected - if ( enr_gpu>=0 ) { + if (enr_gpu >= 0) { selected_gpu_eb_size++; - - if ( enr_gpu>0 ) { + + if (enr_gpu > 0) { positive_gpu_eb_size++; } - + // find the Rechit on CPU reflecting the same did - auto const cpu_iter = wcpuEB->bareProduct().find(DetId{did_gpu}); + auto const cpu_iter = wcpuEB->bareProduct().find(DetId{did_gpu}); if (cpu_iter == wcpuEB->bareProduct().end()) { // std::cerr << ie << ordinal[ie % 10] << " entry\n" // << " Did not find a DetId " << did_gpu_eb @@ -237,42 +269,42 @@ int main(int argc, char *argv[]) { // Set the variables for CPU auto const enr_cpu = cpu_iter->energy(); auto const chi2_cpu = cpu_iter->chi2(); -// auto const flag_cpu = cpu_iter->flagBits(); + // auto const flag_cpu = cpu_iter->flagBits(); auto const flag_cpu = 1; -// auto const extra_cpu = cpu_iter->extra(); + // auto const extra_cpu = cpu_iter->extra(); auto const extra_cpu = 1; // auto const flag_cpu = cpu_iter->flagBits() ? cpu_iter->flagBits():-1; // auto const extra_cpu = cpu_iter->extra() ? cpu_iter->extra():-1; - + // AM: TEST // if (extra_cpu != 10) continue; - + // Fill the energy and Chi2 histograms for GPU and CPU and their comparisons with delta hEnergiesEBGPU->Fill(enr_gpu); hEnergiesEBCPU->Fill(enr_cpu); // std::cout<<"EB CPU Energy:\t"<Fill(enr_cpu, enr_gpu); - hEnergiesEBGPUCPUratio->Fill(enr_gpu/enr_cpu); - hEnergiesEBdeltavsCPU->Fill(enr_cpu, enr_gpu-enr_cpu); - + hEnergiesEBGPUCPUratio->Fill(enr_gpu / enr_cpu); + hEnergiesEBdeltavsCPU->Fill(enr_cpu, enr_gpu - enr_cpu); + hChi2EBGPU->Fill(chi2_gpu); hChi2EBCPU->Fill(chi2_cpu); hChi2EBGPUvsCPU->Fill(chi2_cpu, chi2_gpu); - hChi2EBGPUCPUratio->Fill(chi2_gpu/chi2_cpu); - hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu); - + hChi2EBGPUCPUratio->Fill(chi2_gpu / chi2_cpu); + hChi2EBdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu); + hFlagsEBGPU->Fill(flag_gpu); hFlagsEBCPU->Fill(flag_cpu); hFlagsEBGPUvsCPU->Fill(flag_cpu, flag_gpu); - hFlagsEBGPUCPUratio->Fill(flag_cpu ? flag_gpu/flag_cpu : -1); - hFlagsEBdeltavsCPU->Fill(flag_cpu, flag_gpu-flag_cpu); - + hFlagsEBGPUCPUratio->Fill(flag_cpu ? flag_gpu / flag_cpu : -1); + hFlagsEBdeltavsCPU->Fill(flag_cpu, flag_gpu - flag_cpu); + hExtrasEBGPU->Fill(extra_gpu); hExtrasEBCPU->Fill(extra_cpu); hExtrasEBGPUvsCPU->Fill(extra_cpu, extra_gpu); - hExtrasEBGPUCPUratio->Fill(extra_cpu ? extra_gpu/extra_cpu : -1); - hExtrasEBdeltavsCPU->Fill(extra_cpu, extra_gpu-extra_cpu); - + hExtrasEBGPUCPUratio->Fill(extra_cpu ? extra_gpu / extra_cpu : -1); + hExtrasEBdeltavsCPU->Fill(extra_cpu, extra_gpu - extra_cpu); + // Check if abs difference between GPU and CPU values for energy and Chi2 are smaller than eps, if not print message // if ((std::abs(enr_gpu - enr_cpu) >= eps_diff) or // (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu)) @@ -281,30 +313,29 @@ int main(int argc, char *argv[]) { // ie, i, enr_gpu, enr_cpu, chi2_gpu, chi2_cpu); // if (std::isnan(chi2_gpu)) // printf("*** nan ***\n"); - // } - + // } } } - + // EE: - for (uint32_t i=0; ibareProduct().did[i]; // set the did for the current RecHit + for (uint32_t i = 0; i < gpu_ee_size; ++i) { + auto const did_gpu = wgpuEE->bareProduct().did[i]; // set the did for the current RecHit // Set the variables for GPU - auto const enr_gpu = wgpuEE->bareProduct().energy[i]; + auto const enr_gpu = wgpuEE->bareProduct().energy[i]; auto const chi2_gpu = wgpuEE->bareProduct().chi2[i]; - auto const flag_gpu = wgpuEE->bareProduct().flagBits[i]; + auto const flag_gpu = wgpuEE->bareProduct().flagBits[i]; auto const extra_gpu = wgpuEE->bareProduct().extra[i]; - + // you have "-1" if the crystal is not selected - if ( enr_gpu>=0 ) { + if (enr_gpu >= 0) { selected_gpu_ee_size++; - - if ( enr_gpu>0 ) { + + if (enr_gpu > 0) { positive_gpu_ee_size++; } - + // find the Rechit on CPU reflecting the same did - auto const cpu_iter = wcpuEE->bareProduct().find(DetId{did_gpu}); + auto const cpu_iter = wcpuEE->bareProduct().find(DetId{did_gpu}); if (cpu_iter == wcpuEE->bareProduct().end()) { // std::cerr << ie << ordinal[ie % 10] << " entry\n" // << " Did not find a DetId " << did_gpu @@ -315,43 +346,41 @@ int main(int argc, char *argv[]) { // Set the variables for CPU auto const enr_cpu = cpu_iter->energy(); auto const chi2_cpu = cpu_iter->chi2(); -// auto const flag_cpu = cpu_iter->flagBits(); + // auto const flag_cpu = cpu_iter->flagBits(); auto const flag_cpu = 1; -// auto const extra_cpu = cpu_iter->extra(); + // auto const extra_cpu = cpu_iter->extra(); auto const extra_cpu = 1; // auto const flag_cpu = cpu_iter->flagBits()?cpu_iter->flagBits():-1; // auto const extra_cpu = cpu_iter->extra()?cpu_iter->extra():-1; - - + // AM: TEST // if (extra_cpu != 10) continue; - - + // Fill the energy and Chi2 histograms for GPU and CPU and their comparisons with delta hEnergiesEEGPU->Fill(enr_gpu); hEnergiesEECPU->Fill(enr_cpu); hEnergiesEEGPUvsCPU->Fill(enr_cpu, enr_gpu); - hEnergiesEEGPUCPUratio->Fill(enr_gpu/enr_cpu); - hEnergiesEEdeltavsCPU->Fill(enr_cpu, enr_gpu-enr_cpu); - + hEnergiesEEGPUCPUratio->Fill(enr_gpu / enr_cpu); + hEnergiesEEdeltavsCPU->Fill(enr_cpu, enr_gpu - enr_cpu); + hChi2EEGPU->Fill(chi2_gpu); hChi2EECPU->Fill(chi2_cpu); hChi2EEGPUvsCPU->Fill(chi2_cpu, chi2_gpu); - hChi2EEGPUCPUratio->Fill(chi2_gpu/chi2_cpu); - hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu-chi2_cpu); - + hChi2EEGPUCPUratio->Fill(chi2_gpu / chi2_cpu); + hChi2EEdeltavsCPU->Fill(chi2_cpu, chi2_gpu - chi2_cpu); + hFlagsEEGPU->Fill(flag_gpu); hFlagsEECPU->Fill(flag_cpu); hFlagsEEGPUvsCPU->Fill(flag_cpu, flag_gpu); - hFlagsEEGPUCPUratio->Fill(flag_cpu ? flag_gpu/flag_cpu : -1); - hFlagsEEdeltavsCPU->Fill(flag_cpu, flag_gpu-flag_cpu); - + hFlagsEEGPUCPUratio->Fill(flag_cpu ? flag_gpu / flag_cpu : -1); + hFlagsEEdeltavsCPU->Fill(flag_cpu, flag_gpu - flag_cpu); + hExtrasEEGPU->Fill(extra_gpu); hExtrasEECPU->Fill(extra_cpu); hExtrasEEGPUvsCPU->Fill(extra_cpu, extra_gpu); - hExtrasEEGPUCPUratio->Fill(extra_cpu ? extra_gpu/extra_cpu : -1); - hExtrasEEdeltavsCPU->Fill(extra_cpu, extra_gpu-extra_cpu); - + hExtrasEEGPUCPUratio->Fill(extra_cpu ? extra_gpu / extra_cpu : -1); + hExtrasEEdeltavsCPU->Fill(extra_cpu, extra_gpu - extra_cpu); + // Check if abs difference between GPU and CPU values for energy and Chi2 are smaller than eps, if not print message // if ((std::abs(enr_gpu - enr_cpu) >= eps_diff) or // (std::abs(chi2_gpu - chi2_cpu) >= eps_diff) or std::isnan(chi2_gpu)) @@ -360,17 +389,16 @@ int main(int argc, char *argv[]) { // ie, i, enr_gpu, enr_cpu, chi2_gpu, chi2_cpu); // if (std::isnan(chi2_gpu)) // printf("*** nan ***\n"); - // } + // } } } - - + // // now the rechit counting // - float selected_eb_ratio = (float) selected_gpu_eb_size/cpu_eb_size; - float selected_ee_ratio = (float) selected_gpu_ee_size/cpu_ee_size; - + float selected_eb_ratio = (float)selected_gpu_eb_size / cpu_eb_size; + float selected_ee_ratio = (float)selected_gpu_ee_size / cpu_ee_size; + // Filling up the histograms on events sizes for EB and EE on both GPU and CPU hSelectedRechitsEBGPU->Fill(selected_gpu_eb_size); hSelectedRechitsEBCPU->Fill(cpu_eb_size); @@ -380,37 +408,34 @@ int main(int argc, char *argv[]) { hSelectedRechitsEEGPUvsCPU->Fill(cpu_ee_size, selected_gpu_ee_size); hSelectedRechitsEBGPUCPUratio->Fill(selected_eb_ratio); hSelectedRechitsEEGPUCPUratio->Fill(selected_ee_ratio); - hSelectedRechitsEBdeltavsCPU->Fill(cpu_eb_size, selected_gpu_eb_size-cpu_eb_size); - hSelectedRechitsEEdeltavsCPU->Fill(cpu_ee_size, selected_gpu_ee_size-cpu_ee_size); - - + hSelectedRechitsEBdeltavsCPU->Fill(cpu_eb_size, selected_gpu_eb_size - cpu_eb_size); + hSelectedRechitsEEdeltavsCPU->Fill(cpu_ee_size, selected_gpu_ee_size - cpu_ee_size); + // // now the rechit counting // - - + uint positive_cpu_eb_size = 0; uint positive_cpu_ee_size = 0; - + // EB: - for (uint32_t i=0; ibareProduct()[i].energy(); + for (uint32_t i = 0; i < cpu_eb_size; ++i) { + auto const enr_cpu = wcpuEB->bareProduct()[i].energy(); if (enr_cpu > 0) { positive_cpu_eb_size++; } } // EE: - for (uint32_t i=0; ibareProduct()[i].energy(); + for (uint32_t i = 0; i < cpu_ee_size; ++i) { + auto const enr_cpu = wcpuEE->bareProduct()[i].energy(); if (enr_cpu > 0) { positive_cpu_ee_size++; } } - - - float positive_eb_ratio = (float) positive_gpu_eb_size/positive_cpu_eb_size; - float positive_ee_ratio = (float) positive_gpu_ee_size/positive_cpu_ee_size; - + + float positive_eb_ratio = (float)positive_gpu_eb_size / positive_cpu_eb_size; + float positive_ee_ratio = (float)positive_gpu_ee_size / positive_cpu_ee_size; + // Filling up the histograms on events sizes for EB and EE on both GPU and CPU hPositiveRechitsEBGPU->Fill(positive_gpu_eb_size); hPositiveRechitsEBCPU->Fill(positive_cpu_eb_size); @@ -420,25 +445,19 @@ int main(int argc, char *argv[]) { hPositiveRechitsEEGPUvsCPU->Fill(positive_cpu_ee_size, positive_gpu_ee_size); hPositiveRechitsEBGPUCPUratio->Fill(positive_eb_ratio); hPositiveRechitsEEGPUCPUratio->Fill(positive_ee_ratio); - hPositiveRechitsEBdeltavsCPU->Fill(positive_cpu_eb_size, positive_gpu_eb_size-positive_cpu_eb_size); - hPositiveRechitsEEdeltavsCPU->Fill(positive_cpu_ee_size, positive_gpu_ee_size-positive_cpu_ee_size); - - - + hPositiveRechitsEBdeltavsCPU->Fill(positive_cpu_eb_size, positive_gpu_eb_size - positive_cpu_eb_size); + hPositiveRechitsEEdeltavsCPU->Fill(positive_cpu_ee_size, positive_gpu_ee_size - positive_cpu_ee_size); + if (cpu_eb_size != selected_gpu_eb_size or cpu_ee_size != selected_gpu_ee_size) { // std::cerr << ie << ordinal[ie % 10] << " entry:\n" std::cerr << ie << " entry:\n" - << " EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << selected_gpu_eb_size << " (gpu)\n" - << " EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << selected_gpu_ee_size << " (gpu)" << std::endl; + << " EB size: " << std::setw(4) << cpu_eb_size << " (cpu) vs " << std::setw(4) << selected_gpu_eb_size + << " (gpu)\n" + << " EE size: " << std::setw(4) << cpu_ee_size << " (cpu) vs " << std::setw(4) << selected_gpu_ee_size + << " (gpu)" << std::endl; } - - - } - - - - + // Plotting the results: { // Canvases Setup: @@ -456,10 +475,8 @@ int main(int argc, char *argv[]) { cFlags.Divide(3, 2); TCanvas cExtras("Extras", "Extras", 1750, 860); cExtras.Divide(3, 2); - - - - // Plotting the sizes of GPU vs CPU for each event of EB + + // Plotting the sizes of GPU vs CPU for each event of EB cAllRechits.cd(1); { gPad->SetLogy(); @@ -470,12 +487,12 @@ int main(int argc, char *argv[]) { hRechitsEBGPU->SetLineWidth(2); hRechitsEBGPU->Draw("sames"); cAllRechits.Update(); - auto stats = (TPaveStats*)hRechitsEBGPU->FindObject("stats"); + auto stats = (TPaveStats *)hRechitsEBGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); - } + stats->SetY1NDC(y1 - (y2 - y1)); + } cAllRechits.cd(4); { gPad->SetLogy(); @@ -486,36 +503,38 @@ int main(int argc, char *argv[]) { hRechitsEEGPU->SetLineWidth(2); hRechitsEEGPU->Draw("sames"); cAllRechits.Update(); - auto stats = (TPaveStats*)hRechitsEEGPU->FindObject("stats"); + auto stats = (TPaveStats *)hRechitsEEGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); + stats->SetY1NDC(y1 - (y2 - y1)); } - cAllRechits.cd(2); { + cAllRechits.cd(2); + { gStyle->SetPalette(55); hRechitsEBGPUvsCPU->Draw("COLZ"); } - cAllRechits.cd(5); { + cAllRechits.cd(5); + { gStyle->SetPalette(55); hRechitsEEGPUvsCPU->Draw("COLZ"); } - cAllRechits.cd(3); { + cAllRechits.cd(3); + { gPad->SetLogy(); //hRechitsEBdeltavsCPU->Draw("COLZ"); hRechitsEBGPUCPUratio->Draw(""); } - cAllRechits.cd(6); { + cAllRechits.cd(6); + { gPad->SetLogy(); //hRechitsEEdeltavsCPU->Draw("COLZ"); hRechitsEEGPUCPUratio->Draw(""); } cAllRechits.SaveAs("ecal-allrechits.root"); cAllRechits.SaveAs("ecal-allrechits.png"); - - - - // Plotting the sizes of GPU vs CPU for each event of EB + + // Plotting the sizes of GPU vs CPU for each event of EB cRechits.cd(1); { gPad->SetLogy(); @@ -526,12 +545,12 @@ int main(int argc, char *argv[]) { hSelectedRechitsEBGPU->SetLineWidth(2); hSelectedRechitsEBGPU->Draw("sames"); cRechits.Update(); - auto stats = (TPaveStats*)hSelectedRechitsEBGPU->FindObject("stats"); + auto stats = (TPaveStats *)hSelectedRechitsEBGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); - } + stats->SetY1NDC(y1 - (y2 - y1)); + } cRechits.cd(4); { gPad->SetLogy(); @@ -542,37 +561,38 @@ int main(int argc, char *argv[]) { hSelectedRechitsEEGPU->SetLineWidth(2); hSelectedRechitsEEGPU->Draw("sames"); cRechits.Update(); - auto stats = (TPaveStats*)hSelectedRechitsEEGPU->FindObject("stats"); + auto stats = (TPaveStats *)hSelectedRechitsEEGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); + stats->SetY1NDC(y1 - (y2 - y1)); } - cRechits.cd(2); { + cRechits.cd(2); + { gStyle->SetPalette(55); hSelectedRechitsEBGPUvsCPU->Draw("COLZ"); } - cRechits.cd(5); { + cRechits.cd(5); + { gStyle->SetPalette(55); hSelectedRechitsEEGPUvsCPU->Draw("COLZ"); } - cRechits.cd(3); { + cRechits.cd(3); + { gPad->SetLogy(); //hSelectedRechitsEBdeltavsCPU->Draw("COLZ"); hSelectedRechitsEBGPUCPUratio->Draw(""); } - cRechits.cd(6); { + cRechits.cd(6); + { gPad->SetLogy(); //hSelectedRechitsEEdeltavsCPU->Draw("COLZ"); hSelectedRechitsEEGPUCPUratio->Draw(""); } cRechits.SaveAs("ecal-rechits.root"); cRechits.SaveAs("ecal-rechits.png"); - - - - - // Plotting the sizes of GPU vs CPU for each event of EB + + // Plotting the sizes of GPU vs CPU for each event of EB cRechitsPositive.cd(1); { gPad->SetLogy(); @@ -583,12 +603,12 @@ int main(int argc, char *argv[]) { hPositiveRechitsEBGPU->SetLineWidth(2); hPositiveRechitsEBGPU->Draw("sames"); cRechitsPositive.Update(); - auto stats = (TPaveStats*)hPositiveRechitsEBGPU->FindObject("stats"); + auto stats = (TPaveStats *)hPositiveRechitsEBGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); - } + stats->SetY1NDC(y1 - (y2 - y1)); + } cRechitsPositive.cd(4); { gPad->SetLogy(); @@ -599,34 +619,37 @@ int main(int argc, char *argv[]) { hPositiveRechitsEEGPU->SetLineWidth(2); hPositiveRechitsEEGPU->Draw("sames"); cRechitsPositive.Update(); - auto stats = (TPaveStats*)hPositiveRechitsEEGPU->FindObject("stats"); + auto stats = (TPaveStats *)hPositiveRechitsEEGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); + stats->SetY1NDC(y1 - (y2 - y1)); } - cRechitsPositive.cd(2); { + cRechitsPositive.cd(2); + { gStyle->SetPalette(55); hPositiveRechitsEBGPUvsCPU->Draw("COLZ"); } - cRechitsPositive.cd(5); { + cRechitsPositive.cd(5); + { gStyle->SetPalette(55); hPositiveRechitsEEGPUvsCPU->Draw("COLZ"); } - cRechitsPositive.cd(3); { + cRechitsPositive.cd(3); + { gPad->SetLogy(); //hPositiveRechitsEBdeltavsCPU->Draw("COLZ"); hPositiveRechitsEBGPUCPUratio->Draw(""); } - cRechitsPositive.cd(6); { + cRechitsPositive.cd(6); + { gPad->SetLogy(); //hPositiveRechitsEEdeltavsCPU->Draw("COLZ"); hPositiveRechitsEEGPUCPUratio->Draw(""); } cRechitsPositive.SaveAs("ecal-rechits-positive.root"); cRechitsPositive.SaveAs("ecal-rechits-positive.png"); - - + cEnergies.cd(1); { gPad->SetLogy(); @@ -637,11 +660,11 @@ int main(int argc, char *argv[]) { hEnergiesEBGPU->SetLineWidth(2); hEnergiesEBGPU->Draw("sames"); gPad->Update(); - auto stats = (TPaveStats*)hEnergiesEBGPU->FindObject("stats"); + auto stats = (TPaveStats *)hEnergiesEBGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); + stats->SetY1NDC(y1 - (y2 - y1)); } cEnergies.cd(4); { @@ -653,32 +676,31 @@ int main(int argc, char *argv[]) { hEnergiesEEGPU->SetLineWidth(2); hEnergiesEEGPU->Draw("sames"); gPad->Update(); - auto stats = (TPaveStats*)hEnergiesEEGPU->FindObject("stats"); + auto stats = (TPaveStats *)hEnergiesEEGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); - } - cEnergies.cd(2); { - hEnergiesEBGPUvsCPU->Draw("COLZ"); + stats->SetY1NDC(y1 - (y2 - y1)); } - cEnergies.cd(5); { - hEnergiesEEGPUvsCPU->Draw("COLZ"); - } - cEnergies.cd(3); { + cEnergies.cd(2); + { hEnergiesEBGPUvsCPU->Draw("COLZ"); } + cEnergies.cd(5); + { hEnergiesEEGPUvsCPU->Draw("COLZ"); } + cEnergies.cd(3); + { gPad->SetLogy(); //hEnergiesEBdeltavsCPU->Draw("COLZ"); hEnergiesEBGPUCPUratio->Draw(""); } - cEnergies.cd(6); { + cEnergies.cd(6); + { gPad->SetLogy(); //hEnergiesEEdeltavsCPU->Draw("COLZ"); hEnergiesEEGPUCPUratio->Draw(""); } cEnergies.SaveAs("ecal-energies.root"); cEnergies.SaveAs("ecal-energies.png"); - - + cChi2.cd(1); { gPad->SetLogy(); @@ -689,11 +711,11 @@ int main(int argc, char *argv[]) { hChi2EBGPU->SetLineWidth(2); hChi2EBGPU->Draw("sames"); gPad->Update(); - auto stats = (TPaveStats*)hChi2EBGPU->FindObject("stats"); + auto stats = (TPaveStats *)hChi2EBGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); + stats->SetY1NDC(y1 - (y2 - y1)); } cChi2.cd(4); { @@ -705,32 +727,31 @@ int main(int argc, char *argv[]) { hChi2EEGPU->SetLineWidth(2); hChi2EEGPU->Draw("sames"); gPad->Update(); - auto stats = (TPaveStats*)hChi2EEGPU->FindObject("stats"); + auto stats = (TPaveStats *)hChi2EEGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); - } - cChi2.cd(2); { - hChi2EBGPUvsCPU->Draw("COLZ"); - } - cChi2.cd(5); { - hChi2EEGPUvsCPU->Draw("COLZ"); + stats->SetY1NDC(y1 - (y2 - y1)); } - cChi2.cd(3); { + cChi2.cd(2); + { hChi2EBGPUvsCPU->Draw("COLZ"); } + cChi2.cd(5); + { hChi2EEGPUvsCPU->Draw("COLZ"); } + cChi2.cd(3); + { gPad->SetLogy(); //hChi2EBdeltavsCPU->Draw("COLZ"); hChi2EBGPUCPUratio->Draw(""); } - cChi2.cd(6); { + cChi2.cd(6); + { gPad->SetLogy(); //hChi2EEdeltavsCPU->Draw("COLZ"); hChi2EEGPUCPUratio->Draw(""); } cChi2.SaveAs("ecal-chi2.root"); cChi2.SaveAs("ecal-chi2.png"); - - + cFlags.cd(1); { gPad->SetLogy(); @@ -741,11 +762,11 @@ int main(int argc, char *argv[]) { hFlagsEBGPU->SetLineWidth(2); hFlagsEBGPU->Draw("sames"); gPad->Update(); - auto stats = (TPaveStats*)hFlagsEBGPU->FindObject("stats"); + auto stats = (TPaveStats *)hFlagsEBGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); + stats->SetY1NDC(y1 - (y2 - y1)); } cFlags.cd(4); { @@ -757,32 +778,31 @@ int main(int argc, char *argv[]) { hFlagsEEGPU->SetLineWidth(2); hFlagsEEGPU->Draw("sames"); gPad->Update(); - auto stats = (TPaveStats*)hFlagsEEGPU->FindObject("stats"); + auto stats = (TPaveStats *)hFlagsEEGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); + stats->SetY1NDC(y1 - (y2 - y1)); } - cFlags.cd(2); { - hFlagsEBGPUvsCPU->Draw("COLZ"); - } - cFlags.cd(5); { - hFlagsEEGPUvsCPU->Draw("COLZ"); - } - cFlags.cd(3); { + cFlags.cd(2); + { hFlagsEBGPUvsCPU->Draw("COLZ"); } + cFlags.cd(5); + { hFlagsEEGPUvsCPU->Draw("COLZ"); } + cFlags.cd(3); + { gPad->SetLogy(); //hFlagsEBdeltavsCPU->Draw("COLZ"); hFlagsEBGPUCPUratio->Draw(""); } - cFlags.cd(6); { + cFlags.cd(6); + { gPad->SetLogy(); //hFlagsEEdeltavsCPU->Draw("COLZ"); hFlagsEEGPUCPUratio->Draw(""); } cFlags.SaveAs("ecal-flags.root"); cFlags.SaveAs("ecal-flags.png"); - - + cExtras.cd(1); { gPad->SetLogy(); @@ -793,11 +813,11 @@ int main(int argc, char *argv[]) { hExtrasEBGPU->SetLineWidth(2); hExtrasEBGPU->Draw("sames"); gPad->Update(); - auto stats = (TPaveStats*)hExtrasEBGPU->FindObject("stats"); + auto stats = (TPaveStats *)hExtrasEBGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); + stats->SetY1NDC(y1 - (y2 - y1)); } cExtras.cd(4); { @@ -809,36 +829,36 @@ int main(int argc, char *argv[]) { hExtrasEEGPU->SetLineWidth(2); hExtrasEEGPU->Draw("sames"); gPad->Update(); - auto stats = (TPaveStats*)hExtrasEEGPU->FindObject("stats"); + auto stats = (TPaveStats *)hExtrasEEGPU->FindObject("stats"); auto y2 = stats->GetY2NDC(); auto y1 = stats->GetY1NDC(); stats->SetY2NDC(y1); - stats->SetY1NDC(y1 - (y2-y1)); - } - cExtras.cd(2); { - hExtrasEBGPUvsCPU->Draw("COLZ"); + stats->SetY1NDC(y1 - (y2 - y1)); } - cExtras.cd(5); { - hExtrasEEGPUvsCPU->Draw("COLZ"); - } - cExtras.cd(3); { + cExtras.cd(2); + { hExtrasEBGPUvsCPU->Draw("COLZ"); } + cExtras.cd(5); + { hExtrasEEGPUvsCPU->Draw("COLZ"); } + cExtras.cd(3); + { gPad->SetLogy(); //hExtrasEBdeltavsCPU->Draw("COLZ"); hExtrasEBGPUCPUratio->Draw(""); } - cExtras.cd(6); { + cExtras.cd(6); + { gPad->SetLogy(); //hExtrasEEdeltavsCPU->Draw("COLZ"); hExtrasEEGPUCPUratio->Draw(""); } cExtras.SaveAs("ecal-extras.root"); cExtras.SaveAs("ecal-extras.png"); - } - + } + // Close all open files rf.Close(); rfout.Write(); rfout.Close(); - + return 0; } diff --git a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h index 92d4bee3100f3..80a3f838e9de9 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h @@ -11,9 +11,9 @@ #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" #include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h" -// +// // ECAL UncalibRechit producer -// +// #include "CondFormats/EcalObjects/interface/EcalWeightSet.h" #include "CondFormats/EcalObjects/interface/EcalPedestals.h" @@ -31,9 +31,9 @@ #include "CUDADataFormats/EcalDigi/interface/DigisCollection.h" -// +// // ECAL Rechit producer -// +// #include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h" @@ -49,9 +49,6 @@ #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h" - - - struct EcalPulseShape; class EcalSampleMask; class EcalTimeBiasCorrections; @@ -62,347 +59,289 @@ class EcalSamplesCorrelation; class EBDigiCollection; class EEDigiCollection; -namespace ecal { namespace multifit { - -enum class TimeComputationState : char { - NotFinished = 0, - Finished = 1 -}; -enum class MinimizationState : char { - NotFinished = 0, - Finished = 1, - Precomputed = 2, -}; +namespace ecal { + namespace multifit { -// -struct EventInputDataGPU { - ecal::DigisCollection const& ebDigis; - ecal::DigisCollection const& eeDigis; -}; + enum class TimeComputationState : char { NotFinished = 0, Finished = 1 }; + enum class MinimizationState : char { + NotFinished = 0, + Finished = 1, + Precomputed = 2, + }; -// parameters have a fixed type -// Can we go by with single precision -struct ConfigurationParameters { - using type = double; - // device ptrs - type *amplitudeFitParametersEB=nullptr, *amplitudeFitParametersEE=nullptr; + // + struct EventInputDataGPU { + ecal::DigisCollection const& ebDigis; + ecal::DigisCollection const& eeDigis; + }; - uint32_t timeFitParametersSizeEB, timeFitParametersSizeEE; - // device ptrs - type *timeFitParametersEB=nullptr, *timeFitParametersEE=nullptr; + // parameters have a fixed type + // Can we go by with single precision + struct ConfigurationParameters { + using type = double; + // device ptrs + type *amplitudeFitParametersEB = nullptr, *amplitudeFitParametersEE = nullptr; - type timeFitLimitsFirstEB, timeFitLimitsFirstEE; - type timeFitLimitsSecondEB, timeFitLimitsSecondEE; + uint32_t timeFitParametersSizeEB, timeFitParametersSizeEE; + // device ptrs + type *timeFitParametersEB = nullptr, *timeFitParametersEE = nullptr; - type timeConstantTermEB, timeConstantTermEE; + type timeFitLimitsFirstEB, timeFitLimitsFirstEE; + type timeFitLimitsSecondEB, timeFitLimitsSecondEE; - type timeNconstEB, timeNconstEE; + type timeConstantTermEB, timeConstantTermEE; - type amplitudeThreshEE, amplitudeThreshEB; + type timeNconstEB, timeNconstEE; - type outOfTimeThreshG12pEB, outOfTimeThreshG12mEB; - type outOfTimeThreshG12pEE, outOfTimeThreshG12mEE; - type outOfTimeThreshG61pEE, outOfTimeThreshG61mEE; - type outOfTimeThreshG61pEB, outOfTimeThreshG61mEB; + type amplitudeThreshEE, amplitudeThreshEB; - std::array kernelMinimizeThreads; + type outOfTimeThreshG12pEB, outOfTimeThreshG12mEB; + type outOfTimeThreshG12pEE, outOfTimeThreshG12mEE; + type outOfTimeThreshG61pEE, outOfTimeThreshG61mEE; + type outOfTimeThreshG61pEB, outOfTimeThreshG61mEB; - bool shouldRunTimingComputation; -}; + std::array kernelMinimizeThreads; -struct EventOutputDataGPU final : public ::ecal::UncalibratedRecHit<::ecal::Tag::ptr> -{ - void allocate(ConfigurationParameters const& configParameters, uint32_t size) { - cudaCheck( cudaMalloc((void**)&litudesAll, - size * sizeof(SampleVector)) ); - cudaCheck( cudaMalloc((void**)&litude, - size * sizeof(::ecal::reco::StorageScalarType)) ); - cudaCheck( cudaMalloc((void**)&chi2, - size * sizeof(::ecal::reco::StorageScalarType)) ); - cudaCheck( cudaMalloc((void**)&pedestal, - size * sizeof(::ecal::reco::StorageScalarType)) ); + bool shouldRunTimingComputation; + }; + struct EventOutputDataGPU final : public ::ecal::UncalibratedRecHit<::ecal::Tag::ptr> { + void allocate(ConfigurationParameters const& configParameters, uint32_t size) { + cudaCheck(cudaMalloc((void**)&litudesAll, size * sizeof(SampleVector))); + cudaCheck(cudaMalloc((void**)&litude, size * sizeof(::ecal::reco::StorageScalarType))); + cudaCheck(cudaMalloc((void**)&chi2, size * sizeof(::ecal::reco::StorageScalarType))); + cudaCheck(cudaMalloc((void**)&pedestal, size * sizeof(::ecal::reco::StorageScalarType))); if (configParameters.shouldRunTimingComputation) { - cudaCheck( cudaMalloc((void**)&jitter, - size * sizeof(::ecal::reco::StorageScalarType)) ); - cudaCheck( cudaMalloc((void**)&jitterError, - size * sizeof(::ecal::reco::StorageScalarType)) ); + cudaCheck(cudaMalloc((void**)&jitter, size * sizeof(::ecal::reco::StorageScalarType))); + cudaCheck(cudaMalloc((void**)&jitterError, size * sizeof(::ecal::reco::StorageScalarType))); } - cudaCheck( cudaMalloc((void**)&did, - size * sizeof(uint32_t)) ); - cudaCheck( cudaMalloc((void**)&flags, - size * sizeof(uint32_t)) ); - } - - void deallocate(ConfigurationParameters const& configParameters) { - cudaCheck( cudaFree(amplitudesAll) ); - cudaCheck( cudaFree(amplitude) ); - cudaCheck( cudaFree(chi2) ); - cudaCheck( cudaFree(pedestal) ); + cudaCheck(cudaMalloc((void**)&did, size * sizeof(uint32_t))); + cudaCheck(cudaMalloc((void**)&flags, size * sizeof(uint32_t))); + } + + void deallocate(ConfigurationParameters const& configParameters) { + cudaCheck(cudaFree(amplitudesAll)); + cudaCheck(cudaFree(amplitude)); + cudaCheck(cudaFree(chi2)); + cudaCheck(cudaFree(pedestal)); if (configParameters.shouldRunTimingComputation) { - cudaCheck( cudaFree(jitter) ); - cudaCheck( cudaFree(jitterError) ); + cudaCheck(cudaFree(jitter)); + cudaCheck(cudaFree(jitterError)); } - cudaCheck( cudaFree(did) ); - cudaCheck( cudaFree(flags) ); - } -}; - -struct EventDataForScratchGPU { - SampleVector *samples = nullptr; - SampleGainVector *gainsNoise = nullptr; - - SampleMatrix* noisecov = nullptr; - PulseMatrixType *pulse_matrix = nullptr; - BXVectorType *activeBXs = nullptr; - char *acState = nullptr; - - bool *hasSwitchToGain6=nullptr, - *hasSwitchToGain1=nullptr, - *isSaturated=nullptr; - - SampleVector::Scalar *sample_values, *sample_value_errors; - bool *useless_sample_values; - SampleVector::Scalar* chi2sNullHypot; - SampleVector::Scalar* sum0sNullHypot; - SampleVector::Scalar* sumAAsNullHypot; - char* pedestal_nums; - SampleVector::Scalar *tMaxAlphaBetas, *tMaxErrorAlphaBetas; - SampleVector::Scalar *accTimeMax, *accTimeWgt; - SampleVector::Scalar *ampMaxAlphaBeta, *ampMaxError; - SampleVector::Scalar *timeMax, *timeError; - TimeComputationState *tcState; - - void allocate(ConfigurationParameters const& configParameters, uint32_t size) { - cudaCheck( cudaMalloc((void**)&samples, - size * sizeof(SampleVector)) ); - cudaCheck( cudaMalloc((void**)&gainsNoise, - size * sizeof(SampleGainVector)) ); - - cudaCheck( cudaMalloc((void**)&noisecov, - size * sizeof(SampleMatrix)) ); - cudaCheck( cudaMalloc((void**)&pulse_matrix, - size * sizeof(PulseMatrixType)) ); - cudaCheck( cudaMalloc((void**)&activeBXs, - size * sizeof(BXVectorType)) ); - cudaCheck( cudaMalloc((void**)&acState, - size * sizeof(char)) ); - - cudaCheck( cudaMalloc((void**)&hasSwitchToGain6, - size * sizeof(bool)) ); - cudaCheck( cudaMalloc((void**)&hasSwitchToGain1, - size * sizeof(bool)) ); - cudaCheck( cudaMalloc((void**)&isSaturated, - size * sizeof(bool)) ); + cudaCheck(cudaFree(did)); + cudaCheck(cudaFree(flags)); + } + }; + + struct EventDataForScratchGPU { + SampleVector* samples = nullptr; + SampleGainVector* gainsNoise = nullptr; + + SampleMatrix* noisecov = nullptr; + PulseMatrixType* pulse_matrix = nullptr; + BXVectorType* activeBXs = nullptr; + char* acState = nullptr; + + bool *hasSwitchToGain6 = nullptr, *hasSwitchToGain1 = nullptr, *isSaturated = nullptr; + + SampleVector::Scalar *sample_values, *sample_value_errors; + bool* useless_sample_values; + SampleVector::Scalar* chi2sNullHypot; + SampleVector::Scalar* sum0sNullHypot; + SampleVector::Scalar* sumAAsNullHypot; + char* pedestal_nums; + SampleVector::Scalar *tMaxAlphaBetas, *tMaxErrorAlphaBetas; + SampleVector::Scalar *accTimeMax, *accTimeWgt; + SampleVector::Scalar *ampMaxAlphaBeta, *ampMaxError; + SampleVector::Scalar *timeMax, *timeError; + TimeComputationState* tcState; + + void allocate(ConfigurationParameters const& configParameters, uint32_t size) { + cudaCheck(cudaMalloc((void**)&samples, size * sizeof(SampleVector))); + cudaCheck(cudaMalloc((void**)&gainsNoise, size * sizeof(SampleGainVector))); + + cudaCheck(cudaMalloc((void**)&noisecov, size * sizeof(SampleMatrix))); + cudaCheck(cudaMalloc((void**)&pulse_matrix, size * sizeof(PulseMatrixType))); + cudaCheck(cudaMalloc((void**)&activeBXs, size * sizeof(BXVectorType))); + cudaCheck(cudaMalloc((void**)&acState, size * sizeof(char))); + + cudaCheck(cudaMalloc((void**)&hasSwitchToGain6, size * sizeof(bool))); + cudaCheck(cudaMalloc((void**)&hasSwitchToGain1, size * sizeof(bool))); + cudaCheck(cudaMalloc((void**)&isSaturated, size * sizeof(bool))); if (configParameters.shouldRunTimingComputation) { - cudaCheck( cudaMalloc((void**)&sample_values, - size * sizeof(SampleVector)) ); - cudaCheck( cudaMalloc((void**)&sample_value_errors, - size * sizeof(SampleVector)) ); - cudaCheck( cudaMalloc((void**)&useless_sample_values, - size * sizeof(bool) * EcalDataFrame::MAXSAMPLES) ); - cudaCheck( cudaMalloc((void**)&chi2sNullHypot, - size * sizeof(SampleVector::Scalar)) ); - cudaCheck( cudaMalloc((void**)&sum0sNullHypot, - size * sizeof(SampleVector::Scalar)) ); - cudaCheck( cudaMalloc((void**)&sumAAsNullHypot, - size * sizeof(SampleVector::Scalar)) ); - cudaCheck( cudaMalloc((void**)&pedestal_nums, - size * sizeof(char)) ); - - cudaCheck( cudaMalloc((void**)&tMaxAlphaBetas, - size * sizeof(SampleVector::Scalar)) ); - cudaCheck( cudaMalloc((void**)&tMaxErrorAlphaBetas, - size * sizeof(SampleVector::Scalar)) ); - cudaCheck( cudaMalloc((void**)&accTimeMax, - size * sizeof(SampleVector::Scalar)) ); - cudaCheck( cudaMalloc((void**)&accTimeWgt, - size * sizeof(SampleVector::Scalar)) ); - cudaCheck( cudaMalloc((void**)&MaxAlphaBeta, - size * sizeof(SampleVector::Scalar)) ); - cudaCheck( cudaMalloc((void**)&MaxError, - size * sizeof(SampleVector::Scalar)) ); - cudaCheck( cudaMalloc((void**)&timeMax, - size * sizeof(SampleVector::Scalar)) ); - cudaCheck( cudaMalloc((void**)&timeError, - size * sizeof(SampleVector::Scalar)) ); - cudaCheck( cudaMalloc((void**)&tcState, - size * sizeof(TimeComputationState)) ); + cudaCheck(cudaMalloc((void**)&sample_values, size * sizeof(SampleVector))); + cudaCheck(cudaMalloc((void**)&sample_value_errors, size * sizeof(SampleVector))); + cudaCheck(cudaMalloc((void**)&useless_sample_values, size * sizeof(bool) * EcalDataFrame::MAXSAMPLES)); + cudaCheck(cudaMalloc((void**)&chi2sNullHypot, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&sum0sNullHypot, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&sumAAsNullHypot, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&pedestal_nums, size * sizeof(char))); + + cudaCheck(cudaMalloc((void**)&tMaxAlphaBetas, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&tMaxErrorAlphaBetas, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&accTimeMax, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&accTimeWgt, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&MaxAlphaBeta, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&MaxError, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&timeMax, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&timeError, size * sizeof(SampleVector::Scalar))); + cudaCheck(cudaMalloc((void**)&tcState, size * sizeof(TimeComputationState))); } - } + } - void deallocate(ConfigurationParameters const& configParameters) { - cudaCheck( cudaFree(samples) ); - cudaCheck( cudaFree(gainsNoise) ); + void deallocate(ConfigurationParameters const& configParameters) { + cudaCheck(cudaFree(samples)); + cudaCheck(cudaFree(gainsNoise)); - cudaCheck( cudaFree(noisecov) ); - cudaCheck( cudaFree(pulse_matrix) ); - cudaCheck( cudaFree(activeBXs) ); - cudaCheck( cudaFree(acState) ); + cudaCheck(cudaFree(noisecov)); + cudaCheck(cudaFree(pulse_matrix)); + cudaCheck(cudaFree(activeBXs)); + cudaCheck(cudaFree(acState)); - cudaCheck( cudaFree(hasSwitchToGain6) ); - cudaCheck( cudaFree(hasSwitchToGain1) ); - cudaCheck( cudaFree(isSaturated) ); + cudaCheck(cudaFree(hasSwitchToGain6)); + cudaCheck(cudaFree(hasSwitchToGain1)); + cudaCheck(cudaFree(isSaturated)); if (configParameters.shouldRunTimingComputation) { - cudaCheck( cudaFree(sample_values) ); - cudaCheck( cudaFree(sample_value_errors) ); - cudaCheck( cudaFree(useless_sample_values) ); - cudaCheck( cudaFree(chi2sNullHypot) ); - cudaCheck( cudaFree(sum0sNullHypot) ); - cudaCheck( cudaFree(sumAAsNullHypot) ); - cudaCheck( cudaFree(pedestal_nums) ); - - cudaCheck( cudaFree(tMaxAlphaBetas) ); - cudaCheck( cudaFree(tMaxErrorAlphaBetas) ); - cudaCheck( cudaFree(accTimeMax) ); - cudaCheck( cudaFree(accTimeWgt) ); - cudaCheck( cudaFree(ampMaxAlphaBeta) ); - cudaCheck( cudaFree(ampMaxError) ); - cudaCheck( cudaFree(timeMax) ); - cudaCheck( cudaFree(timeError) ); - cudaCheck( cudaFree(tcState) ); + cudaCheck(cudaFree(sample_values)); + cudaCheck(cudaFree(sample_value_errors)); + cudaCheck(cudaFree(useless_sample_values)); + cudaCheck(cudaFree(chi2sNullHypot)); + cudaCheck(cudaFree(sum0sNullHypot)); + cudaCheck(cudaFree(sumAAsNullHypot)); + cudaCheck(cudaFree(pedestal_nums)); + + cudaCheck(cudaFree(tMaxAlphaBetas)); + cudaCheck(cudaFree(tMaxErrorAlphaBetas)); + cudaCheck(cudaFree(accTimeMax)); + cudaCheck(cudaFree(accTimeWgt)); + cudaCheck(cudaFree(ampMaxAlphaBeta)); + cudaCheck(cudaFree(ampMaxError)); + cudaCheck(cudaFree(timeMax)); + cudaCheck(cudaFree(timeError)); + cudaCheck(cudaFree(tcState)); } - } -}; - -// const refs products to conditions -struct ConditionsProducts { - EcalPedestalsGPU::Product const& pedestals; - EcalGainRatiosGPU::Product const& gainRatios; - EcalPulseShapesGPU::Product const& pulseShapes; - EcalPulseCovariancesGPU::Product const& pulseCovariances; - EcalSamplesCorrelationGPU::Product const& samplesCorrelation; - EcalTimeBiasCorrectionsGPU::Product const& timeBiasCorrections; - EcalTimeCalibConstantsGPU::Product const& timeCalibConstants; - EcalSampleMask const& sampleMask; - EcalTimeOffsetConstant const& timeOffsetConstant; - uint32_t offsetForHashes; -}; - -//*/ - -struct xyz { - int x,y,z; -}; - -struct conf_data { - xyz threads; - bool runV1; - cudaStream_t cuStream; -}; - -}} - - - -// + } + }; + + // const refs products to conditions + struct ConditionsProducts { + EcalPedestalsGPU::Product const& pedestals; + EcalGainRatiosGPU::Product const& gainRatios; + EcalPulseShapesGPU::Product const& pulseShapes; + EcalPulseCovariancesGPU::Product const& pulseCovariances; + EcalSamplesCorrelationGPU::Product const& samplesCorrelation; + EcalTimeBiasCorrectionsGPU::Product const& timeBiasCorrections; + EcalTimeCalibConstantsGPU::Product const& timeCalibConstants; + EcalSampleMask const& sampleMask; + EcalTimeOffsetConstant const& timeOffsetConstant; + uint32_t offsetForHashes; + }; + + //*/ + + struct xyz { + int x, y, z; + }; + + struct conf_data { + xyz threads; + bool runV1; + cudaStream_t cuStream; + }; + + } // namespace multifit +} // namespace ecal + +// // ECAL Rechit producer -// +// -namespace ecal { +namespace ecal { namespace rechit { - + // parameters that are read in the configuration file for rechit producer struct ConfigurationParameters { // device ptrs - int *ChannelStatusToBeExcluded=nullptr; + int* ChannelStatusToBeExcluded = nullptr; uint32_t ChannelStatusToBeExcludedSize; - + bool killDeadChannels; - - bool recoverEBIsolatedChannels ; - bool recoverEEIsolatedChannels ; - bool recoverEBVFE ; - bool recoverEEVFE ; - bool recoverEBFE ; - bool recoverEEFE ; - + + bool recoverEBIsolatedChannels; + bool recoverEEIsolatedChannels; + bool recoverEBVFE; + bool recoverEEVFE; + bool recoverEBFE; + bool recoverEEFE; + float EBLaserMIN; float EELaserMIN; float EBLaserMAX; float EELaserMAX; - + // std::vector > v_DB_reco_flags; int* expanded_v_DB_reco_flags; uint32_t* expanded_Sizes_v_DB_reco_flags; uint32_t* expanded_flagbit_v_DB_reco_flags; uint32_t expanded_v_DB_reco_flagsSize; - + uint32_t flagmask; - - - // + + // // bool shouldRunTimingComputation; }; - - - - - - + struct EventOutputDataGPU final : public ::ecal::RecHit<::ecal::Tag::ptr> { - void allocate(ConfigurationParameters const& configParameters, uint32_t size) { // void allocate(uint32_t size) { //---- configParameters -> needed only to decide if to save the timing information or not - - cudaCheck( cudaMalloc((void**)&energy, - size * sizeof(::ecal::reco::StorageScalarType)) ); - cudaCheck( cudaMalloc((void**)&time, - size * sizeof(::ecal::reco::StorageScalarType)) ); - cudaCheck( cudaMalloc((void**)&chi2, - size * sizeof(::ecal::reco::StorageScalarType)) ); - cudaCheck( cudaMalloc((void**)&flagBits, - size * sizeof(uint32_t)) ); - cudaCheck( cudaMalloc((void**)&extra, - size * sizeof(uint32_t)) ); - cudaCheck( cudaMalloc((void**)&did, - size * sizeof(uint32_t)) ); + + cudaCheck(cudaMalloc((void**)&energy, size * sizeof(::ecal::reco::StorageScalarType))); + cudaCheck(cudaMalloc((void**)&time, size * sizeof(::ecal::reco::StorageScalarType))); + cudaCheck(cudaMalloc((void**)&chi2, size * sizeof(::ecal::reco::StorageScalarType))); + cudaCheck(cudaMalloc((void**)&flagBits, size * sizeof(uint32_t))); + cudaCheck(cudaMalloc((void**)&extra, size * sizeof(uint32_t))); + cudaCheck(cudaMalloc((void**)&did, size * sizeof(uint32_t))); } - - + void deallocate(ConfigurationParameters const& configParameters) { // void deallocate() { //---- configParameters -> needed only to decide if to save the timing information or not - - cudaCheck( cudaFree(energy) ); - cudaCheck( cudaFree(time) ); - cudaCheck( cudaFree(chi2) ); - cudaCheck( cudaFree(flagBits) ); - cudaCheck( cudaFree(extra) ); - cudaCheck( cudaFree(did) ); + + cudaCheck(cudaFree(energy)); + cudaCheck(cudaFree(time)); + cudaCheck(cudaFree(chi2)); + cudaCheck(cudaFree(flagBits)); + cudaCheck(cudaFree(extra)); + cudaCheck(cudaFree(did)); } }; - - - + struct EventInputDataGPU { ecal::UncalibratedRecHit const& ebUncalibRecHits; ecal::UncalibratedRecHit const& eeUncalibRecHits; }; - + // const refs products to conditions struct ConditionsProducts { - EcalRechitADCToGeVConstantGPU::Product const& ADCToGeV ; - EcalIntercalibConstantsGPU::Product const& Intercalib ; - EcalRechitChannelStatusGPU::Product const& ChannelStatus ; - // - EcalLaserAPDPNRatiosGPU::Product const& LaserAPDPNRatios ; - EcalLaserAPDPNRatiosRefGPU::Product const& LaserAPDPNRatiosRef ; - EcalLaserAlphasGPU::Product const& LaserAlphas ; - EcalLinearCorrectionsGPU::Product const& LinearCorrections ; - // - // - uint32_t offsetForHashes; + EcalRechitADCToGeVConstantGPU::Product const& ADCToGeV; + EcalIntercalibConstantsGPU::Product const& Intercalib; + EcalRechitChannelStatusGPU::Product const& ChannelStatus; + // + EcalLaserAPDPNRatiosGPU::Product const& LaserAPDPNRatios; + EcalLaserAPDPNRatiosRefGPU::Product const& LaserAPDPNRatiosRef; + EcalLaserAlphasGPU::Product const& LaserAlphas; + EcalLinearCorrectionsGPU::Product const& LinearCorrections; + // + // + uint32_t offsetForHashes; }; - - - - } -} + + } // namespace rechit +} // namespace ecal #endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h index ae36aa78c9e45..c59527a6d9f5a 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h @@ -12,33 +12,32 @@ class EcalIntercalibConstantsGPU { public: struct Product { ~Product(); - float *values = nullptr; + float* values = nullptr; }; - - #ifndef __CUDACC__ - // + +#ifndef __CUDACC__ + // EcalIntercalibConstantsGPU(EcalIntercalibConstants const&); - + // will call dealloation for Product thru ~Product ~EcalIntercalibConstantsGPU() = default; - + // get device pointers Product const& getProduct(cudaStream_t) const; - + // TODO: do this centrally // get offset for hashes. equals number of barrel items uint32_t getOffset() const { return valuesEB_.size(); } - - // + + // static std::string name() { return std::string{"ecalIntercalibConstantsGPU"}; } - + private: std::vector const& valuesEB_; std::vector const& valuesEE_; - + cms::cuda::ESProduct product_; - #endif +#endif }; - #endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h index 53c8ea6ba67b7..9b87c3228e5c7 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h @@ -12,47 +12,42 @@ class EcalLaserAPDPNRatiosGPU { public: struct Product { ~Product(); - float *p1=nullptr; - float *p2=nullptr; - float *p3=nullptr; - edm::TimeValue_t *t1=nullptr; - edm::TimeValue_t *t2=nullptr; - edm::TimeValue_t *t3=nullptr; + float *p1 = nullptr; + float *p2 = nullptr; + float *p3 = nullptr; + edm::TimeValue_t *t1 = nullptr; + edm::TimeValue_t *t2 = nullptr; + edm::TimeValue_t *t3 = nullptr; }; - - #ifndef __CUDACC__ - - // - EcalLaserAPDPNRatiosGPU(EcalLaserAPDPNRatios const&); - + +#ifndef __CUDACC__ + + // + EcalLaserAPDPNRatiosGPU(EcalLaserAPDPNRatios const &); + // will call dealloation for Product thru ~Product ~EcalLaserAPDPNRatiosGPU() = default; - + // get device pointers - Product const& getProduct(cudaStream_t) const; - - // + Product const &getProduct(cudaStream_t) const; + + // static std::string name() { return std::string{"ecalLaserAPDPNRatiosGPU"}; } - + private: // in the future, we need to arrange so to avoid this copy on the host // store eb first then ee std::vector > p1_; std::vector > p2_; std::vector > p3_; - + std::vector > t1_; std::vector > t2_; std::vector > t3_; - - cms::cuda::ESProduct product_; - - #endif -}; + cms::cuda::ESProduct product_; #endif +}; - - - +#endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h index 191c78a7c4617..6e48d50f217f3 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h @@ -12,33 +12,32 @@ class EcalLaserAPDPNRatiosRefGPU { public: struct Product { ~Product(); - float *values = nullptr; + float* values = nullptr; }; - - #ifndef __CUDACC__ - // + +#ifndef __CUDACC__ + // EcalLaserAPDPNRatiosRefGPU(EcalLaserAPDPNRatiosRef const&); - + // will call dealloation for Product thru ~Product ~EcalLaserAPDPNRatiosRefGPU() = default; - + // get device pointers Product const& getProduct(cudaStream_t) const; - + // TODO: do this centrally // get offset for hashes. equals number of barrel items uint32_t getOffset() const { return valuesEB_.size(); } - - // + + // static std::string name() { return std::string{"ecalLaserAPDPNRatiosRefGPU"}; } - + private: std::vector const& valuesEB_; std::vector const& valuesEE_; - + cms::cuda::ESProduct product_; - #endif +#endif }; - #endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h index ac97e6c514bac..d787c5700cd7e 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h @@ -12,33 +12,32 @@ class EcalLaserAlphasGPU { public: struct Product { ~Product(); - float *values = nullptr; + float* values = nullptr; }; - - #ifndef __CUDACC__ - // + +#ifndef __CUDACC__ + // EcalLaserAlphasGPU(EcalLaserAlphas const&); - + // will call dealloation for Product thru ~Product ~EcalLaserAlphasGPU() = default; - + // get device pointers Product const& getProduct(cudaStream_t) const; - + // TODO: do this centrally // get offset for hashes. equals number of barrel items uint32_t getOffset() const { return valuesEB_.size(); } - - // + + // static std::string name() { return std::string{"ecalLaserAlphasGPU"}; } - + private: std::vector const& valuesEB_; std::vector const& valuesEE_; - + cms::cuda::ESProduct product_; - #endif +#endif }; - #endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h index 41469bcf16c82..f2b395f5660fa 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h @@ -12,46 +12,42 @@ class EcalLinearCorrectionsGPU { public: struct Product { ~Product(); - float *p1=nullptr; - float *p2=nullptr; - float *p3=nullptr; - edm::TimeValue_t *t1=nullptr; - edm::TimeValue_t *t2=nullptr; - edm::TimeValue_t *t3=nullptr; + float *p1 = nullptr; + float *p2 = nullptr; + float *p3 = nullptr; + edm::TimeValue_t *t1 = nullptr; + edm::TimeValue_t *t2 = nullptr; + edm::TimeValue_t *t3 = nullptr; }; - - #ifndef __CUDACC__ - - // - EcalLinearCorrectionsGPU(EcalLinearCorrections const&); - + +#ifndef __CUDACC__ + + // + EcalLinearCorrectionsGPU(EcalLinearCorrections const &); + // will call dealloation for Product thru ~Product ~EcalLinearCorrectionsGPU() = default; - + // get device pointers - Product const& getProduct(cudaStream_t) const; - - // + Product const &getProduct(cudaStream_t) const; + + // static std::string name() { return std::string{"ecalLinearCorrectionsGPU"}; } - + private: // in the future, we need to arrange so to avoid this copy on the host // store eb first then ee std::vector> p1_; std::vector> p2_; std::vector> p3_; - + std::vector> t1_; std::vector> t2_; std::vector> t3_; - - cms::cuda::ESProduct product_; - - #endif -}; + cms::cuda::ESProduct product_; #endif +}; - - +#endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h index 8addc316f366d..3838a757cc2e1 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h @@ -12,32 +12,31 @@ class EcalRechitADCToGeVConstantGPU { public: struct Product { ~Product(); - float *adc2gev = nullptr; + float* adc2gev = nullptr; }; - - #ifndef __CUDACC__ - - // + +#ifndef __CUDACC__ + + // EcalRechitADCToGeVConstantGPU(EcalADCToGeVConstant const&); - + // will call dealloation for Product thru ~Product ~EcalRechitADCToGeVConstantGPU() = default; - + // get device pointers Product const& getProduct(cudaStream_t) const; - - // + + // static std::string name() { return std::string{"ecalRechitADCToGeVConstantGPU"}; } - + private: // in the future, we need to arrange so to avoid this copy on the host // store eb first then ee std::vector> adc2gev_; - + cms::cuda::ESProduct product_; - - #endif -}; +#endif +}; #endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h index 2329b3752089d..bf3f0f600224e 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h @@ -12,32 +12,31 @@ class EcalRechitChannelStatusGPU { public: struct Product { ~Product(); - uint16_t *status = nullptr; + uint16_t* status = nullptr; }; - - #ifndef __CUDACC__ - - // + +#ifndef __CUDACC__ + + // EcalRechitChannelStatusGPU(EcalChannelStatus const&); - + // will call dealloation for Product thru ~Product ~EcalRechitChannelStatusGPU() = default; - + // get device pointers Product const& getProduct(cudaStream_t) const; - - // + + // static std::string name() { return std::string{"ecalRechitChannelStatusGPU"}; } - + private: // in the future, we need to arrange so to avoid this copy on the host // store eb first then ee std::vector> status_; - + cms::cuda::ESProduct product_; - - #endif -}; +#endif +}; #endif diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu index 139c1c31f09a9..717a005a3dfb1 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationCommonKernels.cu @@ -85,7 +85,8 @@ namespace ecal { auto const did = DetId{dids[inputCh]}; auto const isBarrel = did.subdetId() == EcalBarrel; // TODO offset for ee, 0 for eb - auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId()); + auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) + : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId()); // // pulse shape template @@ -345,7 +346,8 @@ namespace ecal { bool tmp1 = hasSwitchToGain1[ch]; auto const did = DetId{dids[inputCh]}; auto const isBarrel = did.subdetId() == EcalBarrel; - auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId()); + auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) + : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId()); auto const G12SamplesCorrelation = isBarrel ? G12SamplesCorrelationEB : G12SamplesCorrelationEE; auto const* G6SamplesCorrelation = isBarrel ? G6SamplesCorrelationEB : G6SamplesCorrelationEE; auto const* G1SamplesCorrelation = isBarrel ? G1SamplesCorrelationEB : G1SamplesCorrelationEE; @@ -503,4 +505,3 @@ namespace ecal { } // namespace multifit } // namespace ecal - diff --git a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu index b12fa6fc1043f..a3f9cf71caaf6 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/AmplitudeComputationKernels.cu @@ -158,8 +158,9 @@ namespace ecal { auto const* dids = ch >= offsetForInputs ? dids_ee : dids_eb; auto const did = DetId{dids[inputCh]}; auto const isBarrel = did.subdetId() == EcalBarrel; - auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId()); - + auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) + : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId()); + // inits int iter = 0; int npassive = 0; diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalIntercalibConstantsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalIntercalibConstantsGPU.cc index 844a28d27fd8e..dec10cff57dd0 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalIntercalibConstantsGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalIntercalibConstantsGPU.cc @@ -3,41 +3,37 @@ #include "FWCore/Utilities/interface/typelookup.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -EcalIntercalibConstantsGPU::EcalIntercalibConstantsGPU(EcalIntercalibConstants const& values) -: valuesEB_{values.barrelItems()} -, valuesEE_{values.endcapItems()} -{} +EcalIntercalibConstantsGPU::EcalIntercalibConstantsGPU(EcalIntercalibConstants const& values) + : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {} EcalIntercalibConstantsGPU::Product::~Product() { // deallocation - cudaCheck( cudaFree(values) ); + cudaCheck(cudaFree(values)); } EcalIntercalibConstantsGPU::Product const& EcalIntercalibConstantsGPU::getProduct(cudaStream_t cudaStream) const { - auto const& product = product_.dataForCurrentDeviceAsync(cudaStream, - [this](EcalIntercalibConstantsGPU::Product& product, cudaStream_t cudaStream) { - // malloc - cudaCheck( cudaMalloc((void**)&product.values, - (this->valuesEB_.size() + this->valuesEE_.size()) * - sizeof(float)) ); - - // offset in floats, not bytes - auto const offset = this->valuesEB_.size(); - - // transfer - cudaCheck( cudaMemcpyAsync(product.values, - this->valuesEB_.data(), - this->valuesEB_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.values + offset, - this->valuesEE_.data(), - this->valuesEE_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - } - ); - + auto const& product = product_.dataForCurrentDeviceAsync( + cudaStream, [this](EcalIntercalibConstantsGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck( + cudaMalloc((void**)&product.values, (this->valuesEB_.size() + this->valuesEE_.size()) * sizeof(float))); + + // offset in floats, not bytes + auto const offset = this->valuesEB_.size(); + + // transfer + cudaCheck(cudaMemcpyAsync(product.values, + this->valuesEB_.data(), + this->valuesEB_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.values + offset, + this->valuesEE_.data(), + this->valuesEE_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + }); + return product; } diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosGPU.cc index f54f7bd47c022..4aa92ea6750fe 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosGPU.cc @@ -3,107 +3,84 @@ #include "FWCore/Utilities/interface/typelookup.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -EcalLaserAPDPNRatiosGPU::EcalLaserAPDPNRatiosGPU(EcalLaserAPDPNRatios const& values) -: p1_(values.getLaserMap().size()) -, p2_(values.getLaserMap().size()) -, p3_(values.getLaserMap().size()) -, t1_(values.getTimeMap().size()) -, t2_(values.getTimeMap().size()) -, t3_(values.getTimeMap().size()) -{ - +EcalLaserAPDPNRatiosGPU::EcalLaserAPDPNRatiosGPU(EcalLaserAPDPNRatios const& values) + : p1_(values.getLaserMap().size()), + p2_(values.getLaserMap().size()), + p3_(values.getLaserMap().size()), + t1_(values.getTimeMap().size()), + t2_(values.getTimeMap().size()), + t3_(values.getTimeMap().size()) { // fill in eb // auto const& barrelValues = values.barrelItems(); - for (unsigned int i=0; i EcalLaserTimeStampMap; - for (unsigned int i=0; ip1_.size() * sizeof(float)) ); - cudaCheck( cudaMalloc((void**)&product.p2, - this->p2_.size() * sizeof(float)) ); - cudaCheck( cudaMalloc((void**)&product.p3, - this->p3_.size() * sizeof(float)) ); - cudaCheck( cudaMalloc((void**)&product.t1, - this->t1_.size() * sizeof(edm::TimeValue_t)) ); - cudaCheck( cudaMalloc((void**)&product.t2, - this->t2_.size() * sizeof(edm::TimeValue_t)) ); - cudaCheck( cudaMalloc((void**)&product.t3, - this->t3_.size() * sizeof(edm::TimeValue_t)) ); - // transfer - cudaCheck( cudaMemcpyAsync(product.p1, - this->p1_.data(), - this->p1_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.p2, - this->p2_.data(), - this->p2_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.p3, - this->p3_.data(), - this->p3_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.t1, - this->t1_.data(), - this->t1_.size() * sizeof(edm::TimeValue_t), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.t2, - this->t2_.data(), - this->t2_.size() * sizeof(edm::TimeValue_t), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.t3, - this->t3_.data(), - this->t3_.size() * sizeof(edm::TimeValue_t), - cudaMemcpyHostToDevice, - cudaStream) ); - } - ); - - return product; - } - - TYPELOOKUP_DATA_REG(EcalLaserAPDPNRatiosGPU); - \ No newline at end of file +EcalLaserAPDPNRatiosGPU::Product const& EcalLaserAPDPNRatiosGPU::getProduct(cudaStream_t cudaStream) const { + auto const& product = product_.dataForCurrentDeviceAsync( + cudaStream, [this](EcalLaserAPDPNRatiosGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck(cudaMalloc((void**)&product.p1, this->p1_.size() * sizeof(float))); + cudaCheck(cudaMalloc((void**)&product.p2, this->p2_.size() * sizeof(float))); + cudaCheck(cudaMalloc((void**)&product.p3, this->p3_.size() * sizeof(float))); + cudaCheck(cudaMalloc((void**)&product.t1, this->t1_.size() * sizeof(edm::TimeValue_t))); + cudaCheck(cudaMalloc((void**)&product.t2, this->t2_.size() * sizeof(edm::TimeValue_t))); + cudaCheck(cudaMalloc((void**)&product.t3, this->t3_.size() * sizeof(edm::TimeValue_t))); + // transfer + cudaCheck(cudaMemcpyAsync( + product.p1, this->p1_.data(), this->p1_.size() * sizeof(float), cudaMemcpyHostToDevice, cudaStream)); + cudaCheck(cudaMemcpyAsync( + product.p2, this->p2_.data(), this->p2_.size() * sizeof(float), cudaMemcpyHostToDevice, cudaStream)); + cudaCheck(cudaMemcpyAsync( + product.p3, this->p3_.data(), this->p3_.size() * sizeof(float), cudaMemcpyHostToDevice, cudaStream)); + cudaCheck(cudaMemcpyAsync(product.t1, + this->t1_.data(), + this->t1_.size() * sizeof(edm::TimeValue_t), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.t2, + this->t2_.data(), + this->t2_.size() * sizeof(edm::TimeValue_t), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.t3, + this->t3_.data(), + this->t3_.size() * sizeof(edm::TimeValue_t), + cudaMemcpyHostToDevice, + cudaStream)); + }); + + return product; +} + +TYPELOOKUP_DATA_REG(EcalLaserAPDPNRatiosGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosRefGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosRefGPU.cc index c4c07361a8535..8f77cf48fe1d1 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosRefGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAPDPNRatiosRefGPU.cc @@ -3,41 +3,37 @@ #include "FWCore/Utilities/interface/typelookup.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -EcalLaserAPDPNRatiosRefGPU::EcalLaserAPDPNRatiosRefGPU(EcalLaserAPDPNRatiosRef const& values) -: valuesEB_{values.barrelItems()} -, valuesEE_{values.endcapItems()} -{} +EcalLaserAPDPNRatiosRefGPU::EcalLaserAPDPNRatiosRefGPU(EcalLaserAPDPNRatiosRef const& values) + : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {} EcalLaserAPDPNRatiosRefGPU::Product::~Product() { // deallocation - cudaCheck( cudaFree(values) ); + cudaCheck(cudaFree(values)); } EcalLaserAPDPNRatiosRefGPU::Product const& EcalLaserAPDPNRatiosRefGPU::getProduct(cudaStream_t cudaStream) const { - auto const& product = product_.dataForCurrentDeviceAsync(cudaStream, - [this](EcalLaserAPDPNRatiosRefGPU::Product& product, cudaStream_t cudaStream) { - // malloc - cudaCheck( cudaMalloc((void**)&product.values, - (this->valuesEB_.size() + this->valuesEE_.size()) * - sizeof(float)) ); - - // offset in floats, not bytes - auto const offset = this->valuesEB_.size(); - - // transfer - cudaCheck( cudaMemcpyAsync(product.values, - this->valuesEB_.data(), - this->valuesEB_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.values + offset, - this->valuesEE_.data(), - this->valuesEE_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - } - ); - + auto const& product = product_.dataForCurrentDeviceAsync( + cudaStream, [this](EcalLaserAPDPNRatiosRefGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck( + cudaMalloc((void**)&product.values, (this->valuesEB_.size() + this->valuesEE_.size()) * sizeof(float))); + + // offset in floats, not bytes + auto const offset = this->valuesEB_.size(); + + // transfer + cudaCheck(cudaMemcpyAsync(product.values, + this->valuesEB_.data(), + this->valuesEB_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.values + offset, + this->valuesEE_.data(), + this->valuesEE_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + }); + return product; } diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAlphasGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAlphasGPU.cc index 24257fd8b547a..91de441bff683 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAlphasGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLaserAlphasGPU.cc @@ -3,41 +3,37 @@ #include "FWCore/Utilities/interface/typelookup.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -EcalLaserAlphasGPU::EcalLaserAlphasGPU(EcalLaserAlphas const& values) -: valuesEB_{values.barrelItems()} -, valuesEE_{values.endcapItems()} -{} +EcalLaserAlphasGPU::EcalLaserAlphasGPU(EcalLaserAlphas const& values) + : valuesEB_{values.barrelItems()}, valuesEE_{values.endcapItems()} {} EcalLaserAlphasGPU::Product::~Product() { // deallocation - cudaCheck( cudaFree(values) ); + cudaCheck(cudaFree(values)); } EcalLaserAlphasGPU::Product const& EcalLaserAlphasGPU::getProduct(cudaStream_t cudaStream) const { - auto const& product = product_.dataForCurrentDeviceAsync(cudaStream, - [this](EcalLaserAlphasGPU::Product& product, cudaStream_t cudaStream) { - // malloc - cudaCheck( cudaMalloc((void**)&product.values, - (this->valuesEB_.size() + this->valuesEE_.size()) * - sizeof(float)) ); - - // offset in floats, not bytes - auto const offset = this->valuesEB_.size(); - - // transfer - cudaCheck( cudaMemcpyAsync(product.values, - this->valuesEB_.data(), - this->valuesEB_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.values + offset, - this->valuesEE_.data(), - this->valuesEE_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - } - ); - + auto const& product = product_.dataForCurrentDeviceAsync( + cudaStream, [this](EcalLaserAlphasGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck( + cudaMalloc((void**)&product.values, (this->valuesEB_.size() + this->valuesEE_.size()) * sizeof(float))); + + // offset in floats, not bytes + auto const offset = this->valuesEB_.size(); + + // transfer + cudaCheck(cudaMemcpyAsync(product.values, + this->valuesEB_.data(), + this->valuesEB_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.values + offset, + this->valuesEE_.data(), + this->valuesEE_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + }); + return product; } diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc index 2dedb1074bee7..20946028aba90 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc @@ -3,100 +3,78 @@ #include "FWCore/Utilities/interface/typelookup.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -EcalLinearCorrectionsGPU::EcalLinearCorrectionsGPU(EcalLinearCorrections const& values) -: p1_(values.getValueMap().size()) -, p2_(values.getValueMap().size()) -, p3_(values.getValueMap().size()) -, t1_(values.getTimeMap().size()) -, t2_(values.getTimeMap().size()) -, t3_(values.getTimeMap().size()) -{ - +EcalLinearCorrectionsGPU::EcalLinearCorrectionsGPU(EcalLinearCorrections const& values) + : p1_(values.getValueMap().size()), + p2_(values.getValueMap().size()), + p3_(values.getValueMap().size()), + t1_(values.getTimeMap().size()), + t2_(values.getTimeMap().size()), + t3_(values.getTimeMap().size()) { // fill in eb - for (unsigned int i=0; i EcalLaserTimeStampMap; - for (unsigned int i=0; ip1_.size() * sizeof(float)) ); - cudaCheck( cudaMalloc((void**)&product.p2, - this->p2_.size() * sizeof(float)) ); - cudaCheck( cudaMalloc((void**)&product.p3, - this->p3_.size() * sizeof(float)) ); - cudaCheck( cudaMalloc((void**)&product.t1, - this->t1_.size() * sizeof(edm::TimeValue_t)) ); - cudaCheck( cudaMalloc((void**)&product.t2, - this->t2_.size() * sizeof(edm::TimeValue_t)) ); - cudaCheck( cudaMalloc((void**)&product.t3, - this->t3_.size() * sizeof(edm::TimeValue_t)) ); - // transfer - cudaCheck( cudaMemcpyAsync(product.p1, - this->p1_.data(), - this->p1_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.p2, - this->p2_.data(), - this->p2_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.p3, - this->p3_.data(), - this->p3_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.t1, - this->t1_.data(), - this->t1_.size() * sizeof(edm::TimeValue_t), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.t2, - this->t2_.data(), - this->t2_.size() * sizeof(edm::TimeValue_t), - cudaMemcpyHostToDevice, - cudaStream) ); - cudaCheck( cudaMemcpyAsync(product.t3, - this->t3_.data(), - this->t3_.size() * sizeof(edm::TimeValue_t), - cudaMemcpyHostToDevice, - cudaStream) ); - } - ); - - return product; - } - - TYPELOOKUP_DATA_REG(EcalLinearCorrectionsGPU); - \ No newline at end of file +EcalLinearCorrectionsGPU::Product const& EcalLinearCorrectionsGPU::getProduct(cudaStream_t cudaStream) const { + auto const& product = product_.dataForCurrentDeviceAsync( + cudaStream, [this](EcalLinearCorrectionsGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck(cudaMalloc((void**)&product.p1, this->p1_.size() * sizeof(float))); + cudaCheck(cudaMalloc((void**)&product.p2, this->p2_.size() * sizeof(float))); + cudaCheck(cudaMalloc((void**)&product.p3, this->p3_.size() * sizeof(float))); + cudaCheck(cudaMalloc((void**)&product.t1, this->t1_.size() * sizeof(edm::TimeValue_t))); + cudaCheck(cudaMalloc((void**)&product.t2, this->t2_.size() * sizeof(edm::TimeValue_t))); + cudaCheck(cudaMalloc((void**)&product.t3, this->t3_.size() * sizeof(edm::TimeValue_t))); + // transfer + cudaCheck(cudaMemcpyAsync( + product.p1, this->p1_.data(), this->p1_.size() * sizeof(float), cudaMemcpyHostToDevice, cudaStream)); + cudaCheck(cudaMemcpyAsync( + product.p2, this->p2_.data(), this->p2_.size() * sizeof(float), cudaMemcpyHostToDevice, cudaStream)); + cudaCheck(cudaMemcpyAsync( + product.p3, this->p3_.data(), this->p3_.size() * sizeof(float), cudaMemcpyHostToDevice, cudaStream)); + cudaCheck(cudaMemcpyAsync(product.t1, + this->t1_.data(), + this->t1_.size() * sizeof(edm::TimeValue_t), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.t2, + this->t2_.data(), + this->t2_.size() * sizeof(edm::TimeValue_t), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.t3, + this->t3_.data(), + this->t3_.size() * sizeof(edm::TimeValue_t), + cudaMemcpyHostToDevice, + cudaStream)); + }); + + return product; +} + +TYPELOOKUP_DATA_REG(EcalLinearCorrectionsGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu index 792b422cefd6f..54c376214c4c6 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu @@ -9,56 +9,49 @@ // #include "EcalRecHitBuilderKernels.h" - #include "KernelHelpers.h" - - - namespace ecal { namespace rechit { - - + // uncalibrecHit flags enum UncalibRecHitFlags { - kGood=-1, // channel is good (mutually exclusive with other states) setFlagBit(kGood) reset flags_ to zero - kPoorReco, // channel has been badly reconstructed (e.g. bad shape, bad chi2 etc.) - kSaturated, // saturated channel - kOutOfTime, // channel out of time - kLeadingEdgeRecovered, // saturated channel: energy estimated from the leading edge before saturation - kHasSwitchToGain6, // at least one data frame is in G6 - kHasSwitchToGain1 // at least one data frame is in G1 + kGood = -1, // channel is good (mutually exclusive with other states) setFlagBit(kGood) reset flags_ to zero + kPoorReco, // channel has been badly reconstructed (e.g. bad shape, bad chi2 etc.) + kSaturated, // saturated channel + kOutOfTime, // channel out of time + kLeadingEdgeRecovered, // saturated channel: energy estimated from the leading edge before saturation + kHasSwitchToGain6, // at least one data frame is in G6 + kHasSwitchToGain1 // at least one data frame is in G1 }; - - + // recHit flags - enum RecHitFlags { - RecHitFlags_kGood=0, // channel ok, the energy and time measurement are reliable - RecHitFlags_kPoorReco, // the energy is available from the UncalibRecHit, but approximate (bad shape, large chi2) - RecHitFlags_kOutOfTime, // the energy is available from the UncalibRecHit (sync reco), but the event is out of time - RecHitFlags_kFaultyHardware, // The energy is available from the UncalibRecHit, channel is faulty at some hardware level (e.g. noisy) - RecHitFlags_kNoisy, // the channel is very noisy - RecHitFlags_kPoorCalib, // the energy is available from the UncalibRecHit, but the calibration of the channel is poor - RecHitFlags_kSaturated, // saturated channel (recovery not tried) - RecHitFlags_kLeadingEdgeRecovered, // saturated channel: energy estimated from the leading edge before saturation - RecHitFlags_kNeighboursRecovered, // saturated/isolated dead: energy estimated from neighbours - RecHitFlags_kTowerRecovered, // channel in TT with no data link, info retrieved from Trigger Primitive - RecHitFlags_kDead, // channel is dead and any recovery fails - RecHitFlags_kKilled, // MC only flag: the channel is killed in the real detector - RecHitFlags_kTPSaturated, // the channel is in a region with saturated TP - RecHitFlags_kL1SpikeFlag, // the channel is in a region with TP with sFGVB = 0 - RecHitFlags_kWeird, // the signal is believed to originate from an anomalous deposit (spike) - RecHitFlags_kDiWeird, // the signal is anomalous, and neighbors another anomalous signal - RecHitFlags_kHasSwitchToGain6, // at least one data frame is in G6 - RecHitFlags_kHasSwitchToGain1, // at least one data frame is in G1 + enum RecHitFlags { + RecHitFlags_kGood = 0, // channel ok, the energy and time measurement are reliable + RecHitFlags_kPoorReco, // the energy is available from the UncalibRecHit, but approximate (bad shape, large chi2) + RecHitFlags_kOutOfTime, // the energy is available from the UncalibRecHit (sync reco), but the event is out of time + RecHitFlags_kFaultyHardware, // The energy is available from the UncalibRecHit, channel is faulty at some hardware level (e.g. noisy) + RecHitFlags_kNoisy, // the channel is very noisy + RecHitFlags_kPoorCalib, // the energy is available from the UncalibRecHit, but the calibration of the channel is poor + RecHitFlags_kSaturated, // saturated channel (recovery not tried) + RecHitFlags_kLeadingEdgeRecovered, // saturated channel: energy estimated from the leading edge before saturation + RecHitFlags_kNeighboursRecovered, // saturated/isolated dead: energy estimated from neighbours + RecHitFlags_kTowerRecovered, // channel in TT with no data link, info retrieved from Trigger Primitive + RecHitFlags_kDead, // channel is dead and any recovery fails + RecHitFlags_kKilled, // MC only flag: the channel is killed in the real detector + RecHitFlags_kTPSaturated, // the channel is in a region with saturated TP + RecHitFlags_kL1SpikeFlag, // the channel is in a region with TP with sFGVB = 0 + RecHitFlags_kWeird, // the signal is believed to originate from an anomalous deposit (spike) + RecHitFlags_kDiWeird, // the signal is anomalous, and neighbors another anomalous signal + RecHitFlags_kHasSwitchToGain6, // at least one data frame is in G6 + RecHitFlags_kHasSwitchToGain1, // at least one data frame is in G1 // - RecHitFlags_kUnknown // to ease the interface with functions returning flags. + RecHitFlags_kUnknown // to ease the interface with functions returning flags. }; - - + // status code enum EcalChannelStatusCode_Code { - kOk=0, + kOk = 0, kDAC, kNoLaser, kNoisy, @@ -72,143 +65,118 @@ namespace ecal { kNonRespondingIsolated, kDeadVFE, kDeadFE, - kNoDataNoTP + kNoDataNoTP }; - - - - - - __global__ - void kernel_create_ecal_rehit( - // configuration - int const* ChannelStatusToBeExcluded, - uint32_t ChannelStatusToBeExcludedSize, - bool const killDeadChannels, - bool const recoverEBIsolatedChannels, - bool const recoverEEIsolatedChannels, - bool const recoverEBVFE, - bool const recoverEEVFE, - bool const recoverEBFE, - bool const recoverEEFE, - float const EBLaserMIN, - float const EELaserMIN, - float const EBLaserMAX, - float const EELaserMAX, - // for flags setting - int const* expanded_v_DB_reco_flags, // FIXME AM: to be checked - uint32_t const* expanded_Sizes_v_DB_reco_flags, - uint32_t const* expanded_flagbit_v_DB_reco_flags, - uint32_t expanded_v_DB_reco_flagsSize, - uint32_t flagmask, - // conditions - float const* adc2gev, - float const* intercalib, - uint16_t const* status, - float const* apdpnrefs, - float const* alphas, - // input for transparency corrections - float const* p1, - float const* p2, - float const* p3, - edm::TimeValue_t const* t1, - edm::TimeValue_t const* t2, - edm::TimeValue_t const* t3, - // input for linear corrections - float const* lp1, - float const* lp2, - float const* lp3, - edm::TimeValue_t const* lt1, - edm::TimeValue_t const* lt2, - edm::TimeValue_t const* lt3, - // time, used for time dependent corrections - edm::TimeValue_t const event_time, - // input - uint32_t const* did_eb, - uint32_t const* did_ee, - ::ecal::reco::StorageScalarType const* amplitude_eb, // in adc counts - ::ecal::reco::StorageScalarType const* amplitude_ee, // in adc counts - ::ecal::reco::StorageScalarType const* time_eb, - ::ecal::reco::StorageScalarType const* time_ee, - ::ecal::reco::StorageScalarType const* chi2_eb, - ::ecal::reco::StorageScalarType const* chi2_ee, - uint32_t const* flags_eb, - uint32_t const* flags_ee, - // output - uint32_t *did, - ::ecal::reco::StorageScalarType* energy, // in energy [GeV] - ::ecal::reco::StorageScalarType* time, - ::ecal::reco::StorageScalarType* chi2, - uint32_t* flagBits, - uint32_t* extra, - // other - int const nchannels, - uint32_t const nChannelsBarrel, - uint32_t const offsetForHashes - ) { - - - // + + __global__ void kernel_create_ecal_rehit( + // configuration + int const* ChannelStatusToBeExcluded, + uint32_t ChannelStatusToBeExcludedSize, + bool const killDeadChannels, + bool const recoverEBIsolatedChannels, + bool const recoverEEIsolatedChannels, + bool const recoverEBVFE, + bool const recoverEEVFE, + bool const recoverEBFE, + bool const recoverEEFE, + float const EBLaserMIN, + float const EELaserMIN, + float const EBLaserMAX, + float const EELaserMAX, + // for flags setting + int const* expanded_v_DB_reco_flags, // FIXME AM: to be checked + uint32_t const* expanded_Sizes_v_DB_reco_flags, + uint32_t const* expanded_flagbit_v_DB_reco_flags, + uint32_t expanded_v_DB_reco_flagsSize, + uint32_t flagmask, + // conditions + float const* adc2gev, + float const* intercalib, + uint16_t const* status, + float const* apdpnrefs, + float const* alphas, + // input for transparency corrections + float const* p1, + float const* p2, + float const* p3, + edm::TimeValue_t const* t1, + edm::TimeValue_t const* t2, + edm::TimeValue_t const* t3, + // input for linear corrections + float const* lp1, + float const* lp2, + float const* lp3, + edm::TimeValue_t const* lt1, + edm::TimeValue_t const* lt2, + edm::TimeValue_t const* lt3, + // time, used for time dependent corrections + edm::TimeValue_t const event_time, + // input + uint32_t const* did_eb, + uint32_t const* did_ee, + ::ecal::reco::StorageScalarType const* amplitude_eb, // in adc counts + ::ecal::reco::StorageScalarType const* amplitude_ee, // in adc counts + ::ecal::reco::StorageScalarType const* time_eb, + ::ecal::reco::StorageScalarType const* time_ee, + ::ecal::reco::StorageScalarType const* chi2_eb, + ::ecal::reco::StorageScalarType const* chi2_ee, + uint32_t const* flags_eb, + uint32_t const* flags_ee, + // output + uint32_t* did, + ::ecal::reco::StorageScalarType* energy, // in energy [GeV] + ::ecal::reco::StorageScalarType* time, + ::ecal::reco::StorageScalarType* chi2, + uint32_t* flagBits, + uint32_t* extra, + // other + int const nchannels, + uint32_t const nChannelsBarrel, + uint32_t const offsetForHashes) { + // // NB: energy "type_wrapper::type" most likely std::vector - // - - for (int ch = threadIdx.x + blockDim.x*blockIdx.x; ch < nchannels; ch += blockDim.x*gridDim.x) { - -// int ch = threadIdx.x + blockDim.x*blockIdx.x; - -// if (ch < nchannels) { - + // + + for (int ch = threadIdx.x + blockDim.x * blockIdx.x; ch < nchannels; ch += blockDim.x * gridDim.x) { + // int ch = threadIdx.x + blockDim.x*blockIdx.x; + + // if (ch < nchannels) { + bool isEndcap = (ch >= nChannelsBarrel); - - int const inputCh = isEndcap - ? ch - nChannelsBarrel - : ch; - - uint32_t const * didCh = isEndcap - ? did_ee - : did_eb; - + + int const inputCh = isEndcap ? ch - nChannelsBarrel : ch; + + uint32_t const* didCh = isEndcap ? did_ee : did_eb; + // only two values, EB or EE // AM : FIXME : why not using "isBarrel" ? isBarrel ? adc2gev[0] : adc2gev[1] - float adc2gev_to_use = isEndcap - ? adc2gev[1] // ee - : adc2gev[0]; // eb - - + float adc2gev_to_use = isEndcap ? adc2gev[1] // ee + : adc2gev[0]; // eb + // first EB and then EE - - ::ecal::reco::StorageScalarType const* amplitude = isEndcap - ? amplitude_ee - : amplitude_eb; - - ::ecal::reco::StorageScalarType const* time_in = isEndcap - ? time_ee - : time_eb; - - ::ecal::reco::StorageScalarType const* chi2_in = isEndcap - ? chi2_ee - : chi2_eb; - - uint32_t const* flags_in = isEndcap - ? flags_ee - : flags_eb; - + + ::ecal::reco::StorageScalarType const* amplitude = isEndcap ? amplitude_ee : amplitude_eb; + + ::ecal::reco::StorageScalarType const* time_in = isEndcap ? time_ee : time_eb; + + ::ecal::reco::StorageScalarType const* chi2_in = isEndcap ? chi2_ee : chi2_eb; + + uint32_t const* flags_in = isEndcap ? flags_ee : flags_eb; + // simple copy did[ch] = didCh[inputCh]; - + auto const did_to_use = DetId{didCh[inputCh]}; - + auto const isBarrel = did_to_use.subdetId() == EcalBarrel; - auto const hashedId = isBarrel - ? ecal::reconstruction::hashedIndexEB(did_to_use.rawId()) - : offsetForHashes + ecal::reconstruction::hashedIndexEE(did_to_use.rawId()); - + auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did_to_use.rawId()) + : offsetForHashes + ecal::reconstruction::hashedIndexEE(did_to_use.rawId()); + float const intercalib_to_use = intercalib[hashedId]; - - + // get laser coefficient float lasercalib = 1.; - + // // AM: ideas // @@ -217,24 +185,22 @@ namespace ecal { // Then only if the LS is different, update the laser correction // The variation within a LS is not worth pursuing (<< 0.1% !!) // and below the precision we can claim on the laser corrections (right?). - // This will save quite some time (also for the CPU version?) + // This will save quite some time (also for the CPU version?) // - + int iLM = 1; - + if (isBarrel) { - iLM = ecal::reconstruction::laser_monitoring_region_EB (did_to_use.rawId()); - } - else { - iLM = ecal::reconstruction::laser_monitoring_region_EE (did_to_use.rawId()); + iLM = ecal::reconstruction::laser_monitoring_region_EB(did_to_use.rawId()); + } else { + iLM = ecal::reconstruction::laser_monitoring_region_EE(did_to_use.rawId()); } - - + long long t_i = 0, t_f = 0; float p_i = 0, p_f = 0; long long lt_i = 0, lt_f = 0; float lp_i = 0, lp_f = 0; - + // laser if (event_time >= t1[iLM - 1] && event_time < t2[iLM - 1]) { t_i = t1[iLM - 1]; @@ -251,15 +217,14 @@ namespace ecal { t_f = t2[iLM - 1]; p_i = p1[hashedId]; p_f = p2[hashedId]; - + } else if (event_time > t3[iLM - 1]) { t_i = t2[iLM - 1]; t_f = t3[iLM - 1]; p_i = p2[hashedId]; p_f = p3[hashedId]; } - - + // linear corrections if (event_time >= lt1[iLM - 1] && event_time < lt2[iLM - 1]) { lt_i = lt1[iLM - 1]; @@ -276,26 +241,27 @@ namespace ecal { lt_f = lt2[iLM - 1]; lp_i = lp1[hashedId]; lp_f = lp2[hashedId]; - + } else if (event_time > lt3[iLM - 1]) { lt_i = lt2[iLM - 1]; lt_f = lt3[iLM - 1]; lp_i = lp2[hashedId]; lp_f = lp3[hashedId]; } - - - // apdpnref and alpha + + // apdpnref and alpha float apdpnref = apdpnrefs[hashedId]; float alpha = alphas[hashedId]; - + // now calculate transparency correction if (apdpnref != 0 && (t_i - t_f) != 0 && (lt_i - lt_f) != 0) { long long tt = event_time; // never subtract two unsigned! - float interpolatedLaserResponse = p_i / apdpnref + float(tt - t_i) * (p_f - p_i) / (apdpnref * float(t_f - t_i)); - - float interpolatedLinearResponse = lp_i / apdpnref + float(tt - lt_i) * (lp_f - lp_i) / (apdpnref * float(lt_f - lt_i)); // FIXED BY FC - + float interpolatedLaserResponse = + p_i / apdpnref + float(tt - t_i) * (p_f - p_i) / (apdpnref * float(t_f - t_i)); + + float interpolatedLinearResponse = + lp_i / apdpnref + float(tt - lt_i) * (lp_f - lp_i) / (apdpnref * float(lt_f - lt_i)); // FIXED BY FC + if (interpolatedLinearResponse > 2.f || interpolatedLinearResponse < 0.1f) { interpolatedLinearResponse = 1.f; } @@ -303,302 +269,279 @@ namespace ecal { // AM : how the heck is it possible? // interpolatedLaserResponse = 0.0001; lasercalib = 1.; - - } - else { - + + } else { float interpolatedTransparencyResponse = interpolatedLaserResponse / interpolatedLinearResponse; - + // ... and now this: - lasercalib = 1.f / ( std::pow(interpolatedTransparencyResponse, alpha) * interpolatedLinearResponse); - + lasercalib = 1.f / (std::pow(interpolatedTransparencyResponse, alpha) * interpolatedLinearResponse); } } - + // // Check for channels to be excluded from reconstruction - // + // // // Default energy? Not to be updated if "ChannelStatusToBeExcluded" // Exploited later by the module "EcalRecHitConvertGPU2CPUFormat" // - energy[ch] = -1; //---- AM: default, un-physical, ok - + energy[ch] = -1; //---- AM: default, un-physical, ok + // - static const int chStatusMask = 0x1F; + static const int chStatusMask = 0x1F; // ChannelStatusToBeExcluded is a "int" then I put "dbstatus" to be the same - int dbstatus = EcalChannelStatusCode_Code( (status[hashedId]) & chStatusMask ); + int dbstatus = EcalChannelStatusCode_Code((status[hashedId]) & chStatusMask); if (ChannelStatusToBeExcludedSize != 0) { - for (int ich_to_check = 0; ich_to_check recHit flagbits and return the apporpriate flagbit word - + // // AM: get the smaller "flagbit_counter" with match // - + uint32_t temporary_flagBits = 0; - + int iterator_flags = 0; bool need_to_exit = false; int flagbit_counter = 0; while (!need_to_exit) { iterator_flags = 0; - for (unsigned int i = 0; i != expanded_v_DB_reco_flagsSize; ++i) { + for (unsigned int i = 0; i != expanded_v_DB_reco_flagsSize; ++i) { // check the correct "flagbit" if (expanded_flagbit_v_DB_reco_flags[i] == flagbit_counter) { - for (unsigned int j = 0; j < expanded_Sizes_v_DB_reco_flags[i]; j++) { - - if ( expanded_v_DB_reco_flags[iterator_flags] == dbstatus ) { - temporary_flagBits = 0x1 << expanded_flagbit_v_DB_reco_flags[i]; + if (expanded_v_DB_reco_flags[iterator_flags] == dbstatus) { + temporary_flagBits = 0x1 << expanded_flagbit_v_DB_reco_flags[i]; need_to_exit = true; - break; // also from the big loop!!! - + break; // also from the big loop!!! } iterator_flags++; } - } - else { + } else { // if not, got to the next bunch directly iterator_flags += expanded_Sizes_v_DB_reco_flags[i]; } - + if (need_to_exit) { break; } - } - flagbit_counter+=1; + flagbit_counter += 1; } - - - if ( (flagmask & temporary_flagBits) && killDeadChannels ) { + + if ((flagmask & temporary_flagBits) && killDeadChannels) { return; } - - + // flagBits[ch] = temporary_flagBits; - + // // multiply the adc counts with factors to get GeV // - + // energy[ch] = amplitude[inputCh] * adc2gev_to_use * intercalib_to_use ; energy[ch] = amplitude[inputCh] * adc2gev_to_use * intercalib_to_use * lasercalib; - + // Time is not saved so far, FIXME // time[ch] = time_in[inputCh]; - - - if (chi2_in[inputCh] > 64) chi2[ch] = 64; - else chi2[ch] = chi2_in[inputCh]; - - + + if (chi2_in[inputCh] > 64) + chi2[ch] = 64; + else + chi2[ch] = chi2_in[inputCh]; + // NB: calculate the "flagBits extra" --> not really "flags", but actually an encoded version of energy uncertainty, time unc., ... extra[ch] = 0; - + // // extra packing ... // - + uint32_t offset; uint32_t width; uint32_t value; - + float chi2_temp = chi2[ch]; - if (chi2_temp > 64) chi2_temp = 64; + if (chi2_temp > 64) + chi2_temp = 64; // use 7 bits - uint32_t rawChi2 = lround(chi2_temp / 64. * ((1<<7)-1)); - + uint32_t rawChi2 = lround(chi2_temp / 64. * ((1 << 7) - 1)); + offset = 0; width = 7; - value = 0; - + value = 0; + uint32_t mask = ((1 << width) - 1) << offset; value &= ~mask; value |= (rawChi2 & ((1U << width) - 1)) << offset; - + // extra[ch] = value; - // - + // + // rawEnergy is actually "error" !!! uint32_t rawEnergy = 0; - - - // AM: FIXME: this is not propagated currently to the uncalibrecHit collection SOA + + // AM: FIXME: this is not propagated currently to the uncalibrecHit collection SOA // if you want to store this in "extra", we need first to add it to the uncalibrecHit results // then it will be something like the following // amplitudeError[inputCh] * adc2gev_to_use * intercalib_to_use * lasercalib - // - // - - float amplitudeError_ch = 0. ; // amplitudeError[ch]; - + // + // + + float amplitudeError_ch = 0.; // amplitudeError[ch]; + if (amplitudeError_ch > 0.001) { // uint16_t exponent = getPower10(amplitudeError_ch); - - static constexpr float p10[] = {1.e-2f,1.e-1f,1.f,1.e1f,1.e2f,1.e3f,1.e4f,1.e5f,1.e6f}; - int b = amplitudeError_ch - // + // + // uncalibRH.isSaturated() ---> + // // bool EcalUncalibratedRecHit::isSaturated() const { // return EcalUncalibratedRecHit::checkFlag(kSaturated); // } // // - - if ( flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kSaturated) ) ) { - flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kSaturated)); + + if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kSaturated))) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kSaturated)); good = false; } - - if (flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kOutOfTime) ) ) { - flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kOutOfTime)); + + if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kOutOfTime))) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kOutOfTime)); good = false; } - if (flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kPoorReco) ) ) { - flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorReco)); + if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kPoorReco))) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorReco)); good = false; } - if (flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kHasSwitchToGain6) ) ) { - flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kHasSwitchToGain6)); + if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kHasSwitchToGain6))) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kHasSwitchToGain6)); } - if (flags_in[inputCh] & ( 0x1 << (UncalibRecHitFlags::kHasSwitchToGain1) ) ) { - flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kHasSwitchToGain1)); + if (flags_in[inputCh] & (0x1 << (UncalibRecHitFlags::kHasSwitchToGain1))) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kHasSwitchToGain1)); } - - + if (good) { flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kGood)); } - - if (isBarrel && (lasercalib < EBLaserMIN || lasercalib > EBLaserMAX)) { - flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorCalib)); - + + if (isBarrel && (lasercalib < EBLaserMIN || lasercalib > EBLaserMAX)) { + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorCalib)); } if (!isBarrel && (lasercalib < EELaserMIN || lasercalib > EELaserMAX)) { - flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorCalib)); + flagBits[ch] |= (0x1 << (RecHitFlags::RecHitFlags_kPoorCalib)); } - - - + // recover, killing, and other stuff - - // - // Structure: - // EB - // EE - // - // - // - single MVA - // - democratic sharing - // - kill all the other cases - // - + + // + // Structure: + // EB + // EE + // + // + // - single MVA + // - democratic sharing + // - kill all the other cases + // + bool is_Single = false; - bool is_FE = false; - bool is_VFE = false; - - bool is_recoverable = false; // DetIdToBeRecovered - - if ( dbstatus == 10 || dbstatus == 11 || dbstatus == 12 ) { + bool is_FE = false; + bool is_VFE = false; + + bool is_recoverable = false; // DetIdToBeRecovered + + if (dbstatus == 10 || dbstatus == 11 || dbstatus == 12) { is_recoverable = true; } - - + if (is_recoverable) { if (dbstatus == EcalChannelStatusCode_Code::kDeadVFE) { is_VFE = true; - } - else if (dbstatus == EcalChannelStatusCode_Code::kDeadVFE) { + } else if (dbstatus == EcalChannelStatusCode_Code::kDeadVFE) { is_FE = true; - } - else { + } else { is_Single = true; } - - + // EB if (isBarrel) { - if (is_Single || is_FE || is_VFE) { + if (is_Single || is_FE || is_VFE) { // single MVA - if (is_Single && (recoverEBIsolatedChannels || !killDeadChannels) ) { - - + if (is_Single && (recoverEBIsolatedChannels || !killDeadChannels)) { } // decmocratic sharing - else if (is_FE && (recoverEBFE || !killDeadChannels) ) { - - + else if (is_FE && (recoverEBFE || !killDeadChannels)) { } // kill all the other cases else { @@ -607,20 +550,17 @@ namespace ecal { } } // EE - else { - if (is_Single || is_FE || is_VFE) { + else { + if (is_Single || is_FE || is_VFE) { // single MVA - if (is_Single && (recoverEBIsolatedChannels || !killDeadChannels) ) { - - + if (is_Single && (recoverEBIsolatedChannels || !killDeadChannels)) { } // decmocratic sharing - else if (is_FE && (recoverEBFE || !killDeadChannels) ) { - - // - // Code is definitely too long ... - // - + else if (is_FE && (recoverEBFE || !killDeadChannels)) { + // + // Code is definitely too long ... + // + } // kill all the other cases else { @@ -628,115 +568,102 @@ namespace ecal { } } } - - } - - - } // end channel - + } + + } // end channel } - - - + // host version, to be called by the plugin - void create_ecal_rehit( - EventInputDataGPU const& eventInputGPU, - EventOutputDataGPU& eventOutputGPU, - // eventDataForScratchGPU_, - ConditionsProducts const& conditions, - ConfigurationParameters const& configParameters, - uint32_t const nChannelsBarrel, - edm::TimeValue_t const event_time, - cudaStream_t cudaStream - ){ - - int nchannels = eventInputGPU.ebUncalibRecHits.size + eventInputGPU.eeUncalibRecHits.size ; - -// unsigned int nchannels_per_block = 32; + void create_ecal_rehit(EventInputDataGPU const& eventInputGPU, + EventOutputDataGPU& eventOutputGPU, + // eventDataForScratchGPU_, + ConditionsProducts const& conditions, + ConfigurationParameters const& configParameters, + uint32_t const nChannelsBarrel, + edm::TimeValue_t const event_time, + cudaStream_t cudaStream) { + int nchannels = eventInputGPU.ebUncalibRecHits.size + eventInputGPU.eeUncalibRecHits.size; + + // unsigned int nchannels_per_block = 32; unsigned int nchannels_per_block = 16; unsigned int threads_min = nchannels_per_block; - unsigned int blocks_min = (nchannels + threads_min - 1) / threads_min; // TEST : to be optimized (AM) - - // + unsigned int blocks_min = (nchannels + threads_min - 1) / threads_min; // TEST : to be optimized (AM) + + // // kernel create rechit // - -// auto const nbytesShared = 2 * threads_min * MapSymM::total * sizeof(DataType); - - kernel_create_ecal_rehit <<< blocks_min, threads_min, 0, cudaStream >>> ( -// kernel_create_ecal_rehit <<< blocks_min, threads_min, nbytesShared, cudaStream >>> ( -// kernel_create_ecal_rehit <<< blocks_min, threads_min >>> ( - // configuration - configParameters.ChannelStatusToBeExcluded, - configParameters.ChannelStatusToBeExcludedSize, - configParameters.killDeadChannels, - configParameters.recoverEBIsolatedChannels, - configParameters.recoverEEIsolatedChannels, - configParameters.recoverEBVFE, - configParameters.recoverEEVFE, - configParameters.recoverEBFE, - configParameters.recoverEEFE, - configParameters.EBLaserMIN, - configParameters.EELaserMIN, - configParameters.EBLaserMAX, - configParameters.EELaserMAX, - // for flags setting - configParameters.expanded_v_DB_reco_flags, - configParameters.expanded_Sizes_v_DB_reco_flags, - configParameters.expanded_flagbit_v_DB_reco_flags, - configParameters.expanded_v_DB_reco_flagsSize, - configParameters.flagmask, - // conditions - conditions.ADCToGeV.adc2gev, - conditions.Intercalib.values, - conditions.ChannelStatus.status, - conditions.LaserAPDPNRatiosRef.values, - conditions.LaserAlphas.values, - // input for transparency corrections - conditions.LaserAPDPNRatios.p1, - conditions.LaserAPDPNRatios.p2, - conditions.LaserAPDPNRatios.p3, - conditions.LaserAPDPNRatios.t1, - conditions.LaserAPDPNRatios.t2, - conditions.LaserAPDPNRatios.t3, - // input for linear corrections - conditions.LinearCorrections.p1, - conditions.LinearCorrections.p2, - conditions.LinearCorrections.p3, - conditions.LinearCorrections.t1, - conditions.LinearCorrections.t2, - conditions.LinearCorrections.t3, - // time, used for time dependent corrections - event_time, - // input - eventInputGPU.ebUncalibRecHits.did, - eventInputGPU.eeUncalibRecHits.did, - eventInputGPU.ebUncalibRecHits.amplitude, - eventInputGPU.eeUncalibRecHits.amplitude, - eventInputGPU.ebUncalibRecHits.jitter, - eventInputGPU.eeUncalibRecHits.jitter, - eventInputGPU.ebUncalibRecHits.chi2, - eventInputGPU.eeUncalibRecHits.chi2, - eventInputGPU.ebUncalibRecHits.flags, - eventInputGPU.eeUncalibRecHits.flags, - // output - eventOutputGPU.did, - eventOutputGPU.energy, - eventOutputGPU.time, - eventOutputGPU.chi2, - eventOutputGPU.flagBits, - eventOutputGPU.extra, - // other - nchannels, - nChannelsBarrel, - conditions.offsetForHashes - ); - - - - } - - - } - -} + + // auto const nbytesShared = 2 * threads_min * MapSymM::total * sizeof(DataType); + + kernel_create_ecal_rehit<<>>( + // kernel_create_ecal_rehit <<< blocks_min, threads_min, nbytesShared, cudaStream >>> ( + // kernel_create_ecal_rehit <<< blocks_min, threads_min >>> ( + // configuration + configParameters.ChannelStatusToBeExcluded, + configParameters.ChannelStatusToBeExcludedSize, + configParameters.killDeadChannels, + configParameters.recoverEBIsolatedChannels, + configParameters.recoverEEIsolatedChannels, + configParameters.recoverEBVFE, + configParameters.recoverEEVFE, + configParameters.recoverEBFE, + configParameters.recoverEEFE, + configParameters.EBLaserMIN, + configParameters.EELaserMIN, + configParameters.EBLaserMAX, + configParameters.EELaserMAX, + // for flags setting + configParameters.expanded_v_DB_reco_flags, + configParameters.expanded_Sizes_v_DB_reco_flags, + configParameters.expanded_flagbit_v_DB_reco_flags, + configParameters.expanded_v_DB_reco_flagsSize, + configParameters.flagmask, + // conditions + conditions.ADCToGeV.adc2gev, + conditions.Intercalib.values, + conditions.ChannelStatus.status, + conditions.LaserAPDPNRatiosRef.values, + conditions.LaserAlphas.values, + // input for transparency corrections + conditions.LaserAPDPNRatios.p1, + conditions.LaserAPDPNRatios.p2, + conditions.LaserAPDPNRatios.p3, + conditions.LaserAPDPNRatios.t1, + conditions.LaserAPDPNRatios.t2, + conditions.LaserAPDPNRatios.t3, + // input for linear corrections + conditions.LinearCorrections.p1, + conditions.LinearCorrections.p2, + conditions.LinearCorrections.p3, + conditions.LinearCorrections.t1, + conditions.LinearCorrections.t2, + conditions.LinearCorrections.t3, + // time, used for time dependent corrections + event_time, + // input + eventInputGPU.ebUncalibRecHits.did, + eventInputGPU.eeUncalibRecHits.did, + eventInputGPU.ebUncalibRecHits.amplitude, + eventInputGPU.eeUncalibRecHits.amplitude, + eventInputGPU.ebUncalibRecHits.jitter, + eventInputGPU.eeUncalibRecHits.jitter, + eventInputGPU.ebUncalibRecHits.chi2, + eventInputGPU.eeUncalibRecHits.chi2, + eventInputGPU.ebUncalibRecHits.flags, + eventInputGPU.eeUncalibRecHits.flags, + // output + eventOutputGPU.did, + eventOutputGPU.energy, + eventOutputGPU.time, + eventOutputGPU.chi2, + eventOutputGPU.flagBits, + eventOutputGPU.extra, + // other + nchannels, + nChannelsBarrel, + conditions.offsetForHashes); + } + + } // namespace rechit + +} // namespace ecal diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h index 30bc589a9a5c2..f0816257eb61e 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.h @@ -10,89 +10,81 @@ #include "DataFormats/Provenance/interface/Timestamp.h" - -namespace ecal { +namespace ecal { namespace rechit { - - - __global__ - void kernel_create_ecal_rehit( - // configuration - int const* ChannelStatusToBeExcluded, - uint32_t ChannelStatusToBeExcludedSize, - bool killDeadChannels, - bool const recoverEBIsolatedChannels, - bool const recoverEEIsolatedChannels, - bool const recoverEBVFE, - bool const recoverEEVFE, - bool const recoverEBFE, - bool const recoverEEFE, - // for flags setting - int const* expanded_v_DB_reco_flags, - uint32_t const* expanded_Sizes_v_DB_reco_flags, - uint32_t const* expanded_flagbit_v_DB_reco_flags, - uint32_t expanded_v_DB_reco_flagsSize, - uint32_t flagmask, - // conditions - float const* adc2gev, - float const* intercalib, - uint16_t const* status, - float const* apdpnrefs, - float const* alphas, - // input for transparency corrections - float const* p1, - float const* p2, - float const* p3, - edm::TimeValue_t const* t1, - edm::TimeValue_t const* t2, - edm::TimeValue_t const* t3, - // input for linear corrections - float const* lp1, - float const* lp2, - float const* lp3, - edm::TimeValue_t const* lt1, - edm::TimeValue_t const* lt2, - edm::TimeValue_t const* lt3, - // time, used for time dependent corrections - edm::TimeValue_t const event_time, - // input - uint32_t const* did_eb, - uint32_t const* did_ee, - ::ecal::reco::StorageScalarType const* amplitude_eb, // in adc counts - ::ecal::reco::StorageScalarType const* amplitude_ee, // in adc counts - ::ecal::reco::StorageScalarType const* time_eb, - ::ecal::reco::StorageScalarType const* time_ee, - ::ecal::reco::StorageScalarType const* chi2_eb, - ::ecal::reco::StorageScalarType const* chi2_ee, - uint32_t const* flags_eb, - uint32_t const* flags_ee, - // output - uint32_t *did, - ::ecal::reco::StorageScalarType* energy, // in energy [GeV] - ::ecal::reco::StorageScalarType* time, - ::ecal::reco::StorageScalarType* chi2, - uint32_t* flagBits, - uint32_t* extra, - int const nchannels, - uint32_t const nChannelsBarrel, - uint32_t const offsetForHashes - ); - - + + __global__ void kernel_create_ecal_rehit( + // configuration + int const* ChannelStatusToBeExcluded, + uint32_t ChannelStatusToBeExcludedSize, + bool killDeadChannels, + bool const recoverEBIsolatedChannels, + bool const recoverEEIsolatedChannels, + bool const recoverEBVFE, + bool const recoverEEVFE, + bool const recoverEBFE, + bool const recoverEEFE, + // for flags setting + int const* expanded_v_DB_reco_flags, + uint32_t const* expanded_Sizes_v_DB_reco_flags, + uint32_t const* expanded_flagbit_v_DB_reco_flags, + uint32_t expanded_v_DB_reco_flagsSize, + uint32_t flagmask, + // conditions + float const* adc2gev, + float const* intercalib, + uint16_t const* status, + float const* apdpnrefs, + float const* alphas, + // input for transparency corrections + float const* p1, + float const* p2, + float const* p3, + edm::TimeValue_t const* t1, + edm::TimeValue_t const* t2, + edm::TimeValue_t const* t3, + // input for linear corrections + float const* lp1, + float const* lp2, + float const* lp3, + edm::TimeValue_t const* lt1, + edm::TimeValue_t const* lt2, + edm::TimeValue_t const* lt3, + // time, used for time dependent corrections + edm::TimeValue_t const event_time, + // input + uint32_t const* did_eb, + uint32_t const* did_ee, + ::ecal::reco::StorageScalarType const* amplitude_eb, // in adc counts + ::ecal::reco::StorageScalarType const* amplitude_ee, // in adc counts + ::ecal::reco::StorageScalarType const* time_eb, + ::ecal::reco::StorageScalarType const* time_ee, + ::ecal::reco::StorageScalarType const* chi2_eb, + ::ecal::reco::StorageScalarType const* chi2_ee, + uint32_t const* flags_eb, + uint32_t const* flags_ee, + // output + uint32_t* did, + ::ecal::reco::StorageScalarType* energy, // in energy [GeV] + ::ecal::reco::StorageScalarType* time, + ::ecal::reco::StorageScalarType* chi2, + uint32_t* flagBits, + uint32_t* extra, + int const nchannels, + uint32_t const nChannelsBarrel, + uint32_t const offsetForHashes); + // host version, to be called by the plugin - - void create_ecal_rehit( - EventInputDataGPU const& eventInputGPU, - EventOutputDataGPU& eventOutputGPU, - // eventDataForScratchGPU_, - ConditionsProducts const& conditions, - ConfigurationParameters const& configParameters, - uint32_t const nChannelsBarrel, - edm::TimeValue_t const event_time, - cudaStream_t cudaStream - ); - - } - -} + void create_ecal_rehit(EventInputDataGPU const& eventInputGPU, + EventOutputDataGPU& eventOutputGPU, + // eventDataForScratchGPU_, + ConditionsProducts const& conditions, + ConfigurationParameters const& configParameters, + uint32_t const nChannelsBarrel, + edm::TimeValue_t const event_time, + cudaStream_t cudaStream); + + } // namespace rechit + +} // namespace ecal diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRechitADCToGeVConstantGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitADCToGeVConstantGPU.cc index 3824b0989f622..5f01068f95186 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalRechitADCToGeVConstantGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitADCToGeVConstantGPU.cc @@ -3,36 +3,31 @@ #include "FWCore/Utilities/interface/typelookup.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -EcalRechitADCToGeVConstantGPU::EcalRechitADCToGeVConstantGPU(EcalADCToGeVConstant const& values) -: adc2gev_(2) // size is 2, one form EB and one for EE +EcalRechitADCToGeVConstantGPU::EcalRechitADCToGeVConstantGPU(EcalADCToGeVConstant const& values) + : adc2gev_(2) // size is 2, one form EB and one for EE { adc2gev_[0] = values.getEBValue(); - adc2gev_[1] = values.getEEValue(); + adc2gev_[1] = values.getEEValue(); } EcalRechitADCToGeVConstantGPU::Product::~Product() { // deallocation - cudaCheck( cudaFree(adc2gev) ); + cudaCheck(cudaFree(adc2gev)); } -EcalRechitADCToGeVConstantGPU::Product const& EcalRechitADCToGeVConstantGPU::getProduct( - cudaStream_t cudaStream) const -{ +EcalRechitADCToGeVConstantGPU::Product const& EcalRechitADCToGeVConstantGPU::getProduct(cudaStream_t cudaStream) const { auto const& product = product_.dataForCurrentDeviceAsync( - cudaStream, - [this](EcalRechitADCToGeVConstantGPU::Product& product, cudaStream_t cudaStream) { - // malloc - cudaCheck( cudaMalloc((void**)&product.adc2gev, - this->adc2gev_.size() * sizeof(float)) ); - // transfer - cudaCheck( cudaMemcpyAsync(product.adc2gev, - this->adc2gev_.data(), - this->adc2gev_.size() * sizeof(float), - cudaMemcpyHostToDevice, - cudaStream) ); - } - ); - + cudaStream, [this](EcalRechitADCToGeVConstantGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck(cudaMalloc((void**)&product.adc2gev, this->adc2gev_.size() * sizeof(float))); + // transfer + cudaCheck(cudaMemcpyAsync(product.adc2gev, + this->adc2gev_.data(), + this->adc2gev_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + }); + return product; } diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc index 7f38a23ec9168..1e6801fbd326a 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRechitChannelStatusGPU.cc @@ -3,47 +3,40 @@ #include "FWCore/Utilities/interface/typelookup.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -EcalRechitChannelStatusGPU::EcalRechitChannelStatusGPU(EcalChannelStatus const& values) -: status_(values.size()) -{ +EcalRechitChannelStatusGPU::EcalRechitChannelStatusGPU(EcalChannelStatus const& values) : status_(values.size()) { // fill in eb auto const& barrelValues = values.barrelItems(); - for (unsigned int i=0; istatus_.size() * sizeof(uint16_t)) ); - // transfer - cudaCheck( cudaMemcpyAsync(product.status, - this->status_.data(), - this->status_.size() * sizeof(uint16_t), - cudaMemcpyHostToDevice, - cudaStream) ); - } - ); - - return product; - } - - TYPELOOKUP_DATA_REG(EcalRechitChannelStatusGPU); - \ No newline at end of file +EcalRechitChannelStatusGPU::Product const& EcalRechitChannelStatusGPU::getProduct(cudaStream_t cudaStream) const { + auto const& product = product_.dataForCurrentDeviceAsync( + cudaStream, [this](EcalRechitChannelStatusGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck(cudaMalloc((void**)&product.status, this->status_.size() * sizeof(uint16_t))); + // transfer + cudaCheck(cudaMemcpyAsync(product.status, + this->status_.data(), + this->status_.size() * sizeof(uint16_t), + cudaMemcpyHostToDevice, + cudaStream)); + }); + + return product; +} + +TYPELOOKUP_DATA_REG(EcalRechitChannelStatusGPU); diff --git a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu index e4e1a59565e0d..c9d023deb8824 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.cu @@ -5,19 +5,17 @@ namespace ecal { namespace reconstruction { - + namespace internal { - + namespace barrel { - + __device__ __forceinline__ bool positiveZ(uint32_t id) { return id & 0x10000; } - + __device__ __forceinline__ uint32_t ietaAbs(uint32_t id) { return (id >> 9) & 0x7F; } - + __device__ __forceinline__ uint32_t iphi(uint32_t id) { return id & 0x1FF; } - - - + __device__ int dccFromSm(int ism) { int iz = 1; if (ism > 18) @@ -27,9 +25,9 @@ namespace ecal { int idcc = 9 + ism; if (iz == +1) idcc += 18; - return idcc; + return idcc; } - + __device__ int sm(int ieta, int iphi) { int iz = 1; if (ieta < 0) @@ -43,36 +41,28 @@ namespace ecal { ism += 18; return ism; } - - + __device__ int dcc(int ieta, int iphi) { int ism = sm(ieta, iphi); return dccFromSm(ism); } - - - - - // + + // // ---- why on hell things are so complex and not simple ??? - // - - - __device__ int lm_channel (int iX, int iY) { - + // + + __device__ int lm_channel(int iX, int iY) { static const int idx_[] = { - // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 - 1, 2, 2, 2, 2, 4, 4, 4, 4, - 6, 6, 6, 6, 8, 8, 8, 8, // 3 - 1, 2, 2, 2, 2, 4, 4, 4, 4, - 6, 6, 6, 6, 8, 8, 8, 8, // 2 - 1, 3, 3, 3, 3, 5, 5, 5, 5, - 7, 7, 7, 7, 9, 9, 9, 9, // 1 - 1, 3, 3, 3, 3, 5, 5, 5, 5, - 7, 7, 7, 7, 9, 9, 9, 9 // 0 - // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + // clang-format off + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + 1, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8, // 3 + 1, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8, // 2 + 1, 3, 3, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 9, 9, 9, 9, // 1 + 1, 3, 3, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 9, 9, 9, 9 // 0 + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + // clang-format on }; - + int il, ic, ii; const int iym = 4; const int ixm = 17; @@ -85,15 +75,12 @@ namespace ecal { return -1; }; return idx_[ii]; - } - - - - __device__ int localCoord_x (int ieta, int iphi) { + + __device__ int localCoord_x(int ieta, int iphi) { int iz = 1; if (ieta < 0) { - iz = -1; + iz = -1; } ieta *= iz; // int iphi_ = iphi; @@ -105,15 +92,14 @@ namespace ecal { // if (iz == -1) { // iy = 19 - iy; // } - + return ix; } - - - __device__ int localCoord_y (int ieta, int iphi) { + + __device__ int localCoord_y(int ieta, int iphi) { int iz = 1; if (ieta < 0) { - iz = -1; + iz = -1; } // ieta *= iz; int iphi_ = iphi; @@ -125,94 +111,80 @@ namespace ecal { if (iz == -1) { iy = 19 - iy; } - + return iy; } - - - __device__ int lmmod (int ieta, int iphi) { - + + __device__ int lmmod(int ieta, int iphi) { int ix = localCoord_x(ieta, iphi); int iy = localCoord_y(ieta, iphi); - + return lm_channel(ix / 5, iy / 5); } - - - - __device__ int side (int ieta, int iphi) { + + __device__ int side(int ieta, int iphi) { int ilmmod = lmmod(ieta, iphi); return (ilmmod % 2 == 0) ? 1 : 0; } - - - + } // namespace barrel - + } // namespace internal - + __device__ uint32_t hashedIndexEB(uint32_t id) { using namespace internal::barrel; return (EBDetId::MAX_IETA + (positiveZ(id) ? ietaAbs(id) - 1 : -ietaAbs(id))) * EBDetId::MAX_IPHI + iphi(id) - 1; } - - - - // + + // // https://cmssdt.cern.ch/lxr/source/CalibCalorimetry/EcalLaserAnalyzer/src/MEEBGeom.cc // function: "lmr" - - __device__ - int laser_monitoring_region_EB(uint32_t id) { + + __device__ int laser_monitoring_region_EB(uint32_t id) { using namespace internal::barrel; - + int ieta; if (positiveZ(id)) { ieta = ietaAbs(id); + } else { + ieta = -ietaAbs(id); } - else { - ieta = - ietaAbs(id); - } - - int idcc = dcc(ieta, (int) (iphi(id)) ); + + int idcc = dcc(ieta, (int)(iphi(id))); int ism = idcc - 9; - - int iside = side(ieta, (int) (iphi(id)) ); + + int iside = side(ieta, (int)(iphi(id))); // int iside = positiveZ(id) ? 1 : 0; - - return ( 1 + 2 * (ism - 1) + iside ); + + return (1 + 2 * (ism - 1) + iside); // return ieta; // return (int) (iphi(id)); // return idcc; // return iside; - } - - - - + namespace internal { - + namespace endcap { - + __device__ __forceinline__ uint32_t ix(uint32_t id) { return (id >> 7) & 0x7F; } - + __device__ __forceinline__ uint32_t iy(uint32_t id) { return id & 0x7F; } - + __device__ __forceinline__ bool positiveZ(uint32_t id) { return id & 0x4000; } - + // these constants come from EE Det Id __constant__ const unsigned short kxf[] = { - 41, 51, 41, 51, 41, 51, 36, 51, 36, 51, 26, 51, 26, 51, 26, 51, 21, 51, 21, 51, 21, 51, 21, 51, 21, - 51, 16, 51, 16, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 9, 51, 9, 51, 9, 51, 9, 51, 9, 51, - 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 4, 51, 4, 51, 4, - 51, 4, 51, 4, 56, 1, 58, 1, 59, 1, 60, 1, 61, 1, 61, 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, - 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, 1, 61, 1, 61, 1, 60, 1, 59, 1, 58, 4, 56, 4, 51, 4, - 51, 4, 51, 4, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, - 9, 51, 9, 51, 9, 51, 9, 51, 9, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 16, 51, 16, 51, 21, - 51, 21, 51, 21, 51, 21, 51, 21, 51, 26, 51, 26, 51, 26, 51, 36, 51, 36, 51, 41, 51, 41, 51, 41, 51}; - - __constant__ const unsigned short kdi[] = { + 41, 51, 41, 51, 41, 51, 36, 51, 36, 51, 26, 51, 26, 51, 26, 51, 21, 51, 21, 51, 21, 51, 21, 51, 21, + 51, 16, 51, 16, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 9, 51, 9, 51, 9, 51, 9, 51, 9, 51, + 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 4, 51, 4, 51, 4, + 51, 4, 51, 4, 56, 1, 58, 1, 59, 1, 60, 1, 61, 1, 61, 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, + 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, 1, 61, 1, 61, 1, 60, 1, 59, 1, 58, 4, 56, 4, 51, 4, + 51, 4, 51, 4, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, + 9, 51, 9, 51, 9, 51, 9, 51, 9, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 16, 51, 16, 51, 21, + 51, 21, 51, 21, 51, 21, 51, 21, 51, 26, 51, 26, 51, 26, 51, 36, 51, 36, 51, 41, 51, 41, 51, 41, 51}; + + __constant__ const unsigned short kdi[] = { 0, 10, 20, 30, 40, 50, 60, 75, 90, 105, 120, 145, 170, 195, 220, 245, 270, 300, 330, 360, 390, 420, 450, 480, 510, 540, 570, 605, 640, 675, 710, 747, 784, 821, 858, 895, 932, 969, 1006, 1043, 1080, 1122, 1164, 1206, 1248, 1290, 1332, 1374, 1416, 1458, 1500, @@ -225,137 +197,112 @@ namespace ecal { 5950, 5992, 6034, 6076, 6118, 6160, 6202, 6244, 6281, 6318, 6355, 6392, 6429, 6466, 6503, 6540, 6577, 6614, 6649, 6684, 6719, 6754, 6784, 6814, 6844, 6874, 6904, 6934, 6964, 6994, 7024, 7054, 7079, 7104, 7129, 7154, 7179, 7204, 7219, 7234, 7249, 7264, 7274, 7284, 7294, 7304, 7314}; - - - __device__ int quadrant(int iX, int iY) { - bool near = iX >= 11; - bool far = !near; - bool top = iY >= 11; - bool bot = !top; - - int iquad = 0; - if (near && top) - iquad = 1; - if (far && top) - iquad = 2; - if (far && bot) - iquad = 3; - if (near && bot) - iquad = 4; - - return iquad; - } - - __device__ int sector(int iX, int iY) { - // Y (towards the surface) - // T - // | - // | - // | - // o---------| X (towards center of LHC) - // - static const int idx_[] = { - // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 - 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 9, - 9, 9, 0, 0, 0, 0, 0, 0, 0, // 20 - 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 9, - 9, 9, 9, 9, 9, 0, 0, 0, 0, // 19 - 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 9, - 9, 9, 9, 9, 9, 8, 0, 0, 0, // 18 - 0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 9, - 9, 9, 9, 9, 8, 8, 8, 0, 0, // 17 - 0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 9, - 9, 9, 9, 9, 8, 8, 8, 8, 0, // 16 - 0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 9, - 9, 9, 9, 8, 8, 8, 8, 8, 0, // 15 - 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 9, - 9, 9, 8, 8, 8, 8, 8, 8, 0, // 14 - 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 9, - 9, 8, 8, 8, 8, 8, 8, 8, 8, // 13 - 3, 3, 2, 2, 2, 2, 2, 2, 2, 0, 0, - 8, 8, 8, 8, 8, 8, 8, 7, 7, // 12 - 3, 3, 3, 3, 3, 3, 3, 2, 0, 0, 0, - 0, 8, 7, 7, 7, 7, 7, 7, 7, // 11 - 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, - 0, 7, 7, 7, 7, 7, 7, 7, 7, // 10 - 3, 3, 3, 3, 3, 3, 3, 4, 4, 0, 0, - 6, 6, 7, 7, 7, 7, 7, 7, 7, // 9 - 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, - 6, 6, 6, 7, 7, 7, 7, 7, 7, // 8 - 0, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, - 6, 6, 6, 6, 6, 7, 7, 7, 0, // 7 - 0, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, - 5, 6, 6, 6, 6, 6, 6, 7, 0, // 6 - 0, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, - 5, 6, 6, 6, 6, 6, 6, 6, 0, // 5 - 0, 0, 4, 4, 4, 4, 4, 4, 5, 5, 5, - 5, 6, 6, 6, 6, 6, 6, 0, 0, // 4 - 0, 0, 0, 4, 4, 4, 4, 5, 5, 5, 5, - 5, 5, 6, 6, 6, 6, 0, 0, 0, // 3 - 0, 0, 0, 0, 4, 4, 4, 5, 5, 5, 5, - 5, 5, 6, 6, 6, 0, 0, 0, 0, // 2 - 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, - 5, 5, 0, 0, 0, 0, 0, 0, 0 // 1 - // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 - }; - - int iym, ixm, il, ic, ii; - iym = 20; - ixm = 20; - int iX_ = iX; - int iY_ = iY; - il = iym - iY_; - ic = iX_ - 1; - ii = il * ixm + ic; - - if (ii < 0 || ii > (int)(sizeof(idx_) / sizeof(int)) || idx_[ii] == 0) { - return -1; - }; - return idx_[ii]; - } - - - + + __device__ int quadrant(int iX, int iY) { + bool near = iX >= 11; + bool far = !near; + bool top = iY >= 11; + bool bot = !top; + + int iquad = 0; + if (near && top) + iquad = 1; + if (far && top) + iquad = 2; + if (far && bot) + iquad = 3; + if (near && bot) + iquad = 4; + + return iquad; + } + + __device__ int sector(int iX, int iY) { + // Y (towards the surface) + // T + // | + // | + // | + // o---------| X (towards center of LHC) + // + static const int idx_[] = { + // clang-format off + // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, // 20 + 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, // 19 + 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 8, 0, 0, 0, // 18 + 0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 8, 8, 8, 0, 0, // 17 + 0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 8, 8, 8, 8, 0, // 16 + 0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 9, 9, 9, 9, 8, 8, 8, 8, 8, 0, // 15 + 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 9, 9, 9, 8, 8, 8, 8, 8, 8, 0, // 14 + 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, // 13 + 3, 3, 2, 2, 2, 2, 2, 2, 2, 0, 0, 8, 8, 8, 8, 8, 8, 8, 7, 7, // 12 + 3, 3, 3, 3, 3, 3, 3, 2, 0, 0, 0, 0, 8, 7, 7, 7, 7, 7, 7, 7, // 11 + 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, // 10 + 3, 3, 3, 3, 3, 3, 3, 4, 4, 0, 0, 6, 6, 7, 7, 7, 7, 7, 7, 7, // 9 + 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 7, 7, 7, // 8 + 0, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 0, // 7 + 0, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 0, // 6 + 0, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 0, // 5 + 0, 0, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 0, 0, // 4 + 0, 0, 0, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 0, 0, 0, // 3 + 0, 0, 0, 0, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 0, 0, 0, 0, // 2 + 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0 // 1 + // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 + // clang-format on + }; + + int iym, ixm, il, ic, ii; + iym = 20; + ixm = 20; + int iX_ = iX; + int iY_ = iY; + il = iym - iY_; + ic = iX_ - 1; + ii = il * ixm + ic; + + if (ii < 0 || ii > (int)(sizeof(idx_) / sizeof(int)) || idx_[ii] == 0) { + return -1; + }; + return idx_[ii]; + } + } // namespace endcap - + } // namespace internal - + __device__ uint32_t hashedIndexEE(uint32_t id) { using namespace internal::endcap; - + const uint32_t jx(ix(id)); const uint32_t jd(2 * (iy(id) - 1) + (jx - 1) / 50); return ((positiveZ(id) ? EEDetId::kEEhalf : 0) + kdi[jd] + jx - kxf[jd]); } - - - - - // + + // // https://cmssdt.cern.ch/lxr/source/CalibCalorimetry/EcalLaserAnalyzer/src/MEEEGeom.cc // https://github.com/cms-sw/cmssw/blob/master/CalibCalorimetry/EcalLaserCorrection/src/EcalLaserDbService.cc - // - - __device__ - int laser_monitoring_region_EE(uint32_t id) { + // + + __device__ int laser_monitoring_region_EE(uint32_t id) { using namespace internal::endcap; - + // SuperCrysCoord uint32_t iX = (ix(id) - 1) / 5 + 1; uint32_t iY = (iy(id) - 1) / 5 + 1; - - // Correct convention + + // Correct convention // * @param iz iz/zside index: -1 for EE-, +1 for EE+ // https://github.com/cms-sw/cmssw/blob/master/DataFormats/EcalDetId/interface/EEDetId.h#L68-L71 // zside in https://github.com/cms-sw/cmssw/blob/master/CalibCalorimetry/EcalLaserCorrection/src/EcalLaserDbService.cc#L63 - // + // int iz = positiveZ(id) ? 1 : -1; - + int iquad = quadrant(iX, iY); int isect = sector(iX, iY); if (isect < 0) return -1; - + int ilmr = 0; ilmr = isect - 6; if (ilmr <= 0) @@ -368,14 +315,9 @@ namespace ecal { ilmr += 72; else ilmr += 82; - + return ilmr; - } - - - - + } // namespace reconstruction } // namespace ecal - diff --git a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h index d83f0c1fe2674..f291e85db5a06 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h +++ b/RecoLocalCalo/EcalRecAlgos/src/KernelHelpers.h @@ -431,22 +431,18 @@ namespace ecal { } // namespace multifit } // namespace ecal - namespace ecal { namespace reconstruction { - + __device__ uint32_t hashedIndexEB(uint32_t id); - + __device__ uint32_t hashedIndexEE(uint32_t id); - - + __device__ int laser_monitoring_region_EB(uint32_t id); - + __device__ int laser_monitoring_region_EE(uint32_t id); - + } // namespace reconstruction } // namespace ecal - -#endif // RecoLocalCalo_EcalRecAlgos_src_KernelHelpers_h - +#endif // RecoLocalCalo_EcalRecAlgos_src_KernelHelpers_h diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc index fc6ae22ff57e0..8c5e5c0c9783d 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc @@ -9,8 +9,7 @@ #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" -#include "FWCore/Framework/interface/MakerMacros.h" - +#include "FWCore/Framework/interface/MakerMacros.h" #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" @@ -20,166 +19,145 @@ #include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h" -class EcalCPURecHitProducer -: public edm::stream::EDProducer -{ +class EcalCPURecHitProducer : public edm::stream::EDProducer { public: explicit EcalCPURecHitProducer(edm::ParameterSet const& ps); ~EcalCPURecHitProducer() override = default; static void fillDescriptions(edm::ConfigurationDescriptions&); - + private: - void acquire(edm::Event const&, - edm::EventSetup const&, - edm::WaitingTaskWithArenaHolder) override; - void produce(edm::Event&, edm::EventSetup const&) override; - + void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override; + void produce(edm::Event&, edm::EventSetup const&) override; + private: edm::EDGetTokenT>> recHitsInEBToken_, recHitsInEEToken_; edm::EDPutTokenT> recHitsOutEBToken_, recHitsOutEEToken_; - + ecal::RecHit recHitsEB_, recHitsEE_; bool containsTimingInformation_; }; -void EcalCPURecHitProducer::fillDescriptions( - edm::ConfigurationDescriptions& confDesc) { +void EcalCPURecHitProducer::fillDescriptions(edm::ConfigurationDescriptions& confDesc) { edm::ParameterSetDescription desc; - + desc.add("recHitsInLabelEB", edm::InputTag{"ecalRecHitProducerGPU", "EcalRecHitsEB"}); desc.add("recHitsInLabelEE", edm::InputTag{"ecalRecHitProducerGPU", "EcalRecHitsEE"}); desc.add("recHitsOutLabelEB", "EcalRecHitsEB"); desc.add("recHitsOutLabelEE", "EcalRecHitsEE"); desc.add("containsTimingInformation", false); - + std::string label = "ecalCPURecHitProducer"; confDesc.add(label, desc); - } - - EcalCPURecHitProducer::EcalCPURecHitProducer( - const edm::ParameterSet& ps) - : recHitsInEBToken_{consumes>>(ps.getParameter("recHitsInLabelEB"))} - , recHitsInEEToken_{consumes>>(ps.getParameter("recHitsInLabelEE"))} - , recHitsOutEBToken_{produces>(ps.getParameter("recHitsOutLabelEB"))} - , recHitsOutEEToken_{produces>(ps.getParameter("recHitsOutLabelEE"))} - , containsTimingInformation_{ps.getParameter("containsTimingInformation")} - {} - - - void EcalCPURecHitProducer::acquire( - edm::Event const& event, - edm::EventSetup const& setup, - edm::WaitingTaskWithArenaHolder taskHolder) - { - // retrieve data/ctx - auto const& ebRecHitsProduct = event.get(recHitsInEBToken_); - auto const& eeRecHitsProduct = event.get(recHitsInEEToken_); - cms::cuda::ScopedContextAcquire ctx{ebRecHitsProduct, std::move(taskHolder)}; - auto const& ebRecHits = ctx.get(ebRecHitsProduct); - auto const& eeRecHits = ctx.get(eeRecHitsProduct); - - // resize the output buffers - recHitsEB_.resize(ebRecHits.size); - recHitsEE_.resize(eeRecHits.size); - - // std::cout << " [EcalCPURecHitProducer::acquire] ebRecHits.size = " << ebRecHits.size << std::endl; - // std::cout << " [EcalCPURecHitProducer::acquire] eeRecHits.size = " << eeRecHits.size << std::endl; - - // enqeue transfers - cudaCheck( cudaMemcpyAsync(recHitsEB_.did.data(), - ebRecHits.did, - recHitsEB_.did.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - cudaCheck( cudaMemcpyAsync(recHitsEE_.did.data(), - eeRecHits.did, - recHitsEE_.did.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - // - // ./CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h:using StorageScalarType = float; - // - - cudaCheck( cudaMemcpyAsync(recHitsEB_.energy.data(), - ebRecHits.energy, - recHitsEB_.energy.size() * sizeof(::ecal::reco::StorageScalarType), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - cudaCheck( cudaMemcpyAsync(recHitsEE_.energy.data(), - eeRecHits.energy, - recHitsEE_.energy.size() * sizeof(::ecal::reco::StorageScalarType), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - - cudaCheck( cudaMemcpyAsync(recHitsEB_.chi2.data(), - ebRecHits.chi2, - recHitsEB_.chi2.size() * sizeof(::ecal::reco::StorageScalarType), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - cudaCheck( cudaMemcpyAsync(recHitsEE_.chi2.data(), - eeRecHits.chi2, - recHitsEE_.chi2.size() * sizeof(::ecal::reco::StorageScalarType), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - - cudaCheck( cudaMemcpyAsync(recHitsEB_.extra.data(), - ebRecHits.extra, - recHitsEB_.extra.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - cudaCheck( cudaMemcpyAsync(recHitsEE_.extra.data(), - eeRecHits.extra, - recHitsEE_.extra.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - - cudaCheck( cudaMemcpyAsync(recHitsEB_.flagBits.data(), - ebRecHits.flagBits, - recHitsEB_.flagBits.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - cudaCheck( cudaMemcpyAsync(recHitsEE_.flagBits.data(), - eeRecHits.flagBits, - recHitsEE_.flagBits.size() * sizeof(uint32_t), - cudaMemcpyDeviceToHost, - ctx.stream()) ); - - - - - // for (unsigned int ieb = 0; ieb < ebRecHits.size ; ieb++) { - // if (recHitsEB_.extra[ieb] != 0 ) std::cout << " [ " << ieb << " :: " << ebRecHits.size << " ] [ " << recHitsEB_.did[ieb] << " ] eb extra = " << recHitsEB_.extra[ieb] << std::endl; - // } - - // - // for (unsigned int ieb = 0; ieb < ebRecHits.size ; ieb++) { - // if (recHitsEB_.energy[ieb] != 0 ) std::cout << " [ " << ieb << " :: " << ebRecHits.size << " ] [ " << recHitsEB_.did[ieb] << " ] eb energy = " << recHitsEB_.energy[ieb] << std::endl; - // } - // - // for (unsigned int iee = 0; iee < eeRecHits.size ; iee++) { - // if (recHitsEE_.energy[iee] != 0 ) std::cout << " [ " << iee << " :: " << eeRecHits.size << " ] [ " << recHitsEE_.did[iee] << " ] ee energy = " << recHitsEE_.energy[iee] << std::endl; - // } - // - - - - - } - - void EcalCPURecHitProducer::produce( - edm::Event& event, - edm::EventSetup const& setup) - { - // tmp vectors - auto recHitsOutEB = std::make_unique>(std::move(recHitsEB_)); - auto recHitsOutEE = std::make_unique>(std::move(recHitsEE_)); - - // put into event - event.put(recHitsOutEBToken_, std::move(recHitsOutEB)); - event.put(recHitsOutEEToken_, std::move(recHitsOutEE)); - } - - DEFINE_FWK_MODULE(EcalCPURecHitProducer); - - - \ No newline at end of file +} + +EcalCPURecHitProducer::EcalCPURecHitProducer(const edm::ParameterSet& ps) + : recHitsInEBToken_{consumes>>( + ps.getParameter("recHitsInLabelEB"))}, + recHitsInEEToken_{consumes>>( + ps.getParameter("recHitsInLabelEE"))}, + recHitsOutEBToken_{produces>(ps.getParameter("recHitsOutLabelEB"))}, + recHitsOutEEToken_{produces>(ps.getParameter("recHitsOutLabelEE"))}, + containsTimingInformation_{ps.getParameter("containsTimingInformation")} {} + +void EcalCPURecHitProducer::acquire(edm::Event const& event, + edm::EventSetup const& setup, + edm::WaitingTaskWithArenaHolder taskHolder) { + // retrieve data/ctx + auto const& ebRecHitsProduct = event.get(recHitsInEBToken_); + auto const& eeRecHitsProduct = event.get(recHitsInEEToken_); + cms::cuda::ScopedContextAcquire ctx{ebRecHitsProduct, std::move(taskHolder)}; + auto const& ebRecHits = ctx.get(ebRecHitsProduct); + auto const& eeRecHits = ctx.get(eeRecHitsProduct); + + // resize the output buffers + recHitsEB_.resize(ebRecHits.size); + recHitsEE_.resize(eeRecHits.size); + + // std::cout << " [EcalCPURecHitProducer::acquire] ebRecHits.size = " << ebRecHits.size << std::endl; + // std::cout << " [EcalCPURecHitProducer::acquire] eeRecHits.size = " << eeRecHits.size << std::endl; + + // enqeue transfers + cudaCheck(cudaMemcpyAsync(recHitsEB_.did.data(), + ebRecHits.did, + recHitsEB_.did.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream())); + cudaCheck(cudaMemcpyAsync(recHitsEE_.did.data(), + eeRecHits.did, + recHitsEE_.did.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream())); + // + // ./CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h:using StorageScalarType = float; + // + + cudaCheck(cudaMemcpyAsync(recHitsEB_.energy.data(), + ebRecHits.energy, + recHitsEB_.energy.size() * sizeof(::ecal::reco::StorageScalarType), + cudaMemcpyDeviceToHost, + ctx.stream())); + cudaCheck(cudaMemcpyAsync(recHitsEE_.energy.data(), + eeRecHits.energy, + recHitsEE_.energy.size() * sizeof(::ecal::reco::StorageScalarType), + cudaMemcpyDeviceToHost, + ctx.stream())); + + cudaCheck(cudaMemcpyAsync(recHitsEB_.chi2.data(), + ebRecHits.chi2, + recHitsEB_.chi2.size() * sizeof(::ecal::reco::StorageScalarType), + cudaMemcpyDeviceToHost, + ctx.stream())); + cudaCheck(cudaMemcpyAsync(recHitsEE_.chi2.data(), + eeRecHits.chi2, + recHitsEE_.chi2.size() * sizeof(::ecal::reco::StorageScalarType), + cudaMemcpyDeviceToHost, + ctx.stream())); + + cudaCheck(cudaMemcpyAsync(recHitsEB_.extra.data(), + ebRecHits.extra, + recHitsEB_.extra.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream())); + cudaCheck(cudaMemcpyAsync(recHitsEE_.extra.data(), + eeRecHits.extra, + recHitsEE_.extra.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream())); + + cudaCheck(cudaMemcpyAsync(recHitsEB_.flagBits.data(), + ebRecHits.flagBits, + recHitsEB_.flagBits.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream())); + cudaCheck(cudaMemcpyAsync(recHitsEE_.flagBits.data(), + eeRecHits.flagBits, + recHitsEE_.flagBits.size() * sizeof(uint32_t), + cudaMemcpyDeviceToHost, + ctx.stream())); + + // for (unsigned int ieb = 0; ieb < ebRecHits.size ; ieb++) { + // if (recHitsEB_.extra[ieb] != 0 ) std::cout << " [ " << ieb << " :: " << ebRecHits.size << " ] [ " << recHitsEB_.did[ieb] << " ] eb extra = " << recHitsEB_.extra[ieb] << std::endl; + // } + + // + // for (unsigned int ieb = 0; ieb < ebRecHits.size ; ieb++) { + // if (recHitsEB_.energy[ieb] != 0 ) std::cout << " [ " << ieb << " :: " << ebRecHits.size << " ] [ " << recHitsEB_.did[ieb] << " ] eb energy = " << recHitsEB_.energy[ieb] << std::endl; + // } + // + // for (unsigned int iee = 0; iee < eeRecHits.size ; iee++) { + // if (recHitsEE_.energy[iee] != 0 ) std::cout << " [ " << iee << " :: " << eeRecHits.size << " ] [ " << recHitsEE_.did[iee] << " ] ee energy = " << recHitsEE_.energy[iee] << std::endl; + // } + // +} + +void EcalCPURecHitProducer::produce(edm::Event& event, edm::EventSetup const& setup) { + // tmp vectors + auto recHitsOutEB = std::make_unique>(std::move(recHitsEB_)); + auto recHitsOutEE = std::make_unique>(std::move(recHitsEE_)); + + // put into event + event.put(recHitsOutEBToken_, std::move(recHitsOutEB)); + event.put(recHitsOutEEToken_, std::move(recHitsOutEE)); +} + +DEFINE_FWK_MODULE(EcalCPURecHitProducer); diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc index 5fded99cf3d0b..c2f6de85ef5a3 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalESProducersGPUDefs.cc @@ -17,7 +17,6 @@ #include "CondFormats/DataRecord/interface/EcalLaserAlphasRcd.h" #include "CondFormats/DataRecord/interface/EcalLinearCorrectionsRcd.h" - // for uncalibrechit #include "RecoLocalCalo/EcalRecAlgos/interface/EcalPedestalsGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalGainRatiosGPU.h" @@ -35,92 +34,49 @@ #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h" +#include + +using EcalPedestalsGPUESProducer = EcalESProducerGPU; +using EcalGainRatiosGPUESProducer = EcalESProducerGPU; +using EcalPulseShapesGPUESProducer = EcalESProducerGPU; +using EcalPulseCovariancesGPUESProducer = + EcalESProducerGPU; -#include +using EcalSamplesCorrelationGPUESProducer = + EcalESProducerGPU; + +using EcalTimeBiasCorrectionsGPUESProducer = + EcalESProducerGPU; + +using EcalTimeCalibConstantsGPUESProducer = + EcalESProducerGPU; + +using EcalRechitADCToGeVConstantGPUESProducer = + EcalESProducerGPU; +using EcalIntercalibConstantsGPUESProducer = + EcalESProducerGPU; -using EcalPedestalsGPUESProducer = EcalESProducerGPU; - -using EcalGainRatiosGPUESProducer = EcalESProducerGPU; - -using EcalPulseShapesGPUESProducer = EcalESProducerGPU; - -using EcalPulseCovariancesGPUESProducer = EcalESProducerGPU; - -using EcalSamplesCorrelationGPUESProducer = EcalESProducerGPU< - EcalSamplesCorrelationGPU, - EcalSamplesCorrelation, - EcalSamplesCorrelationRcd - >; - -using EcalTimeBiasCorrectionsGPUESProducer = EcalESProducerGPU< - EcalTimeBiasCorrectionsGPU, - EcalTimeBiasCorrections, - EcalTimeBiasCorrectionsRcd - >; - -using EcalTimeCalibConstantsGPUESProducer = EcalESProducerGPU< - EcalTimeCalibConstantsGPU, - EcalTimeCalibConstants, - EcalTimeCalibConstantsRcd - >; - -using EcalRechitADCToGeVConstantGPUESProducer = EcalESProducerGPU< - EcalRechitADCToGeVConstantGPU, - EcalADCToGeVConstant, - EcalADCToGeVConstantRcd - >; - -using EcalIntercalibConstantsGPUESProducer = EcalESProducerGPU< - EcalIntercalibConstantsGPU, - EcalIntercalibConstants, - EcalIntercalibConstantsRcd - >; - -using EcalRechitChannelStatusGPUESProducer = EcalESProducerGPU< - EcalRechitChannelStatusGPU, - EcalChannelStatus, - EcalChannelStatusRcd - >; - -using EcalLaserAPDPNRatiosGPUESProducer = EcalESProducerGPU< - EcalLaserAPDPNRatiosGPU, - EcalLaserAPDPNRatios, - EcalLaserAPDPNRatiosRcd - >; - -using EcalLaserAPDPNRatiosRefGPUESProducer = EcalESProducerGPU< - EcalLaserAPDPNRatiosRefGPU, - EcalLaserAPDPNRatiosRef, - EcalLaserAPDPNRatiosRefRcd - >; - -using EcalLaserAlphasGPUESProducer = EcalESProducerGPU< - EcalLaserAlphasGPU, - EcalLaserAlphas, - EcalLaserAlphasRcd - >; - -using EcalLinearCorrectionsGPUESProducer = EcalESProducerGPU< - EcalLinearCorrectionsGPU, - EcalLinearCorrections, - EcalLinearCorrectionsRcd - >; - -// +using EcalRechitChannelStatusGPUESProducer = + EcalESProducerGPU; + +using EcalLaserAPDPNRatiosGPUESProducer = + EcalESProducerGPU; + +using EcalLaserAPDPNRatiosRefGPUESProducer = + EcalESProducerGPU; + +using EcalLaserAlphasGPUESProducer = EcalESProducerGPU; + +using EcalLinearCorrectionsGPUESProducer = + EcalESProducerGPU; + +// // This below also creates the .py config files, as described in "EcalESProducerGPU.h" -// +// DEFINE_FWK_EVENTSETUP_MODULE(EcalPedestalsGPUESProducer); DEFINE_FWK_EVENTSETUP_MODULE(EcalGainRatiosGPUESProducer); @@ -137,4 +93,3 @@ DEFINE_FWK_EVENTSETUP_MODULE(EcalLaserAPDPNRatiosGPUESProducer); DEFINE_FWK_EVENTSETUP_MODULE(EcalLaserAPDPNRatiosRefGPUESProducer); DEFINE_FWK_EVENTSETUP_MODULE(EcalLaserAlphasGPUESProducer); DEFINE_FWK_EVENTSETUP_MODULE(EcalLinearCorrectionsGPUESProducer); - diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc index 54d772efa806b..548bc812ffa2e 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc @@ -3,7 +3,7 @@ #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" -#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/MakerMacros.h" // algorithm specific #include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" @@ -14,124 +14,111 @@ #include -class EcalRecHitConvertGPU2CPUFormat -: public edm::stream::EDProducer<> -{ +class EcalRecHitConvertGPU2CPUFormat : public edm::stream::EDProducer<> { public: explicit EcalRecHitConvertGPU2CPUFormat(edm::ParameterSet const& ps); ~EcalRecHitConvertGPU2CPUFormat() override; static void fillDescriptions(edm::ConfigurationDescriptions&); - + private: using GPURecHitType = ecal::RecHit; void produce(edm::Event&, edm::EventSetup const&) override; - + private: const edm::EDGetTokenT recHitsGPUEB_; const edm::EDGetTokenT recHitsGPUEE_; - + const std::string recHitsLabelCPUEB_, recHitsLabelCPUEE_; }; -void EcalRecHitConvertGPU2CPUFormat::fillDescriptions( - edm::ConfigurationDescriptions& confDesc) { +void EcalRecHitConvertGPU2CPUFormat::fillDescriptions(edm::ConfigurationDescriptions& confDesc) { edm::ParameterSetDescription desc; - + desc.add("recHitsLabelGPUEB", edm::InputTag("ecalRecHitProducerGPU", "EcalRecHitsGPUEB")); desc.add("recHitsLabelGPUEE", edm::InputTag("ecalRecHitProducerGPU", "EcalRecHitsGPUEE")); - + desc.add("recHitsLabelCPUEB", "EcalRecHitsEB"); desc.add("recHitsLabelCPUEE", "EcalRecHitsEE"); - + std::string label = "ecalRecHitConvertGPU2CPUFormat"; confDesc.add(label, desc); - } - - EcalRecHitConvertGPU2CPUFormat::EcalRecHitConvertGPU2CPUFormat(const edm::ParameterSet& ps) - : recHitsGPUEB_{consumes(ps.getParameter("recHitsLabelGPUEB"))} - , recHitsGPUEE_{consumes(ps.getParameter("recHitsLabelGPUEE"))} - , recHitsLabelCPUEB_{ps.getParameter("recHitsLabelCPUEB")} - , recHitsLabelCPUEE_{ps.getParameter("recHitsLabelCPUEE")} - { - produces(recHitsLabelCPUEB_); - produces(recHitsLabelCPUEE_); - } - - EcalRecHitConvertGPU2CPUFormat::~EcalRecHitConvertGPU2CPUFormat() {} - - void EcalRecHitConvertGPU2CPUFormat::produce( - edm::Event& event, - edm::EventSetup const& setup) - { - edm::Handle hRecHitsGPUEB, hRecHitsGPUEE; - event.getByToken(recHitsGPUEB_, hRecHitsGPUEB); - event.getByToken(recHitsGPUEE_, hRecHitsGPUEE); - - auto recHitsCPUEB = std::make_unique(); - auto recHitsCPUEE = std::make_unique(); - recHitsCPUEB->reserve(hRecHitsGPUEB->energy.size()); - recHitsCPUEE->reserve(hRecHitsGPUEE->energy.size()); - - // - // explicit EcalRecHit(const DetId& id, float energy, float time, uint32_t extra = 0, uint32_t flagBits = 0): - // - - for (uint32_t i=0; ienergy.size(); ++i) { - - // - // Save only if energy is >= 0 ! - // This is extremely important because the channels that were supposed - // to be excluded get "-1" as energy - // - - if (hRecHitsGPUEB->energy[i] >=0) { - recHitsCPUEB->emplace_back( - DetId{hRecHitsGPUEB->did[i]}, - hRecHitsGPUEB->energy[i], - hRecHitsGPUEB->time[i], - hRecHitsGPUEB->extra[i], - hRecHitsGPUEB->flagBits[i] - ); - } - - // std::cout << " EB :: extra [" << i << "::" << hRecHitsGPUEB->energy.size() << "] = " << hRecHitsGPUEB->extra[i] << std::endl; - - // (*recHitsCPUEB)[i].setJitterError(hRecHitsGPUEB->timeError[i]); - // auto const offset = i * EcalDataFrame::MAXSAMPLES; - // for (uint32_t sample=0; sampleenergysAll[offset + sample]); +} + +EcalRecHitConvertGPU2CPUFormat::EcalRecHitConvertGPU2CPUFormat(const edm::ParameterSet& ps) + : recHitsGPUEB_{consumes(ps.getParameter("recHitsLabelGPUEB"))}, + recHitsGPUEE_{consumes(ps.getParameter("recHitsLabelGPUEE"))}, + recHitsLabelCPUEB_{ps.getParameter("recHitsLabelCPUEB")}, + recHitsLabelCPUEE_{ps.getParameter("recHitsLabelCPUEE")} { + produces(recHitsLabelCPUEB_); + produces(recHitsLabelCPUEE_); +} + +EcalRecHitConvertGPU2CPUFormat::~EcalRecHitConvertGPU2CPUFormat() {} + +void EcalRecHitConvertGPU2CPUFormat::produce(edm::Event& event, edm::EventSetup const& setup) { + edm::Handle hRecHitsGPUEB, hRecHitsGPUEE; + event.getByToken(recHitsGPUEB_, hRecHitsGPUEB); + event.getByToken(recHitsGPUEE_, hRecHitsGPUEE); + + auto recHitsCPUEB = std::make_unique(); + auto recHitsCPUEE = std::make_unique(); + recHitsCPUEB->reserve(hRecHitsGPUEB->energy.size()); + recHitsCPUEE->reserve(hRecHitsGPUEE->energy.size()); + + // + // explicit EcalRecHit(const DetId& id, float energy, float time, uint32_t extra = 0, uint32_t flagBits = 0): + // + + for (uint32_t i = 0; i < hRecHitsGPUEB->energy.size(); ++i) { + // + // Save only if energy is >= 0 ! + // This is extremely important because the channels that were supposed + // to be excluded get "-1" as energy + // + + if (hRecHitsGPUEB->energy[i] >= 0) { + recHitsCPUEB->emplace_back(DetId{hRecHitsGPUEB->did[i]}, + hRecHitsGPUEB->energy[i], + hRecHitsGPUEB->time[i], + hRecHitsGPUEB->extra[i], + hRecHitsGPUEB->flagBits[i]); } - - for (uint32_t i=0; ienergy.size(); ++i) { - // - // Save only if energy is >= 0 ! - // This is extremely important because the channels that were supposed - // to be excluded get "-1" as energy - // - - if (hRecHitsGPUEE->energy[i] >=0) { - recHitsCPUEE->emplace_back( - DetId{hRecHitsGPUEE->did[i]}, - hRecHitsGPUEE->energy[i], - hRecHitsGPUEE->time[i], - hRecHitsGPUEE->extra[i], - hRecHitsGPUEE->flagBits[i] - ); - } - - // std::cout << " EE :: extra [" << i << "::" << hRecHitsGPUEE->energy.size() << "] = " << hRecHitsGPUEE->extra[i] << std::endl; - - // (*recHitsCPUEE)[i].setJitterError(hRecHitsGPUEE->timeError[i]); - // auto const offset = i * EcalDataFrame::MAXSAMPLES; - // for (uint32_t sample=0; sampleenergysAll[offset + sample]); + + // std::cout << " EB :: extra [" << i << "::" << hRecHitsGPUEB->energy.size() << "] = " << hRecHitsGPUEB->extra[i] << std::endl; + + // (*recHitsCPUEB)[i].setJitterError(hRecHitsGPUEB->timeError[i]); + // auto const offset = i * EcalDataFrame::MAXSAMPLES; + // for (uint32_t sample=0; sampleenergysAll[offset + sample]); + } + + for (uint32_t i = 0; i < hRecHitsGPUEE->energy.size(); ++i) { + // + // Save only if energy is >= 0 ! + // This is extremely important because the channels that were supposed + // to be excluded get "-1" as energy + // + + if (hRecHitsGPUEE->energy[i] >= 0) { + recHitsCPUEE->emplace_back(DetId{hRecHitsGPUEE->did[i]}, + hRecHitsGPUEE->energy[i], + hRecHitsGPUEE->time[i], + hRecHitsGPUEE->extra[i], + hRecHitsGPUEE->flagBits[i]); } - - event.put(std::move(recHitsCPUEB), recHitsLabelCPUEB_); - event.put(std::move(recHitsCPUEE), recHitsLabelCPUEE_); + + // std::cout << " EE :: extra [" << i << "::" << hRecHitsGPUEE->energy.size() << "] = " << hRecHitsGPUEE->extra[i] << std::endl; + + // (*recHitsCPUEE)[i].setJitterError(hRecHitsGPUEE->timeError[i]); + // auto const offset = i * EcalDataFrame::MAXSAMPLES; + // for (uint32_t sample=0; sampleenergysAll[offset + sample]); } - - DEFINE_FWK_MODULE(EcalRecHitConvertGPU2CPUFormat); - \ No newline at end of file + + event.put(std::move(recHitsCPUEB), recHitsLabelCPUEB_); + event.put(std::move(recHitsCPUEE), recHitsLabelCPUEE_); +} + +DEFINE_FWK_MODULE(EcalRecHitConvertGPU2CPUFormat); diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc index 7422838471ebc..795a499987a06 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc @@ -1,23 +1,21 @@ // framework #include "FWCore/Framework/interface/stream/EDProducer.h" - #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" -#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/MakerMacros.h" -// -// -// +// +// +// // format #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" #include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h" #include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h" - // needed for definition of flags #include "DataFormats/EcalRecHit/interface/EcalRecHit.h" @@ -34,7 +32,6 @@ #include "CondFormats/DataRecord/interface/EcalLaserAlphasRcd.h" #include "CondFormats/DataRecord/interface/EcalLinearCorrectionsRcd.h" - // conditions gpu #include "RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h" @@ -45,108 +42,89 @@ #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h" #include "RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h" - #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" // configuration #include "CommonTools/Utils/interface/StringToEnumValue.h" - -class EcalRecHitProducerGPU: public edm::stream::EDProducer { - +class EcalRecHitProducerGPU : public edm::stream::EDProducer { public: explicit EcalRecHitProducerGPU(edm::ParameterSet const& ps); ~EcalRecHitProducerGPU() override; static void fillDescriptions(edm::ConfigurationDescriptions&); - + private: - using RecHitType = ecal::RecHit; - void acquire(edm::Event const&, - edm::EventSetup const&, - edm::WaitingTaskWithArenaHolder) override; - void produce(edm::Event&, edm::EventSetup const&) override; - + void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override; + void produce(edm::Event&, edm::EventSetup const&) override; + private: - // data - uint32_t neb_, nee_; // extremely important, in particular neb_ - - - // gpu input - edm::EDGetTokenT > > uncalibRecHitsInEBToken_; - edm::EDGetTokenT > > uncalibRecHitsInEEToken_; - - - + uint32_t neb_, nee_; // extremely important, in particular neb_ + + // gpu input + edm::EDGetTokenT>> uncalibRecHitsInEBToken_; + edm::EDGetTokenT>> uncalibRecHitsInEEToken_; + // event data ecal::rechit::EventOutputDataGPU eventOutputDataGPU_; - bool shouldTransferToHost_{true}; - + cms::cuda::ContextState cudaState_; - + // gpu output - edm::EDPutTokenT>> recHitsTokenEB_, recHitsTokenEE_; - - + edm::EDPutTokenT>> recHitsTokenEB_, recHitsTokenEE_; + // configuration parameters ecal::rechit::ConfigurationParameters configParameters_; uint32_t maxNumberHits_; - - + // conditions handles edm::ESHandle ADCToGeVConstantHandle_; - edm::ESHandle IntercalibConstantsHandle_; - edm::ESHandle ChannelStatusHandle_; - - edm::ESHandle LaserAPDPNRatiosHandle_; + edm::ESHandle IntercalibConstantsHandle_; + edm::ESHandle ChannelStatusHandle_; + + edm::ESHandle LaserAPDPNRatiosHandle_; edm::ESHandle LaserAPDPNRatiosRefHandle_; - edm::ESHandle LaserAlphasHandle_; - edm::ESHandle LinearCorrectionsHandle_; - + edm::ESHandle LaserAlphasHandle_; + edm::ESHandle LinearCorrectionsHandle_; + // configuration std::vector v_chstatus_; - - + // // https://github.com/cms-sw/cmssw/blob/266e21cfc9eb409b093e4cf064f4c0a24c6ac293/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitWorkerSimple.h // - + // Associate reco flagbit ( outer vector) to many db status flags (inner vector) // std::vector > v_DB_reco_flags_; - std::vector expanded_v_DB_reco_flags_; // Transform a map in a vector // FIXME AM: int or uint32 to be checked - std::vector expanded_Sizes_v_DB_reco_flags_; // Saving the size for each piece - std::vector expanded_flagbit_v_DB_reco_flags_; // And the "key" for each key - - + std::vector + expanded_v_DB_reco_flags_; // Transform a map in a vector // FIXME AM: int or uint32 to be checked + std::vector expanded_Sizes_v_DB_reco_flags_; // Saving the size for each piece + std::vector expanded_flagbit_v_DB_reco_flags_; // And the "key" for each key + uint32_t flagmask_; // do not propagate channels with these flags on - - }; +void EcalRecHitProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; + desc.add("uncalibrecHitsInLabelEB", + edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEB")); + desc.add("uncalibrecHitsInLabelEE", + edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEE")); -void EcalRecHitProducerGPU::fillDescriptions( - edm::ConfigurationDescriptions& confDesc) -{ - - edm::ParameterSetDescription desc; - - desc.add("uncalibrecHitsInLabelEB", edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEB")); - desc.add("uncalibrecHitsInLabelEE", edm::InputTag("ecalUncalibRecHitProducerGPU", "EcalUncalibRecHitsEE")); - desc.add("recHitsLabelEB", "EcalRecHitsGPUEB"); desc.add("recHitsLabelEE", "EcalRecHitsGPUEE"); - + desc.add("killDeadChannels", true); - + desc.add("EBLaserMIN", 0.01); desc.add("EELaserMIN", 0.01); desc.add("EBLaserMAX", 30.0); desc.add("EELaserMAX", 30.0); - + desc.add("maxNumberHits", 20000); - + // ## db statuses to be exluded from reconstruction (some will be recovered) edm::ParameterSetDescription desc_ChannelStatusToBeExcluded; desc_ChannelStatusToBeExcluded.add("kDAC"); @@ -159,122 +137,106 @@ void EcalRecHitProducerGPU::fillDescriptions( desc_ChannelStatusToBeExcluded.add("kDeadVFE"); desc_ChannelStatusToBeExcluded.add("kDeadFE"); desc_ChannelStatusToBeExcluded.add("kNoDataNoTP"); - + std::vector default_ChannelStatusToBeExcluded(1); - + desc.addVPSet("ChannelStatusToBeExcluded", desc_ChannelStatusToBeExcluded, default_ChannelStatusToBeExcluded); - } - -EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) { - +EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) { //---- input - uncalibRecHitsInEBToken_ = consumes>>(ps.getParameter("uncalibrecHitsInLabelEB")); - uncalibRecHitsInEEToken_ = consumes>>(ps.getParameter("uncalibrecHitsInLabelEE")); - + uncalibRecHitsInEBToken_ = consumes>>( + ps.getParameter("uncalibrecHitsInLabelEB")); + uncalibRecHitsInEEToken_ = consumes>>( + ps.getParameter("uncalibrecHitsInLabelEE")); + //---- output - recHitsTokenEB_ = produces>>( ps.getParameter("recHitsLabelEB") ); - recHitsTokenEE_ = produces>>( ps.getParameter("recHitsLabelEE") ); - - + recHitsTokenEB_ = + produces>>(ps.getParameter("recHitsLabelEB")); + recHitsTokenEE_ = + produces>>(ps.getParameter("recHitsLabelEE")); + //---- db statuses to be exluded from reconstruction - v_chstatus_ = StringToEnumValue( ps.getParameter >("ChannelStatusToBeExcluded")); - - + v_chstatus_ = StringToEnumValue( + ps.getParameter>("ChannelStatusToBeExcluded")); + bool killDeadChannels = ps.getParameter("killDeadChannels"); configParameters_.killDeadChannels = killDeadChannels; - - + configParameters_.EBLaserMIN = ps.getParameter("EBLaserMIN"); configParameters_.EELaserMIN = ps.getParameter("EELaserMIN"); configParameters_.EBLaserMAX = ps.getParameter("EBLaserMAX"); configParameters_.EELaserMAX = ps.getParameter("EELaserMAX"); - - + // max number of digis to allocate for maxNumberHits_ = ps.getParameter("maxNumberHits"); - + // allocate event output data eventOutputDataGPU_.allocate(configParameters_, maxNumberHits_); - + configParameters_.ChannelStatusToBeExcludedSize = v_chstatus_.size(); - - cudaCheck( cudaMalloc((void**)&configParameters_.ChannelStatusToBeExcluded, - sizeof(int) * v_chstatus_.size()) - ); - cudaCheck( cudaMemcpy(configParameters_.ChannelStatusToBeExcluded, - v_chstatus_.data(), - v_chstatus_.size() * sizeof(int), - cudaMemcpyHostToDevice) ); - - - + + cudaCheck(cudaMalloc((void**)&configParameters_.ChannelStatusToBeExcluded, sizeof(int) * v_chstatus_.size())); + cudaCheck(cudaMemcpy(configParameters_.ChannelStatusToBeExcluded, + v_chstatus_.data(), + v_chstatus_.size() * sizeof(int), + cudaMemcpyHostToDevice)); + // // https://github.com/cms-sw/cmssw/blob/266e21cfc9eb409b093e4cf064f4c0a24c6ac293/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitWorkerSimple.cc - // - + // + // Traslate string representation of flagsMapDBReco into enum values const edm::ParameterSet& p = ps.getParameter("flagsMapDBReco"); std::vector recoflagbitsStrings = p.getParameterNames(); // v_DB_reco_flags_.resize(32); - + for (unsigned int i = 0; i != recoflagbitsStrings.size(); ++i) { EcalRecHit::Flags recoflagbit = (EcalRecHit::Flags)StringToEnumValue(recoflagbitsStrings[i]); - std::vector dbstatus_s = p.getParameter >(recoflagbitsStrings[i]); + std::vector dbstatus_s = p.getParameter>(recoflagbitsStrings[i]); // std::vector dbstatuses; for (unsigned int j = 0; j != dbstatus_s.size(); ++j) { EcalChannelStatusCode::Code dbstatus = - (EcalChannelStatusCode::Code)StringToEnumValue(dbstatus_s[j]); + (EcalChannelStatusCode::Code)StringToEnumValue(dbstatus_s[j]); // dbstatuses.push_back(dbstatus); expanded_v_DB_reco_flags_.push_back(dbstatus); } - - expanded_Sizes_v_DB_reco_flags_.push_back( dbstatus_s.size() ); - expanded_flagbit_v_DB_reco_flags_.push_back( recoflagbit ); - + + expanded_Sizes_v_DB_reco_flags_.push_back(dbstatus_s.size()); + expanded_flagbit_v_DB_reco_flags_.push_back(recoflagbit); + // v_DB_reco_flags_[recoflagbit] = dbstatuses; } - + // actual values - cudaCheck( cudaMalloc((void**)&configParameters_.expanded_v_DB_reco_flags, - sizeof(int) * expanded_v_DB_reco_flags_.size()) - ); - - cudaCheck( cudaMemcpy(configParameters_.expanded_v_DB_reco_flags, - expanded_v_DB_reco_flags_.data(), - expanded_v_DB_reco_flags_.size() * sizeof(int), - cudaMemcpyHostToDevice) - ); - - + cudaCheck( + cudaMalloc((void**)&configParameters_.expanded_v_DB_reco_flags, sizeof(int) * expanded_v_DB_reco_flags_.size())); + + cudaCheck(cudaMemcpy(configParameters_.expanded_v_DB_reco_flags, + expanded_v_DB_reco_flags_.data(), + expanded_v_DB_reco_flags_.size() * sizeof(int), + cudaMemcpyHostToDevice)); + // sizes - cudaCheck( cudaMalloc((void**)&configParameters_.expanded_Sizes_v_DB_reco_flags, - sizeof(uint32_t) * expanded_Sizes_v_DB_reco_flags_.size() ) - ); - - cudaCheck( cudaMemcpy(configParameters_.expanded_Sizes_v_DB_reco_flags, - expanded_Sizes_v_DB_reco_flags_.data(), - expanded_Sizes_v_DB_reco_flags_.size() * sizeof(uint32_t), - cudaMemcpyHostToDevice) - ); - + cudaCheck(cudaMalloc((void**)&configParameters_.expanded_Sizes_v_DB_reco_flags, + sizeof(uint32_t) * expanded_Sizes_v_DB_reco_flags_.size())); + + cudaCheck(cudaMemcpy(configParameters_.expanded_Sizes_v_DB_reco_flags, + expanded_Sizes_v_DB_reco_flags_.data(), + expanded_Sizes_v_DB_reco_flags_.size() * sizeof(uint32_t), + cudaMemcpyHostToDevice)); + // keys - cudaCheck( cudaMalloc((void**)&configParameters_.expanded_flagbit_v_DB_reco_flags, - sizeof(uint32_t) * expanded_flagbit_v_DB_reco_flags_.size() ) - ); - - cudaCheck( cudaMemcpy(configParameters_.expanded_flagbit_v_DB_reco_flags, - expanded_flagbit_v_DB_reco_flags_.data(), - expanded_flagbit_v_DB_reco_flags_.size() * sizeof(uint32_t), - cudaMemcpyHostToDevice) - ); - - + cudaCheck(cudaMalloc((void**)&configParameters_.expanded_flagbit_v_DB_reco_flags, + sizeof(uint32_t) * expanded_flagbit_v_DB_reco_flags_.size())); + + cudaCheck(cudaMemcpy(configParameters_.expanded_flagbit_v_DB_reco_flags, + expanded_flagbit_v_DB_reco_flags_.data(), + expanded_flagbit_v_DB_reco_flags_.size() * sizeof(uint32_t), + cudaMemcpyHostToDevice)); + configParameters_.expanded_v_DB_reco_flagsSize = expanded_flagbit_v_DB_reco_flags_.size(); - - - + flagmask_ = 0; flagmask_ |= 0x1 << EcalRecHit::kNeighboursRecovered; flagmask_ |= 0x1 << EcalRecHit::kTowerRecovered; @@ -282,45 +244,35 @@ EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) { flagmask_ |= 0x1 << EcalRecHit::kKilled; flagmask_ |= 0x1 << EcalRecHit::kTPSaturated; flagmask_ |= 0x1 << EcalRecHit::kL1SpikeFlag; - + configParameters_.flagmask = flagmask_; - - + // for recovery and killing - - configParameters_.recoverEBIsolatedChannels = ps.getParameter("recoverEBIsolatedChannels"); - configParameters_.recoverEEIsolatedChannels = ps.getParameter("recoverEEIsolatedChannels"); - configParameters_.recoverEBVFE = ps.getParameter("recoverEBVFE"); - configParameters_.recoverEEVFE = ps.getParameter("recoverEEVFE"); - configParameters_.recoverEBFE = ps.getParameter("recoverEBFE"); - configParameters_.recoverEEFE = ps.getParameter("recoverEEFE"); - - - -} + configParameters_.recoverEBIsolatedChannels = ps.getParameter("recoverEBIsolatedChannels"); + configParameters_.recoverEEIsolatedChannels = ps.getParameter("recoverEEIsolatedChannels"); + configParameters_.recoverEBVFE = ps.getParameter("recoverEBVFE"); + configParameters_.recoverEEVFE = ps.getParameter("recoverEEVFE"); + configParameters_.recoverEBFE = ps.getParameter("recoverEBFE"); + configParameters_.recoverEEFE = ps.getParameter("recoverEEFE"); +} EcalRecHitProducerGPU::~EcalRecHitProducerGPU() { - - // free event ouput data + // free event ouput data eventOutputDataGPU_.deallocate(configParameters_); - + // FIXME AM: do I need to do this? // Or can I do it as part of "deallocate" ? - cudaCheck( cudaFree(configParameters_.ChannelStatusToBeExcluded) ); - - cudaCheck( cudaFree(configParameters_.expanded_v_DB_reco_flags) ); - cudaCheck( cudaFree(configParameters_.expanded_Sizes_v_DB_reco_flags) ); - cudaCheck( cudaFree(configParameters_.expanded_flagbit_v_DB_reco_flags) ); - -} + cudaCheck(cudaFree(configParameters_.ChannelStatusToBeExcluded)); + cudaCheck(cudaFree(configParameters_.expanded_v_DB_reco_flags)); + cudaCheck(cudaFree(configParameters_.expanded_Sizes_v_DB_reco_flags)); + cudaCheck(cudaFree(configParameters_.expanded_flagbit_v_DB_reco_flags)); +} -void EcalRecHitProducerGPU::acquire( - edm::Event const& event, - edm::EventSetup const& setup, - edm::WaitingTaskWithArenaHolder holder) -{ +void EcalRecHitProducerGPU::acquire(edm::Event const& event, + edm::EventSetup const& setup, + edm::WaitingTaskWithArenaHolder holder) { // cuda products auto const& ebUncalibRecHitsProduct = event.get(uncalibRecHitsInEBToken_); auto const& eeUncalibRecHitsProduct = event.get(uncalibRecHitsInEEToken_); @@ -329,114 +281,95 @@ void EcalRecHitProducerGPU::acquire( // get actual object auto const& ebUncalibRecHits = ctx.get(ebUncalibRecHitsProduct); auto const& eeUncalibRecHits = ctx.get(eeUncalibRecHitsProduct); - + ecal::rechit::EventInputDataGPU inputDataGPU{ebUncalibRecHits, eeUncalibRecHits}; - + neb_ = ebUncalibRecHits.size; nee_ = eeUncalibRecHits.size; // std::cout << " [EcalRecHitProducerGPU::acquire] neb_:nee_ = " << neb_ << " : " << nee_ << std::endl; - - int nchannelsEB = ebUncalibRecHits.size; // --> offsetForInput, first EB and then EE - + + int nchannelsEB = ebUncalibRecHits.size; // --> offsetForInput, first EB and then EE + // conditions - // - laser correction + // - laser correction // - IC // - adt2gev - - // - setup.get() .get(ADCToGeVConstantHandle_); + + // + setup.get().get(ADCToGeVConstantHandle_); setup.get().get(IntercalibConstantsHandle_); - setup.get() .get(ChannelStatusHandle_); - - setup.get() .get(LaserAPDPNRatiosHandle_); - setup.get() .get(LaserAPDPNRatiosRefHandle_); - setup.get() .get(LaserAlphasHandle_); - setup.get() .get(LinearCorrectionsHandle_); - - // - - auto const& ADCToGeVConstantProduct = ADCToGeVConstantHandle_ -> getProduct(ctx.stream()); - auto const& IntercalibConstantsProduct = IntercalibConstantsHandle_ -> getProduct(ctx.stream()); - auto const& ChannelStatusProduct = ChannelStatusHandle_ -> getProduct(ctx.stream()); - - auto const& LaserAPDPNRatiosProduct = LaserAPDPNRatiosHandle_ -> getProduct(ctx.stream()); - auto const& LaserAPDPNRatiosRefProduct = LaserAPDPNRatiosRefHandle_ -> getProduct(ctx.stream()); - auto const& LaserAlphasProduct = LaserAlphasHandle_ -> getProduct(ctx.stream()); - auto const& LinearCorrectionsProduct = LinearCorrectionsHandle_ -> getProduct(ctx.stream()); - - + setup.get().get(ChannelStatusHandle_); + + setup.get().get(LaserAPDPNRatiosHandle_); + setup.get().get(LaserAPDPNRatiosRefHandle_); + setup.get().get(LaserAlphasHandle_); + setup.get().get(LinearCorrectionsHandle_); + + // + + auto const& ADCToGeVConstantProduct = ADCToGeVConstantHandle_->getProduct(ctx.stream()); + auto const& IntercalibConstantsProduct = IntercalibConstantsHandle_->getProduct(ctx.stream()); + auto const& ChannelStatusProduct = ChannelStatusHandle_->getProduct(ctx.stream()); + + auto const& LaserAPDPNRatiosProduct = LaserAPDPNRatiosHandle_->getProduct(ctx.stream()); + auto const& LaserAPDPNRatiosRefProduct = LaserAPDPNRatiosRefHandle_->getProduct(ctx.stream()); + auto const& LaserAlphasProduct = LaserAlphasHandle_->getProduct(ctx.stream()); + auto const& LinearCorrectionsProduct = LinearCorrectionsHandle_->getProduct(ctx.stream()); + // bundle up conditions - ecal::rechit::ConditionsProducts conditions { - ADCToGeVConstantProduct, - IntercalibConstantsProduct, - ChannelStatusProduct, - // - LaserAPDPNRatiosProduct, - LaserAPDPNRatiosRefProduct, - LaserAlphasProduct, - LinearCorrectionsProduct, - // - IntercalibConstantsHandle_->getOffset() - }; - - + ecal::rechit::ConditionsProducts conditions{ADCToGeVConstantProduct, + IntercalibConstantsProduct, + ChannelStatusProduct, + // + LaserAPDPNRatiosProduct, + LaserAPDPNRatiosRefProduct, + LaserAlphasProduct, + LinearCorrectionsProduct, + // + IntercalibConstantsHandle_->getOffset()}; + // // schedule algorithms // - + edm::TimeValue_t event_time = event.time().value(); - - - ecal::rechit::create_ecal_rehit( - inputDataGPU, - eventOutputDataGPU_, - // eventDataForScratchGPU_, - conditions, - configParameters_, - nchannelsEB, - event_time, - ctx.stream() - ); - -// cudaCheck(cudaGetLastError()); - - + + ecal::rechit::create_ecal_rehit(inputDataGPU, + eventOutputDataGPU_, + // eventDataForScratchGPU_, + conditions, + configParameters_, + nchannelsEB, + event_time, + ctx.stream()); + + // cudaCheck(cudaGetLastError()); } -void EcalRecHitProducerGPU::produce( - edm::Event& event, - edm::EventSetup const& setup) -{ +void EcalRecHitProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) { //DurationMeasurer timer{std::string{"produce duration"}}; cms::cuda::ScopedContextProduce ctx{cudaState_}; - + // copy construct output collections // note, output collections do not own device memory! ecal::RecHit ebRecHits{eventOutputDataGPU_}; ecal::RecHit eeRecHits{eventOutputDataGPU_}; - - - + // set the size of eb and ee ebRecHits.size = neb_; eeRecHits.size = nee_; - + // shift ptrs for ee - eeRecHits.energy += neb_; - eeRecHits.chi2 += neb_; - eeRecHits.did += neb_; - eeRecHits.time += neb_; - eeRecHits.extra += neb_; + eeRecHits.energy += neb_; + eeRecHits.chi2 += neb_; + eeRecHits.did += neb_; + eeRecHits.time += neb_; + eeRecHits.extra += neb_; eeRecHits.flagBits += neb_; - + // put into the event ctx.emplace(event, recHitsTokenEB_, std::move(ebRecHits)); ctx.emplace(event, recHitsTokenEE_, std::move(eeRecHits)); - } - - - DEFINE_FWK_MODULE(EcalRecHitProducerGPU); - From 55c10ec10564c2cd9a6649e7d8697e2de4c09d9e Mon Sep 17 00:00:00 2001 From: amassiro Date: Wed, 27 May 2020 12:23:10 +0200 Subject: [PATCH 18/30] change from CUDAHostAllocator to HostAllocator --- .../EcalRecHitSoA/interface/EcalRecHit_soa.h | 2 +- .../interface/EcalIntercalibConstantsGPU.h | 2 +- .../interface/EcalLaserAPDPNRatiosGPU.h | 14 +++++++------- .../interface/EcalLaserAPDPNRatiosRefGPU.h | 2 +- .../EcalRecAlgos/interface/EcalLaserAlphasGPU.h | 2 +- .../interface/EcalLinearCorrectionsGPU.h | 14 +++++++------- .../interface/EcalRechitADCToGeVConstantGPU.h | 4 ++-- .../interface/EcalRechitChannelStatusGPU.h | 4 ++-- 8 files changed, 22 insertions(+), 22 deletions(-) diff --git a/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h b/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h index 8379dec5c81ad..a5f73d2166f7a 100644 --- a/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h +++ b/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h @@ -5,7 +5,7 @@ #include #include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h" -#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" // needed for "soa" definition #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h index c59527a6d9f5a..3bbcdbd04e385 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalIntercalibConstantsGPU.h @@ -4,7 +4,7 @@ #include "CondFormats/EcalObjects/interface/EcalIntercalibConstants.h" #ifndef __CUDACC__ -#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" #include "HeterogeneousCore/CUDACore/interface/ESProduct.h" #endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h index 9b87c3228e5c7..633238234e086 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosGPU.h @@ -4,7 +4,7 @@ #include "CondFormats/EcalObjects/interface/EcalLaserAPDPNRatios.h" #ifndef __CUDACC__ -#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" #include "HeterogeneousCore/CUDACore/interface/ESProduct.h" #endif @@ -37,13 +37,13 @@ class EcalLaserAPDPNRatiosGPU { private: // in the future, we need to arrange so to avoid this copy on the host // store eb first then ee - std::vector > p1_; - std::vector > p2_; - std::vector > p3_; + std::vector > p1_; + std::vector > p2_; + std::vector > p3_; - std::vector > t1_; - std::vector > t2_; - std::vector > t3_; + std::vector > t1_; + std::vector > t2_; + std::vector > t3_; cms::cuda::ESProduct product_; diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h index 6e48d50f217f3..08b2a2b5047dc 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAPDPNRatiosRefGPU.h @@ -4,7 +4,7 @@ #include "CondFormats/EcalObjects/interface/EcalLaserAPDPNRatiosRef.h" #ifndef __CUDACC__ -#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" #include "HeterogeneousCore/CUDACore/interface/ESProduct.h" #endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h index d787c5700cd7e..71af7753933f6 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLaserAlphasGPU.h @@ -4,7 +4,7 @@ #include "CondFormats/EcalObjects/interface/EcalLaserAlphas.h" #ifndef __CUDACC__ -#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" #include "HeterogeneousCore/CUDACore/interface/ESProduct.h" #endif diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h index f2b395f5660fa..62691e9c4ef8c 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalLinearCorrectionsGPU.h @@ -4,7 +4,7 @@ #include "CondFormats/EcalObjects/interface/EcalLinearCorrections.h" #ifndef __CUDACC__ -#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" #include "HeterogeneousCore/CUDACore/interface/ESProduct.h" #endif @@ -37,13 +37,13 @@ class EcalLinearCorrectionsGPU { private: // in the future, we need to arrange so to avoid this copy on the host // store eb first then ee - std::vector> p1_; - std::vector> p2_; - std::vector> p3_; + std::vector> p1_; + std::vector> p2_; + std::vector> p3_; - std::vector> t1_; - std::vector> t2_; - std::vector> t3_; + std::vector> t1_; + std::vector> t2_; + std::vector> t3_; cms::cuda::ESProduct product_; diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h index 3838a757cc2e1..92441ae4ae703 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitADCToGeVConstantGPU.h @@ -4,7 +4,7 @@ #include "CondFormats/EcalObjects/interface/EcalADCToGeVConstant.h" #ifndef __CUDACC__ -#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" #include "HeterogeneousCore/CUDACore/interface/ESProduct.h" #endif @@ -32,7 +32,7 @@ class EcalRechitADCToGeVConstantGPU { private: // in the future, we need to arrange so to avoid this copy on the host // store eb first then ee - std::vector> adc2gev_; + std::vector> adc2gev_; cms::cuda::ESProduct product_; diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h index bf3f0f600224e..f425293a5488d 100644 --- a/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h +++ b/RecoLocalCalo/EcalRecAlgos/interface/EcalRechitChannelStatusGPU.h @@ -4,7 +4,7 @@ #include "CondFormats/EcalObjects/interface/EcalChannelStatus.h" #ifndef __CUDACC__ -#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" #include "HeterogeneousCore/CUDACore/interface/ESProduct.h" #endif @@ -32,7 +32,7 @@ class EcalRechitChannelStatusGPU { private: // in the future, we need to arrange so to avoid this copy on the host // store eb first then ee - std::vector> status_; + std::vector> status_; cms::cuda::ESProduct product_; From 5dd85ff28c3aeec12aff4fe69cd873125f0ce745 Mon Sep 17 00:00:00 2001 From: amassiro Date: Fri, 29 May 2020 18:05:23 +0200 Subject: [PATCH 19/30] remove message logger not needed --- RecoLocalCalo/EcalRecProducers/BuildFile.xml | 1 - RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml | 1 - 2 files changed, 2 deletions(-) diff --git a/RecoLocalCalo/EcalRecProducers/BuildFile.xml b/RecoLocalCalo/EcalRecProducers/BuildFile.xml index abbae509cdab0..aa19516964fd9 100644 --- a/RecoLocalCalo/EcalRecProducers/BuildFile.xml +++ b/RecoLocalCalo/EcalRecProducers/BuildFile.xml @@ -3,7 +3,6 @@ - diff --git a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml index ca6b19c2ddd23..3b1d2c0cf159d 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml +++ b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml @@ -15,7 +15,6 @@ - From 34f8084f4a84a118195ed88088db930876fa8e83 Mon Sep 17 00:00:00 2001 From: amassiro Date: Thu, 4 Jun 2020 17:43:20 +0200 Subject: [PATCH 20/30] suggestions from PR implemented --- ...eEcalMultifitResultsGpuValidationPlots.cpp | 52 ++++++++---------- .../plugins/EcalCPURecHitProducer.cc | 11 ++-- .../plugins/EcalRecHitConvertGPU2CPUFormat.cc | 54 +++++++++---------- 3 files changed, 51 insertions(+), 66 deletions(-) diff --git a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp index 1cf7c9d706317..8ddc5f9c9c028 100644 --- a/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp +++ b/RecoLocalCalo/EcalRecAlgos/bin/makeEcalMultifitResultsGpuValidationPlots.cpp @@ -232,7 +232,7 @@ int main(int argc, char *argv[]) { if (chi2_cpu > 0) hChi2EBGPUCPUratio->Fill((float)chi2_gpu / chi2_cpu); - if (fabs(chi2_gpu / chi2_cpu - 1) > 0.05 || fabs(soi_amp_gpu / soi_amp_cpu - 1) > 0.05) { + if (std::abs(chi2_gpu / chi2_cpu - 1) > 0.05 || std::abs(soi_amp_gpu / soi_amp_cpu - 1) > 0.05) { std::cout << " ---- EB " << std::endl; std::cout << " eventid = " << ie << " xtal = " << i << std::endl; std::cout << " chi2_gpu = " << chi2_gpu << " chi2_cpu = " << chi2_cpu << std::endl; @@ -296,7 +296,7 @@ int main(int argc, char *argv[]) { if (chi2_cpu > 0) hChi2EEGPUCPUratio->Fill((float)chi2_gpu / chi2_cpu); - if (fabs(chi2_gpu / chi2_cpu - 1) > 0.05 || fabs(soi_amp_gpu / soi_amp_cpu - 1) > 0.05) { + if (std::abs(chi2_gpu / chi2_cpu - 1) > 0.05 || std::abs(soi_amp_gpu / soi_amp_cpu - 1) > 0.05) { std::cout << " ---- EE " << std::endl; std::cout << " eventid = " << ie << " xtal = " << i << std::endl; std::cout << " chi2_gpu = " << chi2_gpu << " chi2_cpu = " << chi2_cpu << std::endl; @@ -332,12 +332,9 @@ int main(int argc, char *argv[]) { } { - // TCanvas c("plots", "plots", 4200, 6200); TCanvas c("plots", "plots", 1750, 860); - // c.Divide(2, 3); c.Divide(3, 2); - // c.cd(1); c.cd(1); { gPad->SetLogy(); @@ -354,7 +351,7 @@ int main(int argc, char *argv[]) { stats->SetY2NDC(y1); stats->SetY1NDC(y1 - (y2 - y1)); } - // c.cd(2); + c.cd(4); { gPad->SetLogy(); @@ -371,21 +368,21 @@ int main(int argc, char *argv[]) { stats->SetY2NDC(y1); stats->SetY1NDC(y1 - (y2 - y1)); } - // c.cd(3); + c.cd(2); gPad->SetGrid(); hSOIAmplitudesEBGPUvsCPU->Draw("COLZ"); - // c.cd(4); + c.cd(5); gPad->SetGrid(); hSOIAmplitudesEEGPUvsCPU->Draw("COLZ"); - // c.cd(5); + c.cd(3); - // hSOIAmplitudesEBdeltavsCPU->Draw("COLZ"); + hSOIAmplitudesEBGPUCPUratio->Draw(""); - // c.cd(6); + c.cd(6); - // hSOIAmplitudesEEdeltavsCPU->Draw("COLZ"); + hSOIAmplitudesEEGPUCPUratio->Draw(""); c.SaveAs("ecal-amplitudes.root"); @@ -393,7 +390,6 @@ int main(int argc, char *argv[]) { // chi2 - // c.cd(1); c.cd(1); { gPad->SetLogy(); @@ -410,7 +406,7 @@ int main(int argc, char *argv[]) { stats->SetY2NDC(y1); stats->SetY1NDC(y1 - (y2 - y1)); } - // c.cd(2); + c.cd(4); { gPad->SetLogy(); @@ -427,21 +423,21 @@ int main(int argc, char *argv[]) { stats->SetY2NDC(y1); stats->SetY1NDC(y1 - (y2 - y1)); } - // c.cd(3); + c.cd(2); gPad->SetGrid(); hChi2EBGPUvsCPU->Draw("COLZ"); - // c.cd(4); + c.cd(5); gPad->SetGrid(); hChi2EEGPUvsCPU->Draw("COLZ"); - // c.cd(5); + c.cd(3); - // hChi2EBdeltavsCPU->Draw("COLZ"); + hChi2EBGPUCPUratio->Draw(""); - // c.cd(6); + c.cd(6); - // hChi2EEdeltavsCPU->Draw("COLZ"); + hChi2EEGPUCPUratio->Draw(""); c.SaveAs("ecal-chi2.root"); @@ -449,7 +445,7 @@ int main(int argc, char *argv[]) { // flags - // c.cd(1); + c.cd(1); { gPad->SetLogy(); @@ -466,7 +462,7 @@ int main(int argc, char *argv[]) { stats->SetY2NDC(y1); stats->SetY1NDC(y1 - (y2 - y1)); } - // c.cd(2); + c.cd(4); { gPad->SetLogy(); @@ -483,22 +479,20 @@ int main(int argc, char *argv[]) { stats->SetY2NDC(y1); stats->SetY1NDC(y1 - (y2 - y1)); } - // c.cd(3); + c.cd(2); gPad->SetGrid(); hFlagsEBGPUvsCPU->Draw("COLZ"); - // c.cd(4); + c.cd(5); gPad->SetGrid(); hFlagsEEGPUvsCPU->Draw("COLZ"); - // c.cd(5); + c.cd(3); - // hFlagsEBdeltavsCPU->Draw("COLZ"); hFlagsEBGPUCPUratio->Draw(""); - // c.cd(6); + c.cd(6); - // hFlagsEEdeltavsCPU->Draw("COLZ"); hFlagsEEGPUCPUratio->Draw(""); c.SaveAs("ecal-flags.root"); @@ -547,13 +541,11 @@ int main(int argc, char *argv[]) { cRechits.cd(3); { gPad->SetLogy(); - //hRechitsEBdeltavsCPU->Draw("COLZ"); hRechitsEBGPUCPUratio->Draw(""); } cRechits.cd(6); { gPad->SetLogy(); - //hRechitsEEdeltavsCPU->Draw("COLZ"); hRechitsEEGPUCPUratio->Draw(""); } cRechits.SaveAs("ecal-rechits.root"); diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc index 8c5e5c0c9783d..8e1b4d399e0c7 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc @@ -46,8 +46,7 @@ void EcalCPURecHitProducer::fillDescriptions(edm::ConfigurationDescriptions& con desc.add("recHitsOutLabelEE", "EcalRecHitsEE"); desc.add("containsTimingInformation", false); - std::string label = "ecalCPURecHitProducer"; - confDesc.add(label, desc); + confDesc.addWithDefaultLabel(desc); } EcalCPURecHitProducer::EcalCPURecHitProducer(const edm::ParameterSet& ps) @@ -151,13 +150,9 @@ void EcalCPURecHitProducer::acquire(edm::Event const& event, } void EcalCPURecHitProducer::produce(edm::Event& event, edm::EventSetup const& setup) { - // tmp vectors - auto recHitsOutEB = std::make_unique>(std::move(recHitsEB_)); - auto recHitsOutEE = std::make_unique>(std::move(recHitsEE_)); - // put into event - event.put(recHitsOutEBToken_, std::move(recHitsOutEB)); - event.put(recHitsOutEEToken_, std::move(recHitsOutEE)); + event.emplace(recHitsOutEBToken_, std::move(recHitsEB_)); + event.emplace(recHitsOutEEToken_, std::move(recHitsEE_)); } DEFINE_FWK_MODULE(EcalCPURecHitProducer); diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc index 548bc812ffa2e..151762c6b63d3 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitConvertGPU2CPUFormat.cc @@ -40,8 +40,7 @@ void EcalRecHitConvertGPU2CPUFormat::fillDescriptions(edm::ConfigurationDescript desc.add("recHitsLabelCPUEB", "EcalRecHitsEB"); desc.add("recHitsLabelCPUEE", "EcalRecHitsEE"); - std::string label = "ecalRecHitConvertGPU2CPUFormat"; - confDesc.add(label, desc); + confDesc.addWithDefaultLabel(desc); } EcalRecHitConvertGPU2CPUFormat::EcalRecHitConvertGPU2CPUFormat(const edm::ParameterSet& ps) @@ -56,65 +55,64 @@ EcalRecHitConvertGPU2CPUFormat::EcalRecHitConvertGPU2CPUFormat(const edm::Parame EcalRecHitConvertGPU2CPUFormat::~EcalRecHitConvertGPU2CPUFormat() {} void EcalRecHitConvertGPU2CPUFormat::produce(edm::Event& event, edm::EventSetup const& setup) { - edm::Handle hRecHitsGPUEB, hRecHitsGPUEE; - event.getByToken(recHitsGPUEB_, hRecHitsGPUEB); - event.getByToken(recHitsGPUEE_, hRecHitsGPUEE); - + auto const& hRecHitsGPUEB = event.get(recHitsGPUEB_); + auto const& hRecHitsGPUEE = event.get(recHitsGPUEE_); + auto recHitsCPUEB = std::make_unique(); auto recHitsCPUEE = std::make_unique(); - recHitsCPUEB->reserve(hRecHitsGPUEB->energy.size()); - recHitsCPUEE->reserve(hRecHitsGPUEE->energy.size()); + recHitsCPUEB->reserve(hRecHitsGPUEB.energy.size()); + recHitsCPUEE->reserve(hRecHitsGPUEE.energy.size()); // // explicit EcalRecHit(const DetId& id, float energy, float time, uint32_t extra = 0, uint32_t flagBits = 0): // - for (uint32_t i = 0; i < hRecHitsGPUEB->energy.size(); ++i) { + for (uint32_t i = 0; i < hRecHitsGPUEB.energy.size(); ++i) { // // Save only if energy is >= 0 ! // This is extremely important because the channels that were supposed // to be excluded get "-1" as energy // - if (hRecHitsGPUEB->energy[i] >= 0) { - recHitsCPUEB->emplace_back(DetId{hRecHitsGPUEB->did[i]}, - hRecHitsGPUEB->energy[i], - hRecHitsGPUEB->time[i], - hRecHitsGPUEB->extra[i], - hRecHitsGPUEB->flagBits[i]); + if (hRecHitsGPUEB.energy[i] >= 0) { + recHitsCPUEB->emplace_back(DetId{hRecHitsGPUEB.did[i]}, + hRecHitsGPUEB.energy[i], + hRecHitsGPUEB.time[i], + hRecHitsGPUEB.extra[i], + hRecHitsGPUEB.flagBits[i]); } - // std::cout << " EB :: extra [" << i << "::" << hRecHitsGPUEB->energy.size() << "] = " << hRecHitsGPUEB->extra[i] << std::endl; + // std::cout << " EB :: extra [" << i << "::" << hRecHitsGPUEB.energy.size() << "] = " << hRecHitsGPUEB.extra[i] << std::endl; - // (*recHitsCPUEB)[i].setJitterError(hRecHitsGPUEB->timeError[i]); + // (*recHitsCPUEB)[i].setJitterError(hRecHitsGPUEB.timeError[i]); // auto const offset = i * EcalDataFrame::MAXSAMPLES; // for (uint32_t sample=0; sampleenergysAll[offset + sample]); + // sample, hRecHitsGPUEB.energysAll[offset + sample]); } - for (uint32_t i = 0; i < hRecHitsGPUEE->energy.size(); ++i) { + for (uint32_t i = 0; i < hRecHitsGPUEE.energy.size(); ++i) { // // Save only if energy is >= 0 ! // This is extremely important because the channels that were supposed // to be excluded get "-1" as energy // - if (hRecHitsGPUEE->energy[i] >= 0) { - recHitsCPUEE->emplace_back(DetId{hRecHitsGPUEE->did[i]}, - hRecHitsGPUEE->energy[i], - hRecHitsGPUEE->time[i], - hRecHitsGPUEE->extra[i], - hRecHitsGPUEE->flagBits[i]); + if (hRecHitsGPUEE.energy[i] >= 0) { + recHitsCPUEE->emplace_back(DetId{hRecHitsGPUEE.did[i]}, + hRecHitsGPUEE.energy[i], + hRecHitsGPUEE.time[i], + hRecHitsGPUEE.extra[i], + hRecHitsGPUEE.flagBits[i]); } - // std::cout << " EE :: extra [" << i << "::" << hRecHitsGPUEE->energy.size() << "] = " << hRecHitsGPUEE->extra[i] << std::endl; + // std::cout << " EE :: extra [" << i << "::" << hRecHitsGPUEE.energy.size() << "] = " << hRecHitsGPUEE.extra[i] << std::endl; - // (*recHitsCPUEE)[i].setJitterError(hRecHitsGPUEE->timeError[i]); + // (*recHitsCPUEE)[i].setJitterError(hRecHitsGPUEE.timeError[i]); // auto const offset = i * EcalDataFrame::MAXSAMPLES; // for (uint32_t sample=0; sampleenergysAll[offset + sample]); + // sample, hRecHitsGPUEE.energysAll[offset + sample]); } event.put(std::move(recHitsCPUEB), recHitsLabelCPUEB_); From 02c4d1059dc9bc41611048a12591e0794bacf0aa Mon Sep 17 00:00:00 2001 From: amassiro Date: Thu, 4 Jun 2020 19:11:01 +0200 Subject: [PATCH 21/30] add cuda protection --- .../plugins/EcalRecHitProducerGPU.cc | 104 ++++++++++-------- 1 file changed, 60 insertions(+), 44 deletions(-) diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc index 795a499987a06..1c80d648f1eed 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc @@ -44,6 +44,9 @@ #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + // configuration #include "CommonTools/Utils/interface/StringToEnumValue.h" @@ -176,12 +179,18 @@ EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) { configParameters_.ChannelStatusToBeExcludedSize = v_chstatus_.size(); - cudaCheck(cudaMalloc((void**)&configParameters_.ChannelStatusToBeExcluded, sizeof(int) * v_chstatus_.size())); - cudaCheck(cudaMemcpy(configParameters_.ChannelStatusToBeExcluded, - v_chstatus_.data(), - v_chstatus_.size() * sizeof(int), - cudaMemcpyHostToDevice)); - + + // call CUDA API functions only if CUDA is available + edm::Service cs; + if (cs and cs->enabled()) { + + cudaCheck(cudaMalloc((void**)&configParameters_.ChannelStatusToBeExcluded, sizeof(int) * v_chstatus_.size())); + cudaCheck(cudaMemcpy(configParameters_.ChannelStatusToBeExcluded, + v_chstatus_.data(), + v_chstatus_.size() * sizeof(int), + cudaMemcpyHostToDevice)); + } + // // https://github.com/cms-sw/cmssw/blob/266e21cfc9eb409b093e4cf064f4c0a24c6ac293/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitWorkerSimple.cc // @@ -208,33 +217,36 @@ EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) { // v_DB_reco_flags_[recoflagbit] = dbstatuses; } - // actual values - cudaCheck( - cudaMalloc((void**)&configParameters_.expanded_v_DB_reco_flags, sizeof(int) * expanded_v_DB_reco_flags_.size())); - - cudaCheck(cudaMemcpy(configParameters_.expanded_v_DB_reco_flags, - expanded_v_DB_reco_flags_.data(), - expanded_v_DB_reco_flags_.size() * sizeof(int), - cudaMemcpyHostToDevice)); - - // sizes - cudaCheck(cudaMalloc((void**)&configParameters_.expanded_Sizes_v_DB_reco_flags, - sizeof(uint32_t) * expanded_Sizes_v_DB_reco_flags_.size())); - - cudaCheck(cudaMemcpy(configParameters_.expanded_Sizes_v_DB_reco_flags, - expanded_Sizes_v_DB_reco_flags_.data(), - expanded_Sizes_v_DB_reco_flags_.size() * sizeof(uint32_t), - cudaMemcpyHostToDevice)); - - // keys - cudaCheck(cudaMalloc((void**)&configParameters_.expanded_flagbit_v_DB_reco_flags, - sizeof(uint32_t) * expanded_flagbit_v_DB_reco_flags_.size())); - - cudaCheck(cudaMemcpy(configParameters_.expanded_flagbit_v_DB_reco_flags, - expanded_flagbit_v_DB_reco_flags_.data(), - expanded_flagbit_v_DB_reco_flags_.size() * sizeof(uint32_t), - cudaMemcpyHostToDevice)); - + // call CUDA API functions only if CUDA is available + if (cs and cs->enabled()) { + // actual values + cudaCheck( + cudaMalloc((void**)&configParameters_.expanded_v_DB_reco_flags, sizeof(int) * expanded_v_DB_reco_flags_.size())); + + cudaCheck(cudaMemcpy(configParameters_.expanded_v_DB_reco_flags, + expanded_v_DB_reco_flags_.data(), + expanded_v_DB_reco_flags_.size() * sizeof(int), + cudaMemcpyHostToDevice)); + + // sizes + cudaCheck(cudaMalloc((void**)&configParameters_.expanded_Sizes_v_DB_reco_flags, + sizeof(uint32_t) * expanded_Sizes_v_DB_reco_flags_.size())); + + cudaCheck(cudaMemcpy(configParameters_.expanded_Sizes_v_DB_reco_flags, + expanded_Sizes_v_DB_reco_flags_.data(), + expanded_Sizes_v_DB_reco_flags_.size() * sizeof(uint32_t), + cudaMemcpyHostToDevice)); + + // keys + cudaCheck(cudaMalloc((void**)&configParameters_.expanded_flagbit_v_DB_reco_flags, + sizeof(uint32_t) * expanded_flagbit_v_DB_reco_flags_.size())); + + cudaCheck(cudaMemcpy(configParameters_.expanded_flagbit_v_DB_reco_flags, + expanded_flagbit_v_DB_reco_flags_.data(), + expanded_flagbit_v_DB_reco_flags_.size() * sizeof(uint32_t), + cudaMemcpyHostToDevice)); + } + configParameters_.expanded_v_DB_reco_flagsSize = expanded_flagbit_v_DB_reco_flags_.size(); flagmask_ = 0; @@ -258,16 +270,20 @@ EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) { } EcalRecHitProducerGPU::~EcalRecHitProducerGPU() { - // free event ouput data - eventOutputDataGPU_.deallocate(configParameters_); - - // FIXME AM: do I need to do this? - // Or can I do it as part of "deallocate" ? - cudaCheck(cudaFree(configParameters_.ChannelStatusToBeExcluded)); - - cudaCheck(cudaFree(configParameters_.expanded_v_DB_reco_flags)); - cudaCheck(cudaFree(configParameters_.expanded_Sizes_v_DB_reco_flags)); - cudaCheck(cudaFree(configParameters_.expanded_flagbit_v_DB_reco_flags)); + + edm::Service cs; + if (cs and cs->enabled()) { + // free event ouput data + eventOutputDataGPU_.deallocate(configParameters_); + + // FIXME AM: do I need to do this? + // Or can I do it as part of "deallocate" ? + cudaCheck(cudaFree(configParameters_.ChannelStatusToBeExcluded)); + + cudaCheck(cudaFree(configParameters_.expanded_v_DB_reco_flags)); + cudaCheck(cudaFree(configParameters_.expanded_Sizes_v_DB_reco_flags)); + cudaCheck(cudaFree(configParameters_.expanded_flagbit_v_DB_reco_flags)); + } } void EcalRecHitProducerGPU::acquire(edm::Event const& event, @@ -343,7 +359,7 @@ void EcalRecHitProducerGPU::acquire(edm::Event const& event, event_time, ctx.stream()); - // cudaCheck(cudaGetLastError()); + cudaCheck(cudaGetLastError()); } void EcalRecHitProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) { From aed840061df8a4da6b037ff93fbbc7fff4bd608b Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Fri, 5 Jun 2020 10:58:46 +0200 Subject: [PATCH 22/30] Update sequences for ECAL local reconstruction running on GPU --- .../EcalRawToDigi/python/ecalDigis_cff.py | 10 ++++---- .../python/ecalMultiFitUncalibRecHit_cff.py | 23 +++++++++++++++---- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/EventFilter/EcalRawToDigi/python/ecalDigis_cff.py b/EventFilter/EcalRawToDigi/python/ecalDigis_cff.py index 9f79d3e0dbcb4..f6b873704dcd8 100644 --- a/EventFilter/EcalRawToDigi/python/ecalDigis_cff.py +++ b/EventFilter/EcalRawToDigi/python/ecalDigis_cff.py @@ -18,11 +18,11 @@ # copy the digi from the GPU to the CPU and convert to legacy format from EventFilter.EcalRawToDigi.ecalCPUDigisProducer_cfi import ecalCPUDigisProducer as _ecalCPUDigisProducer -_gpu_ecalDigis = _ecalCPUDigisProducer.clone( - digisInLabelEB = 'ecalDigisGPU:ebDigisGPU', - digisInLabelEE = 'ecalDigisGPU:eeDigisGPU', - produceDummyIntegrityCollections = True, +_ecalDigis_gpu = _ecalCPUDigisProducer.clone( + digisInLabelEB = cms.InputTag('ecalDigisGPU', 'ebDigisGPU'), + digisInLabelEE = cms.InputTag('ecalDigisGPU', 'eeDigisGPU'), + produceDummyIntegrityCollections = True ) -gpu.toReplaceWith(ecalDigis, _gpu_ecalDigis) +gpu.toReplaceWith(ecalDigis, _ecalDigis_gpu) gpu.toReplaceWith(ecalDigisTask, cms.Task(ecalElectronicsMappingGPUESProducer, ecalDigisGPU, ecalDigis)) diff --git a/RecoLocalCalo/EcalRecProducers/python/ecalMultiFitUncalibRecHit_cff.py b/RecoLocalCalo/EcalRecProducers/python/ecalMultiFitUncalibRecHit_cff.py index 829c1b1c9468e..cbf220323df78 100644 --- a/RecoLocalCalo/EcalRecProducers/python/ecalMultiFitUncalibRecHit_cff.py +++ b/RecoLocalCalo/EcalRecProducers/python/ecalMultiFitUncalibRecHit_cff.py @@ -29,12 +29,27 @@ recHitsInLabelEE = cms.InputTag('ecalMultiFitUncalibRecHitGPU', 'EcalUncalibRecHitsEE'), ) -# convert the uncalibrated rechits legacy format +# convert the uncalibrated rechits from SoA to legacy format from RecoLocalCalo.EcalRecProducers.ecalUncalibRecHitConvertGPU2CPUFormat_cfi import ecalUncalibRecHitConvertGPU2CPUFormat as _ecalUncalibRecHitConvertGPU2CPUFormat -_gpu_ecalMultiFitUncalibRecHit = _ecalUncalibRecHitConvertGPU2CPUFormat.clone( +_ecalMultiFitUncalibRecHit_gpu = _ecalUncalibRecHitConvertGPU2CPUFormat.clone( recHitsLabelGPUEB = cms.InputTag('ecalMultiFitUncalibRecHitSoA', 'EcalUncalibRecHitsEB'), recHitsLabelGPUEE = cms.InputTag('ecalMultiFitUncalibRecHitSoA', 'EcalUncalibRecHitsEE'), ) -gpu.toReplaceWith(ecalMultiFitUncalibRecHit, _gpu_ecalMultiFitUncalibRecHit) +gpu.toReplaceWith(ecalMultiFitUncalibRecHit, _ecalMultiFitUncalibRecHit_gpu) -gpu.toReplaceWith(ecalMultiFitUncalibRecHitTask, cms.Task(ecalMultiFitUncalibRecHitGPU, ecalMultiFitUncalibRecHitSoA, ecalMultiFitUncalibRecHit)) +gpu.toReplaceWith(ecalMultiFitUncalibRecHitTask, cms.Task( + # ECAL conditions used by the multifit running on GPU + ecalPedestalsGPUESProducer, + ecalGainRatiosGPUESProducer, + ecalPulseShapesGPUESProducer, + ecalPulseCovariancesGPUESProducer, + ecalSamplesCorrelationGPUESProducer, + ecalTimeBiasCorrectionsGPUESProducer, + ecalTimeCalibConstantsGPUESProducer, + # ECAL multifit running on GP + ecalMultiFitUncalibRecHitGPU, + # copy the uncalibrated rechits from GPU to CPU + ecalMultiFitUncalibRecHitSoA, + # convert the uncalibrated rechits legacy format + ecalMultiFitUncalibRecHit, +)) From 92afb3e9dfc0a9977785c736604cb33b6628dd14 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Fri, 5 Jun 2020 10:59:26 +0200 Subject: [PATCH 23/30] Reconstruct ECAL rechits on GPUs --- .../python/ecalLocalRecoSequence_cff.py | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/RecoLocalCalo/Configuration/python/ecalLocalRecoSequence_cff.py b/RecoLocalCalo/Configuration/python/ecalLocalRecoSequence_cff.py index 06fecf4787baf..5895f78eccd55 100644 --- a/RecoLocalCalo/Configuration/python/ecalLocalRecoSequence_cff.py +++ b/RecoLocalCalo/Configuration/python/ecalLocalRecoSequence_cff.py @@ -1,4 +1,5 @@ import FWCore.ParameterSet.Config as cms +from Configuration.ProcessModifiers.gpu_cff import gpu # TPG condition needed by ecalRecHit producer if TT recovery is ON from RecoLocalCalo.EcalRecProducers.ecalRecHitTPGConditions_cff import * @@ -43,6 +44,57 @@ ecalOnlyLocalRecoSequence = cms.Sequence(ecalOnlyLocalRecoTask) +# ECAL rechit calibrations on GPU +from RecoLocalCalo.EcalRecProducers.ecalRechitADCToGeVConstantGPUESProducer_cfi import ecalRechitADCToGeVConstantGPUESProducer +from RecoLocalCalo.EcalRecProducers.ecalRechitChannelStatusGPUESProducer_cfi import ecalRechitChannelStatusGPUESProducer +from RecoLocalCalo.EcalRecProducers.ecalIntercalibConstantsGPUESProducer_cfi import ecalIntercalibConstantsGPUESProducer +from RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosGPUESProducer_cfi import ecalLaserAPDPNRatiosGPUESProducer +from RecoLocalCalo.EcalRecProducers.ecalLaserAPDPNRatiosRefGPUESProducer_cfi import ecalLaserAPDPNRatiosRefGPUESProducer +from RecoLocalCalo.EcalRecProducers.ecalLaserAlphasGPUESProducer_cfi import ecalLaserAlphasGPUESProducer +from RecoLocalCalo.EcalRecProducers.ecalLinearCorrectionsGPUESProducer_cfi import ecalLinearCorrectionsGPUESProducer + +# ECAL rechits running on GPU +from RecoLocalCalo.EcalRecProducers.ecalRecHitGPU_cfi import ecalRecHitGPU as _ecalRecHitGPU +ecalRecHitGPU = _ecalRecHitGPU.clone( + uncalibrecHitsInLabelEB = cms.InputTag('ecalMultiFitUncalibRecHitGPU', 'EcalUncalibRecHitsEB'), + uncalibrecHitsInLabelEE = cms.InputTag('ecalMultiFitUncalibRecHitGPU', 'EcalUncalibRecHitsEE') +) + +# copy the rechits from GPU to CPU +from RecoLocalCalo.EcalRecProducers.ecalCPURecHitProducer_cfi import ecalCPURecHitProducer as _ecalCPURecHitProducer +ecalRecHitSoA = _ecalCPURecHitProducer.clone( + recHitsInLabelEB = cms.InputTag('ecalRecHitGPU', 'EcalRecHitsEB'), + recHitsInLabelEE = cms.InputTag('ecalRecHitGPU', 'EcalRecHitsEE') +) + +# convert the rechits from SoA to legacy format +from RecoLocalCalo.EcalRecProducers.ecalRecHitConvertGPU2CPUFormat_cfi import ecalRecHitConvertGPU2CPUFormat as _ecalRecHitConvertGPU2CPUFormat +_ecalRecHit_gpu = _ecalRecHitConvertGPU2CPUFormat.clone( + recHitsLabelGPUEB = cms.InputTag('ecalRecHitSoA', 'EcalRecHitsEB'), + recHitsLabelGPUEE = cms.InputTag('ecalRecHitSoA', 'EcalRecHitsEE') +) +gpu.toReplaceWith(ecalRecHit, _ecalRecHit_gpu) + +# ECAL reconstruction on GPU +gpu.toReplaceWith(ecalRecHitNoTPTask, cms.Task( + # ECAL rechit calibrations on GPU + ecalRechitADCToGeVConstantGPUESProducer, + ecalRechitChannelStatusGPUESProducer, + ecalIntercalibConstantsGPUESProducer, + ecalLaserAPDPNRatiosGPUESProducer, + ecalLaserAPDPNRatiosRefGPUESProducer, + ecalLaserAlphasGPUESProducer, + ecalLinearCorrectionsGPUESProducer, + # ECAL rechits running on GPU + ecalRecHitGPU, + # copy the rechits from GPU to CPU + ecalRecHitSoA, + # convert the rechits from SoA to legacy format + ecalRecHit, + # ECAL preshower rechit legacy module + ecalPreshowerRecHit +)) + # Phase 2 modifications from RecoLocalCalo.EcalRecProducers.ecalDetailedTimeRecHit_cfi import * _phase2_timing_ecalRecHitTask = cms.Task( ecalRecHitTask.copy() , ecalDetailedTimeRecHit ) From e94bc013f7ee861c6d27e9e639363eda65501f97 Mon Sep 17 00:00:00 2001 From: amassiro Date: Mon, 8 Jun 2020 14:59:46 +0200 Subject: [PATCH 24/30] fixes of the previous PR after central validation --- .../EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc | 4 ++++ .../EcalRecAlgos/src/EcalRecHitBuilderKernels.cu | 12 +++++++++--- .../plugins/EcalRecHitProducerGPU.cc | 4 ++-- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc b/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc index 20946028aba90..0af2a9044ab65 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalLinearCorrectionsGPU.cc @@ -38,6 +38,10 @@ EcalLinearCorrectionsGPU::Product::~Product() { // deallocation cudaCheck(cudaFree(p1)); cudaCheck(cudaFree(p2)); + cudaCheck(cudaFree(p3)); + cudaCheck(cudaFree(t1)); + cudaCheck(cudaFree(t2)); + cudaCheck(cudaFree(t3)); } EcalLinearCorrectionsGPU::Product const& EcalLinearCorrectionsGPU::getProduct(cudaStream_t cudaStream) const { diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu index 54c376214c4c6..904c751de460a 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu @@ -286,7 +286,10 @@ namespace ecal { // Exploited later by the module "EcalRecHitConvertGPU2CPUFormat" // energy[ch] = -1; //---- AM: default, un-physical, ok - + chi2[ch] = chi2_in[inputCh]; + extra[ch] = 0; + + bool skip_this_channel = false; // static const int chStatusMask = 0x1F; // ChannelStatusToBeExcluded is a "int" then I put "dbstatus" to be the same @@ -294,10 +297,12 @@ namespace ecal { if (ChannelStatusToBeExcludedSize != 0) { for (int ich_to_check = 0; ich_to_check < ChannelStatusToBeExcludedSize; ich_to_check++) { if (ChannelStatusToBeExcluded[ich_to_check] == dbstatus) { - return; + skip_this_channel = true; } } } + + if (skip_this_channel) continue; // Take our association map of dbstatuses-> recHit flagbits and return the apporpriate flagbit word @@ -336,7 +341,8 @@ namespace ecal { } if ((flagmask & temporary_flagBits) && killDeadChannels) { - return; + continue; + // skip this channel } // diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc index 1c80d648f1eed..70c0af0e01821 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc @@ -283,7 +283,7 @@ EcalRecHitProducerGPU::~EcalRecHitProducerGPU() { cudaCheck(cudaFree(configParameters_.expanded_v_DB_reco_flags)); cudaCheck(cudaFree(configParameters_.expanded_Sizes_v_DB_reco_flags)); cudaCheck(cudaFree(configParameters_.expanded_flagbit_v_DB_reco_flags)); - } + } } void EcalRecHitProducerGPU::acquire(edm::Event const& event, @@ -302,7 +302,7 @@ void EcalRecHitProducerGPU::acquire(edm::Event const& event, neb_ = ebUncalibRecHits.size; nee_ = eeUncalibRecHits.size; - // std::cout << " [EcalRecHitProducerGPU::acquire] neb_:nee_ = " << neb_ << " : " << nee_ << std::endl; + std::cout << " [EcalRecHitProducerGPU::acquire] neb_:nee_ = " << neb_ << " : " << nee_ << std::endl; int nchannelsEB = ebUncalibRecHits.size; // --> offsetForInput, first EB and then EE From 907a3bb265c30d332432084a72b1748f2b51971c Mon Sep 17 00:00:00 2001 From: amassiro Date: Mon, 8 Jun 2020 15:04:15 +0200 Subject: [PATCH 25/30] ops, a cout slipped through --- RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc index 70c0af0e01821..ac09cb484288b 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc @@ -302,7 +302,7 @@ void EcalRecHitProducerGPU::acquire(edm::Event const& event, neb_ = ebUncalibRecHits.size; nee_ = eeUncalibRecHits.size; - std::cout << " [EcalRecHitProducerGPU::acquire] neb_:nee_ = " << neb_ << " : " << nee_ << std::endl; + // std::cout << " [EcalRecHitProducerGPU::acquire] neb_:nee_ = " << neb_ << " : " << nee_ << std::endl; int nchannelsEB = ebUncalibRecHits.size; // --> offsetForInput, first EB and then EE From ff30a39c4e0a33a06c838d771983fe10645e4f85 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 8 Jun 2020 16:21:04 +0200 Subject: [PATCH 26/30] Remove extra whitespace --- .../EcalRecProducers/plugins/EcalRecHitProducerGPU.cc | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc index ac09cb484288b..0a1260dffefd2 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc @@ -7,10 +7,6 @@ #include "FWCore/Framework/interface/EventSetup.h" #include "FWCore/Framework/interface/MakerMacros.h" -// -// -// - // format #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" #include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit_soa.h" @@ -283,7 +279,7 @@ EcalRecHitProducerGPU::~EcalRecHitProducerGPU() { cudaCheck(cudaFree(configParameters_.expanded_v_DB_reco_flags)); cudaCheck(cudaFree(configParameters_.expanded_Sizes_v_DB_reco_flags)); cudaCheck(cudaFree(configParameters_.expanded_flagbit_v_DB_reco_flags)); - } + } } void EcalRecHitProducerGPU::acquire(edm::Event const& event, @@ -321,8 +317,6 @@ void EcalRecHitProducerGPU::acquire(edm::Event const& event, setup.get().get(LaserAlphasHandle_); setup.get().get(LinearCorrectionsHandle_); - // - auto const& ADCToGeVConstantProduct = ADCToGeVConstantHandle_->getProduct(ctx.stream()); auto const& IntercalibConstantsProduct = IntercalibConstantsHandle_->getProduct(ctx.stream()); auto const& ChannelStatusProduct = ChannelStatusHandle_->getProduct(ctx.stream()); From 134ab3ae2bd8d4bcd9433cdd25f7a211ea4a275e Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 8 Jun 2020 16:24:52 +0200 Subject: [PATCH 27/30] Move skip_this_channel inside the if block --- RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu index 904c751de460a..31ba3c0d3b5e4 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu @@ -289,21 +289,20 @@ namespace ecal { chi2[ch] = chi2_in[inputCh]; extra[ch] = 0; - bool skip_this_channel = false; - // static const int chStatusMask = 0x1F; // ChannelStatusToBeExcluded is a "int" then I put "dbstatus" to be the same int dbstatus = EcalChannelStatusCode_Code((status[hashedId]) & chStatusMask); if (ChannelStatusToBeExcludedSize != 0) { + bool skip_this_channel = false; for (int ich_to_check = 0; ich_to_check < ChannelStatusToBeExcludedSize; ich_to_check++) { if (ChannelStatusToBeExcluded[ich_to_check] == dbstatus) { skip_this_channel = true; + break; } } + if (skip_this_channel) continue; } - if (skip_this_channel) continue; - // Take our association map of dbstatuses-> recHit flagbits and return the apporpriate flagbit word // From a6a074f3ece52dfdec3cd5fb4b97c92256188910 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 8 Jun 2020 16:38:08 +0200 Subject: [PATCH 28/30] Avoid some repeated assignments --- .../src/EcalRecHitBuilderKernels.cu | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu index 31ba3c0d3b5e4..5ab7f6226d1b2 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu @@ -286,10 +286,17 @@ namespace ecal { // Exploited later by the module "EcalRecHitConvertGPU2CPUFormat" // energy[ch] = -1; //---- AM: default, un-physical, ok - chi2[ch] = chi2_in[inputCh]; + + // truncate the chi2 + if (chi2_in[inputCh] > 64) + chi2[ch] = 64; + else + chi2[ch] = chi2_in[inputCh]; + + // default value for the "extra flags" extra[ch] = 0; - - static const int chStatusMask = 0x1F; + + static const int chStatusMask = 0x1f; // ChannelStatusToBeExcluded is a "int" then I put "dbstatus" to be the same int dbstatus = EcalChannelStatusCode_Code((status[hashedId]) & chStatusMask); if (ChannelStatusToBeExcludedSize != 0) { @@ -300,9 +307,12 @@ namespace ecal { break; } } - if (skip_this_channel) continue; + if (skip_this_channel) { + // skip this channel + continue; + } } - + // Take our association map of dbstatuses-> recHit flagbits and return the apporpriate flagbit word // @@ -340,8 +350,8 @@ namespace ecal { } if ((flagmask & temporary_flagBits) && killDeadChannels) { - continue; // skip this channel + continue; } // @@ -357,13 +367,7 @@ namespace ecal { // Time is not saved so far, FIXME // time[ch] = time_in[inputCh]; - if (chi2_in[inputCh] > 64) - chi2[ch] = 64; - else - chi2[ch] = chi2_in[inputCh]; - // NB: calculate the "flagBits extra" --> not really "flags", but actually an encoded version of energy uncertainty, time unc., ... - extra[ch] = 0; // // extra packing ... From 9da15f2d5a0720d6a0e65cf8780ed51d303d8414 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Wed, 10 Jun 2020 10:09:17 +0200 Subject: [PATCH 29/30] Silence warning about unused variable --- RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu index 5ab7f6226d1b2..cc44b27ab224a 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu @@ -157,7 +157,7 @@ namespace ecal { ::ecal::reco::StorageScalarType const* amplitude = isEndcap ? amplitude_ee : amplitude_eb; - ::ecal::reco::StorageScalarType const* time_in = isEndcap ? time_ee : time_eb; + //::ecal::reco::StorageScalarType const* time_in = isEndcap ? time_ee : time_eb; ::ecal::reco::StorageScalarType const* chi2_in = isEndcap ? chi2_ee : chi2_eb; From dacaf2ead322bfe3b7c38aaea18a16b0ed79a07f Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Wed, 10 Jun 2020 10:14:52 +0200 Subject: [PATCH 30/30] Set flagBits for all channels --- .../EcalRecAlgos/src/EcalRecHitBuilderKernels.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu index cc44b27ab224a..114c56e8907f2 100644 --- a/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu +++ b/RecoLocalCalo/EcalRecAlgos/src/EcalRecHitBuilderKernels.cu @@ -293,7 +293,8 @@ namespace ecal { else chi2[ch] = chi2_in[inputCh]; - // default value for the "extra flags" + // default values for the flags + flagBits[ch] = 0; extra[ch] = 0; static const int chStatusMask = 0x1f; @@ -349,14 +350,13 @@ namespace ecal { flagbit_counter += 1; } + flagBits[ch] = temporary_flagBits; + if ((flagmask & temporary_flagBits) && killDeadChannels) { // skip this channel continue; } - // - flagBits[ch] = temporary_flagBits; - // // multiply the adc counts with factors to get GeV //