diff --git a/CUDADataFormats/HcalDigi/BuildFile.xml b/CUDADataFormats/HcalDigi/BuildFile.xml new file mode 100644 index 0000000000000..fb871f16b69f0 --- /dev/null +++ b/CUDADataFormats/HcalDigi/BuildFile.xml @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/CUDADataFormats/HcalDigi/interface/DigiCollection.h b/CUDADataFormats/HcalDigi/interface/DigiCollection.h new file mode 100644 index 0000000000000..e2f4bf0848e94 --- /dev/null +++ b/CUDADataFormats/HcalDigi/interface/DigiCollection.h @@ -0,0 +1,160 @@ +#ifndef CUDADataFormats_HcalDigi_interface_DigiCollection_h +#define CUDADataFormats_HcalDigi_interface_DigiCollection_h + +#include "CUDADataFormats/CaloCommon/interface/Common.h" + +namespace hcal { + + // FLAVOR_HE_QIE11 = 1; Phase1 upgrade + struct Flavor1 { + static constexpr int WORDS_PER_SAMPLE = 1; + static constexpr int SAMPLES_PER_WORD = 1; + static constexpr int HEADER_WORDS = 1; + + static constexpr uint8_t adc(uint16_t const* const sample_start) { return (*sample_start & 0xff); } + static constexpr uint8_t tdc(uint16_t const* const sample_start) { return (*sample_start >> 8) & 0x3f; } + static constexpr uint8_t soibit(uint16_t const* const sample_start) { return (*sample_start >> 14) & 0x1; } + }; + + // FLAVOR_HB_QIE11 = 3; Phase1 upgrade + struct Flavor3 { + static constexpr int WORDS_PER_SAMPLE = 1; + static constexpr int SAMPLES_PER_WORD = 1; + static constexpr int HEADER_WORDS = 1; + + static constexpr uint8_t adc(uint16_t const* const sample_start) { return (*sample_start & 0xff); } + static constexpr uint8_t tdc(uint16_t const* const sample_start) { return ((*sample_start >> 8) & 0x3); } + static constexpr uint8_t soibit(uint16_t const* const sample_start) { return ((*sample_start >> 14) & 0x1); } + static constexpr uint8_t capid(uint16_t const* const sample_start) { return ((*sample_start >> 10) & 0x3); } + }; + + // FLAVOR_HB_QIE10 = 5; Phase0 + struct Flavor5 { + static constexpr float WORDS_PER_SAMPLE = 0.5; + static constexpr int SAMPLES_PER_WORD = 2; + static constexpr int HEADER_WORDS = 1; + + static constexpr uint8_t adc(uint16_t const* const sample_start, uint8_t const shifter) { + return ((*sample_start >> shifter * 8) & 0x7f); + } + }; + + template + constexpr uint8_t capid_for_sample(uint16_t const* const dfstart, uint32_t const sample) { + auto const capid_first = (*dfstart >> 8) & 0x3; + return (capid_first + sample) & 0x3; // same as % 4 + } + + template <> + constexpr uint8_t capid_for_sample(uint16_t const* const dfstart, uint32_t const sample) { + return Flavor3::capid(dfstart + Flavor3::HEADER_WORDS + sample * Flavor3::WORDS_PER_SAMPLE); + } + + template + constexpr uint8_t soibit_for_sample(uint16_t const* const dfstart, uint32_t const sample) { + return Flavor::soibit(dfstart + Flavor::HEADER_WORDS + sample * Flavor::WORDS_PER_SAMPLE); + } + + template + constexpr uint8_t adc_for_sample(uint16_t const* const dfstart, uint32_t const sample) { + return Flavor::adc(dfstart + Flavor::HEADER_WORDS + sample * Flavor::WORDS_PER_SAMPLE); + } + + template + constexpr uint8_t tdc_for_sample(uint16_t const* const dfstart, uint32_t const sample) { + return Flavor::tdc(dfstart + Flavor::HEADER_WORDS + sample * Flavor::WORDS_PER_SAMPLE); + } + + template <> + constexpr uint8_t adc_for_sample(uint16_t const* const dfstart, uint32_t const sample) { + // avoid using WORDS_PER_SAMPLE and simply shift + return Flavor5::adc(dfstart + Flavor5::HEADER_WORDS + (sample >> 1), sample % 2); + } + + template + constexpr uint32_t compute_stride(uint32_t const nsamples) { + return static_cast(nsamples * Flavor::WORDS_PER_SAMPLE) + Flavor::HEADER_WORDS; + } + + template + constexpr uint32_t compute_nsamples(uint32_t const nwords) { + if constexpr (Flavor::SAMPLES_PER_WORD >= 1) + return (nwords - Flavor::HEADER_WORDS) * Flavor::SAMPLES_PER_WORD; + else + return (nwords - Flavor::HEADER_WORDS) / Flavor::WORDS_PER_SAMPLE; + } + + // + template + struct DigiCollectionBase : public ::calo::common::AddSize { + DigiCollectionBase() = default; + DigiCollectionBase(DigiCollectionBase const&) = default; + DigiCollectionBase& operator=(DigiCollectionBase const&) = default; + + DigiCollectionBase(DigiCollectionBase&&) = default; + DigiCollectionBase& operator=(DigiCollectionBase&&) = default; + + template + typename std::enable_if::value, void>::type resize(std::size_t size) { + ids.resize(size); + data.resize(size * stride); + } + + template + typename std::enable_if::value, void>::type reserve(std::size_t size) { + ids.reserve(size); + data.reserve(size * stride); + } + + template + typename std::enable_if::value, void>::type clear() { + ids.clear(); + data.clear(); + } + + typename StoragePolicy::template StorageSelector::type ids; + typename StoragePolicy::template StorageSelector::type data; + uint32_t stride{0}; + }; + + template + struct DigiCollection : public DigiCollectionBase { + using DigiCollectionBase::DigiCollectionBase; + }; + + // NOTE: base ctors will not be available + template + struct DigiCollection : public DigiCollectionBase { + DigiCollection() = default; + + DigiCollection(DigiCollection const&) = default; + DigiCollection& operator=(DigiCollection const&) = default; + + DigiCollection(DigiCollection&&) = default; + DigiCollection& operator=(DigiCollection&&) = default; + + template + typename std::enable_if::value, void>::type resize(std::size_t size) { + DigiCollectionBase::resize(size); + npresamples.resize(size); + } + + template + typename std::enable_if::value, void>::type reserve(std::size_t size) { + DigiCollectionBase::reserve(size); + npresamples.reserve(size); + } + + template + typename std::enable_if::value, void>::type clear() { + DigiCollectionBase::clear(); + npresamples.clear(); + } + + // add npresamples member + typename StoragePolicy::template StorageSelector::type npresamples; + }; + +} // namespace hcal + +#endif // CUDADataFormats_HcalDigi_interface_DigiCollection_h diff --git a/CUDADataFormats/HcalDigi/src/classes.h b/CUDADataFormats/HcalDigi/src/classes.h new file mode 100644 index 0000000000000..8c4a20318928e --- /dev/null +++ b/CUDADataFormats/HcalDigi/src/classes.h @@ -0,0 +1,3 @@ +#include "CUDADataFormats/Common/interface/Product.h" +#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h" +#include "DataFormats/Common/interface/Wrapper.h" diff --git a/CUDADataFormats/HcalDigi/src/classes_def.xml b/CUDADataFormats/HcalDigi/src/classes_def.xml new file mode 100644 index 0000000000000..71997eb59ba61 --- /dev/null +++ b/CUDADataFormats/HcalDigi/src/classes_def.xml @@ -0,0 +1,36 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/CUDADataFormats/HcalRecHitSoA/BuildFile.xml b/CUDADataFormats/HcalRecHitSoA/BuildFile.xml new file mode 100644 index 0000000000000..245701de5fdb0 --- /dev/null +++ b/CUDADataFormats/HcalRecHitSoA/BuildFile.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h b/CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h new file mode 100644 index 0000000000000..424b2c0813b4c --- /dev/null +++ b/CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h @@ -0,0 +1,38 @@ +#ifndef CUDADataFormats_HcalRecHitCollectionSoA_interface_RecHitCollection_h +#define CUDADataFormats_HcalRecHitCollectionSoA_interface_RecHitCollection_h + +#include + +#include "CUDADataFormats/CaloCommon/interface/Common.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" + +namespace hcal { + + template + struct RecHitCollection : public ::calo::common::AddSize { + RecHitCollection() = default; + RecHitCollection(const RecHitCollection&) = default; + RecHitCollection& operator=(const RecHitCollection&) = default; + + RecHitCollection(RecHitCollection&&) = default; + RecHitCollection& operator=(RecHitCollection&&) = default; + + typename StoragePolicy::template StorageSelector::type energy; + typename StoragePolicy::template StorageSelector::type chi2; + typename StoragePolicy::template StorageSelector::type energyM0; + typename StoragePolicy::template StorageSelector::type timeM0; + typename StoragePolicy::template StorageSelector::type did; + + template + typename std::enable_if::value, void>::type resize(size_t size) { + energy.resize(size); + chi2.resize(size); + energyM0.resize(size); + timeM0.resize(size); + did.resize(size); + } + }; + +} // namespace hcal + +#endif // RecoLocalCalo_HcalRecAlgos_interface_RecHitCollection_h diff --git a/CUDADataFormats/HcalRecHitSoA/src/classes.h b/CUDADataFormats/HcalRecHitSoA/src/classes.h new file mode 100644 index 0000000000000..a13782165c413 --- /dev/null +++ b/CUDADataFormats/HcalRecHitSoA/src/classes.h @@ -0,0 +1,3 @@ +#include "CUDADataFormats/Common/interface/Product.h" +#include "CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h" +#include "DataFormats/Common/interface/Wrapper.h" diff --git a/CUDADataFormats/HcalRecHitSoA/src/classes_def.xml b/CUDADataFormats/HcalRecHitSoA/src/classes_def.xml new file mode 100644 index 0000000000000..71dd18a7daddb --- /dev/null +++ b/CUDADataFormats/HcalRecHitSoA/src/classes_def.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/EventFilter/HcalRawToDigi/bin/BuildFile.xml b/EventFilter/HcalRawToDigi/bin/BuildFile.xml new file mode 100644 index 0000000000000..7a24968df89c8 --- /dev/null +++ b/EventFilter/HcalRawToDigi/bin/BuildFile.xml @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp b/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp new file mode 100644 index 0000000000000..039c38dd9df16 --- /dev/null +++ b/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp @@ -0,0 +1,386 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h" +#include "DataFormats/Common/interface/Wrapper.h" +#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h" + +#define CREATE_HIST_1D(varname, nbins, first, last) auto varname = new TH1D(#varname, #varname, nbins, first, last) + +#define CREATE_HIST_2D(varname, nbins, first, last) \ + auto varname = new TH2D(#varname, #varname, nbins, first, last, nbins, first, last) + +QIE11DigiCollection filterQIE11(QIE11DigiCollection const& coll) { + QIE11DigiCollection out; + out.reserve(coll.size()); + + for (uint32_t i = 0; i < coll.size(); i++) { + auto const df = coll[i]; + auto const id = HcalDetId{df.id()}; + if (id.subdetId() != HcalEndcap) + continue; + + out.push_back(QIE11DataFrame{df}); + } + + return out; +} + +int main(int argc, char* argv[]) { + if (argc < 3) { + std::cout << "run with: ./ \n"; + exit(0); + } + + auto filterf01HE = [](QIE11DigiCollection const& coll) { + QIE11DigiCollection out{coll.samples(), coll.subdetId()}; + out.reserve(coll.size()); + + for (uint32_t i = 0; i < coll.size(); i++) { + auto const df = QIE11DataFrame{coll[i]}; + auto const id = HcalDetId{df.id()}; + if ((df.flavor() == 0 or df.flavor() == 1) and id.subdetId() == HcalEndcap) + out.push_back(df); + } + + return out; + }; + + auto filterf3HB = [](QIE11DigiCollection const& coll) { + QIE11DigiCollection out{coll.samples(), coll.subdetId()}; + out.reserve(coll.size()); + + for (uint32_t i = 0; i < coll.size(); i++) { + auto const df = QIE11DataFrame{coll[i]}; + auto const did = HcalDetId{df.id()}; + if (df.flavor() == 3 and did.subdetId() == HcalBarrel) + out.push_back(df); + } + + return out; + }; + + // branches to use + using Collectionf01 = + hcal::DigiCollection>; + using Collectionf5 = + hcal::DigiCollection>; + using Collectionf3 = + hcal::DigiCollection>; + edm::Wrapper* wgpuf01he = nullptr; + edm::Wrapper* wgpuf5hb = nullptr; + edm::Wrapper* wgpuf3hb = nullptr; + edm::Wrapper* wcpuf01he = nullptr; + edm::Wrapper* wcpuf5hb = nullptr; + + std::string inFileName{argv[1]}; + std::string outFileName{argv[2]}; + + // prep output + TFile rfout{outFileName.c_str(), "recreate"}; + + CREATE_HIST_1D(hADCf01HEGPU, 256, 0, 256); + CREATE_HIST_1D(hADCf01HECPU, 256, 0, 256); + CREATE_HIST_1D(hADCf5HBGPU, 128, 0, 128); + CREATE_HIST_1D(hADCf5HBCPU, 128, 0, 128); + CREATE_HIST_1D(hADCf3HBGPU, 256, 0, 256); + CREATE_HIST_1D(hADCf3HBCPU, 256, 0, 256); + CREATE_HIST_1D(hTDCf01HEGPU, 64, 0, 64); + CREATE_HIST_1D(hTDCf01HECPU, 64, 0, 64); + + CREATE_HIST_2D(hADCf01HEGPUvsCPU, 256, 0, 256); + CREATE_HIST_2D(hADCf3HBGPUvsCPU, 256, 0, 256); + CREATE_HIST_2D(hADCf5HBGPUvsCPU, 128, 0, 128); + CREATE_HIST_2D(hTDCf01HEGPUvsCPU, 64, 0, 64); + CREATE_HIST_2D(hTDCf3HBGPUvsCPU, 4, 0, 4); + + // prep input + TFile rfin{inFileName.c_str()}; + TTree* rt = (TTree*)rfin.Get("Events"); + rt->SetBranchAddress("QIE11DataFrameHcalDataFrameContainer_hcalDigis__RECO.", &wcpuf01he); + rt->SetBranchAddress("HBHEDataFramesSorted_hcalDigis__RECO.", &wcpuf5hb); + rt->SetBranchAddress( + "hcalFlavor5calocommonCUDAHostAllocatorAliascalocommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_" + "f5HBDigis_RECO.", + &wgpuf5hb); + rt->SetBranchAddress( + "hcalFlavor1calocommonCUDAHostAllocatorAliascalocommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_" + "f01HEDigis_RECO.", + &wgpuf01he); + rt->SetBranchAddress( + "hcalFlavor3calocommonCUDAHostAllocatorAliascalocommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_" + "f3HBDigis_RECO.", + &wgpuf3hb); + + // accumulate + auto const nentries = rt->GetEntries(); + std::cout << ">>> nentries = " << nentries << std::endl; + for (int ie = 0; ie < nentries; ++ie) { + rt->GetEntry(ie); + + auto const& f01HEProduct = wgpuf01he->bareProduct(); + auto const& f5HBProduct = wgpuf5hb->bareProduct(); + auto const& f3HBProduct = wgpuf3hb->bareProduct(); + auto const& qie11Product = wcpuf01he->bareProduct(); + auto const qie11Filteredf01 = filterf01HE(qie11Product); + auto const qie11Filteredf3 = filterf3HB(qie11Product); + auto const& qie8Product = wcpuf5hb->bareProduct(); + + auto const ngpuf01he = f01HEProduct.ids.size(); + auto const ngpuf5hb = f5HBProduct.ids.size(); + auto const ngpuf3hb = f3HBProduct.ids.size(); + auto const ncpuf01he = qie11Filteredf01.size(); + auto const ncpuf5hb = qie8Product.size(); + auto const ncpuf3hb = qie11Filteredf3.size(); + + /* + printf("ngpuf01he = %u nqie11 = %u ncpuf01he = %u ngpuf5hb = %u ncpuf5hb = %u\n", + f01HEProduct.size(), qie11Product.size(), qie11Filtered.size(), + f5HBProduct.size(), + static_cast(qie8Product.size())); + */ + + if (ngpuf01he != ncpuf01he) { + std::cerr << "*** mismatch in number of flavor 01 digis for event " << ie << std::endl + << ">>> ngpuf01he = " << ngpuf01he << std::endl + << ">>> ncpuf01he = " << ncpuf01he << std::endl; + } + + { + auto const& idsgpu = f01HEProduct.ids; + auto const& datagpu = f01HEProduct.data; + + for (uint32_t ich = 0; ich < ncpuf01he; ich++) { + auto const cpudf = QIE11DataFrame{qie11Filteredf01[ich]}; + auto const cpuid = cpudf.id(); + auto iter2idgpu = std::find(idsgpu.begin(), idsgpu.end(), cpuid); + + if (iter2idgpu == idsgpu.end()) { + std::cerr << "missing " << HcalDetId{cpuid} << std::endl; + continue; + } + + // FIXME: cna fail... + assert(*iter2idgpu == cpuid); + + auto const ptrdiff = iter2idgpu - idsgpu.begin(); + auto const nsamples_gpu = hcal::compute_nsamples(f01HEProduct.stride); + auto const nsamples_cpu = qie11Filteredf01.samples(); + assert(static_cast(nsamples_cpu) == nsamples_gpu); + + uint32_t ichgpu = ptrdiff; + uint32_t offset = ichgpu * f01HEProduct.stride; + uint16_t const* df_start = datagpu.data() + offset; + for (uint32_t sample = 0u; sample < nsamples_gpu; sample++) { + auto const cpuadc = cpudf[sample].adc(); + auto const gpuadc = hcal::adc_for_sample(df_start, sample); + auto const cputdc = cpudf[sample].tdc(); + auto const gputdc = hcal::tdc_for_sample(df_start, sample); + auto const cpucapid = cpudf[sample].capid(); + auto const gpucapid = hcal::capid_for_sample(df_start, sample); + + hADCf01HEGPU->Fill(gpuadc); + hADCf01HECPU->Fill(cpuadc); + hTDCf01HEGPU->Fill(gputdc); + hTDCf01HECPU->Fill(cputdc); + hADCf01HEGPUvsCPU->Fill(cpuadc, gpuadc); + hTDCf01HEGPUvsCPU->Fill(cputdc, gputdc); + + // At RAW Decoding level there must not be any mistmatches + // in the adc values at all! + assert(static_cast(cpuadc) == gpuadc); + assert(static_cast(cputdc) == gputdc); + assert(static_cast(cpucapid) == gpucapid); + } + } + } + + if (ngpuf3hb != ncpuf3hb) { + std::cerr << "*** mismatch in number of flavor 3 digis for event " << ie << std::endl + << ">>> ngpuf01he = " << ngpuf3hb << std::endl + << ">>> ncpuf01he = " << ncpuf3hb << std::endl; + } + + { + auto const& idsgpu = f3HBProduct.ids; + auto const& datagpu = f3HBProduct.data; + + for (uint32_t ich = 0; ich < ncpuf3hb; ich++) { + auto const cpudf = QIE11DataFrame{qie11Filteredf3[ich]}; + auto const cpuid = cpudf.id(); + auto iter2idgpu = std::find(idsgpu.begin(), idsgpu.end(), cpuid); + + if (iter2idgpu == idsgpu.end()) { + std::cerr << "missing " << HcalDetId{cpuid} << std::endl; + continue; + } + + // FIXME: cna fail... + assert(*iter2idgpu == cpuid); + + auto const ptrdiff = iter2idgpu - idsgpu.begin(); + auto const nsamples_gpu = hcal::compute_nsamples(f3HBProduct.stride); + auto const nsamples_cpu = qie11Filteredf3.samples(); + assert(static_cast(nsamples_cpu) == nsamples_gpu); + + uint32_t ichgpu = ptrdiff; + uint32_t offset = ichgpu * f3HBProduct.stride; + uint16_t const* df_start = datagpu.data() + offset; + for (uint32_t sample = 0u; sample < nsamples_gpu; sample++) { + auto const cpuadc = cpudf[sample].adc(); + auto const gpuadc = hcal::adc_for_sample(df_start, sample); + auto const cputdc = cpudf[sample].tdc(); + auto const gputdc = hcal::tdc_for_sample(df_start, sample); + + hADCf3HBGPU->Fill(gpuadc); + hADCf3HBCPU->Fill(cpuadc); + hADCf3HBGPUvsCPU->Fill(cpuadc, gpuadc); + hTDCf3HBGPUvsCPU->Fill(cputdc, gputdc); + + // At RAW Decoding level there must not be any mistmatches + // in the adc values at all! + assert(static_cast(cpuadc) == gpuadc); + assert(static_cast(cputdc) == gputdc); + } + } + } + + if (ngpuf5hb != ncpuf5hb) { + std::cerr << "*** mismatch in number of flavor 5 digis for event " << ie << std::endl + << ">>> ngpuf5hb = " << ngpuf5hb << std::endl + << ">>> ncpuf5hb = " << ncpuf5hb << std::endl; + } + + { + auto const& idsgpu = f5HBProduct.ids; + auto const& datagpu = f5HBProduct.data; + for (uint32_t i = 0; i < ncpuf5hb; i++) { + auto const cpudf = qie8Product[i]; + auto const cpuid = cpudf.id().rawId(); + auto iter2idgpu = std::find(idsgpu.begin(), idsgpu.end(), cpuid); + if (iter2idgpu == idsgpu.end()) { + std::cerr << "missing " << HcalDetId{cpuid} << std::endl; + continue; + } + + assert(*iter2idgpu == cpuid); + + auto const ptrdiff = iter2idgpu - idsgpu.begin(); + auto const nsamples_gpu = hcal::compute_nsamples(f5HBProduct.stride); + auto const nsamples_cpu = qie8Product[0].size(); + assert(static_cast(nsamples_cpu) == nsamples_gpu); + + uint32_t offset = ptrdiff * f5HBProduct.stride; + uint16_t const* df_start = datagpu.data() + offset; + for (uint32_t sample = 0u; sample < nsamples_gpu; sample++) { + auto const cpuadc = cpudf.sample(sample).adc(); + auto const gpuadc = hcal::adc_for_sample(df_start, sample); + auto const cpucapid = cpudf.sample(sample).capid(); + auto const gpucapid = hcal::capid_for_sample(df_start, sample); + + hADCf5HBGPU->Fill(gpuadc); + hADCf5HBCPU->Fill(cpuadc); + hADCf5HBGPUvsCPU->Fill(cpuadc, gpuadc); + + // the must for us at RAW Decoding stage + assert(static_cast(cpuadc) == gpuadc); + assert(static_cast(cpucapid) == gpucapid); + } + } + } + } + + { + TCanvas c{"plots", "plots", 4200, 6200}; + c.Divide(3, 3); + c.cd(1); + { + gPad->SetLogy(); + hADCf01HECPU->SetLineColor(kBlack); + hADCf01HECPU->SetLineWidth(1.); + hADCf01HECPU->Draw(""); + hADCf01HEGPU->SetLineColor(kBlue); + hADCf01HEGPU->SetLineWidth(1.); + hADCf01HEGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hADCf01HEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2 - y1)); + } + c.cd(2); + { + gPad->SetLogy(); + hADCf5HBCPU->SetLineColor(kBlack); + hADCf5HBCPU->SetLineWidth(1.); + hADCf5HBCPU->Draw(""); + hADCf5HBGPU->SetLineColor(kBlue); + hADCf5HBGPU->SetLineWidth(1.); + hADCf5HBGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hADCf5HBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2 - y1)); + } + c.cd(3); + { + gPad->SetLogy(); + hADCf3HBCPU->SetLineColor(kBlack); + hADCf3HBCPU->SetLineWidth(1.); + hADCf3HBCPU->Draw(""); + hADCf3HBGPU->SetLineColor(kBlue); + hADCf3HBGPU->SetLineWidth(1.); + hADCf3HBGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hADCf3HBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2 - y1)); + } + c.cd(4); + hADCf01HEGPUvsCPU->Draw("colz"); + c.cd(5); + hADCf5HBGPUvsCPU->Draw("colz"); + c.cd(6); + hADCf3HBGPUvsCPU->Draw("colz"); + c.cd(7); + { + gPad->SetLogy(); + hTDCf01HECPU->SetLineColor(kBlack); + hTDCf01HECPU->SetLineWidth(1.); + hTDCf01HECPU->Draw(""); + hTDCf01HEGPU->SetLineColor(kBlue); + hTDCf01HEGPU->SetLineWidth(1.); + hTDCf01HEGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hTDCf01HEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2 - y1)); + } + c.cd(8); + hTDCf01HEGPUvsCPU->Draw("colz"); + c.cd(9); + hTDCf3HBGPUvsCPU->Draw("colz"); + + c.SaveAs("plots.pdf"); + } + + rfin.Close(); + rfout.Write(); + rfout.Close(); +} diff --git a/EventFilter/HcalRawToDigi/plugins/BuildFile.xml b/EventFilter/HcalRawToDigi/plugins/BuildFile.xml index ccf6a061119c2..3077a68a665e4 100644 --- a/EventFilter/HcalRawToDigi/plugins/BuildFile.xml +++ b/EventFilter/HcalRawToDigi/plugins/BuildFile.xml @@ -1,16 +1,27 @@ + + + + + - - - + - - - + + + + + + + + + + + diff --git a/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h new file mode 100644 index 0000000000000..9903b77efb341 --- /dev/null +++ b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h @@ -0,0 +1,86 @@ +#ifndef EventFilter_HcalRawToDigi_interface_DeclsForKernels_h +#define EventFilter_HcalRawToDigi_interface_DeclsForKernels_h + +#include + +#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" + +#include "ElectronicsMappingGPU.h" + +namespace hcal { + namespace raw { + + constexpr int32_t empty_event_size = 32; + constexpr uint32_t utca_nfeds_max = 50; + constexpr uint32_t nbytes_per_fed_max = 10 * 1024; + + // each collection corresponds to a particular flavor with a certain number of + // samples per digi + constexpr uint32_t numOutputCollections = 3; + constexpr uint8_t OutputF01HE = 0; + constexpr uint8_t OutputF5HB = 1; + constexpr uint8_t OutputF3HB = 2; + + struct ConfigurationParameters { + uint32_t maxChannelsF01HE; + uint32_t maxChannelsF5HB; + uint32_t maxChannelsF3HB; + uint32_t nsamplesF01HE; + uint32_t nsamplesF5HB; + uint32_t nsamplesF3HB; + }; + + struct InputDataCPU { + cms::cuda::host::unique_ptr data; + cms::cuda::host::unique_ptr offsets; + cms::cuda::host::unique_ptr feds; + }; + + struct OutputDataCPU { + cms::cuda::host::unique_ptr nchannels; + }; + + struct ScratchDataGPU { + // depends on the number of output collections + // that is a statically known predefined number + cms::cuda::device::unique_ptr pChannelsCounters; + }; + + struct OutputDataGPU { + DigiCollection digisF01HE; + DigiCollection digisF5HB; + DigiCollection digisF3HB; + + void allocate(ConfigurationParameters const &config, cudaStream_t cudaStream) { + digisF01HE.data = cms::cuda::make_device_unique( + config.maxChannelsF01HE * compute_stride(config.nsamplesF01HE), cudaStream); + digisF01HE.ids = cms::cuda::make_device_unique(config.maxChannelsF01HE, cudaStream); + + digisF5HB.data = cms::cuda::make_device_unique( + config.maxChannelsF5HB * compute_stride(config.nsamplesF5HB), cudaStream); + digisF5HB.ids = cms::cuda::make_device_unique(config.maxChannelsF5HB, cudaStream); + digisF5HB.npresamples = cms::cuda::make_device_unique(config.maxChannelsF5HB, cudaStream); + + digisF3HB.data = cms::cuda::make_device_unique( + config.maxChannelsF3HB * compute_stride(config.nsamplesF3HB), cudaStream); + digisF3HB.ids = cms::cuda::make_device_unique(config.maxChannelsF3HB, cudaStream); + } + }; + + struct InputDataGPU { + cms::cuda::device::unique_ptr data; + cms::cuda::device::unique_ptr offsets; + cms::cuda::device::unique_ptr feds; + }; + + struct ConditionsProducts { + ElectronicsMappingGPU::Product const &eMappingProduct; + }; + + } // namespace raw +} // namespace hcal + +#endif // EventFilter_HcalRawToDigi_interface_DeclsForKernels_h diff --git a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu new file mode 100644 index 0000000000000..4f2ca85861b30 --- /dev/null +++ b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu @@ -0,0 +1,593 @@ +#include "DataFormats/HcalDetId/interface/HcalElectronicsId.h" +#include "DataFormats/HcalDetId/interface/HcalSubdetector.h" +#include "DataFormats/HcalDetId/interface/HcalDetId.h" + +#include "EventFilter/HcalRawToDigi/plugins/DecodeGPU.h" + +#include +using namespace cooperative_groups; + +namespace hcal { + namespace raw { + + __forceinline__ __device__ char const* get_subdet_str(DetId const& did) { + switch (did.subdetId()) { + case HcalEmpty: + return "HcalEmpty"; + break; + case HcalBarrel: + return "HcalBarrel"; + break; + case HcalEndcap: + return "HcalEndcap"; + break; + case HcalOuter: + return "HcalOuter"; + break; + case HcalForward: + return "HcalForward"; + break; + case HcalTriggerTower: + return "HcalTriggerTower"; + break; + case HcalOther: + return "HcalOther"; + break; + default: + return "Unknown"; + break; + } + + return "Unknown"; + } + + __forceinline__ __device__ bool is_channel_header_word(uint16_t const* ptr) { + uint8_t bit = (*ptr >> 15) & 0x1; + return bit == 1; + } + + template + constexpr bool is_power_of_two(T x) { + return (x != 0) && ((x & (x - 1)) == 0); + } + + template + __global__ void kernel_rawdecode_test(unsigned char const* data, + uint32_t const* offsets, + int const* feds, + uint32_t const* eid2did, + uint32_t const* eid2tid, + uint16_t* digisF01HE, + uint32_t* idsF01HE, + uint16_t* digisF5HB, + uint32_t* idsF5HB, + uint8_t* npresamplesF5HB, + uint16_t* digisF3HB, + uint32_t* idsF3HB, + uint32_t* pChannelsCounters, + uint32_t const nsamplesF01HE, + uint32_t const nsamplesF5HB, + uint32_t const nsamplesF3HB, + uint32_t const nBytesTotal) { + // in order to properly use cooperative groups + static_assert(is_power_of_two(NTHREADS) == true && NTHREADS <= 32); + + thread_block_tile thread_group = tiled_partition(this_thread_block()); + + auto const iamc = threadIdx.x / NTHREADS; + auto const ifed = blockIdx.x; + auto const offset = offsets[ifed]; + +#ifdef HCAL_RAWDECODE_GPUDEBUG_CG + if (ifed > 0 || iamc > 0) + return; + printf("threadIdx.x = %d rank = %d iamc = %d\n", threadIdx.x, thread_group.thread_rank(), iamc); +#endif + +#ifdef HCAL_RAWDECODE_GPUDEBUG + auto const fed = feds[ifed]; + auto const size = ifed == gridDim.x - 1 ? nBytesTotal - offset : offsets[ifed + 1] - offset; + printf("ifed = %d fed = %d offset = %u size = %u\n", ifed, fed, offset, size); +#endif + + // offset to the right raw buffer + uint64_t const* buffer = reinterpret_cast(data + offset); + +#ifdef HCAL_RAWDECODE_GPUDEBUG + // + // fed header + // + auto const fed_header = buffer[0]; + uint32_t const fed_id = (fed_header >> 8) & 0xfff; + uint32_t const bx = (fed_header >> 20) & 0xfff; + uint32_t const lv1 = (fed_header >> 32) & 0xffffff; + uint8_t const trigger_type = (fed_header >> 56) & 0xf; + uint8_t const bid_fed_header = (fed_header >> 60) & 0xf; + + printf("fed = %d fed_id = %u bx = %u lv1 = %u trigger_type = %u bid = %u\n", + fed, + fed_id, + bx, + lv1, + trigger_type, + bid_fed_header); +#endif + + // amc 13 header + auto const amc13word = buffer[1]; + uint8_t const namc = (amc13word >> 52) & 0xf; + if (iamc >= namc) + return; + +#ifdef HCAL_RAWDECODE_GPUDEBUG + uint8_t const amc13version = (amc13word >> 60) & 0xf; + uint32_t const amc13OrbitNumber = (amc13word >> 4) & 0xffffffffu; + printf("fed = %d namc = %u amc13version = %u amc13OrbitNumber = %u\n", fed, namc, amc13version, amc13OrbitNumber); +#endif + + // compute the offset int to the right buffer + uint32_t amcoffset = 0; + for (uint8_t ii = 0u; ii < iamc; ii++) { + auto const word = buffer[2 + ii]; + int const amcSize = (word >> 32) & 0xffffff; + amcoffset += amcSize; + } + + auto const word = buffer[2 + iamc]; + int const amcSize = (word >> 32) & 0xffffff; + +#ifdef HCAL_RAWDECODE_GPUDEBUG + uint16_t const amcid = word & 0xffff; + int const slot = (word >> 16) & 0xf; + int const amcBlockNumber = (word >> 20) & 0xff; + printf("fed = %d amcid = %u slot = %d amcBlockNumber = %d\n", fed, amcid, slot, amcBlockNumber); + + bool const amcmore = ((word >> 61) & 0x1) != 0; + bool const amcSegmented = ((word >> 60) & 0x1) != 0; + bool const amcLengthOk = ((word >> 62) & 0x1) != 0; + bool const amcCROk = ((word >> 56) & 0x1) != 0; + bool const amcDataPresent = ((word >> 58) & 0x1) != 0; + bool const amcDataValid = ((word >> 56) & 0x1) != 0; + bool const amcEnabled = ((word >> 59) & 0x1) != 0; + printf( + "fed = %d amcmore = %d amcSegmented = %d, amcLengthOk = %d amcCROk = %d\n>> amcDataPresent = %d amcDataValid " + "= %d amcEnabled = %d\n", + fed, + static_cast(amcmore), + static_cast(amcSegmented), + static_cast(amcLengthOk), + static_cast(amcCROk), + static_cast(amcDataPresent), + static_cast(amcDataValid), + static_cast(amcEnabled)); +#endif + + // get to the payload + auto const* payload64 = buffer + 2 + namc + amcoffset; + +#ifdef HCAL_RAWDECODE_GPUDEBUG + // uhtr header v1 1st 64 bits + auto const payload64_w0 = payload64[0]; +#endif + // uhtr n bytes comes from amcSize, according to the cpu version! + uint32_t const data_length64 = amcSize; + +#ifdef HCAL_RAWDECODE_GPUDEBUG + uint16_t bcn = (payload64_w0 >> 20) & 0xfff; + uint32_t evn = (payload64_w0 >> 32) & 0xffffff; + printf("fed = %d data_length64 = %u bcn = %u evn = %u\n", fed, data_length64, bcn, evn); +#endif + + // uhtr header v1 2nd 64 bits + auto const payload64_w1 = payload64[1]; + uint8_t const uhtrcrate = payload64_w1 & 0xff; + uint8_t const uhtrslot = (payload64_w1 >> 8) & 0xf; + uint8_t const presamples = (payload64_w1 >> 12) & 0xf; + uint8_t const payloadFormat = (payload64_w1 >> 44) & 0xf; + +#ifdef HCAL_RAWDECODE_GPUDEBUG + uint16_t const orbitN = (payload64_w1 >> 16) & 0xffff; + uint8_t const firmFlavor = (payload64_w1 >> 32) & 0xff; + uint8_t const eventType = (payload64_w1 >> 40) & 0xf; + printf( + "fed = %d crate = %u slot = %u presamples = %u\n>>> orbitN = %u firmFlavor = %u eventType = %u payloadFormat " + "= %u\n", + fed, + uhtrcrate, + uhtrslot, + presamples, + orbitN, + firmFlavor, + eventType, + payloadFormat); +#endif + + // this should be filtering out uMNio... + if (payloadFormat != 1) + return; + + // skip uhtr header words + auto const channelDataSize = data_length64 - 2; // 2 uhtr header v1 words + auto const* channelDataBuffer64Start = payload64 + 2; // 2 uhtr header v2 wds + auto const* ptr = reinterpret_cast(channelDataBuffer64Start); + auto const* end = ptr + sizeof(uint64_t) / sizeof(uint16_t) * (channelDataSize - 1); + auto const t_rank = thread_group.thread_rank(); + + // iterate through the channel data + while (ptr != end) { + // this is the starting point for this thread group for this iteration + // with respect to this pointer every thread will move forward afterwards + auto const* const start_ptr = ptr; + +#ifdef HCAL_RAWDECODE_GPUDEBUG_CG + thread_group.sync(); +#endif + + // skip to the header word of the right channel for this thread + int counter = 0; + while (counter < thread_group.thread_rank()) { + // just a check for threads that land beyond the end + if (ptr == end) + break; + + // move ptr one forward past header + if (is_channel_header_word(ptr)) + ++ptr; + else { + // go to the next channel and do not consider this guy as a channel + while (ptr != end) + if (!is_channel_header_word(ptr)) + ++ptr; + else + break; + continue; + } + + // skip + while (ptr != end) + if (!is_channel_header_word(ptr)) + ++ptr; + else + break; + counter++; + } + +#ifdef HCAL_RAWDECODE_GPUDEBUG_CG + thread_group.sync(); + printf("ptr - start_ptr = %d counter = %d rank = %d\n", static_cast(ptr - start_ptr), counter, t_rank); +#endif + + // when the end is near, channels will land outside of the [start_ptr, end) region + if (ptr != end) { + // for all of the flavors, these 2 guys have the same bit layout + uint8_t const flavor = (ptr[0] >> 12) & 0x7; + uint8_t const channelid = ptr[0] & 0xff; + auto const* const new_channel_start = ptr; + + // flavor dependent stuff + switch (flavor) { + case 0: + case 1: { + // treat eid and did + uint8_t fiber = (channelid >> 3) & 0x1f; + uint8_t fchannel = channelid & 0x7; + HcalElectronicsId eid{uhtrcrate, uhtrslot, fiber, fchannel, false}; + auto const did = HcalDetId{eid2did[eid.linearIndex()]}; + +#ifdef HCAL_RAWDECODE_GPUDEBUG + printf("erawId = %u linearIndex = %u drawid = %u subdet = %s\n", + eid.rawId(), + eid.linearIndex(), + did.rawId(), + get_subdet_str(did)); + printf("flavor = %u crate = %u slot = %u channelid = %u fiber = %u fchannel = %u\n", + flavor, + uhtrcrate, + uhtrslot, + channelid, + fiber, + fchannel); +#endif + + // remove digis not for HE + if (did.subdetId() != HcalEndcap) + break; + + // count words + auto const* channel_header_word = ptr++; + while (!is_channel_header_word(ptr) && ptr != end) + ++ptr; + auto const* channel_end = ptr; // set ptr + uint32_t const nwords = channel_end - channel_header_word; + + // filter out this digi if nwords does not equal expected + auto const expected_words = compute_stride(nsamplesF01HE); + if (nwords != expected_words) + break; + + // inc the number of digis of this type + auto const pos = atomicAdd(&pChannelsCounters[OutputF01HE], 1); +#ifdef HCAL_RAWDECODE_GPUDEBUG_CG + printf("rank = %d pos = %d\n", thread_group.thread_rank(), pos); +#endif + + // store to global mem words for this digi + idsF01HE[pos] = did.rawId(); + + for (uint32_t iword = 0; iword < expected_words; iword++) + digisF01HE[pos * expected_words + iword] = channel_header_word[iword]; + +#ifdef HCAL_RAWDECODE_GPUDEBUG + printf("nwords = %u\n", nwords); +#endif + + break; + } + case 3: { + // treat eid and did + uint8_t fiber = (channelid >> 3) & 0x1f; + uint8_t fchannel = channelid & 0x7; + HcalElectronicsId eid{uhtrcrate, uhtrslot, fiber, fchannel, false}; + auto const did = HcalDetId{eid2did[eid.linearIndex()]}; + +#ifdef HCAL_RAWDECODE_GPUDEBUG + printf("erawId = %u linearIndex = %u drawid = %u subdet = %s\n", + eid.rawId(), + eid.linearIndex(), + did.rawId(), + get_subdet_str(did)); + printf("flavor = %u crate = %u slot = %u channelid = %u fiber = %u fchannel = %u\n", + flavor, + uhtrcrate, + uhtrslot, + channelid, + fiber, + fchannel); +#endif + + // remove digis not for HE + if (did.subdetId() != HcalBarrel) + break; + + // count words + auto const* channel_header_word = ptr++; + while (!is_channel_header_word(ptr) && ptr != end) + ++ptr; + auto const* channel_end = ptr; // set ptr + uint32_t const nwords = channel_end - channel_header_word; + + // filter out this digi if nwords does not equal expected + auto const expected_words = compute_stride(nsamplesF3HB); + if (nwords != expected_words) + break; + + // inc the number of digis of this type + auto const pos = atomicAdd(&pChannelsCounters[OutputF3HB], 1); + + // store to global mem words for this digi + idsF3HB[pos] = did.rawId(); + for (uint32_t iword = 0; iword < expected_words; iword++) + digisF3HB[pos * expected_words + iword] = channel_header_word[iword]; + +#ifdef HCAL_RAWDECODE_GPUDEBUG + printf("nwords = %u\n", nwords); +#endif + + break; + } + case 2: { + uint8_t fiber = (channelid >> 3) & 0x1f; + uint8_t fchannel = channelid & 0x7; + HcalElectronicsId eid{uhtrcrate, uhtrslot, fiber, fchannel, false}; + auto const did = DetId{eid2did[eid.linearIndex()]}; + +#ifdef HCAL_RAWDECODE_GPUDEBUG + printf("erawId = %u linearIndex = %u drawid = %u subdet = %s\n", + eid.rawId(), + eid.linearIndex(), + did.rawId(), + get_subdet_str(did)); + printf("flavor = %u crate = %u slot = %u channelid = %u fiber = %u fchannel = %u\n", + flavor, + uhtrcrate, + uhtrslot, + channelid, + fiber, + fchannel); +#endif + + break; + } + case 4: { + uint8_t link = (channelid >> 4) & 0xf; + uint8_t tower = channelid & 0xf; + HcalElectronicsId eid{uhtrcrate, uhtrslot, link, tower, true}; + auto const did = DetId{eid2tid[eid.linearIndex()]}; + +#ifdef HCAL_RAWDECODE_GPUDEBUG + printf("erawId = %u linearIndex = %u drawid = %u subdet = %s\n", + eid.rawId(), + eid.linearIndex(), + did.rawId(), + get_subdet_str(did)); + printf("flavor = %u crate = %u slot = %u channelid = %u link = %u tower = %u\n", + flavor, + uhtrcrate, + uhtrslot, + channelid, + link, + tower); +#endif + + break; + } + case 5: { + uint8_t fiber = (channelid >> 2) & 0x3f; + uint8_t fchannel = channelid & 0x3; + HcalElectronicsId eid{uhtrcrate, uhtrslot, fiber, fchannel, false}; + auto const did = HcalDetId{eid2did[eid.linearIndex()]}; + +#ifdef HCAL_RAWDECODE_GPUDEBUG + printf("erawId = %u linearIndex = %u drawid = %u subdet = %s\n", + eid.rawId(), + eid.linearIndex(), + did.rawId(), + get_subdet_str(did)); + printf("flavor = %u crate = %u slot = %u channelid = %u fiber = %u fchannel = %u\n", + flavor, + uhtrcrate, + uhtrslot, + channelid, + fiber, + fchannel); +#endif + + // remove digis not for HB + if (did.subdetId() != HcalBarrel) + break; + + // count words + auto const* channel_header_word = ptr++; + while (!is_channel_header_word(ptr) && ptr != end) + ++ptr; + auto const* channel_end = ptr; // set ptr + uint32_t const nwords = channel_end - channel_header_word; + + // filter out this digi if nwords does not equal expected + auto const expected_words = compute_stride(nsamplesF5HB); + if (nwords != expected_words) + break; + + // inc the number of digis of this type + auto const pos = atomicAdd(&pChannelsCounters[OutputF5HB], 1); + +#ifdef HCAL_RAWDECODE_GPUDEBUG_CG + printf("rank = %d pos = %d\n", thread_group.thread_rank(), pos); +#endif + + // store to global mem words for this digi + idsF5HB[pos] = did.rawId(); + npresamplesF5HB[pos] = presamples; + for (uint32_t iword = 0; iword < expected_words; iword++) + digisF5HB[pos * expected_words + iword] = channel_header_word[iword]; + + break; + } + case 7: { + uint8_t const fiber = (channelid >> 2) & 0x3f; + uint8_t const fchannel = channelid & 0x3; + HcalElectronicsId eid{uhtrcrate, uhtrslot, fiber, fchannel, false}; + auto const did = DetId{eid2did[eid.linearIndex()]}; + + /* uncomment to check the linear index validity + if (eid.rawId() >= HcalElectronicsId::maxLinearIndex) { +#ifdef HCAL_RAWDECODE_GPUDEBUG + printf("*** rawid = %u has no known det id***\n", eid.rawId()); +#endif + break; + } + */ + +#ifdef HCAL_RAWDECODE_GPUDEBUG + printf("erawId = %u linearIndex = %u drawid = %u\n", eid.rawId(), eid.linearIndex(), did.rawId()); + printf("flavor = %u crate = %u slot = %u channelid = %u fiber = %u fchannel = %u\n", + flavor, + uhtrcrate, + uhtrslot, + channelid, + fiber, + fchannel); +#endif + + break; + } + default: +#ifdef HCAL_RAWDECODE_GPUDEBUG + printf("flavor = %u crate = %u slot = %u channelid = %u\n", flavor, uhtrcrate, uhtrslot, channelid); +#endif + ; + } + + // skip to the next word in case + // 1) current channel was not treated + // 2) we are in the middle of the digi words and not at the end + while (new_channel_start == ptr || !is_channel_header_word(ptr) && ptr != end) + ++ptr; + } + + // thread with rank 31 of the group will have the ptr pointing to the + // header word of the next channel or the end + int const offset_to_shuffle = ptr - start_ptr; + + // always receive from the last guy in the group + auto const offset_for_rank31 = thread_group.shfl(offset_to_shuffle, NTHREADS - 1); + +#ifdef HCAL_RAWDECODE_GPUDEBUG_CG + printf("rank = %d offset_to_shuffle = %d offset_for_rank32 = %d\n", + thread_group.thread_rank(), + offset_to_shuffle, + offset_for_rank31); +#endif + + // update the ptr for all threads of this group + // NOTE: relative to the start_ptr that is the same for all threads of + // this group + ptr = start_ptr + offset_for_rank31; + } + } + + void entryPoint(InputDataCPU const& inputCPU, + InputDataGPU& inputGPU, + OutputDataGPU& outputGPU, + ScratchDataGPU& scratchGPU, + OutputDataCPU& outputCPU, + ConditionsProducts const& conditions, + ConfigurationParameters const& config, + cudaStream_t cudaStream, + uint32_t const nfedsWithData, + uint32_t const nbytesTotal) { + // transfer + cudaCheck(cudaMemcpyAsync(inputGPU.data.get(), + inputCPU.data.get(), + nbytesTotal * sizeof(unsigned char), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(inputGPU.offsets.get(), + inputCPU.offsets.get(), + nfedsWithData * sizeof(uint32_t), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck( + cudaMemsetAsync(scratchGPU.pChannelsCounters.get(), 0, sizeof(uint32_t) * numOutputCollections, cudaStream)); + cudaCheck(cudaMemcpyAsync( + inputGPU.feds.get(), inputCPU.feds.get(), nfedsWithData * sizeof(int), cudaMemcpyHostToDevice, cudaStream)); + + // 12 is the max number of modules per crate + kernel_rawdecode_test<32><<>>(inputGPU.data.get(), + inputGPU.offsets.get(), + inputGPU.feds.get(), + conditions.eMappingProduct.eid2did, + conditions.eMappingProduct.eid2tid, + outputGPU.digisF01HE.data.get(), + outputGPU.digisF01HE.ids.get(), + outputGPU.digisF5HB.data.get(), + outputGPU.digisF5HB.ids.get(), + outputGPU.digisF5HB.npresamples.get(), + outputGPU.digisF3HB.data.get(), + outputGPU.digisF3HB.ids.get(), + scratchGPU.pChannelsCounters.get(), + config.nsamplesF01HE, + config.nsamplesF5HB, + config.nsamplesF3HB, + nbytesTotal); + cudaCheck(cudaGetLastError()); + + cudaCheck(cudaMemcpyAsync(outputCPU.nchannels.get(), + scratchGPU.pChannelsCounters.get(), + sizeof(uint32_t) * numOutputCollections, + cudaMemcpyDeviceToHost, + cudaStream)); + } + + } // namespace raw +} // namespace hcal diff --git a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.h b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.h new file mode 100644 index 0000000000000..3d5e4eec32269 --- /dev/null +++ b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.h @@ -0,0 +1,23 @@ +#ifndef EventFilter_HcalRawToDigi_interface_DecodeGPU_h +#define EventFilter_HcalRawToDigi_interface_DecodeGPU_h + +#include "DeclsForKernels.h" + +namespace hcal { + namespace raw { + + void entryPoint(InputDataCPU const&, + InputDataGPU&, + OutputDataGPU&, + ScratchDataGPU&, + OutputDataCPU&, + ConditionsProducts const&, + ConfigurationParameters const&, + cudaStream_t cudaStream, + uint32_t const, + uint32_t const); + + } +} // namespace hcal + +#endif // EventFilter_HcalRawToDigi_interface_DecodeGPU_h diff --git a/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.cc b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.cc new file mode 100644 index 0000000000000..6b7b89cc6ea77 --- /dev/null +++ b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.cc @@ -0,0 +1,63 @@ +#include "DataFormats/HcalDetId/interface/HcalElectronicsId.h" +#include "FWCore/Utilities/interface/typelookup.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +#include "ElectronicsMappingGPU.h" + +namespace hcal { + namespace raw { + + // TODO: 0x3FFFFF * 4B ~= 16MB + // tmp solution for linear mapping of eid -> did + ElectronicsMappingGPU::ElectronicsMappingGPU(HcalElectronicsMap const& mapping) + : eid2tid_(HcalElectronicsId::maxLinearIndex, 0u), eid2did_(HcalElectronicsId::maxLinearIndex, 0u) { + auto const& eidsPrecision = mapping.allElectronicsIdPrecision(); + for (uint32_t i = 0; i < eidsPrecision.size(); ++i) { + auto const& eid = eidsPrecision[i]; + + // assign + eid2did_[eid.linearIndex()] = eid.isTriggerChainId() ? 0u : mapping.lookup(eid).rawId(); + } + + auto const& eidsTrigger = mapping.allElectronicsIdTrigger(); + for (uint32_t i = 0; i < eidsTrigger.size(); i++) { + auto const& eid = eidsTrigger[i]; + + // assign + eid2tid_[eid.linearIndex()] = eid.isTriggerChainId() ? mapping.lookupTrigger(eid).rawId() : 0u; + } + } + + ElectronicsMappingGPU::Product::~Product() { + // deallocation + cudaCheck(cudaFree(eid2did)); + cudaCheck(cudaFree(eid2tid)); + } + + ElectronicsMappingGPU::Product const& ElectronicsMappingGPU::getProduct(cudaStream_t cudaStream) const { + auto const& product = product_.dataForCurrentDeviceAsync( + cudaStream, [this](ElectronicsMappingGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck(cudaMalloc((void**)&product.eid2did, this->eid2did_.size() * sizeof(uint32_t))); + cudaCheck(cudaMalloc((void**)&product.eid2tid, this->eid2tid_.size() * sizeof(uint32_t))); + + // transfer + cudaCheck(cudaMemcpyAsync(product.eid2did, + this->eid2did_.data(), + this->eid2did_.size() * sizeof(uint32_t), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.eid2tid, + this->eid2tid_.data(), + this->eid2tid_.size() * sizeof(uint32_t), + cudaMemcpyHostToDevice, + cudaStream)); + }); + + return product; + } + + } // namespace raw +} // namespace hcal + +TYPELOOKUP_DATA_REG(hcal::raw::ElectronicsMappingGPU); diff --git a/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h new file mode 100644 index 0000000000000..0f4c12f02a92d --- /dev/null +++ b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h @@ -0,0 +1,48 @@ +#ifndef EventFilter_HcalRawToDigi_plugins_ElectronicsMappingGPU_h +#define EventFilter_HcalRawToDigi_plugins_ElectronicsMappingGPU_h + +#include "CondFormats/HcalObjects/interface/HcalElectronicsMap.h" + +#ifndef __CUDACC__ +#include "HeterogeneousCore/CUDACore/interface/ESProduct.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" +#endif + +namespace hcal { + namespace raw { + + class ElectronicsMappingGPU { + public: + struct Product { + ~Product(); + // trigger + uint32_t *eid2tid; + // detector + uint32_t *eid2did; + }; + +#ifndef __CUDACC__ + + // rearrange pedestals + ElectronicsMappingGPU(HcalElectronicsMap const &); + + // will call dealloation for Product thru ~Product + ~ElectronicsMappingGPU() = default; + + // get device pointers + Product const &getProduct(cudaStream_t) const; + + private: + // in the future, we need to arrange so to avoid this copy on the host + // if possible + std::vector> eid2tid_; + std::vector> eid2did_; + + cms::cuda::ESProduct product_; +#endif + }; + + } // namespace raw +} // namespace hcal + +#endif // EventFilter_HcalRawToDigi_plugins_ElectronicsMappingGPU_h diff --git a/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc b/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc new file mode 100644 index 0000000000000..c2b67a10afaff --- /dev/null +++ b/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc @@ -0,0 +1,117 @@ +#include + +#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h" +#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h" +#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +class HcalCPUDigisProducer : public edm::stream::EDProducer { +public: + explicit HcalCPUDigisProducer(edm::ParameterSet const& ps); + ~HcalCPUDigisProducer() override; + static void fillDescriptions(edm::ConfigurationDescriptions&); + +private: + void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override; + void produce(edm::Event&, edm::EventSetup const&) override; + +private: + using IProductTypef01 = cms::cuda::Product>; + edm::EDGetTokenT digisF01HETokenIn_; + using IProductTypef5 = cms::cuda::Product>; + edm::EDGetTokenT digisF5HBTokenIn_; + using IProductTypef3 = cms::cuda::Product>; + edm::EDGetTokenT digisF3HBTokenIn_; + + using OProductTypef01 = + hcal::DigiCollection>; + edm::EDPutTokenT digisF01HETokenOut_; + using OProductTypef5 = + hcal::DigiCollection>; + edm::EDPutTokenT digisF5HBTokenOut_; + using OProductTypef3 = + hcal::DigiCollection>; + edm::EDPutTokenT digisF3HBTokenOut_; + + // needed to pass data from acquire to produce + OProductTypef01 digisf01HE_; + OProductTypef5 digisf5HB_; + OProductTypef3 digisf3HB_; +}; + +void HcalCPUDigisProducer::fillDescriptions(edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; + + desc.add("digisLabelF01HEIn", edm::InputTag{"hcalRawToDigiGPU", "f01HEDigisGPU"}); + desc.add("digisLabelF5HBIn", edm::InputTag{"hcalRawToDigiGPU", "f5HBDigisGPU"}); + desc.add("digisLabelF3HBIn", edm::InputTag{"hcalRawToDigiGPU", "f3HBDigisGPU"}); + desc.add("digisLabelF01HEOut", "f01HEDigis"); + desc.add("digisLabelF5HBOut", "f5HBDigis"); + desc.add("digisLabelF3HBOut", "f3HBDigis"); + + confDesc.addWithDefaultLabel(desc); +} + +HcalCPUDigisProducer::HcalCPUDigisProducer(const edm::ParameterSet& ps) + : digisF01HETokenIn_{consumes(ps.getParameter("digisLabelF01HEIn"))}, + digisF5HBTokenIn_{consumes(ps.getParameter("digisLabelF5HBIn"))}, + digisF3HBTokenIn_{consumes(ps.getParameter("digisLabelF3HBIn"))}, + digisF01HETokenOut_{produces(ps.getParameter("digisLabelF01HEOut"))}, + digisF5HBTokenOut_{produces(ps.getParameter("digisLabelF5HBOut"))}, + digisF3HBTokenOut_{produces(ps.getParameter("digisLabelF3HBOut"))} {} + +HcalCPUDigisProducer::~HcalCPUDigisProducer() {} + +void HcalCPUDigisProducer::acquire(edm::Event const& event, + edm::EventSetup const& setup, + edm::WaitingTaskWithArenaHolder taskHolder) { + // retrieve data/ctx + auto const& f01HEProduct = event.get(digisF01HETokenIn_); + auto const& f5HBProduct = event.get(digisF5HBTokenIn_); + auto const& f3HBProduct = event.get(digisF3HBTokenIn_); + cms::cuda::ScopedContextAcquire ctx{f01HEProduct, std::move(taskHolder)}; + auto const& f01HEDigis = ctx.get(f01HEProduct); + auto const& f5HBDigis = ctx.get(f5HBProduct); + auto const& f3HBDigis = ctx.get(f3HBProduct); + + // resize out tmp buffers + digisf01HE_.stride = f01HEDigis.stride; + digisf5HB_.stride = f5HBDigis.stride; + digisf3HB_.stride = f3HBDigis.stride; + digisf01HE_.resize(f01HEDigis.size); + digisf5HB_.resize(f5HBDigis.size); + digisf3HB_.resize(f3HBDigis.size); + + auto lambdaToTransfer = [&ctx](auto& dest, auto* src) { + using vector_type = typename std::remove_reference::type; + using type = typename vector_type::value_type; + using src_data_type = typename std::remove_pointer::type; + static_assert(std::is_same::value && "Dest and Src data types do not match"); + cudaCheck(cudaMemcpyAsync(dest.data(), src, dest.size() * sizeof(type), cudaMemcpyDeviceToHost, ctx.stream())); + }; + + lambdaToTransfer(digisf01HE_.data, f01HEDigis.data.get()); + lambdaToTransfer(digisf01HE_.ids, f01HEDigis.ids.get()); + + lambdaToTransfer(digisf5HB_.data, f5HBDigis.data.get()); + lambdaToTransfer(digisf5HB_.ids, f5HBDigis.ids.get()); + lambdaToTransfer(digisf5HB_.npresamples, f5HBDigis.npresamples.get()); + + lambdaToTransfer(digisf3HB_.data, f3HBDigis.data.get()); + lambdaToTransfer(digisf3HB_.ids, f3HBDigis.ids.get()); +} + +void HcalCPUDigisProducer::produce(edm::Event& event, edm::EventSetup const& setup) { + event.emplace(digisF01HETokenOut_, std::move(digisf01HE_)); + event.emplace(digisF5HBTokenOut_, std::move(digisf5HB_)); + event.emplace(digisF3HBTokenOut_, std::move(digisf3HB_)); +} + +DEFINE_FWK_MODULE(HcalCPUDigisProducer); diff --git a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc new file mode 100644 index 0000000000000..9ca33340f7036 --- /dev/null +++ b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc @@ -0,0 +1,235 @@ +#include + +#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h" +#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +class HcalDigisProducerGPU : public edm::stream::EDProducer { +public: + explicit HcalDigisProducerGPU(edm::ParameterSet const& ps); + ~HcalDigisProducerGPU() override = default; + static void fillDescriptions(edm::ConfigurationDescriptions&); + +private: + void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override; + void produce(edm::Event&, edm::EventSetup const&) override; + +private: + // input product tokens + edm::EDGetTokenT hbheDigiToken_; + edm::EDGetTokenT qie11DigiToken_; + + // type aliases + using HostCollectionf01 = + hcal::DigiCollection>; + using DeviceCollectionf01 = hcal::DigiCollection; + using HostCollectionf5 = + hcal::DigiCollection>; + using DeviceCollectionf5 = hcal::DigiCollection; + using HostCollectionf3 = + hcal::DigiCollection>; + using DeviceCollectionf3 = hcal::DigiCollection; + + // output product tokens + using ProductTypef01 = cms::cuda::Product; + edm::EDPutTokenT digisF01HEToken_; + using ProductTypef5 = cms::cuda::Product; + edm::EDPutTokenT digisF5HBToken_; + using ProductTypef3 = cms::cuda::Product; + edm::EDPutTokenT digisF3HBToken_; + + cms::cuda::ContextState cudaState_; + + struct ConfigParameters { + uint32_t maxChannelsF01HE, maxChannelsF5HB, maxChannelsF3HB; + }; + ConfigParameters config_; + + // per event host buffers + HostCollectionf01 hf01_; + HostCollectionf5 hf5_; + HostCollectionf3 hf3_; + + // device products: product owns memory (i.e. not the module) + DeviceCollectionf01 df01_; + DeviceCollectionf5 df5_; + DeviceCollectionf3 df3_; +}; + +void HcalDigisProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; + + // FIXME + desc.add("hbheDigisLabel", edm::InputTag("hcalDigis")); + desc.add("qie11DigiLabel", edm::InputTag("hcalDigis")); + desc.add("digisLabelF01HE", std::string{"f01HEDigisGPU"}); + desc.add("digisLabelF5HB", std::string{"f5HBDigisGPU"}); + desc.add("digisLabelF3HB", std::string{"f3HBDigisGPU"}); + desc.add("maxChannelsF01HE", 10000u); + desc.add("maxChannelsF5HB", 10000u); + desc.add("maxChannelsF3HB", 10000u); + + confDesc.addWithDefaultLabel(desc); +} + +HcalDigisProducerGPU::HcalDigisProducerGPU(const edm::ParameterSet& ps) + : hbheDigiToken_{consumes(ps.getParameter("hbheDigisLabel"))}, + qie11DigiToken_{consumes(ps.getParameter("qie11DigiLabel"))}, + digisF01HEToken_{produces(ps.getParameter("digisLabelF01HE"))}, + digisF5HBToken_{produces(ps.getParameter("digisLabelF5HB"))}, + digisF3HBToken_{produces(ps.getParameter("digisLabelF3HB"))} { + config_.maxChannelsF01HE = ps.getParameter("maxChannelsF01HE"); + config_.maxChannelsF5HB = ps.getParameter("maxChannelsF5HB"); + config_.maxChannelsF3HB = ps.getParameter("maxChannelsF3HB"); + + // this is a preallocation for the max statically known number of time samples + // actual stride/nsamples will be inferred from data + hf01_.stride = hcal::compute_stride(QIE11DigiCollection::MAXSAMPLES); + hf5_.stride = hcal::compute_stride(HBHEDataFrame::MAXSAMPLES); + hf3_.stride = hcal::compute_stride(QIE11DigiCollection::MAXSAMPLES); + + // preallocate pinned host memory only if CUDA is available + edm::Service cs; + if (cs and cs->enabled()) { + hf01_.reserve(config_.maxChannelsF01HE); + hf5_.reserve(config_.maxChannelsF5HB); + hf3_.reserve(config_.maxChannelsF3HB); + } +} + +void HcalDigisProducerGPU::acquire(edm::Event const& event, + edm::EventSetup const& setup, + edm::WaitingTaskWithArenaHolder holder) { + // raii + cms::cuda::ScopedContextAcquire ctx{event.streamID(), std::move(holder), cudaState_}; + + // clear host buffers + hf01_.clear(); + hf5_.clear(); + hf3_.clear(); + + // event data + edm::Handle hbheDigis; + edm::Handle qie11Digis; + event.getByToken(hbheDigiToken_, hbheDigis); + event.getByToken(qie11DigiToken_, qie11Digis); + + // init f5 collection + if (not hbheDigis->empty()) { + auto const nsamples = (*hbheDigis)[0].size(); + auto const stride = hcal::compute_stride(nsamples); + hf5_.stride = stride; + + // flavor5 get device blobs + df5_.stride = stride; + df5_.data = cms::cuda::make_device_unique(config_.maxChannelsF5HB * stride, ctx.stream()); + df5_.ids = cms::cuda::make_device_unique(config_.maxChannelsF5HB, ctx.stream()); + df5_.npresamples = cms::cuda::make_device_unique(config_.maxChannelsF5HB, ctx.stream()); + } + + if (not qie11Digis->empty()) { + auto const nsamples = qie11Digis->samples(); + auto const stride01 = hcal::compute_stride(nsamples); + auto const stride3 = hcal::compute_stride(nsamples); + + hf01_.stride = stride01; + hf3_.stride = stride3; + + // flavor 0/1 get devie blobs + df01_.stride = stride01; + df01_.data = cms::cuda::make_device_unique(config_.maxChannelsF01HE * stride01, ctx.stream()); + df01_.ids = cms::cuda::make_device_unique(config_.maxChannelsF01HE, ctx.stream()); + + // flavor3 get device blobs + df3_.stride = stride3; + df3_.data = cms::cuda::make_device_unique(config_.maxChannelsF3HB * stride3, ctx.stream()); + df3_.ids = cms::cuda::make_device_unique(config_.maxChannelsF3HB, ctx.stream()); + } + + for (auto const& hbhe : *hbheDigis) { + auto const id = hbhe.id().rawId(); + auto const presamples = hbhe.presamples(); + hf5_.ids.push_back(id); + hf5_.npresamples.push_back(presamples); + auto const stride = hcal::compute_stride(hbhe.size()); + assert(stride == hf5_.stride && "strides must be the same for every single digi of the collection"); + // simple for now... + static_assert(hcal::Flavor5::HEADER_WORDS == 1); + uint16_t header_word = (1 << 15) | (0x5 << 12) | (0 << 10) | ((hbhe.sample(0).capid() & 0x3) << 8); + hf5_.data.push_back(header_word); + for (unsigned int i = 0; i < stride - hcal::Flavor5::HEADER_WORDS; i++) { + uint16_t s0 = (0 << 7) | (static_cast(hbhe.sample(2 * i).adc()) & 0x7f); + uint16_t s1 = (0 << 7) | (static_cast(hbhe.sample(2 * i + 1).adc()) & 0x7f); + uint16_t sample = (s1 << 8) | s0; + hf5_.data.push_back(sample); + } + } + + for (unsigned int i = 0; i < qie11Digis->size(); i++) { + auto const& digi = QIE11DataFrame{(*qie11Digis)[i]}; + assert(digi.samples() == qie11Digis->samples() && "collection nsamples must equal per digi samples"); + if (digi.flavor() == 0 or digi.flavor() == 1) { + if (digi.detid().subdetId() != HcalEndcap) + continue; + auto const id = digi.detid().rawId(); + hf01_.ids.push_back(id); + for (int hw = 0; hw < hcal::Flavor1::HEADER_WORDS; hw++) + hf01_.data.push_back((*qie11Digis)[i][hw]); + for (int sample = 0; sample < digi.samples(); sample++) { + hf01_.data.push_back((*qie11Digis)[i][hcal::Flavor1::HEADER_WORDS + sample]); + } + } else if (digi.flavor() == 3) { + if (digi.detid().subdetId() != HcalBarrel) + continue; + auto const id = digi.detid().rawId(); + hf3_.ids.push_back(id); + for (int hw = 0; hw < hcal::Flavor3::HEADER_WORDS; hw++) + hf3_.data.push_back((*qie11Digis)[i][hw]); + for (int sample = 0; sample < digi.samples(); sample++) { + hf3_.data.push_back((*qie11Digis)[i][hcal::Flavor3::HEADER_WORDS + sample]); + } + } + } + + auto lambdaToTransfer = [&ctx](auto* dest, auto const& src) { + if (src.empty()) + return; + using vector_type = typename std::remove_reference::type; + using type = typename vector_type::value_type; + using dest_data_type = typename std::remove_pointer::type; + static_assert(std::is_same::value && "Dest and Src data typesdo not match"); + cudaCheck(cudaMemcpyAsync(dest, src.data(), src.size() * sizeof(type), cudaMemcpyHostToDevice, ctx.stream())); + }; + + lambdaToTransfer(df01_.data.get(), hf01_.data); + lambdaToTransfer(df01_.ids.get(), hf01_.ids); + + lambdaToTransfer(df5_.data.get(), hf5_.data); + lambdaToTransfer(df5_.ids.get(), hf5_.ids); + lambdaToTransfer(df5_.npresamples.get(), hf5_.npresamples); + + lambdaToTransfer(df3_.data.get(), hf3_.data); + lambdaToTransfer(df3_.ids.get(), hf3_.ids); + + df01_.size = hf01_.ids.size(); + df5_.size = hf5_.ids.size(); + df3_.size = hf3_.ids.size(); +} + +void HcalDigisProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) { + cms::cuda::ScopedContextProduce ctx{cudaState_}; + + ctx.emplace(event, digisF01HEToken_, std::move(df01_)); + ctx.emplace(event, digisF5HBToken_, std::move(df5_)); + ctx.emplace(event, digisF3HBToken_, std::move(df3_)); +} + +DEFINE_FWK_MODULE(HcalDigisProducerGPU); diff --git a/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc b/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc new file mode 100644 index 0000000000000..749a98e990755 --- /dev/null +++ b/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc @@ -0,0 +1,10 @@ +#include "CondFormats/DataRecord/interface/HcalElectronicsMapRcd.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "HeterogeneousCore/CUDACore/interface/ConvertingESProducerT.h" + +#include "ElectronicsMappingGPU.h" + +using HcalElectronicsMappingGPUESProducer = + ConvertingESProducerT; + +DEFINE_FWK_EVENTSETUP_MODULE(HcalElectronicsMappingGPUESProducer); diff --git a/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc new file mode 100644 index 0000000000000..7e8388a5f4d2f --- /dev/null +++ b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc @@ -0,0 +1,200 @@ +#include + +#include "CondFormats/DataRecord/interface/HcalElectronicsMapRcd.h" +#include "DataFormats/FEDRawData/interface/FEDNumbering.h" +#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" + +#include "DeclsForKernels.h" +#include "DecodeGPU.h" +#include "ElectronicsMappingGPU.h" + +class HcalRawToDigiGPU : public edm::stream::EDProducer { +public: + explicit HcalRawToDigiGPU(edm::ParameterSet const& ps); + ~HcalRawToDigiGPU() override; + static void fillDescriptions(edm::ConfigurationDescriptions&); + +private: + void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override; + void produce(edm::Event&, edm::EventSetup const&) override; + +private: + edm::EDGetTokenT rawDataToken_; + using ProductTypef01 = cms::cuda::Product>; + edm::EDPutTokenT digisF01HEToken_; + using ProductTypef5 = cms::cuda::Product>; + edm::EDPutTokenT digisF5HBToken_; + using ProductTypef3 = cms::cuda::Product>; + edm::EDPutTokenT digisF3HBToken_; + + cms::cuda::ContextState cudaState_; + + std::vector fedsToUnpack_; + + hcal::raw::ConfigurationParameters config_; + hcal::raw::OutputDataGPU outputGPU_; + hcal::raw::OutputDataCPU outputCPU_; +}; + +void HcalRawToDigiGPU::fillDescriptions(edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; + + desc.add("InputLabel", edm::InputTag("rawDataCollector")); + auto nFeds = FEDNumbering::MAXHCALuTCAFEDID - FEDNumbering::MINHCALuTCAFEDID + 1; + std::vector feds(nFeds); + for (int i = 0; i < nFeds; ++i) + feds[i] = i + FEDNumbering::MINHCALuTCAFEDID; + desc.add>("FEDs", feds); + desc.add("maxChannelsF01HE", 10000u); + desc.add("maxChannelsF5HB", 10000u); + desc.add("maxChannelsF3HB", 10000u); + desc.add("nsamplesF01HE", 8); + desc.add("nsamplesF5HB", 8); + desc.add("nsamplesF3HB", 8); + desc.add("digisLabelF5HB", "f5HBDigisGPU"); + desc.add("digisLabelF01HE", "f01HEDigisGPU"); + desc.add("digisLabelF3HB", "f3HBDigisGPU"); + + std::string label = "hcalRawToDigiGPU"; + confDesc.add(label, desc); +} + +HcalRawToDigiGPU::HcalRawToDigiGPU(const edm::ParameterSet& ps) + : rawDataToken_{consumes(ps.getParameter("InputLabel"))}, + digisF01HEToken_{produces(ps.getParameter("digisLabelF01HE"))}, + digisF5HBToken_{produces(ps.getParameter("digisLabelF5HB"))}, + digisF3HBToken_{produces(ps.getParameter("digisLabelF3HB"))}, + fedsToUnpack_{ps.getParameter>("FEDs")} { + config_.maxChannelsF01HE = ps.getParameter("maxChannelsF01HE"); + config_.maxChannelsF5HB = ps.getParameter("maxChannelsF5HB"); + config_.maxChannelsF3HB = ps.getParameter("maxChannelsF3HB"); + config_.nsamplesF01HE = ps.getParameter("nsamplesF01HE"); + config_.nsamplesF5HB = ps.getParameter("nsamplesF5HB"); + config_.nsamplesF3HB = ps.getParameter("nsamplesF3HB"); +} + +HcalRawToDigiGPU::~HcalRawToDigiGPU() {} + +void HcalRawToDigiGPU::acquire(edm::Event const& event, + edm::EventSetup const& setup, + edm::WaitingTaskWithArenaHolder holder) { + // raii + cms::cuda::ScopedContextAcquire ctx{event.streamID(), std::move(holder), cudaState_}; + + // conditions + edm::ESHandle eMappingHandle; + setup.get().get(eMappingHandle); + auto const& eMappingProduct = eMappingHandle->getProduct(ctx.stream()); + + // bundle up conditions + hcal::raw::ConditionsProducts conditions{eMappingProduct}; + + // event data + edm::Handle rawDataHandle; + event.getByToken(rawDataToken_, rawDataHandle); + + // scratch + hcal::raw::ScratchDataGPU scratchGPU = { + cms::cuda::make_device_unique(hcal::raw::numOutputCollections, ctx.stream())}; + + // input cpu data + hcal::raw::InputDataCPU inputCPU = {cms::cuda::make_host_unique( + hcal::raw::utca_nfeds_max * hcal::raw::nbytes_per_fed_max, ctx.stream()), + cms::cuda::make_host_unique(hcal::raw::utca_nfeds_max, ctx.stream()), + cms::cuda::make_host_unique(hcal::raw::utca_nfeds_max, ctx.stream())}; + + // input data gpu + hcal::raw::InputDataGPU inputGPU = { + cms::cuda::make_device_unique(hcal::raw::utca_nfeds_max * hcal::raw::nbytes_per_fed_max, + ctx.stream()), + cms::cuda::make_device_unique(hcal::raw::utca_nfeds_max, ctx.stream()), + cms::cuda::make_device_unique(hcal::raw::utca_nfeds_max, ctx.stream())}; + + // output cpu + outputCPU_ = {cms::cuda::make_host_unique(hcal::raw::numOutputCollections, ctx.stream())}; + + // output gpu + outputGPU_.allocate(config_, ctx.stream()); + + // iterate over feds + // TODO: another idea + // - loop over all feds to unpack and enqueue cuda memcpy + // - accumulate the sizes + // - after the loop launch cuda memcpy for sizes + // - enqueue the kernel + uint32_t currentCummOffset = 0; + uint32_t counter = 0; + for (auto const& fed : fedsToUnpack_) { + auto const& data = rawDataHandle->FEDData(fed); + auto const nbytes = data.size(); + + // skip empty feds + if (nbytes < hcal::raw::empty_event_size) + continue; + +#ifdef HCAL_RAWDECODE_CPUDEBUG + printf("fed = %d nbytes = %lu\n", fed, nbytes); +#endif + + // copy raw data into plain buffer + std::memcpy(inputCPU.data.get() + currentCummOffset, data.data(), nbytes); + // set the offset in bytes from the start + inputCPU.offsets[counter] = currentCummOffset; + inputCPU.feds[counter] = fed; + + // this is the current offset into the vector + currentCummOffset += nbytes; + ++counter; + } + + hcal::raw::entryPoint(inputCPU, + inputGPU, + outputGPU_, + scratchGPU, + outputCPU_, + conditions, + config_, + ctx.stream(), + counter, + currentCummOffset); +} + +void HcalRawToDigiGPU::produce(edm::Event& event, edm::EventSetup const& setup) { + cms::cuda::ScopedContextProduce ctx{cudaState_}; + +#ifdef HCAL_RAWDECODE_CPUDEBUG + printf("f01he channels = %u f5hb channesl = %u\n", + outputCPU_.nchannels[hcal::raw::OutputF01HE], + outputCPU_.nchannels[hcal::raw::OutputF5HB]); +#endif + + // FIXME: use sizes of views directly for cuda mem cpy? + auto const nchannelsF01HE = outputCPU_.nchannels[hcal::raw::OutputF01HE]; + auto const nchannelsF5HB = outputCPU_.nchannels[hcal::raw::OutputF5HB]; + auto const nchannelsF3HB = outputCPU_.nchannels[hcal::raw::OutputF3HB]; + outputGPU_.digisF01HE.size = nchannelsF01HE; + outputGPU_.digisF5HB.size = nchannelsF5HB; + outputGPU_.digisF3HB.size = nchannelsF3HB; + outputGPU_.digisF01HE.stride = hcal::compute_stride(config_.nsamplesF01HE); + outputGPU_.digisF5HB.stride = hcal::compute_stride(config_.nsamplesF5HB); + outputGPU_.digisF3HB.stride = hcal::compute_stride(config_.nsamplesF3HB); + + ctx.emplace(event, digisF01HEToken_, std::move(outputGPU_.digisF01HE)); + ctx.emplace(event, digisF5HBToken_, std::move(outputGPU_.digisF5HB)); + ctx.emplace(event, digisF3HBToken_, std::move(outputGPU_.digisF3HB)); + + // reset ptrs that are carried as members + outputCPU_.nchannels.reset(); +} + +DEFINE_FWK_MODULE(HcalRawToDigiGPU); diff --git a/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py b/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py new file mode 100644 index 0000000000000..b3a2548791ae5 --- /dev/null +++ b/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py @@ -0,0 +1,60 @@ +import FWCore.ParameterSet.Config as cms + +# Customise the HCAL-only reconstruction to run on GPU +# +# Currently, this means: +# - running the unpacker on CPU, converting the digis into SoA format and copying them to GPU; +# - running the HBHE local reconstruction, including MAHI, on GPU. +def customizeHcalOnlyForProfilingGPUOnly(process): + + process.consumer = cms.EDAnalyzer("GenericConsumer", + eventProducts = cms.untracked.vstring('hbheRecHitProducerGPU') + ) + + process.consume_step = cms.EndPath(process.consumer) + + process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step) + + return process + + +# Customise the HCAL-only reconstruction to run on GPU, and copy the data to the host +# +# Currently, this means: +# - running the unpacker on CPU, converting the digis into SoA format and copying them to GPU; +# - running the HBHE local reconstruction, including MAHI, on GPU; +# - copying the rechits to CPU and converting them to legacy format. +# +# (this is equivalent to customizeHcalOnlyForProfiling, as the copy and conversion is done by the same module) +def customizeHcalOnlyForProfilingGPUWithHostCopy(process): + + process.consumer = cms.EDAnalyzer("GenericConsumer", + eventProducts = cms.untracked.vstring('hbheprereco') + ) + + process.consume_step = cms.EndPath(process.consumer) + + process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step) + + return process + + +# Customise the HCAL-only reconstruction to run on GPU, copy the data to the host, and convert to legacy format +# +# Currently, this means: +# - running the unpacker on CPU, converting the digis into SoA format and copying them to GPU; +# - running the HBHE local reconstruction, including MAHI, on GPU; +# - copying the rechits to CPU and converting them to legacy format. +# +# The same customisation can be also used on the CPU workflow, running up to the rechits on CPU. +def customizeHcalOnlyForProfiling(process): + + process.consumer = cms.EDAnalyzer("GenericConsumer", + eventProducts = cms.untracked.vstring('hbheprereco') + ) + + process.consume_step = cms.EndPath(process.consumer) + + process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step) + + return process diff --git a/RecoLocalCalo/Configuration/python/hcalGlobalReco_cff.py b/RecoLocalCalo/Configuration/python/hcalGlobalReco_cff.py index 70207e36ba654..fbb4c53f9f28b 100644 --- a/RecoLocalCalo/Configuration/python/hcalGlobalReco_cff.py +++ b/RecoLocalCalo/Configuration/python/hcalGlobalReco_cff.py @@ -4,7 +4,18 @@ hcalGlobalRecoTask = cms.Task(hbhereco) hcalGlobalRecoSequence = cms.Sequence(hcalGlobalRecoTask) +#--- for Run 3 and later +from Configuration.Eras.Modifier_run3_HB_cff import run3_HB + from RecoLocalCalo.HcalRecProducers.HBHEPhase1Reconstructor_cfi import hbheprereco as _phase1_hbheprereco +run3_HB.toReplaceWith(hbhereco, _phase1_hbheprereco) -from Configuration.Eras.Modifier_run3_HB_cff import run3_HB -run3_HB.toReplaceWith( hbhereco, _phase1_hbheprereco ) # >=Run3 +#--- for Run 3 on GPU +from Configuration.ProcessModifiers.gpu_cff import gpu + +from RecoLocalCalo.HcalRecProducers.hcalCPURecHitsProducer_cfi import hcalCPURecHitsProducer as _hcalCPURecHitsProducer +gpu.toReplaceWith(hbhereco, _hcalCPURecHitsProducer.clone( + recHitsM0LabelIn = "hbheRecHitProducerGPU", + recHitsM0LabelOut = "", + recHitsLegacyLabelOut = "" +)) diff --git a/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py b/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py index 84e7498b7eb2d..a7bdce3b916af 100644 --- a/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py +++ b/RecoLocalCalo/Configuration/python/hcalLocalReco_cff.py @@ -16,20 +16,17 @@ from RecoLocalCalo.HcalRecProducers.HcalHitReconstructor_ho_cfi import * from RecoLocalCalo.HcalRecProducers.HcalHitReconstructor_hf_cfi import * from RecoLocalCalo.HcalRecProducers.HcalHitReconstructor_zdc_cfi import * -hcalLocalRecoTask = cms.Task(hbheprereco,hfreco,horeco,zdcreco) +hcalLocalRecoTask = cms.Task(hbheprereco, hfreco, horeco, zdcreco) hcalLocalRecoSequence = cms.Sequence(hcalLocalRecoTask) from RecoLocalCalo.HcalRecProducers.hfprereco_cfi import hfprereco from RecoLocalCalo.HcalRecProducers.HFPhase1Reconstructor_cfi import hfreco as _phase1_hfreco from RecoLocalCalo.HcalRecProducers.hbheplan1_cfi import hbheplan1 -#--- for HCALonly wf -hcalOnlyLocalRecoTask = cms.Task(hbheprereco,hfprereco,hfreco,horeco) - -# copy for cosmics +#--- for cosmics _default_hfreco = hfreco.clone() -#--- Phase1 +#--- for Phase 1 _phase1_hcalLocalRecoTask = hcalLocalRecoTask.copy() _phase1_hcalLocalRecoTask.add(hfprereco) @@ -37,7 +34,7 @@ run2_HF_2017.toReplaceWith( hcalLocalRecoTask, _phase1_hcalLocalRecoTask ) run2_HF_2017.toReplaceWith( hfreco, _phase1_hfreco ) from Configuration.Eras.Modifier_run2_HCAL_2017_cff import run2_HCAL_2017 -run2_HCAL_2017.toReplaceWith( hbheprereco, _phase1_hbheprereco ) +run2_HCAL_2017.toReplaceWith(hbheprereco, _phase1_hbheprereco) _plan1_hcalLocalRecoTask = _phase1_hcalLocalRecoTask.copy() _plan1_hcalLocalRecoTask.add(hbheplan1) @@ -50,12 +47,39 @@ from Configuration.ProcessModifiers.run2_HECollapse_2018_cff import run2_HECollapse_2018 run2_HECollapse_2018.toReplaceWith(hcalLocalRecoTask, _collapse_hcalLocalRecoTask) -#--- from >=Run3 +#--- for Run 3 and later _run3_hcalLocalRecoTask = _phase1_hcalLocalRecoTask.copy() _run3_hcalLocalRecoTask.remove(hbheprereco) from Configuration.Eras.Modifier_run3_HB_cff import run3_HB -run3_HB.toReplaceWith( hcalLocalRecoTask, _run3_hcalLocalRecoTask ) +run3_HB.toReplaceWith(hcalLocalRecoTask, _run3_hcalLocalRecoTask) + +#--- for Run 3 on GPU +from Configuration.ProcessModifiers.gpu_cff import gpu + +from RecoLocalCalo.HcalRecProducers.hbheRecHitProducerGPUTask_cff import * +_run3_hcalLocalRecoGPUTask = _run3_hcalLocalRecoTask.copy() +_run3_hcalLocalRecoGPUTask.add(hbheRecHitProducerGPUTask) +gpu.toReplaceWith(hcalLocalRecoTask, _run3_hcalLocalRecoGPUTask) + +#--- HCAL-only workflow for Run 3 +# FIXME rename `hbheprereco` to `hbhereco` and use it from hcalGlobalRecoTask +hcalOnlyLocalRecoTask = cms.Task(hbheprereco, hfprereco, hfreco, horeco) + +#--- HCAL-only workflow for Run 3 on GPU +from Configuration.ProcessModifiers.gpu_cff import gpu + +_hcalOnlyLocalRecoGPUTask = hcalOnlyLocalRecoTask.copy() +_hcalOnlyLocalRecoGPUTask.add(hbheRecHitProducerGPUTask) +gpu.toReplaceWith(hcalOnlyLocalRecoTask, _hcalOnlyLocalRecoGPUTask) + +from RecoLocalCalo.HcalRecProducers.hcalCPURecHitsProducer_cfi import hcalCPURecHitsProducer as _hcalCPURecHitsProducer +gpu.toReplaceWith(hbheprereco, _hcalCPURecHitsProducer.clone( + recHitsM0LabelIn = "hbheRecHitProducerGPU", + recHitsM0LabelOut = "", + recHitsLegacyLabelOut = "" +)) +#--- for FastSim _fastSim_hcalLocalRecoTask = hcalLocalRecoTask.copyAndExclude([zdcreco]) from Configuration.Eras.Modifier_fastSim_cff import fastSim fastSim.toReplaceWith( hcalLocalRecoTask, _fastSim_hcalLocalRecoTask ) diff --git a/RecoLocalCalo/HcalRecAlgos/BuildFile.xml b/RecoLocalCalo/HcalRecAlgos/BuildFile.xml index ac94d61e12494..2c2ee20aff7d5 100644 --- a/RecoLocalCalo/HcalRecAlgos/BuildFile.xml +++ b/RecoLocalCalo/HcalRecAlgos/BuildFile.xml @@ -1,19 +1,26 @@ - + + + + + + + + + + - - - - - + + + + + - - diff --git a/RecoLocalCalo/HcalRecAlgos/bin/BuildFile.xml b/RecoLocalCalo/HcalRecAlgos/bin/BuildFile.xml new file mode 100644 index 0000000000000..30f76131622bd --- /dev/null +++ b/RecoLocalCalo/HcalRecAlgos/bin/BuildFile.xml @@ -0,0 +1,2 @@ + + diff --git a/RecoLocalCalo/HcalRecAlgos/bin/generateQIEShapes.cc b/RecoLocalCalo/HcalRecAlgos/bin/generateQIEShapes.cc new file mode 100644 index 0000000000000..39d5195e7438d --- /dev/null +++ b/RecoLocalCalo/HcalRecAlgos/bin/generateQIEShapes.cc @@ -0,0 +1,82 @@ +#include +#include +#include + +// +// Pregenerate QIE Shapes using hardcoded arrays +// This is taken directly from CondFormats/HcalObjects/srcHcalQIEData.cc +// This generation is running upon conditions retrieval typically for the cpu workload +// +// For the GPU workload, it is better to put generated values into constant memory. +// Either this or just use global memory (for global mem, we need getters...). +// Choosign constant memory as thsese +// values are statically known and never change. Any change in any case requires +// recompilation! +// + +const float binMin[32] = {-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 16, 18, 20, 22, 24, 26, 28, 31, 34, 37, 40, 44, 48, 52, 57, 62}; + +const float binMin2[64] = {-0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, + 10.5, 11.5, 12.5, 13.5, 14.5, // 16 bins with width 1x + 15.5, 17.5, 19.5, 21.5, 23.5, 25.5, 27.5, 29.5, 31.5, 33.5, 35.5, + 37.5, 39.5, 41.5, 43.5, 45.5, 47.5, 49.5, 51.5, 53.5, // 20 bins with width 2x + 55.5, 59.5, 63.5, 67.5, 71.5, 75.5, 79.5, 83.5, 87.5, 91.5, 95.5, + 99.5, 103.5, 107.5, 111.5, 115.5, 119.5, 123.5, 127.5, 131.5, 135.5, // 21 bins with width 4x + 139.5, 147.5, 155.5, 163.5, 171.5, 179.5, 187.5}; // 7 bins with width 8x + +constexpr uint32_t nbins_qie8 = 32; +constexpr uint32_t nbins_qie11 = 64; + +void dump(std::vector const& vec, std::string const& name) { + std::stringstream str; + str << "float const " << name << "[" << vec.size() << "] = {"; + uint32_t counter = 0; + for (auto const& value : vec) { + if (counter % 8 == 0) + str << std::endl; + if (counter == vec.size() - 1) + str << value; + else + str << value << ", "; + counter++; + } + str << "};"; + std::cout << str.str() << std::endl; +} + +void generate(uint32_t const nbins, float const* initValues, std::vector& values) { + // preset the first range + for (uint32_t adc = 0; adc < nbins; adc++) + values[adc] = initValues[adc]; + + // do the rest + int scale = 1; + for (uint32_t range = 1; range < 4; range++) { + int factor = nbins == 32 ? 5 : 8; + scale *= factor; + + auto const index_offset = range * nbins; + uint32_t const overlap = nbins == 32 ? 2 : 3; + values[index_offset] = values[index_offset - overlap]; + + for (uint32_t i = 1; i < nbins; i++) + values[index_offset + i] = values[index_offset + i - 1] + scale * (values[i] - values[i - 1]); + } + + values[nbins * 4] = 2 * values[nbins * 4 - 1] - values[nbins * 4 - 2]; +} + +int main(int argc, char* argv[]) { + // + // run 128 bins + // + std::vector valuesqie8(nbins_qie8 * 4 + 1), valuesqie11(nbins_qie11 * 4 + 1); + generate(nbins_qie8, binMin, valuesqie8); + generate(nbins_qie11, binMin2, valuesqie11); + + dump(valuesqie8, std::string{"qie8shape"}); + dump(valuesqie11, std::string{"qie11shape"}); + + return 0; +} diff --git a/RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h b/RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h new file mode 100644 index 0000000000000..98ce9c0b660f1 --- /dev/null +++ b/RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h @@ -0,0 +1,37 @@ +#ifndef RecoLocalCalo_HcalRecAlgos_interface_HcalMahiPulseOffsetsGPU_h +#define RecoLocalCalo_HcalRecAlgos_interface_HcalMahiPulseOffsetsGPU_h + +#include "FWCore/ParameterSet/interface/ParameterSet.h" + +#ifndef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" +#include "HeterogeneousCore/CUDACore/interface/ESProduct.h" +#endif + +class HcalMahiPulseOffsetsGPU { +public: + struct Product { + ~Product(); + int* values; + }; + +#ifndef __CUDACC__ + // rearrange reco params + HcalMahiPulseOffsetsGPU(edm::ParameterSet const&); + + // will trigger deallocation of Product thru ~Product + ~HcalMahiPulseOffsetsGPU() = default; + + std::vector> const& getValues() const { return values_; } + + // get device pointers + Product const& getProduct(cudaStream_t) const; + +private: + std::vector> values_; + + cms::cuda::ESProduct product_; +#endif +}; + +#endif diff --git a/RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h b/RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h new file mode 100644 index 0000000000000..965fb873bcf88 --- /dev/null +++ b/RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h @@ -0,0 +1,54 @@ +#ifndef RecoLocalCalo_HcalRecAlgos_interface_HcalRecoParamsWithPulseShapesGPU_h +#define RecoLocalCalo_HcalRecAlgos_interface_HcalRecoParamsWithPulseShapesGPU_h + +#ifndef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" +#include "HeterogeneousCore/CUDACore/interface/ESProduct.h" +#endif + +class HcalRecoParams; + +// +// TODO: HcalPulseShapes will need to be used via ESSource +// This is a workaround: precompute/store/transfer what's needed only +// +class HcalRecoParamsWithPulseShapesGPU { +public: + struct Product { + ~Product(); + uint32_t *param1 = nullptr, *param2 = nullptr; + uint32_t *ids = nullptr; + + // These guys come directly from PulseShapeFunctor class + float *acc25nsVec = nullptr, *diff25nsItvlVec = nullptr, *accVarLenIdxMinusOneVec = nullptr, + *diffVarItvlIdxMinusOneVec = nullptr, *accVarLenIdxZEROVec = nullptr, *diffVarItvlIdxZEROVec = nullptr; + }; + +#ifndef __CUDACC__ + // rearrange reco params + HcalRecoParamsWithPulseShapesGPU(HcalRecoParams const &); + + // will trigger deallocation of Product thru ~Product + ~HcalRecoParamsWithPulseShapesGPU() = default; + + // get device pointers + Product const &getProduct(cudaStream_t) const; + +private: + uint64_t totalChannels_; // hb + he + std::vector> param1_; + std::vector> param2_; + std::vector> ids_; + + std::vector> acc25nsVec_; // 256 + std::vector> diff25nsItvlVec_; // 256 + std::vector> accVarLenIdxMinusOneVec_; // 25 + std::vector> diffVarItvlIdxMinusOneVec_; // 25 + std::vector> accVarLenIdxZEROVec_; // 25 + std::vector> diffVarItvlIdxZEROVec_; // 25 + + cms::cuda::ESProduct product_; +#endif +}; + +#endif diff --git a/RecoLocalCalo/HcalRecAlgos/src/HcalMahiPulseOffsetsGPU.cc b/RecoLocalCalo/HcalRecAlgos/src/HcalMahiPulseOffsetsGPU.cc new file mode 100644 index 0000000000000..005a77932f6df --- /dev/null +++ b/RecoLocalCalo/HcalRecAlgos/src/HcalMahiPulseOffsetsGPU.cc @@ -0,0 +1,35 @@ +#include "RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h" + +#include "FWCore/Utilities/interface/typelookup.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +// FIXME: add proper getters to conditions +HcalMahiPulseOffsetsGPU::HcalMahiPulseOffsetsGPU(edm::ParameterSet const& ps) { + auto const& values = ps.getParameter>("pulseOffsets"); + values_.resize(values.size()); + std::copy(values.begin(), values.end(), values_.begin()); +} + +HcalMahiPulseOffsetsGPU::Product::~Product() { + // deallocation + cudaCheck(cudaFree(values)); +} + +HcalMahiPulseOffsetsGPU::Product const& HcalMahiPulseOffsetsGPU::getProduct(cudaStream_t cudaStream) const { + auto const& product = product_.dataForCurrentDeviceAsync( + cudaStream, [this](HcalMahiPulseOffsetsGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck(cudaMalloc((void**)&product.values, this->values_.size() * sizeof(int))); + + // transfer + cudaCheck(cudaMemcpyAsync(product.values, + this->values_.data(), + this->values_.size() * sizeof(int), + cudaMemcpyHostToDevice, + cudaStream)); + }); + + return product; +} + +TYPELOOKUP_DATA_REG(HcalMahiPulseOffsetsGPU); diff --git a/RecoLocalCalo/HcalRecAlgos/src/HcalRecoParamsWithPulseShapesGPU.cc b/RecoLocalCalo/HcalRecAlgos/src/HcalRecoParamsWithPulseShapesGPU.cc new file mode 100644 index 0000000000000..b42621b98908e --- /dev/null +++ b/RecoLocalCalo/HcalRecAlgos/src/HcalRecoParamsWithPulseShapesGPU.cc @@ -0,0 +1,222 @@ +#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h" + +#include "CondFormats/HcalObjects/interface/HcalRecoParams.h" +#include "CalibCalorimetry/HcalAlgos/interface/HcalPulseShapes.h" +#include "RecoLocalCalo/HcalRecAlgos/interface/PulseShapeFunctor.h" + +#include "FWCore/Utilities/interface/typelookup.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +#include + +// FIXME: add proper getters to conditions +HcalRecoParamsWithPulseShapesGPU::HcalRecoParamsWithPulseShapesGPU(HcalRecoParams const& recoParams) + : totalChannels_{recoParams.getAllContainers()[0].second.size() + recoParams.getAllContainers()[1].second.size()}, + param1_(totalChannels_), + param2_(totalChannels_), + ids_(totalChannels_) { +#ifdef HCAL_MAHI_CPUDEBUG + printf("hello from a reco params with pulse shapes\n"); +#endif + + auto const containers = recoParams.getAllContainers(); + + HcalPulseShapes pulseShapes; + std::unordered_map idCache; + + // fill in eb + auto const& barrelValues = containers[0].second; + for (uint64_t i = 0; i < barrelValues.size(); ++i) { + param1_[i] = barrelValues[i].param1(); + param2_[i] = barrelValues[i].param2(); + + auto const pulseShapeId = barrelValues[i].pulseShapeID(); + // FIXME: 0 throws upon look up to HcalPulseShapes + // although comments state that 0 is reserved, + // HcalPulseShapes::getShape throws on 0! + if (pulseShapeId == 0) { + ids_[i] = 0; + continue; + } + if (auto const iter = idCache.find(pulseShapeId); iter == idCache.end()) { + // new guy + auto const newId = idCache.size(); + idCache[pulseShapeId] = newId; + // this will be the id + ids_[i] = newId; + + // resize value arrays + acc25nsVec_.resize(acc25nsVec_.size() + hcal::constants::maxPSshapeBin); + diff25nsItvlVec_.resize(diff25nsItvlVec_.size() + hcal::constants::maxPSshapeBin); + accVarLenIdxMinusOneVec_.resize(accVarLenIdxMinusOneVec_.size() + hcal::constants::nsPerBX); + diffVarItvlIdxMinusOneVec_.resize(diffVarItvlIdxMinusOneVec_.size() + hcal::constants::nsPerBX); + accVarLenIdxZEROVec_.resize(accVarLenIdxZEROVec_.size() + hcal::constants::nsPerBX); + diffVarItvlIdxZEROVec_.resize(diffVarItvlIdxZEROVec_.size() + hcal::constants::nsPerBX); + + // precompute and get values from the functor + auto const& pulseShape = pulseShapes.getShape(pulseShapeId); + FitterFuncs::PulseShapeFunctor functor{pulseShape, false, false, false, 1, 0, 0, hcal::constants::maxSamples}; + auto const offset256 = newId * hcal::constants::maxPSshapeBin; + auto const offset25 = newId * hcal::constants::nsPerBX; + auto const numShapes = newId; + for (int i = 0; i < hcal::constants::maxPSshapeBin; i++) { + acc25nsVec_[offset256 * numShapes + i] = functor.acc25nsVec()[i]; + diff25nsItvlVec_[offset256 * numShapes + i] = functor.diff25nsItvlVec()[i]; + } + + for (int i = 0; i < hcal::constants::nsPerBX; i++) { + accVarLenIdxMinusOneVec_[offset25 * numShapes + i] = functor.accVarLenIdxMinusOneVec()[i]; + diffVarItvlIdxMinusOneVec_[offset25 * numShapes + i] = functor.diffVarItvlIdxMinusOneVec()[i]; + accVarLenIdxZEROVec_[offset25 * numShapes + i] = functor.accVarLenIdxZEROVec()[i]; + diffVarItvlIdxZEROVec_[offset25 * numShapes + i] = functor.diffVarItvlIdxZEROVec()[i]; + } + } else { + // already recorded this pulse shape, just set id + ids_[i] = iter->second; + } +#ifdef HCAL_MAHI_CPUDEBUG + if (barrelValues[i].rawId() == DETID_TO_DEBUG) { + printf("recoShapeId = %u myid = %u\n", pulseShapeId, ids_[i]); + } +#endif + } + + // fill in ee + auto const& endcapValues = containers[1].second; + auto const offset = barrelValues.size(); + for (uint64_t i = 0; i < endcapValues.size(); ++i) { + param1_[i + offset] = endcapValues[i].param1(); + param2_[i + offset] = endcapValues[i].param2(); + + auto const pulseShapeId = endcapValues[i].pulseShapeID(); + // FIXME: 0 throws upon look up to HcalPulseShapes + // although comments state that 0 is reserved, + // HcalPulseShapes::getShape throws on 0! + if (pulseShapeId == 0) { + ids_[i + offset] = 0; + continue; + } + if (auto const iter = idCache.find(pulseShapeId); iter == idCache.end()) { + // new guy + auto const newId = idCache.size(); + idCache[pulseShapeId] = newId; + // this will be the id + ids_[i + offset] = newId; + + // resize value arrays + acc25nsVec_.resize(acc25nsVec_.size() + hcal::constants::maxPSshapeBin); + diff25nsItvlVec_.resize(diff25nsItvlVec_.size() + hcal::constants::maxPSshapeBin); + accVarLenIdxMinusOneVec_.resize(accVarLenIdxMinusOneVec_.size() + hcal::constants::nsPerBX); + diffVarItvlIdxMinusOneVec_.resize(diffVarItvlIdxMinusOneVec_.size() + hcal::constants::nsPerBX); + accVarLenIdxZEROVec_.resize(accVarLenIdxZEROVec_.size() + hcal::constants::nsPerBX); + diffVarItvlIdxZEROVec_.resize(diffVarItvlIdxZEROVec_.size() + hcal::constants::nsPerBX); + + // precompute and get values from the functor + auto const& pulseShape = pulseShapes.getShape(pulseShapeId); + FitterFuncs::PulseShapeFunctor functor{pulseShape, false, false, false, 1, 0, 0, hcal::constants::maxSamples}; + auto const offset256 = newId * hcal::constants::maxPSshapeBin; + auto const offset25 = newId * hcal::constants::nsPerBX; + auto const numShapes = newId; + for (int i = 0; i < hcal::constants::maxPSshapeBin; i++) { + acc25nsVec_[offset256 * numShapes + i] = functor.acc25nsVec()[i]; + diff25nsItvlVec_[offset256 * numShapes + i] = functor.diff25nsItvlVec()[i]; + } + + for (int i = 0; i < hcal::constants::nsPerBX; i++) { + accVarLenIdxMinusOneVec_[offset25 * numShapes + i] = functor.accVarLenIdxMinusOneVec()[i]; + diffVarItvlIdxMinusOneVec_[offset25 * numShapes + i] = functor.diffVarItvlIdxMinusOneVec()[i]; + accVarLenIdxZEROVec_[offset25 * numShapes + i] = functor.accVarLenIdxZEROVec()[i]; + diffVarItvlIdxZEROVec_[offset25 * numShapes + i] = functor.diffVarItvlIdxZEROVec()[i]; + } + } else { + // already recorded this pulse shape, just set id + ids_[i + offset] = iter->second; + } + } + +#ifdef HCAL_MAHI_CPUDEBUG + for (auto const& p : idCache) + printf("recoPulseShapeId = %u id = %u\n", p.first, p.second); +#endif +} + +HcalRecoParamsWithPulseShapesGPU::Product::~Product() { + // deallocation + cudaCheck(cudaFree(param1)); + cudaCheck(cudaFree(param2)); + cudaCheck(cudaFree(ids)); + cudaCheck(cudaFree(acc25nsVec)); + cudaCheck(cudaFree(diff25nsItvlVec)); + cudaCheck(cudaFree(accVarLenIdxMinusOneVec)); + cudaCheck(cudaFree(diffVarItvlIdxMinusOneVec)); + cudaCheck(cudaFree(accVarLenIdxZEROVec)); + cudaCheck(cudaFree(diffVarItvlIdxZEROVec)); +} + +HcalRecoParamsWithPulseShapesGPU::Product const& HcalRecoParamsWithPulseShapesGPU::getProduct( + cudaStream_t cudaStream) const { + auto const& product = product_.dataForCurrentDeviceAsync( + cudaStream, [this](HcalRecoParamsWithPulseShapesGPU::Product& product, cudaStream_t cudaStream) { + // malloc + cudaCheck(cudaMalloc((void**)&product.param1, this->param1_.size() * sizeof(uint32_t))); + cudaCheck(cudaMalloc((void**)&product.param2, this->param2_.size() * sizeof(uint32_t))); + cudaCheck(cudaMalloc((void**)&product.ids, this->ids_.size() * sizeof(uint32_t))); + cudaCheck(cudaMalloc((void**)&product.acc25nsVec, this->acc25nsVec_.size() * sizeof(float))); + cudaCheck(cudaMalloc((void**)&product.diff25nsItvlVec, this->diff25nsItvlVec_.size() * sizeof(float))); + cudaCheck(cudaMalloc((void**)&product.accVarLenIdxMinusOneVec, + this->accVarLenIdxMinusOneVec_.size() * sizeof(float))); + cudaCheck(cudaMalloc((void**)&product.diffVarItvlIdxMinusOneVec, + this->diffVarItvlIdxMinusOneVec_.size() * sizeof(float))); + cudaCheck(cudaMalloc((void**)&product.accVarLenIdxZEROVec, this->accVarLenIdxZEROVec_.size() * sizeof(float))); + cudaCheck( + cudaMalloc((void**)&product.diffVarItvlIdxZEROVec, this->diffVarItvlIdxZEROVec_.size() * sizeof(float))); + + // transfer + cudaCheck(cudaMemcpyAsync(product.param1, + this->param1_.data(), + this->param1_.size() * sizeof(uint32_t), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.param2, + this->param2_.data(), + this->param2_.size() * sizeof(uint32_t), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync( + product.ids, this->ids_.data(), this->ids_.size() * sizeof(uint32_t), cudaMemcpyHostToDevice, cudaStream)); + cudaCheck(cudaMemcpyAsync(product.acc25nsVec, + this->acc25nsVec_.data(), + this->acc25nsVec_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.diff25nsItvlVec, + this->diff25nsItvlVec_.data(), + this->diff25nsItvlVec_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.accVarLenIdxMinusOneVec, + this->accVarLenIdxMinusOneVec_.data(), + this->accVarLenIdxMinusOneVec_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.diffVarItvlIdxMinusOneVec, + this->diffVarItvlIdxMinusOneVec_.data(), + this->diffVarItvlIdxMinusOneVec_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.accVarLenIdxZEROVec, + this->accVarLenIdxZEROVec_.data(), + this->accVarLenIdxZEROVec_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + cudaCheck(cudaMemcpyAsync(product.diffVarItvlIdxZEROVec, + this->diffVarItvlIdxZEROVec_.data(), + this->diffVarItvlIdxZEROVec_.size() * sizeof(float), + cudaMemcpyHostToDevice, + cudaStream)); + }); + + return product; +} + +TYPELOOKUP_DATA_REG(HcalRecoParamsWithPulseShapesGPU); diff --git a/RecoLocalCalo/HcalRecProducers/BuildFile.xml b/RecoLocalCalo/HcalRecProducers/BuildFile.xml index c3ae589a0c0a7..e34037ecbcbb5 100644 --- a/RecoLocalCalo/HcalRecProducers/BuildFile.xml +++ b/RecoLocalCalo/HcalRecProducers/BuildFile.xml @@ -1,8 +1,25 @@ - + + + + + + - - + + + + + + + + + - + + + + + + diff --git a/RecoLocalCalo/HcalRecProducers/bin/BuildFile.xml b/RecoLocalCalo/HcalRecProducers/bin/BuildFile.xml new file mode 100644 index 0000000000000..a804a7fe4b70e --- /dev/null +++ b/RecoLocalCalo/HcalRecProducers/bin/BuildFile.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/RecoLocalCalo/HcalRecProducers/bin/makeHcalRecHitGpuValidationPlots.cpp b/RecoLocalCalo/HcalRecProducers/bin/makeHcalRecHitGpuValidationPlots.cpp new file mode 100644 index 0000000000000..fe6aabf928aca --- /dev/null +++ b/RecoLocalCalo/HcalRecProducers/bin/makeHcalRecHitGpuValidationPlots.cpp @@ -0,0 +1,282 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "DataFormats/HcalRecHit/interface/HcalRecHitCollections.h" +//#include "CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h" + +#define CREATE_HIST_1D(varname, nbins, first, last) auto varname = new TH1D(#varname, #varname, nbins, first, last) + +#define CREATE_HIST_2D(varname, nbins, first, last) \ + auto varname = new TH2D(#varname, #varname, nbins, first, last, nbins, first, last) + +int main(int argc, char* argv[]) { + if (argc < 3) { + std::cout << "run with: ./ \n"; + exit(0); + } + + std::string inFileName{argv[1]}; + std::string outFileName{argv[2]}; + + // branches to use + edm::Wrapper* wcpu = nullptr; + edm::Wrapper* wgpu = nullptr; + // edm::Wrapper>> *wgpu=nullptr; + + // prep output + TFile rfout{outFileName.c_str(), "recreate"}; + + CREATE_HIST_1D(hEnergyM0HBGPU, 1000, 0, 100); + CREATE_HIST_1D(hEnergyM0HEGPU, 1000, 0, 100); + CREATE_HIST_1D(hEnergyM0HBCPU, 1000, 0, 100); + CREATE_HIST_1D(hEnergyM0HECPU, 1000, 0, 100); + + CREATE_HIST_1D(hEnergyHBGPU, 1000, 0, 100); + CREATE_HIST_1D(hEnergyHBCPU, 1000, 0, 100); + CREATE_HIST_1D(hEnergyHEGPU, 1000, 0, 100); + CREATE_HIST_1D(hEnergyHECPU, 1000, 0, 100); + + CREATE_HIST_1D(hChi2HBGPU, 1000, 0, 100); + CREATE_HIST_1D(hChi2HBCPU, 1000, 0, 100); + CREATE_HIST_1D(hChi2HEGPU, 1000, 0, 100); + CREATE_HIST_1D(hChi2HECPU, 1000, 0, 100); + + CREATE_HIST_2D(hEnergyHBGPUvsCPU, 1000, 0, 100); + CREATE_HIST_2D(hEnergyHEGPUvsCPU, 1000, 0, 100); + CREATE_HIST_2D(hChi2HBGPUvsCPU, 1000, 0, 100); + CREATE_HIST_2D(hChi2HEGPUvsCPU, 1000, 0, 100); + + CREATE_HIST_2D(hEnergyM0HBGPUvsCPU, 1000, 0, 100); + CREATE_HIST_2D(hEnergyM0HEGPUvsCPU, 1000, 0, 100); + + // prep input + TFile rfin{inFileName.c_str()}; + TTree* rt = (TTree*)rfin.Get("Events"); + rt->SetBranchAddress("HBHERecHitsSorted_hcalCPURecHitsProducer_recHitsLegacyHBHE_RECO.", &wgpu); + // rt->SetBranchAddress("hcalCUDAHostAllocatorAliashcalcommonVecStoragePolicyhcalRecHitCollection_hcalCPURecHitsProducer_recHitsM0LabelOut_RECO.", &wgpu); + rt->SetBranchAddress("HBHERecHitsSorted_hbheprereco__RECO.", &wcpu); + + // accumulate + auto const nentries = rt->GetEntries(); + std::cout << ">>> nentries = " << nentries << std::endl; + for (int ie = 0; ie < nentries; ++ie) { + rt->GetEntry(ie); + + auto const& gpuProduct = wgpu->bareProduct(); + auto const& cpuProduct = wcpu->bareProduct(); + + auto const ncpu = cpuProduct.size(); + auto const ngpu = gpuProduct.size(); + // auto const ngpu = gpuProduct.energy.size(); + + if (ngpu != ncpu) { + std::cerr << "*** mismatch in number of rec hits for event " << ie << std::endl + << ">>> ngpu = " << ngpu << std::endl + << ">>> ncpu = " << ncpu << std::endl; + } + + for (uint32_t ich = 0; ich < ncpu; ich++) { + auto const& cpurh = cpuProduct[ich]; + auto const& did = cpurh.id(); + auto iter2gpu = gpuProduct.find(did); + // auto iter2idgpu = std::find( + // gpuProduct.did.begin(), gpuProduct.did.end(), did.rawId()); + + if (iter2gpu == gpuProduct.end()) { + std::cerr << "missing " << did << std::endl; + continue; + } + + assert(iter2gpu->id().rawId() == did.rawId()); + + auto const gpu_energy_m0 = iter2gpu->eraw(); + auto const cpu_energy_m0 = cpurh.eraw(); + auto const gpu_energy = iter2gpu->energy(); + auto const cpu_energy = cpurh.energy(); + auto const gpu_chi2 = iter2gpu->chi2(); + auto const cpu_chi2 = cpurh.chi2(); + + if (did.subdetId() == HcalBarrel) { + hEnergyM0HBGPU->Fill(gpu_energy_m0); + hEnergyM0HBCPU->Fill(cpu_energy_m0); + hEnergyM0HBGPUvsCPU->Fill(cpu_energy_m0, gpu_energy_m0); + + hEnergyHBGPU->Fill(gpu_energy); + hEnergyHBCPU->Fill(cpu_energy); + hEnergyHBGPUvsCPU->Fill(cpu_energy, gpu_energy); + hChi2HBGPU->Fill(gpu_chi2); + hChi2HBCPU->Fill(cpu_chi2); + hChi2HBGPUvsCPU->Fill(cpu_chi2, gpu_chi2); + } else if (did.subdetId() == HcalEndcap) { + hEnergyM0HEGPU->Fill(gpu_energy_m0); + hEnergyM0HECPU->Fill(cpu_energy_m0); + hEnergyM0HEGPUvsCPU->Fill(cpu_energy_m0, gpu_energy_m0); + + hEnergyHEGPU->Fill(gpu_energy); + hEnergyHECPU->Fill(cpu_energy); + hEnergyHEGPUvsCPU->Fill(cpu_energy, gpu_energy); + + hChi2HEGPU->Fill(gpu_chi2); + hChi2HECPU->Fill(cpu_chi2); + hChi2HEGPUvsCPU->Fill(cpu_chi2, gpu_chi2); + } + } + } + + { + TCanvas c{"plots", "plots", 4200, 6200}; + c.Divide(4, 3); + c.cd(1); + { + gPad->SetLogy(); + hEnergyM0HBCPU->SetLineColor(kBlack); + hEnergyM0HBCPU->SetLineWidth(1.); + hEnergyM0HBCPU->Draw(""); + hEnergyM0HBGPU->SetLineColor(kBlue); + hEnergyM0HBGPU->SetLineWidth(1.); + hEnergyM0HBGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hEnergyM0HBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2 - y1)); + } + c.cd(2); + { + gPad->SetLogz(); + hEnergyM0HBGPUvsCPU->GetXaxis()->SetTitle("cpu"); + hEnergyM0HBGPUvsCPU->GetYaxis()->SetTitle("gpu"); + hEnergyM0HBGPUvsCPU->Draw("colz"); + } + c.cd(3); + { + gPad->SetLogy(); + hEnergyM0HECPU->SetLineColor(kBlack); + hEnergyM0HECPU->SetLineWidth(1.); + hEnergyM0HECPU->Draw(""); + hEnergyM0HEGPU->SetLineColor(kBlue); + hEnergyM0HEGPU->SetLineWidth(1.); + hEnergyM0HEGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hEnergyM0HEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2 - y1)); + } + c.cd(4); + { + gPad->SetLogz(); + hEnergyM0HEGPUvsCPU->GetXaxis()->SetTitle("cpu"); + hEnergyM0HEGPUvsCPU->GetYaxis()->SetTitle("gpu"); + hEnergyM0HEGPUvsCPU->Draw("colz"); + } + c.cd(5); + { + gPad->SetLogy(); + hEnergyHBCPU->SetLineColor(kBlack); + hEnergyHBCPU->SetLineWidth(1.); + hEnergyHBCPU->Draw(""); + hEnergyHBGPU->SetLineColor(kBlue); + hEnergyHBGPU->SetLineWidth(1.); + hEnergyHBGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hEnergyHBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2 - y1)); + } + c.cd(6); + { + gPad->SetLogz(); + hEnergyHBGPUvsCPU->GetXaxis()->SetTitle("cpu"); + hEnergyHBGPUvsCPU->GetYaxis()->SetTitle("gpu"); + hEnergyHBGPUvsCPU->Draw("colz"); + } + c.cd(7); + { + gPad->SetLogy(); + hEnergyHECPU->SetLineColor(kBlack); + hEnergyHECPU->SetLineWidth(1.); + hEnergyHECPU->Draw(""); + hEnergyHEGPU->SetLineColor(kBlue); + hEnergyHEGPU->SetLineWidth(1.); + hEnergyHEGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hEnergyHEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2 - y1)); + } + c.cd(8); + { + gPad->SetLogz(); + hEnergyHEGPUvsCPU->GetXaxis()->SetTitle("cpu"); + hEnergyHEGPUvsCPU->GetYaxis()->SetTitle("gpu"); + hEnergyHEGPUvsCPU->Draw("colz"); + } + c.cd(9); + { + gPad->SetLogy(); + hChi2HBCPU->SetLineColor(kBlack); + hChi2HBCPU->SetLineWidth(1.); + hChi2HBCPU->Draw(""); + hChi2HBGPU->SetLineColor(kBlue); + hChi2HBGPU->SetLineWidth(1.); + hChi2HBGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hChi2HBGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2 - y1)); + } + c.cd(10); + { + gPad->SetLogz(); + hChi2HBGPUvsCPU->GetXaxis()->SetTitle("cpu"); + hChi2HBGPUvsCPU->GetYaxis()->SetTitle("gpu"); + hChi2HBGPUvsCPU->Draw("colz"); + } + c.cd(11); + { + gPad->SetLogy(); + hChi2HECPU->SetLineColor(kBlack); + hChi2HECPU->SetLineWidth(1.); + hChi2HECPU->Draw(""); + hChi2HEGPU->SetLineColor(kBlue); + hChi2HEGPU->SetLineWidth(1.); + hChi2HEGPU->Draw("sames"); + gPad->Update(); + auto stats = (TPaveStats*)hChi2HEGPU->FindObject("stats"); + auto y2 = stats->GetY2NDC(); + auto y1 = stats->GetY1NDC(); + stats->SetY2NDC(y1); + stats->SetY1NDC(y1 - (y2 - y1)); + } + c.cd(12); + { + gPad->SetLogz(); + hChi2HEGPUvsCPU->GetXaxis()->SetTitle("cpu"); + hChi2HEGPUvsCPU->GetYaxis()->SetTitle("gpu"); + hChi2HEGPUvsCPU->Draw("colz"); + } + c.SaveAs("plots.pdf"); + } + + rfin.Close(); + rfout.Write(); + rfout.Close(); +} diff --git a/RecoLocalCalo/HcalRecProducers/python/hbheRecHitProducerGPUTask_cff.py b/RecoLocalCalo/HcalRecProducers/python/hbheRecHitProducerGPUTask_cff.py new file mode 100644 index 0000000000000..d938653d5a15e --- /dev/null +++ b/RecoLocalCalo/HcalRecProducers/python/hbheRecHitProducerGPUTask_cff.py @@ -0,0 +1,65 @@ +import FWCore.ParameterSet.Config as cms + +# Run 3 HCAL workflow on GPU + +# EventSetup modules used by HBHERecHitProducerGPU +from RecoLocalCalo.HcalRecProducers.hcalGainsGPUESProducer_cfi import hcalGainsGPUESProducer +from RecoLocalCalo.HcalRecProducers.hcalGainWidthsGPUESProducer_cfi import hcalGainWidthsGPUESProducer +from RecoLocalCalo.HcalRecProducers.hcalLUTCorrsGPUESProducer_cfi import hcalLUTCorrsGPUESProducer +from RecoLocalCalo.HcalRecProducers.hcalConvertedPedestalsGPUESProducer_cfi import hcalConvertedPedestalsGPUESProducer +from RecoLocalCalo.HcalRecProducers.hcalConvertedEffectivePedestalsGPUESProducer_cfi import hcalConvertedEffectivePedestalsGPUESProducer +hcalConvertedEffectivePedestalsGPUESProducer.label0 = "withTopoEff" + +from RecoLocalCalo.HcalRecProducers.hcalConvertedPedestalWidthsGPUESProducer_cfi import hcalConvertedPedestalWidthsGPUESProducer +from RecoLocalCalo.HcalRecProducers.hcalConvertedEffectivePedestalWidthsGPUESProducer_cfi import hcalConvertedEffectivePedestalWidthsGPUESProducer +hcalConvertedEffectivePedestalWidthsGPUESProducer.label0 = "withTopoEff" +hcalConvertedEffectivePedestalWidthsGPUESProducer.label1 = "withTopoEff" + +from RecoLocalCalo.HcalRecProducers.hcalQIECodersGPUESProducer_cfi import hcalQIECodersGPUESProducer +from RecoLocalCalo.HcalRecProducers.hcalRecoParamsWithPulseShapesGPUESProducer_cfi import hcalRecoParamsWithPulseShapesGPUESProducer +from RecoLocalCalo.HcalRecProducers.hcalRespCorrsGPUESProducer_cfi import hcalRespCorrsGPUESProducer +from RecoLocalCalo.HcalRecProducers.hcalTimeCorrsGPUESProducer_cfi import hcalTimeCorrsGPUESProducer +from RecoLocalCalo.HcalRecProducers.hcalQIETypesGPUESProducer_cfi import hcalQIETypesGPUESProducer +from RecoLocalCalo.HcalRecProducers.hcalSiPMParametersGPUESProducer_cfi import hcalSiPMParametersGPUESProducer +from RecoLocalCalo.HcalRecProducers.hcalSiPMCharacteristicsGPUESProducer_cfi import hcalSiPMCharacteristicsGPUESProducer +from RecoLocalCalo.HcalRecProducers.hcalMahiPulseOffsetsGPUESProducer_cfi import hcalMahiPulseOffsetsGPUESProducer + +# convert the HBHE digis into SoA format, and copy them from CPU to GPU +from EventFilter.HcalRawToDigi.hcalDigisProducerGPU_cfi import hcalDigisProducerGPU as _hcalDigisProducerGPU +hcalDigisGPU = _hcalDigisProducerGPU.clone( + digisLabelF01HE = "", + digisLabelF5HB = "", + digisLabelF3HB = "" +) + +# run the HCAL local reconstruction (MAHI) on GPU +from RecoLocalCalo.HcalRecProducers.hbheRecHitProducerGPU_cfi import hbheRecHitProducerGPU as _hbheRecHitProducerGPU +hbheRecHitProducerGPU = _hbheRecHitProducerGPU.clone( + digisLabelF01HE = "hcalDigisGPU", + digisLabelF5HB = "hcalDigisGPU", + digisLabelF3HB = "hcalDigisGPU", + recHitsLabelM0HBHE = "" +) + +# Tasks and Sequences +hbheRecHitProducerGPUTask = cms.Task( + hcalGainsGPUESProducer, + hcalGainWidthsGPUESProducer, + hcalLUTCorrsGPUESProducer, + hcalConvertedPedestalsGPUESProducer, + hcalConvertedEffectivePedestalsGPUESProducer, + hcalConvertedPedestalWidthsGPUESProducer, + hcalConvertedEffectivePedestalWidthsGPUESProducer, + hcalQIECodersGPUESProducer, + hcalRecoParamsWithPulseShapesGPUESProducer, + hcalRespCorrsGPUESProducer, + hcalTimeCorrsGPUESProducer, + hcalQIETypesGPUESProducer, + hcalSiPMParametersGPUESProducer, + hcalSiPMCharacteristicsGPUESProducer, + hcalMahiPulseOffsetsGPUESProducer, + hcalDigisGPU, + hbheRecHitProducerGPU +) + +hbheRecHitProducerGPUSequence = cms.Sequence(hbheRecHitProducerGPUTask) diff --git a/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h b/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h new file mode 100644 index 0000000000000..807c42b057fa3 --- /dev/null +++ b/RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h @@ -0,0 +1,115 @@ +#ifndef RecoLocalCalo_HcalRecProducers_src_DeclsForKernels_h +#define RecoLocalCalo_HcalRecProducers_src_DeclsForKernels_h + +#include +#include + +#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h" +#include "CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h" +#include "CalibCalorimetry/HcalAlgos/interface/HcalTimeSlew.h" +#include "CondFormats/DataRecord/interface/HcalCombinedRecordsGPU.h" +#include "CondFormats/DataRecord/interface/HcalGainWidthsRcd.h" +#include "CondFormats/DataRecord/interface/HcalGainsRcd.h" +#include "CondFormats/DataRecord/interface/HcalLUTCorrsRcd.h" +#include "CondFormats/DataRecord/interface/HcalQIEDataRcd.h" +#include "CondFormats/DataRecord/interface/HcalQIETypesRcd.h" +#include "CondFormats/DataRecord/interface/HcalRecoParamsRcd.h" +#include "CondFormats/DataRecord/interface/HcalRespCorrsRcd.h" +#include "CondFormats/DataRecord/interface/HcalSiPMCharacteristicsRcd.h" +#include "CondFormats/DataRecord/interface/HcalSiPMParametersRcd.h" +#include "CondFormats/DataRecord/interface/HcalTimeCorrsRcd.h" +#include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalWidthsGPU.h" +#include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalsGPU.h" +#include "CondFormats/HcalObjects/interface/HcalGainWidthsGPU.h" +#include "CondFormats/HcalObjects/interface/HcalGainsGPU.h" +#include "CondFormats/HcalObjects/interface/HcalLUTCorrsGPU.h" +#include "CondFormats/HcalObjects/interface/HcalQIECodersGPU.h" +#include "CondFormats/HcalObjects/interface/HcalQIETypesGPU.h" +#include "CondFormats/HcalObjects/interface/HcalRecoParamsGPU.h" +#include "CondFormats/HcalObjects/interface/HcalRespCorrsGPU.h" +#include "CondFormats/HcalObjects/interface/HcalSiPMCharacteristicsGPU.h" +#include "CondFormats/HcalObjects/interface/HcalSiPMParametersGPU.h" +#include "CondFormats/HcalObjects/interface/HcalTimeCorrsGPU.h" +#include "Geometry/CaloTopology/interface/HcalTopology.h" +#include "Geometry/HcalCommonData/interface/HcalDDDRecConstants.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h" +#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h" + +namespace hcal { + namespace reconstruction { + + struct ConditionsProducts { + HcalGainWidthsGPU::Product const& gainWidths; + HcalGainsGPU::Product const& gains; + HcalLUTCorrsGPU::Product const& lutCorrs; + HcalConvertedPedestalWidthsGPU::Product const& pedestalWidths; + HcalConvertedEffectivePedestalWidthsGPU::Product const& effectivePedestalWidths; + HcalConvertedPedestalsGPU::Product const& pedestals; + HcalQIECodersGPU::Product const& qieCoders; + HcalRecoParamsWithPulseShapesGPU::Product const& recoParams; + HcalRespCorrsGPU::Product const& respCorrs; + HcalTimeCorrsGPU::Product const& timeCorrs; + HcalQIETypesGPU::Product const& qieTypes; + HcalSiPMParametersGPU::Product const& sipmParameters; + HcalSiPMCharacteristicsGPU::Product const& sipmCharacteristics; + HcalConvertedPedestalsGPU::Product const* convertedEffectivePedestals; + HcalTopology const* topology; + HcalDDDRecConstants const* recConstants; + uint32_t offsetForHashes; + HcalMahiPulseOffsetsGPU::Product const& pulseOffsets; + std::vector> const& pulseOffsetsHost; + }; + + struct ConfigParameters { + uint32_t maxChannels; + uint32_t maxTimeSamples; + uint32_t kprep1dChannelsPerBlock; + int sipmQTSShift; + int sipmQNTStoSum; + int firstSampleShift; + bool useEffectivePedestals; + + float meanTime; + float timeSigmaSiPM, timeSigmaHPD; + float ts4Thresh; + + std::array kernelMinimizeThreads; + + // FIXME: + // - add "getters" to HcalTimeSlew calib formats + // - add ES Producer to consume what is produced above not to replicate. + // which ones to use is hardcoded, therefore no need to send those to the device + bool applyTimeSlew; + float tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew; + }; + + struct OutputDataGPU { + RecHitCollection<::calo::common::DevStoragePolicy> recHits; + + void allocate(ConfigParameters const& config, cudaStream_t cudaStream) { + recHits.energy = cms::cuda::make_device_unique(config.maxChannels, cudaStream); + recHits.chi2 = cms::cuda::make_device_unique(config.maxChannels, cudaStream); + recHits.energyM0 = cms::cuda::make_device_unique(config.maxChannels, cudaStream); + recHits.timeM0 = cms::cuda::make_device_unique(config.maxChannels, cudaStream); + recHits.did = cms::cuda::make_device_unique(config.maxChannels, cudaStream); + } + }; + + struct ScratchDataGPU { + cms::cuda::device::unique_ptr amplitudes, noiseTerms, pulseMatrices, pulseMatricesM, pulseMatricesP; + cms::cuda::device::unique_ptr soiSamples; + }; + + struct InputDataGPU { + DigiCollection const& f01HEDigis; + DigiCollection const& f5HBDigis; + DigiCollection const& f3HBDigis; + }; + + } // namespace reconstruction +} // namespace hcal + +#endif // RecoLocalCalo_HcalRecProducers_src_DeclsForKernels_h diff --git a/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc new file mode 100644 index 0000000000000..af5398e49fa8f --- /dev/null +++ b/RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc @@ -0,0 +1,246 @@ +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDACore/interface/JobConfigurationGPURecord.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + +#include "SimpleAlgoGPU.h" + +class HBHERecHitProducerGPU : public edm::stream::EDProducer { +public: + explicit HBHERecHitProducerGPU(edm::ParameterSet const&); + ~HBHERecHitProducerGPU() override; + static void fillDescriptions(edm::ConfigurationDescriptions&); + +private: + void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override; + void produce(edm::Event&, edm::EventSetup const&) override; + + using IProductTypef01 = cms::cuda::Product>; + edm::EDGetTokenT digisTokenF01HE_; + + using IProductTypef5 = cms::cuda::Product>; + edm::EDGetTokenT digisTokenF5HB_; + + using IProductTypef3 = cms::cuda::Product>; + edm::EDGetTokenT digisTokenF3HB_; + + using RecHitType = hcal::RecHitCollection; + using OProductType = cms::cuda::Product; + edm::EDPutTokenT rechitsM0Token_; + + hcal::reconstruction::ConfigParameters configParameters_; + hcal::reconstruction::OutputDataGPU outputGPU_; + cms::cuda::ContextState cudaState_; +}; + +HBHERecHitProducerGPU::HBHERecHitProducerGPU(edm::ParameterSet const& ps) + : digisTokenF01HE_{consumes(ps.getParameter("digisLabelF01HE"))}, + digisTokenF5HB_{consumes(ps.getParameter("digisLabelF5HB"))}, + digisTokenF3HB_{consumes(ps.getParameter("digisLabelF3HB"))}, + rechitsM0Token_{produces(ps.getParameter("recHitsLabelM0HBHE"))} { + configParameters_.maxChannels = ps.getParameter("maxChannels"); + configParameters_.maxTimeSamples = ps.getParameter("maxTimeSamples"); + configParameters_.kprep1dChannelsPerBlock = ps.getParameter("kprep1dChannelsPerBlock"); + configParameters_.sipmQTSShift = ps.getParameter("sipmQTSShift"); + configParameters_.sipmQNTStoSum = ps.getParameter("sipmQNTStoSum"); + configParameters_.firstSampleShift = ps.getParameter("firstSampleShift"); + configParameters_.useEffectivePedestals = ps.getParameter("useEffectivePedestals"); + + configParameters_.meanTime = ps.getParameter("meanTime"); + configParameters_.timeSigmaSiPM = ps.getParameter("timeSigmaSiPM"); + configParameters_.timeSigmaHPD = ps.getParameter("timeSigmaHPD"); + configParameters_.ts4Thresh = ps.getParameter("ts4Thresh"); + + configParameters_.applyTimeSlew = ps.getParameter("applyTimeSlew"); + auto const tzeroValues = ps.getParameter>("tzeroTimeSlewParameters"); + auto const slopeValues = ps.getParameter>("slopeTimeSlewParameters"); + auto const tmaxValues = ps.getParameter>("tmaxTimeSlewParameters"); + + configParameters_.tzeroTimeSlew = tzeroValues[HcalTimeSlew::Medium]; + configParameters_.slopeTimeSlew = slopeValues[HcalTimeSlew::Medium]; + configParameters_.tmaxTimeSlew = tmaxValues[HcalTimeSlew::Medium]; + + auto threadsMinimize = ps.getParameter>("kernelMinimizeThreads"); + configParameters_.kernelMinimizeThreads[0] = threadsMinimize[0]; + configParameters_.kernelMinimizeThreads[1] = threadsMinimize[1]; + configParameters_.kernelMinimizeThreads[2] = threadsMinimize[2]; +} + +HBHERecHitProducerGPU::~HBHERecHitProducerGPU() {} + +void HBHERecHitProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& cdesc) { + edm::ParameterSetDescription desc; + desc.add("maxChannels", 10000u); + desc.add("maxTimeSamples", 10); + desc.add("kprep1dChannelsPerBlock", 32); + desc.add("digisLabelF01HE", edm::InputTag{"hcalRawToDigiGPU", "f01HEDigisGPU"}); + desc.add("digisLabelF5HB", edm::InputTag{"hcalRawToDigiGPU", "f5HBDigisGPU"}); + desc.add("digisLabelF3HB", edm::InputTag{"hcalRawToDigiGPU", "f3HBDigisGPU"}); + desc.add("recHitsLabelM0HBHE", "recHitsM0HBHE"); + desc.add("sipmQTSShift", 0); + desc.add("sipmQNTStoSum", 3); + desc.add("firstSampleShift", 0); + desc.add("useEffectivePedestals", true); + + desc.add("meanTime", 0.f); + desc.add("timeSigmaSiPM", 2.5f); + desc.add("timeSigmaHPD", 5.0f); + desc.add("ts4Thresh", 0.0); + + desc.add("applyTimeSlew", true); + desc.add>("tzeroTimeSlewParameters", {23.960177, 11.977461, 9.109694}); + desc.add>("slopeTimeSlewParameters", {-3.178648, -1.5610227, -1.075824}); + desc.add>("tmaxTimeSlewParameters", {16.00, 10.00, 6.25}); + desc.add>("kernelMinimizeThreads", {16, 1, 1}); + + cdesc.addWithDefaultLabel(desc); +} + +void HBHERecHitProducerGPU::acquire(edm::Event const& event, + edm::EventSetup const& setup, + edm::WaitingTaskWithArenaHolder holder) { +#ifdef HCAL_MAHI_CPUDEBUG + auto start = std::chrono::high_resolution_clock::now(); +#endif + + // input + raii + auto const& f01HEProduct = event.get(digisTokenF01HE_); + auto const& f5HBProduct = event.get(digisTokenF5HB_); + auto const& f3HBProduct = event.get(digisTokenF3HB_); + cms::cuda::ScopedContextAcquire ctx{f01HEProduct, std::move(holder), cudaState_}; + auto const& f01HEDigis = ctx.get(f01HEProduct); + auto const& f5HBDigis = ctx.get(f5HBProduct); + auto const& f3HBDigis = ctx.get(f3HBProduct); + + hcal::reconstruction::InputDataGPU inputGPU{f01HEDigis, f5HBDigis, f3HBDigis}; + + // conditions + edm::ESHandle recoParamsHandle; + setup.get().get(recoParamsHandle); + auto const& recoParamsProduct = recoParamsHandle->getProduct(ctx.stream()); + + edm::ESHandle gainWidthsHandle; + setup.get().get(gainWidthsHandle); + auto const& gainWidthsProduct = gainWidthsHandle->getProduct(ctx.stream()); + + edm::ESHandle gainsHandle; + setup.get().get(gainsHandle); + auto const& gainsProduct = gainsHandle->getProduct(ctx.stream()); + + edm::ESHandle lutCorrsHandle; + setup.get().get(lutCorrsHandle); + auto const& lutCorrsProduct = lutCorrsHandle->getProduct(ctx.stream()); + + // use only 1 depending on useEffectivePedestals + edm::ESHandle pedestalWidthsHandle; + edm::ESHandle effectivePedestalWidthsHandle; + setup.get().get(effectivePedestalWidthsHandle); + setup.get().get(pedestalWidthsHandle); + auto const& pedestalWidthsProduct = pedestalWidthsHandle->getProduct(ctx.stream()); + auto const& effectivePedestalWidthsProduct = effectivePedestalWidthsHandle->getProduct(ctx.stream()); + + edm::ESHandle pedestalsHandle; + setup.get().get(pedestalsHandle); + auto const& pedestalsProduct = pedestalsHandle->getProduct(ctx.stream()); + + edm::ESHandle effectivePedestalsHandle; + if (configParameters_.useEffectivePedestals) + setup.get().get(effectivePedestalsHandle); + auto const* effectivePedestalsProduct = + configParameters_.useEffectivePedestals ? &effectivePedestalsHandle->getProduct(ctx.stream()) : nullptr; + + edm::ESHandle qieCodersHandle; + setup.get().get(qieCodersHandle); + auto const& qieCodersProduct = qieCodersHandle->getProduct(ctx.stream()); + + edm::ESHandle respCorrsHandle; + setup.get().get(respCorrsHandle); + auto const& respCorrsProduct = respCorrsHandle->getProduct(ctx.stream()); + + edm::ESHandle timeCorrsHandle; + setup.get().get(timeCorrsHandle); + auto const& timeCorrsProduct = timeCorrsHandle->getProduct(ctx.stream()); + + edm::ESHandle qieTypesHandle; + setup.get().get(qieTypesHandle); + auto const& qieTypesProduct = qieTypesHandle->getProduct(ctx.stream()); + + edm::ESHandle topologyHandle; + setup.get().get(topologyHandle); + edm::ESHandle recConstantsHandle; + setup.get().get(recConstantsHandle); + + edm::ESHandle sipmParametersHandle; + setup.get().get(sipmParametersHandle); + auto const& sipmParametersProduct = sipmParametersHandle->getProduct(ctx.stream()); + + edm::ESHandle sipmCharacteristicsHandle; + setup.get().get(sipmCharacteristicsHandle); + auto const& sipmCharacteristicsProduct = sipmCharacteristicsHandle->getProduct(ctx.stream()); + + edm::ESHandle pulseOffsetsHandle; + setup.get().get(pulseOffsetsHandle); + auto const& pulseOffsetsProduct = pulseOffsetsHandle->getProduct(ctx.stream()); + + // bundle up conditions + hcal::reconstruction::ConditionsProducts conditions{gainWidthsProduct, + gainsProduct, + lutCorrsProduct, + pedestalWidthsProduct, + effectivePedestalWidthsProduct, + pedestalsProduct, + qieCodersProduct, + recoParamsProduct, + respCorrsProduct, + timeCorrsProduct, + qieTypesProduct, + sipmParametersProduct, + sipmCharacteristicsProduct, + effectivePedestalsProduct, + topologyHandle.product(), + recConstantsHandle.product(), + pedestalsHandle->offsetForHashes(), + pulseOffsetsProduct, + pulseOffsetsHandle->getValues()}; + + // scratch mem on device + hcal::reconstruction::ScratchDataGPU scratchGPU = { + cms::cuda::make_device_unique(configParameters_.maxChannels * configParameters_.maxTimeSamples, + ctx.stream()), + cms::cuda::make_device_unique(configParameters_.maxChannels * configParameters_.maxTimeSamples, + ctx.stream()), + cms::cuda::make_device_unique( + configParameters_.maxChannels * configParameters_.maxTimeSamples * configParameters_.maxTimeSamples, + ctx.stream()), + cms::cuda::make_device_unique( + configParameters_.maxChannels * configParameters_.maxTimeSamples * configParameters_.maxTimeSamples, + ctx.stream()), + cms::cuda::make_device_unique( + configParameters_.maxChannels * configParameters_.maxTimeSamples * configParameters_.maxTimeSamples, + ctx.stream()), + cms::cuda::make_device_unique(configParameters_.maxChannels, ctx.stream()), + }; + + // output dev mem + outputGPU_.allocate(configParameters_, ctx.stream()); + + hcal::reconstruction::entryPoint(inputGPU, outputGPU_, conditions, scratchGPU, configParameters_, ctx.stream()); + +#ifdef HCAL_MAHI_CPUDEBUG + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end - start).count(); + std::cout << "acquire duration = " << duration << std::endl; +#endif +} + +void HBHERecHitProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) { + cms::cuda::ScopedContextProduce ctx{cudaState_}; + ctx.emplace(event, rechitsM0Token_, std::move(outputGPU_.recHits)); +} + +DEFINE_FWK_MODULE(HBHERecHitProducerGPU); diff --git a/RecoLocalCalo/HcalRecProducers/src/HCALGPUAnalyzer.cc b/RecoLocalCalo/HcalRecProducers/src/HCALGPUAnalyzer.cc new file mode 100644 index 0000000000000..ba3c9de696c47 --- /dev/null +++ b/RecoLocalCalo/HcalRecProducers/src/HCALGPUAnalyzer.cc @@ -0,0 +1,307 @@ +// -*- C++ -*- +// +// Package: ComparisonPlots/HCALGPUAnalyzer +// Class: HCALGPUAnalyzer +// +/**\class HCALGPUAnalyzer HCALGPUAnalyzer.cc ComparisonPlots/HCALGPUAnalyzer/plugins/HCALGPUAnalyzer.cc + + Description: [one line class summary] + + Implementation: + [Notes on implementation] +*/ +// +// Original Author: Mariarosaria D'Alfonso +// Created: Mon, 17 Dec 2018 16:22:58 GMT +// +// + +// system include files +#include +#include +#include +#include +using namespace std; + +// user include files +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/one/EDAnalyzer.h" + +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/MakerMacros.h" + +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "CommonTools/UtilAlgos/interface/TFileService.h" + +#include "DataFormats/HcalRecHit/interface/HBHERecHit.h" +#include "DataFormats/HcalRecHit/interface/HcalRecHitCollections.h" +#include "DataFormats/HcalDetId/interface/HcalDetId.h" + +#include "SimDataFormats/CaloHit/interface/PCaloHit.h" +#include "SimDataFormats/CaloHit/interface/PCaloHitContainer.h" + +#include "SimCalorimetry/HcalSimAlgos/interface/HcalSimParameterMap.h" + +#include "TH2F.h" + +// +// class declaration +// + +class HCALGPUAnalyzer : public edm::one::EDAnalyzer { +public: + explicit HCALGPUAnalyzer(const edm::ParameterSet &); + ~HCALGPUAnalyzer() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions &descriptions); + +private: + void beginJob() override; + void analyze(const edm::Event &, const edm::EventSetup &) override; + void endJob() override; + + // ----------member data --------------------------- + // void ClearVariables(); + + // some variables for storing information + double Method0Energy, Method0EnergyGPU; + double RecHitEnergy, RecHitEnergyGPU; + double RecHitTime, RecHitTimeGPU; + double iEta, iEtaGPU; + double iPhi, iPhiGPU; + int depth, depthGPU; + + TH2F *hEnergy_2dMahi; + TH2F *hEnergy_2dM0; + TH2F *hTime_2dMahi; + + TH2F *Unmatched; + TH2F *Matched; + TH1F *hEnergy_cpu; + TH1F *hEnergy_gpu; + TH1F *hEnergy_cpugpu; + TH1F *hEnergy_cpugpu_rel; + TH1F *hEnergyM0_cpu; + TH1F *hEnergyM0_gpu; + TH1F *hTime_cpu; + TH1F *hTime_gpu; + + // create the output file + edm::Service FileService; + // create the token to retrieve hit information + edm::EDGetTokenT hRhToken; + edm::EDGetTokenT hRhTokenGPU; +}; + +// +// constants, enums and typedefs +// + +// +// static data member definitions +// + +// +// constructors and destructor +// +HCALGPUAnalyzer::HCALGPUAnalyzer(const edm::ParameterSet &iConfig) { + usesResource("TFileService"); + + hRhToken = consumes(iConfig.getUntrackedParameter("HBHERecHits", "hbheprereco")); + hRhTokenGPU = consumes( + iConfig.getUntrackedParameter("HBHERecHits", "hcalCPURecHitsProducer:recHitsLegacyHBHE")); + + // + + hEnergy_2dM0 = FileService->make("hEnergy_2dM0", "hEnergy_2dM0", 1000, 0., 100., 1000, 0., 100.); + hEnergy_2dM0->GetXaxis()->SetTitle("Cpu M0 Energy"); + hEnergy_2dM0->GetYaxis()->SetTitle("GPU M0 Energy"); + + hEnergy_2dMahi = FileService->make("hEnergy_2dMahi", "hEnergy_2dMahi", 1000, 0., 100., 1000, 0., 100.); + hEnergy_2dMahi->GetXaxis()->SetTitle("CPU Energy"); + hEnergy_2dMahi->GetYaxis()->SetTitle("GPU Energy"); + + hTime_2dMahi = FileService->make("hTime_2dMahi", "hTime_2dMahi", 250, -12.5, 12.5, 250, -12.5, 12.5); + hTime_2dMahi->GetXaxis()->SetTitle("Mahi Time CPU"); + hTime_2dMahi->GetYaxis()->SetTitle("Mahi Time GPU"); + + // + + hEnergyM0_cpu = FileService->make("hEnergyM0_cpu", "hEnergyM0_cpu", 100, 0., 100.); + hEnergyM0_cpu->GetXaxis()->SetTitle("CPU Energy"); + + hEnergy_cpu = FileService->make("hEnergy_cpu", "hEnergy_cpu", 50, 0., 50.); + hEnergy_cpu->GetXaxis()->SetTitle("CPU Energy"); + + hEnergy_gpu = FileService->make("hEnergy_gpu", "hEnergy_gpu", 50, 0., 50.); + hEnergy_gpu->GetXaxis()->SetTitle("GPU Energy"); + + // + + hEnergy_cpugpu = FileService->make("hEnergy_cpugpu", "hEnergy_cpugpu", 500, -2.5, 2.5); + hEnergy_cpugpu->GetXaxis()->SetTitle("GPU Energy - CPU Energy [GeV]"); + hEnergy_cpugpu->GetYaxis()->SetTitle("# RecHits"); + + hEnergy_cpugpu_rel = + FileService->make("hEnergy_cpugpu_rel", "hEnergy_cpugpu_rel ( E > 0.005 GeV)", 500, -2.5, 2.5); + hEnergy_cpugpu_rel->GetXaxis()->SetTitle("(GPU Energy - CPU Energy) / CPU energy"); + hEnergy_cpugpu_rel->GetYaxis()->SetTitle("# RecHits"); + + // + + hTime_cpu = FileService->make("hTime_cpu", "hTime_cpu", 50, -25., 25.); + hTime_cpu->GetXaxis()->SetTitle("CPU Time"); + + hTime_gpu = FileService->make("hTime_gpu", "hTime_gpu", 50, -25., 25.); + hTime_gpu->GetXaxis()->SetTitle("GPU Time"); + + Unmatched = FileService->make("Unmatched", "Unmatched (eta,phi)", 100, -50., 50., 85, 0., 85.); + Matched = FileService->make("Matched", "Matched (eta,phi)", 100, -50., 50., 85, 0., 85.); + + //now do what ever initialization is needed +} + +// +// member functions +// + +// ------------ method called for each event ------------ +void HCALGPUAnalyzer::analyze(const edm::Event &iEvent, const edm::EventSetup &iSetup) { + using namespace edm; + + // Read events + Handle hRecHits; + iEvent.getByToken(hRhToken, hRecHits); + + Handle hRecHitsGPU; + iEvent.getByToken(hRhTokenGPU, hRecHitsGPU); + + // Loop over all rechits in one event + for (int i = 0; i < (int)hRecHits->size(); i++) { + // get ID information for the reconstructed hit + HcalDetId detID_rh = (*hRecHits)[i].id().rawId(); + + // ID information can get us detector coordinates + depth = (*hRecHits)[i].id().depth(); + iEta = detID_rh.ieta(); + iPhi = detID_rh.iphi(); + + // get some variables + Method0Energy = (*hRecHits)[i].eraw(); + RecHitEnergy = (*hRecHits)[i].energy(); + RecHitTime = (*hRecHits)[i].time(); + + hEnergy_cpu->Fill(RecHitEnergy); + hTime_cpu->Fill(RecHitTime); + + /* + cout << "Run " << i << ": "; + cout << "Method0Energy: " << Method0Energy; + cout << "RecHitEnergy: " << RecHitEnergy; + cout << "depth: " << depth; + cout << "iEta: " << iEta; + cout << "iPhi: " << iPhi; + cout << "RecHitTime" << RecHitTime; + */ + } + + for (int i = 0; i < (int)hRecHitsGPU->size(); i++) { + // get ID information for the reconstructed hit + HcalDetId detID_rh = (*hRecHitsGPU)[i].id().rawId(); + + // ID information can get us detector coordinates + depthGPU = (*hRecHitsGPU)[i].id().depth(); + iEtaGPU = detID_rh.ieta(); + iPhiGPU = detID_rh.iphi(); + + // get some variables + Method0EnergyGPU = (*hRecHitsGPU)[i].eraw(); + RecHitEnergyGPU = (*hRecHitsGPU)[i].energy(); + RecHitTimeGPU = (*hRecHitsGPU)[i].time(); + + hEnergy_gpu->Fill(RecHitEnergyGPU); + hTime_gpu->Fill(RecHitTimeGPU); + + /* + cout << "Run " << i << ": "; + cout << "Method0Energy: " << Method0EnergyGPU; + cout << "RecHitEnergy: " << RecHitEnergyGPU; + cout << "depth: " << depthGPU; + cout << "iEta: " << iEtaGPU; + cout << "iPhi: " << iPhiGPU; + cout << "RecHitTime" << RecHitTimeGPU; + */ + } + + // Loop over all rechits in one event + for (int i = 0; i < (int)hRecHits->size(); i++) { + HcalDetId detID_rh = (*hRecHits)[i].id().rawId(); + + bool unmatched = true; + // cout << "--------------------------------------------------------" << endl; + + for (int j = 0; j < (int)hRecHitsGPU->size(); j++) { + HcalDetId detID_gpu = (*hRecHitsGPU)[j].id().rawId(); + + if ((detID_rh == detID_gpu)) { + /* + cout << "Mtime(cpu)" << (*hRecHits)[i].time() << endl; + cout << " Mtime(gpu)" << (*hRecHitsGPU)[j].time() << endl; + + cout << "M0E(cpu)" << (*hRecHits)[i].eraw() << endl; + cout << " M0E(gpu)" << (*hRecHitsGPU)[j].eraw() << endl; + */ + + auto relValue = ((*hRecHitsGPU)[j].energy() - (*hRecHits)[i].energy()) / (*hRecHits)[i].energy(); + + hEnergy_2dM0->Fill((*hRecHits)[i].eraw(), (*hRecHitsGPU)[j].eraw()); + hEnergy_2dMahi->Fill((*hRecHits)[i].energy(), (*hRecHitsGPU)[j].energy()); + hEnergy_cpugpu->Fill((*hRecHitsGPU)[j].energy() - (*hRecHits)[i].energy()); + if ((*hRecHits)[i].energy() > 0.005) + hEnergy_cpugpu_rel->Fill(relValue); + hTime_2dMahi->Fill((*hRecHits)[i].time(), (*hRecHitsGPU)[j].time()); + + /* + if((relValue < - 0.9) and ((*hRecHits)[i].energy()>0.005)) { + cout << "----------------------------------"<< endl; + cout << " detID = " << detID_rh.rawId() << endl; + cout << "ME(cpu)" << (*hRecHits)[i].energy() << endl; + cout << " ME(gpu)" << (*hRecHitsGPU)[j].energy() << endl; + } + */ + + Matched->Fill(detID_rh.ieta(), detID_rh.iphi()); + + unmatched = false; + } + } + + /// + + if (unmatched) { + Unmatched->Fill(detID_rh.ieta(), detID_rh.iphi()); + // cout << " recHit not matched =" << detID_rh << " E(raw)=" << (*hRecHits)[i].eraw() << " E=" << (*hRecHits)[i].energy() << endl; + } + } +} + +// ------------ method called once each job just before starting event loop ------------ +void HCALGPUAnalyzer::beginJob() {} + +// ------------ method called once each job just after ending the event loop ------------ +void HCALGPUAnalyzer::endJob() {} + +// ------------ method fills 'descriptions' with the allowed parameters for the module ------------ +void HCALGPUAnalyzer::fillDescriptions(edm::ConfigurationDescriptions &descriptions) { + //The following says we do not know what parameters are allowed so do no validation + // Please change this to state exactly what you do use, even if it is no parameters + edm::ParameterSetDescription desc; + desc.setUnknown(); + descriptions.addDefault(desc); +} + +//define this as a plug-in +DEFINE_FWK_MODULE(HCALGPUAnalyzer); diff --git a/RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc b/RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc new file mode 100644 index 0000000000000..714ec8b7de5af --- /dev/null +++ b/RecoLocalCalo/HcalRecProducers/src/HcalCPURecHitsProducer.cc @@ -0,0 +1,106 @@ +#include +#include + +#include "CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h" +#include "DataFormats/HcalRecHit/interface/HcalRecHitCollections.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +class HcalCPURecHitsProducer : public edm::stream::EDProducer { +public: + explicit HcalCPURecHitsProducer(edm::ParameterSet const& ps); + ~HcalCPURecHitsProducer() override; + static void fillDescriptions(edm::ConfigurationDescriptions&); + +private: + void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override; + void produce(edm::Event&, edm::EventSetup const&) override; + +private: + using IProductType = cms::cuda::Product>; + edm::EDGetTokenT recHitsM0TokenIn_; + using OProductType = hcal::RecHitCollection>; + edm::EDPutTokenT recHitsM0TokenOut_; + edm::EDPutTokenT recHitsLegacyTokenOut_; + + // to pass from acquire to produce + OProductType tmpRecHits_; +}; + +void HcalCPURecHitsProducer::fillDescriptions(edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; + + desc.add("recHitsM0LabelIn", edm::InputTag{"hbheRecHitProducerGPU", "recHitsM0HBHE"}); + desc.add("recHitsM0LabelOut", "recHitsM0HBHE"); + desc.add("recHitsLegacyLabelOut", "recHitsLegacyHBHE"); + + confDesc.addWithDefaultLabel(desc); +} + +HcalCPURecHitsProducer::HcalCPURecHitsProducer(const edm::ParameterSet& ps) + : recHitsM0TokenIn_{consumes(ps.getParameter("recHitsM0LabelIn"))}, + recHitsM0TokenOut_{produces(ps.getParameter("recHitsM0LabelOut"))}, + recHitsLegacyTokenOut_{produces(ps.getParameter("recHitsLegacyLabelOut"))} {} + +HcalCPURecHitsProducer::~HcalCPURecHitsProducer() {} + +void HcalCPURecHitsProducer::acquire(edm::Event const& event, + edm::EventSetup const& setup, + edm::WaitingTaskWithArenaHolder taskHolder) { + // retrieve data/ctx + auto const& recHitsProduct = event.get(recHitsM0TokenIn_); + cms::cuda::ScopedContextAcquire ctx{recHitsProduct, std::move(taskHolder)}; + auto const& recHits = ctx.get(recHitsProduct); + + // resize tmp buffers + tmpRecHits_.resize(recHits.size); + +#ifdef HCAL_MAHI_CPUDEBUG + std::cout << "num rec Hits = " << recHits.size << std::endl; +#endif + + auto lambdaToTransfer = [&ctx](auto& dest, auto* src) { + using vector_type = typename std::remove_reference::type; + using src_data_type = typename std::remove_pointer::type; + using type = typename vector_type::value_type; + static_assert(std::is_same::value && "Dest and Src data types do not match"); + cudaCheck(cudaMemcpyAsync(dest.data(), src, dest.size() * sizeof(type), cudaMemcpyDeviceToHost, ctx.stream())); + }; + + lambdaToTransfer(tmpRecHits_.energy, recHits.energy.get()); + lambdaToTransfer(tmpRecHits_.chi2, recHits.chi2.get()); + lambdaToTransfer(tmpRecHits_.energyM0, recHits.energyM0.get()); + lambdaToTransfer(tmpRecHits_.timeM0, recHits.timeM0.get()); + lambdaToTransfer(tmpRecHits_.did, recHits.did.get()); +} + +void HcalCPURecHitsProducer::produce(edm::Event& event, edm::EventSetup const& setup) { + // populate the legacy collection + auto recHitsLegacy = std::make_unique(); + // did not set size with ctor as there is no setter for did + recHitsLegacy->reserve(tmpRecHits_.did.size()); + for (uint32_t i = 0; i < tmpRecHits_.did.size(); i++) { + recHitsLegacy->emplace_back(HcalDetId{tmpRecHits_.did[i]}, + tmpRecHits_.energy[i], + 0 // timeRising + ); + + // update newly pushed guy + (*recHitsLegacy)[i].setChiSquared(tmpRecHits_.chi2[i]); + (*recHitsLegacy)[i].setRawEnergy(tmpRecHits_.energyM0[i]); + } + + // put a legacy format + event.put(recHitsLegacyTokenOut_, std::move(recHitsLegacy)); + + // put a new format + event.emplace(recHitsM0TokenOut_, std::move(tmpRecHits_)); +} + +DEFINE_FWK_MODULE(HcalCPURecHitsProducer); diff --git a/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc b/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc new file mode 100644 index 0000000000000..2fc6cc0d19002 --- /dev/null +++ b/RecoLocalCalo/HcalRecProducers/src/HcalESProducersGPUDefs.cc @@ -0,0 +1,120 @@ +#include "CondFormats/DataRecord/interface/HcalCombinedRecordsGPU.h" +#include "CondFormats/DataRecord/interface/HcalGainWidthsRcd.h" +#include "CondFormats/DataRecord/interface/HcalGainsRcd.h" +#include "CondFormats/DataRecord/interface/HcalLUTCorrsRcd.h" +#include "CondFormats/DataRecord/interface/HcalPedestalWidthsRcd.h" +#include "CondFormats/DataRecord/interface/HcalPedestalsRcd.h" +#include "CondFormats/DataRecord/interface/HcalQIEDataRcd.h" +#include "CondFormats/DataRecord/interface/HcalQIETypesRcd.h" +#include "CondFormats/DataRecord/interface/HcalRecoParamsRcd.h" +#include "CondFormats/DataRecord/interface/HcalRespCorrsRcd.h" +#include "CondFormats/DataRecord/interface/HcalSiPMCharacteristicsRcd.h" +#include "CondFormats/DataRecord/interface/HcalSiPMParametersRcd.h" +#include "CondFormats/DataRecord/interface/HcalTimeCorrsRcd.h" +#include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalWidthsGPU.h" +#include "CondFormats/HcalObjects/interface/HcalConvertedEffectivePedestalsGPU.h" +#include "CondFormats/HcalObjects/interface/HcalConvertedPedestalWidthsGPU.h" +#include "CondFormats/HcalObjects/interface/HcalConvertedPedestalsGPU.h" +#include "CondFormats/HcalObjects/interface/HcalGainWidths.h" +#include "CondFormats/HcalObjects/interface/HcalGainWidthsGPU.h" +#include "CondFormats/HcalObjects/interface/HcalGains.h" +#include "CondFormats/HcalObjects/interface/HcalGainsGPU.h" +#include "CondFormats/HcalObjects/interface/HcalLUTCorrs.h" +#include "CondFormats/HcalObjects/interface/HcalLUTCorrsGPU.h" +#include "CondFormats/HcalObjects/interface/HcalPedestalWidths.h" +#include "CondFormats/HcalObjects/interface/HcalPedestalWidthsGPU.h" +#include "CondFormats/HcalObjects/interface/HcalPedestals.h" +#include "CondFormats/HcalObjects/interface/HcalPedestalsGPU.h" +#include "CondFormats/HcalObjects/interface/HcalQIECodersGPU.h" +#include "CondFormats/HcalObjects/interface/HcalQIEData.h" +#include "CondFormats/HcalObjects/interface/HcalQIETypes.h" +#include "CondFormats/HcalObjects/interface/HcalQIETypesGPU.h" +#include "CondFormats/HcalObjects/interface/HcalRecoParams.h" +#include "CondFormats/HcalObjects/interface/HcalRecoParamsGPU.h" +#include "CondFormats/HcalObjects/interface/HcalRespCorrs.h" +#include "CondFormats/HcalObjects/interface/HcalRespCorrsGPU.h" +#include "CondFormats/HcalObjects/interface/HcalSiPMCharacteristics.h" +#include "CondFormats/HcalObjects/interface/HcalSiPMCharacteristicsGPU.h" +#include "CondFormats/HcalObjects/interface/HcalSiPMParameters.h" +#include "CondFormats/HcalObjects/interface/HcalSiPMParametersGPU.h" +#include "CondFormats/HcalObjects/interface/HcalTimeCorrs.h" +#include "CondFormats/HcalObjects/interface/HcalTimeCorrsGPU.h" +#include "HeterogeneousCore/CUDACore/interface/ConvertingESProducerT.h" +#include "HeterogeneousCore/CUDACore/interface/ConvertingESProducerWithDependenciesT.h" +#include "RecoLocalCalo/HcalRecAlgos/interface/HcalRecoParamsWithPulseShapesGPU.h" + +using HcalRecoParamsGPUESProducer = ConvertingESProducerT; + +using HcalRecoParamsWithPulseShapesGPUESProducer = + ConvertingESProducerT; + +using HcalPedestalsGPUESProducer = ConvertingESProducerT; + +using HcalGainsGPUESProducer = ConvertingESProducerT; + +using HcalLUTCorrsGPUESProducer = ConvertingESProducerT; + +using HcalRespCorrsGPUESProducer = ConvertingESProducerT; + +using HcalTimeCorrsGPUESProducer = ConvertingESProducerT; + +using HcalPedestalWidthsGPUESProducer = + ConvertingESProducerT; + +using HcalGainWidthsGPUESProducer = ConvertingESProducerT; + +using HcalQIECodersGPUESProducer = ConvertingESProducerT; + +using HcalQIETypesGPUESProducer = ConvertingESProducerT; + +using HcalSiPMParametersGPUESProducer = + ConvertingESProducerT; + +using HcalSiPMCharacteristicsGPUESProducer = + ConvertingESProducerT; + +using HcalConvertedPedestalsGPUESProducer = ConvertingESProducerWithDependenciesT; + +using HcalConvertedEffectivePedestalsGPUESProducer = + ConvertingESProducerWithDependenciesT; + +using HcalConvertedPedestalWidthsGPUESProducer = ConvertingESProducerWithDependenciesT; + +using HcalConvertedEffectivePedestalWidthsGPUESProducer = + ConvertingESProducerWithDependenciesT; + +DEFINE_FWK_EVENTSETUP_MODULE(HcalRecoParamsGPUESProducer); +DEFINE_FWK_EVENTSETUP_MODULE(HcalRecoParamsWithPulseShapesGPUESProducer); +DEFINE_FWK_EVENTSETUP_MODULE(HcalPedestalsGPUESProducer); +DEFINE_FWK_EVENTSETUP_MODULE(HcalGainsGPUESProducer); +DEFINE_FWK_EVENTSETUP_MODULE(HcalLUTCorrsGPUESProducer); +DEFINE_FWK_EVENTSETUP_MODULE(HcalRespCorrsGPUESProducer); +DEFINE_FWK_EVENTSETUP_MODULE(HcalTimeCorrsGPUESProducer); +DEFINE_FWK_EVENTSETUP_MODULE(HcalPedestalWidthsGPUESProducer); +DEFINE_FWK_EVENTSETUP_MODULE(HcalGainWidthsGPUESProducer); +DEFINE_FWK_EVENTSETUP_MODULE(HcalQIECodersGPUESProducer); +DEFINE_FWK_EVENTSETUP_MODULE(HcalQIETypesGPUESProducer); +DEFINE_FWK_EVENTSETUP_MODULE(HcalSiPMParametersGPUESProducer); +DEFINE_FWK_EVENTSETUP_MODULE(HcalSiPMCharacteristicsGPUESProducer); +DEFINE_FWK_EVENTSETUP_MODULE(HcalConvertedPedestalsGPUESProducer); +DEFINE_FWK_EVENTSETUP_MODULE(HcalConvertedEffectivePedestalsGPUESProducer); +DEFINE_FWK_EVENTSETUP_MODULE(HcalConvertedPedestalWidthsGPUESProducer); +DEFINE_FWK_EVENTSETUP_MODULE(HcalConvertedEffectivePedestalWidthsGPUESProducer); diff --git a/RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc b/RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc new file mode 100644 index 0000000000000..0862e0a861d5d --- /dev/null +++ b/RecoLocalCalo/HcalRecProducers/src/HcalMahiPulseOffsetsGPUESProducer.cc @@ -0,0 +1,58 @@ +#include +#include +#include +#include + +#include "FWCore/Framework/interface/ESProducer.h" +#include "FWCore/Framework/interface/ESProductHost.h" +#include "FWCore/Framework/interface/ESTransientHandle.h" +#include "FWCore/Framework/interface/EventSetupRecordIntervalFinder.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/ModuleFactory.h" +#include "FWCore/Framework/interface/SourceFactory.h" +#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/Utilities/interface/ReusableObjectHolder.h" +#include "FWCore/Utilities/interface/typelookup.h" +#include "HeterogeneousCore/CUDACore/interface/JobConfigurationGPURecord.h" +#include "RecoLocalCalo/HcalRecAlgos/interface/HcalMahiPulseOffsetsGPU.h" + +class HcalMahiPulseOffsetsGPUESProducer : public edm::ESProducer, public edm::EventSetupRecordIntervalFinder { +public: + HcalMahiPulseOffsetsGPUESProducer(edm::ParameterSet const&); + ~HcalMahiPulseOffsetsGPUESProducer() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions&); + std::unique_ptr produce(JobConfigurationGPURecord const&); + +protected: + void setIntervalFor(const edm::eventsetup::EventSetupRecordKey&, + const edm::IOVSyncValue&, + edm::ValidityInterval&) override; + +private: + edm::ParameterSet const& pset_; +}; + +HcalMahiPulseOffsetsGPUESProducer::HcalMahiPulseOffsetsGPUESProducer(edm::ParameterSet const& pset) : pset_{pset} { + setWhatProduced(this); + findingRecord(); +} + +void HcalMahiPulseOffsetsGPUESProducer::setIntervalFor(const edm::eventsetup::EventSetupRecordKey& iKey, + const edm::IOVSyncValue& iTime, + edm::ValidityInterval& oInterval) { + oInterval = edm::ValidityInterval(edm::IOVSyncValue::beginOfTime(), edm::IOVSyncValue::endOfTime()); +} + +void HcalMahiPulseOffsetsGPUESProducer::fillDescriptions(edm::ConfigurationDescriptions& desc) { + edm::ParameterSetDescription d; + d.add>("pulseOffsets", {-3, -2, -1, 0, 1, 2, 3, 4}); + desc.addWithDefaultLabel(d); +} + +std::unique_ptr HcalMahiPulseOffsetsGPUESProducer::produce(JobConfigurationGPURecord const&) { + return std::make_unique(pset_); +} + +DEFINE_FWK_EVENTSETUP_SOURCE(HcalMahiPulseOffsetsGPUESProducer); diff --git a/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h b/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h new file mode 100644 index 0000000000000..ade221b2c4870 --- /dev/null +++ b/RecoLocalCalo/HcalRecProducers/src/KernelHelpers.h @@ -0,0 +1,220 @@ +#ifndef RecoLocalCalo_HcalRecProducers_src_KernelHelpers_h +#define RecoLocalCalo_HcalRecProducers_src_KernelHelpers_h + +#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConstants.h" + +#include "DeclsForKernels.h" + +namespace hcal { + namespace reconstruction { + + // this is from HcalTimeSlew. + // HcalTimeSlew are values that come in from ESProducer that takes them + // from a python config. see DeclsForKernels for more explanation + __forceinline__ __device__ float compute_time_slew_delay(float const fC, + float const tzero, + float const slope, + float const tmax) { + auto const rawDelay = tzero + slope * std::log(fC); + return rawDelay < 0 ? 0 : (rawDelay > tmax ? tmax : rawDelay); + } + + // HcalQIEShapes are hardcoded in HcalQIEData.cc basically + // + some logic to generate 128 and 256 value arrays... + __constant__ float const qie8shape[129] = { + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, + 18, 20, 22, 24, 26, 28, 31, 34, 37, 40, 44, 48, 52, 57, 62, 57, 62, + 67, 72, 77, 82, 87, 92, 97, 102, 107, 112, 117, 122, 127, 132, 142, 152, 162, + 172, 182, 192, 202, 217, 232, 247, 262, 282, 302, 322, 347, 372, 347, 372, 397, 422, + 447, 472, 497, 522, 547, 572, 597, 622, 647, 672, 697, 722, 772, 822, 872, 922, 972, + 1022, 1072, 1147, 1222, 1297, 1372, 1472, 1572, 1672, 1797, 1922, 1797, 1922, 2047, 2172, 2297, 2422, + 2547, 2672, 2797, 2922, 3047, 3172, 3297, 3422, 3547, 3672, 3922, 4172, 4422, 4672, 4922, 5172, 5422, + 5797, 6172, 6547, 6922, 7422, 7922, 8422, 9047, 9672, 10297}; + + __constant__ float const qie11shape[257] = { + -0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5, + 11.5, 12.5, 13.5, 14.5, 15.5, 17.5, 19.5, 21.5, 23.5, 25.5, 27.5, 29.5, + 31.5, 33.5, 35.5, 37.5, 39.5, 41.5, 43.5, 45.5, 47.5, 49.5, 51.5, 53.5, + 55.5, 59.5, 63.5, 67.5, 71.5, 75.5, 79.5, 83.5, 87.5, 91.5, 95.5, 99.5, + 103.5, 107.5, 111.5, 115.5, 119.5, 123.5, 127.5, 131.5, 135.5, 139.5, 147.5, 155.5, + 163.5, 171.5, 179.5, 187.5, 171.5, 179.5, 187.5, 195.5, 203.5, 211.5, 219.5, 227.5, + 235.5, 243.5, 251.5, 259.5, 267.5, 275.5, 283.5, 291.5, 299.5, 315.5, 331.5, 347.5, + 363.5, 379.5, 395.5, 411.5, 427.5, 443.5, 459.5, 475.5, 491.5, 507.5, 523.5, 539.5, + 555.5, 571.5, 587.5, 603.5, 619.5, 651.5, 683.5, 715.5, 747.5, 779.5, 811.5, 843.5, + 875.5, 907.5, 939.5, 971.5, 1003.5, 1035.5, 1067.5, 1099.5, 1131.5, 1163.5, 1195.5, 1227.5, + 1259.5, 1291.5, 1355.5, 1419.5, 1483.5, 1547.5, 1611.5, 1675.5, 1547.5, 1611.5, 1675.5, 1739.5, + 1803.5, 1867.5, 1931.5, 1995.5, 2059.5, 2123.5, 2187.5, 2251.5, 2315.5, 2379.5, 2443.5, 2507.5, + 2571.5, 2699.5, 2827.5, 2955.5, 3083.5, 3211.5, 3339.5, 3467.5, 3595.5, 3723.5, 3851.5, 3979.5, + 4107.5, 4235.5, 4363.5, 4491.5, 4619.5, 4747.5, 4875.5, 5003.5, 5131.5, 5387.5, 5643.5, 5899.5, + 6155.5, 6411.5, 6667.5, 6923.5, 7179.5, 7435.5, 7691.5, 7947.5, 8203.5, 8459.5, 8715.5, 8971.5, + 9227.5, 9483.5, 9739.5, 9995.5, 10251.5, 10507.5, 11019.5, 11531.5, 12043.5, 12555.5, 13067.5, 13579.5, + 12555.5, 13067.5, 13579.5, 14091.5, 14603.5, 15115.5, 15627.5, 16139.5, 16651.5, 17163.5, 17675.5, 18187.5, + 18699.5, 19211.5, 19723.5, 20235.5, 20747.5, 21771.5, 22795.5, 23819.5, 24843.5, 25867.5, 26891.5, 27915.5, + 28939.5, 29963.5, 30987.5, 32011.5, 33035.5, 34059.5, 35083.5, 36107.5, 37131.5, 38155.5, 39179.5, 40203.5, + 41227.5, 43275.5, 45323.5, 47371.5, 49419.5, 51467.5, 53515.5, 55563.5, 57611.5, 59659.5, 61707.5, 63755.5, + 65803.5, 67851.5, 69899.5, 71947.5, 73995.5, 76043.5, 78091.5, 80139.5, 82187.5, 84235.5, 88331.5, 92427.5, + 96523.5, 100620, 104716, 108812, 112908}; + + // Conditions are transferred once per IOV + // Access is performed based on the det id which is converted to a linear index + // 2 funcs below are taken from HcalTopology (reimplemented here). + // Inputs are constants that are also taken from HcalTopology + // but passed to the kernel as arguments using the HclaTopology itself + constexpr int32_t IPHI_MAX = 72; + + __forceinline__ __device__ uint32_t did2linearIndexHB( + uint32_t const didraw, int const maxDepthHB, int const firstHBRing, int const lastHBRing, int const nEtaHB) { + HcalDetId did{didraw}; + uint32_t const value = (did.depth() - 1) + maxDepthHB * (did.iphi() - 1); + return did.ieta() > 0 ? value + maxDepthHB * hcal::reconstruction::IPHI_MAX * (did.ieta() - firstHBRing) + : value + maxDepthHB * hcal::reconstruction::IPHI_MAX * (did.ieta() + lastHBRing + nEtaHB); + } + + __forceinline__ __device__ uint32_t did2linearIndexHE(uint32_t const didraw, + int const maxDepthHE, + int const maxPhiHE, + int const firstHERing, + int const lastHERing, + int const nEtaHE) { + HcalDetId did{didraw}; + uint32_t const value = (did.depth() - 1) + maxDepthHE * (did.iphi() - 1); + return did.ieta() > 0 ? value + maxDepthHE * maxPhiHE * (did.ieta() - firstHERing) + : value + maxDepthHE * maxPhiHE * (did.ieta() + lastHERing + nEtaHE); + } + + __forceinline__ __device__ uint32_t get_qiecoder_index(uint32_t const capid, uint32_t const range) { + return capid * 4 + range; + } + + __forceinline__ __device__ float compute_reco_correction_factor(float const par1, + float const par2, + float const par3, + float const x) { + return par3 * x * x + par2 * x + par1; + } + + // compute the charge using the adc, qie type and the appropriate qie shape array + __forceinline__ __device__ float compute_coder_charge( + int const qieType, uint8_t const adc, uint8_t const capid, float const* qieOffsets, float const* qieSlopes) { + auto const range = qieType == 0 ? (adc >> 5) & 0x3 : (adc >> 6) & 0x3; + auto const* qieShapeToUse = qieType == 0 ? qie8shape : qie11shape; + auto const nbins = qieType == 0 ? 32 : 64; + auto const center = adc % nbins == nbins - 1 ? 0.5 * (3 * qieShapeToUse[adc] - qieShapeToUse[adc - 1]) + : 0.5 * (qieShapeToUse[adc] + qieShapeToUse[adc + 1]); + auto const index = get_qiecoder_index(capid, range); + return (center - qieOffsets[index]) / qieSlopes[index]; + } + + // this is from + // https://github.com/cms-sw/cmssw/blob/master/RecoLocalCalo/HcalRecProducers/src/HBHEPhase1Reconstructor.cc#L140 + + __forceinline__ __device__ float compute_diff_charge_gain(int const qieType, + uint8_t adc, + uint8_t const capid, + float const* qieOffsets, + float const* qieSlopes, + bool const isqie11) { + constexpr uint32_t mantissaMaskQIE8 = 0x1fu; + constexpr uint32_t mantissaMaskQIE11 = 0x3f; + auto const mantissaMask = isqie11 ? mantissaMaskQIE11 : mantissaMaskQIE8; + auto const q = compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes); + auto const mantissa = adc & mantissaMask; + + if (mantissa == 0u || mantissa == mantissaMask - 1u) + return compute_coder_charge(qieType, adc + 1u, capid, qieOffsets, qieSlopes) - q; + else if (mantissa == 1u || mantissa == mantissaMask) + return q - compute_coder_charge(qieType, adc - 1u, capid, qieOffsets, qieSlopes); + else { + auto const qup = compute_coder_charge(qieType, adc + 1u, capid, qieOffsets, qieSlopes); + auto const qdown = compute_coder_charge(qieType, adc - 1u, capid, qieOffsets, qieSlopes); + auto const upgain = qup - q; + auto const downgain = q - qdown; + auto const averagegain = (qup - qdown) / 2.f; + if (std::abs(upgain - downgain) < 0.01f * averagegain) + return averagegain; + else { + auto const q2up = compute_coder_charge(qieType, adc + 2u, capid, qieOffsets, qieSlopes); + auto const q2down = compute_coder_charge(qieType, adc - 2u, capid, qieOffsets, qieSlopes); + auto const upgain2 = q2up - qup; + auto const downgain2 = qdown - q2down; + if (std::abs(upgain2 - upgain) < std::abs(downgain2 - downgain)) + return upgain; + else + return downgain; + } + } + } + + // TODO: remove what's not needed + // originally from from RecoLocalCalo/HcalRecAlgos/src/PulseShapeFunctor.cc + __forceinline__ __device__ float compute_pulse_shape_value(float const pulse_time, + int const sample, + int const shift, + float const* acc25nsVec, + float const* diff25nsItvlVec, + float const* accVarLenIdxMinusOneVec, + float const* diffVarItvlIdxMinusOneVec, + float const* accVarLenIdxZeroVec, + float const* diffVarItvlIdxZeroVec) { + // constants + constexpr float slew = 0.f; + constexpr auto ns_per_bx = hcal::constants::nsPerBX; + + // FIXME: clean up all the rounding... this is coming from original cpu version + float const i_start_float = -hcal::constants::iniTimeShift - pulse_time - slew > 0.f + ? 0.f + : std::abs(-hcal::constants::iniTimeShift - pulse_time - slew) + 1.f; + int i_start = static_cast(i_start_float); + float offset_start = static_cast(i_start) - hcal::constants::iniTimeShift - pulse_time - slew; + // FIXME: do we need a check for nan??? +#ifdef HCAL_MAHI_GPUDEBUG + if (shift == 0) + printf("i_start_float = %f i_start = %d offset_start = %f\n", i_start_float, i_start, offset_start); +#endif + + // boundary + if (offset_start == 1.0f) { + offset_start = 0.f; + i_start -= 1; + } + +#ifdef HCAL_MAHI_GPUDEBUG + if (shift == 0) + printf("i_start_float = %f i_start = %d offset_start = %f\n", i_start_float, i_start, offset_start); +#endif + + int const bin_start = static_cast(offset_start); + auto const bin_start_up = static_cast(bin_start) + 0.5f; + int const bin_0_start = offset_start < bin_start_up ? bin_start - 1 : bin_start; + int const its_start = i_start / ns_per_bx; + int const distTo25ns_start = hcal::constants::nsPerBX - 1 - i_start % ns_per_bx; + auto const factor = offset_start - static_cast(bin_0_start) - 0.5; + +#ifdef HCAL_MAHI_GPUDEBUG + if (shift == 0) { + printf("bin_start = %d bin_0_start = %d its_start = %d distTo25ns_start = %d factor = %f\n", + bin_start, + bin_0_start, + its_start, + distTo25ns_start, + factor); + } +#endif + + auto const sample_over10ts = sample + shift; + float value = 0.0f; + if (sample_over10ts == its_start) { + value = bin_0_start == -1 + ? accVarLenIdxMinusOneVec[distTo25ns_start] + factor * diffVarItvlIdxMinusOneVec[distTo25ns_start] + : accVarLenIdxZeroVec[distTo25ns_start] + factor * diffVarItvlIdxZeroVec[distTo25ns_start]; + } else if (sample_over10ts > its_start) { + int const bin_idx = distTo25ns_start + 1 + (sample_over10ts - its_start - 1) * ns_per_bx + bin_0_start; + value = acc25nsVec[bin_idx] + factor * diff25nsItvlVec[bin_idx]; + } + return value; + } + + } // namespace reconstruction +} // namespace hcal + +#endif // RecoLocalCalo_HcalRecProducers_src_KernelHelpers_h diff --git a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu new file mode 100644 index 0000000000000..a1fc79b41eca6 --- /dev/null +++ b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu @@ -0,0 +1,1223 @@ +#include + +#include "DataFormats/CaloRecHit/interface/MultifitComputations.h" + +// needed to compile with USER_CXXFLAGS="-DCOMPUTE_TDC_TIME" +#include "DataFormats/HcalRecHit/interface/HcalSpecialTimes.h" +// TODO reuse some of the HCAL constats from +//#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConstants.h" +// ? + +#include "SimpleAlgoGPU.h" +#include "KernelHelpers.h" + +#ifdef HCAL_MAHI_GPUDEBUG +#define DETID_TO_DEBUG 1125647428 +#endif + +namespace hcal { + namespace mahi { + + // TODO: provide constants from configuration + // from RecoLocalCalo/HcalRecProducers/python/HBHEMahiParameters_cfi.py + constexpr int nMaxItersMin = 50; + constexpr int nMaxItersNNLS = 500; + constexpr double nnlsThresh = 1e-11; + constexpr float deltaChi2Threashold = 1e-3; + + // from RecoLocalCalo/HcalRecProducers/src/HBHEPhase1Reconstructor.cc + __forceinline__ __device__ float get_raw_charge(double const charge, + double const pedestal, + float const* shrChargeMinusPedestal, + float const* parLin1Values, + float const* parLin2Values, + float const* parLin3Values, + int32_t const nsamplesForCompute, + int32_t const soi, + int const sipmQTSShift, + int const sipmQNTStoSum, + int const sipmType, + float const fcByPE, + bool const isqie11) { + float rawCharge; + + if (!isqie11) + rawCharge = charge; + else { + auto const parLin1 = parLin1Values[sipmType - 1]; + auto const parLin2 = parLin2Values[sipmType - 1]; + auto const parLin3 = parLin3Values[sipmType - 1]; + + int const first = std::max(soi + sipmQTSShift, 0); + int const last = std::min(soi + sipmQNTStoSum, nsamplesForCompute); + float sipmq = 0.0f; + for (auto ts = first; ts < last; ts++) + sipmq += shrChargeMinusPedestal[threadIdx.y * nsamplesForCompute + ts]; + auto const effectivePixelsFired = sipmq / fcByPE; + auto const factor = + hcal::reconstruction::compute_reco_correction_factor(parLin1, parLin2, parLin3, effectivePixelsFired); + rawCharge = (charge - pedestal) * factor + pedestal; + +#ifdef HCAL_MAHI_GPUDEBUG + printf("first = %d last = %d sipmQ = %f factor = %f rawCharge = %f\n", first, last, sipmq, factor, rawCharge); +#endif + } + return rawCharge; + } + + // Assume: same number of samples for HB and HE + // TODO: add/validate restrict (will increase #registers in use by the kernel) + __global__ void kernel_prep1d_sameNumberOfSamples(float* amplitudes, + float* noiseTerms, + float* outputEnergy, + float* outputChi2, + uint16_t const* dataf01HE, + uint16_t const* dataf5HB, + uint16_t const* dataf3HB, + uint32_t const* idsf01HE, + uint32_t const* idsf5HB, + uint32_t const* idsf3HB, + uint32_t const stridef01HE, + uint32_t const stridef5HB, + uint32_t const stridef3HB, + uint32_t const nchannelsf01HE, + uint32_t const nchannelsf5HB, + uint8_t const* npresamplesf5HB, + int8_t* soiSamples, + float* method0Energy, + float* method0Time, + uint32_t* outputdid, + uint32_t const nchannels, + uint32_t const* recoParam1Values, + uint32_t const* recoParam2Values, + float const* qieCoderOffsets, + float const* qieCoderSlopes, + int const* qieTypes, + float const* pedestalWidths, + float const* effectivePedestalWidths, + float const* pedestals, + float const* effectivePedestals, + bool const useEffectivePedestals, + int const* sipmTypeValues, + float const* fcByPEValues, + float const* parLin1Values, + float const* parLin2Values, + float const* parLin3Values, + float const* gainValues, + float const* respCorrectionValues, + int const maxDepthHB, + int const maxDepthHE, + int const maxPhiHE, + int const firstHBRing, + int const lastHBRing, + int const firstHERing, + int const lastHERing, + int const nEtaHB, + int const nEtaHE, + int const sipmQTSShift, + int const sipmQNTStoSum, + int const firstSampleShift, + uint32_t const offsetForHashes, + float const ts4Thresh, + int const startingSample) { + // indices + runtime constants + auto const sample = threadIdx.x + startingSample; + auto const sampleWithinWindow = threadIdx.x; + int32_t const nsamplesForCompute = blockDim.x; + auto const lch = threadIdx.y; + auto const gch = lch + blockDim.y * blockIdx.x; + auto const nchannels_per_block = blockDim.y; + auto const linearThPerBlock = threadIdx.x + threadIdx.y * blockDim.x; + + // remove + if (gch >= nchannels) + return; + + // initialize all output buffers + if (sampleWithinWindow == 0) { + outputdid[gch] = 0; + method0Energy[gch] = 0; + method0Time[gch] = 0; + outputEnergy[gch] = 0; + outputChi2[gch] = 0; + } + +#ifdef HCAL_MAHI_GPUDEBUG +#ifdef HCAL_MAHI_GPUDEBUG_SINGLECHANNEL + if (gch > 0) + return; +#endif +#endif + + // configure shared mem + extern __shared__ char smem[]; + float* shrEnergyM0PerTS = reinterpret_cast(smem); + float* shrChargeMinusPedestal = shrEnergyM0PerTS + nsamplesForCompute * nchannels_per_block; + float* shrMethod0EnergyAccum = shrChargeMinusPedestal + nsamplesForCompute * nchannels_per_block; + float* shrEnergyM0TotalAccum = shrMethod0EnergyAccum + nchannels_per_block; + unsigned long long int* shrMethod0EnergySamplePair = + reinterpret_cast(shrEnergyM0TotalAccum + nchannels_per_block); + if (sampleWithinWindow == 0) { + shrMethod0EnergyAccum[lch] = 0; + shrMethod0EnergySamplePair[lch] = __float_as_uint(std::numeric_limits::min()); + shrEnergyM0TotalAccum[lch] = 0; + } + + // offset output + auto* amplitudesForChannel = amplitudes + nsamplesForCompute * gch; + auto* noiseTermsForChannel = noiseTerms + nsamplesForCompute * gch; + auto const nchannelsf015 = nchannelsf01HE + nchannelsf5HB; + + // get event input quantities + auto const stride = gch < nchannelsf01HE ? stridef01HE : (gch < nchannelsf015 ? stridef5HB : stridef3HB); + auto const nsamples = gch < nchannelsf01HE ? compute_nsamples(stride) + : (gch < nchannelsf015 ? compute_nsamples(stride) + : compute_nsamples(stride)); + +#ifdef HCAL_MAHI_GPUDEBUG + assert(nsamples == nsamplesForCompute || nsamples - startingSample == nsamplesForCompute); +#endif + + auto const id = gch < nchannelsf01HE + ? idsf01HE[gch] + : (gch < nchannelsf015 ? idsf5HB[gch - nchannelsf01HE] : idsf3HB[gch - nchannelsf015]); + auto const did = HcalDetId{id}; + auto const adc = + gch < nchannelsf01HE + ? adc_for_sample(dataf01HE + stride * gch, sample) + : (gch < nchannelsf015 ? adc_for_sample(dataf5HB + stride * (gch - nchannelsf01HE), sample) + : adc_for_sample(dataf3HB + stride * (gch - nchannelsf015), sample)); + auto const capid = + gch < nchannelsf01HE + ? capid_for_sample(dataf01HE + stride * gch, sample) + : (gch < nchannelsf015 ? capid_for_sample(dataf5HB + stride * (gch - nchannelsf01HE), sample) + : capid_for_sample(dataf3HB + stride * (gch - nchannelsf015), sample)); + +#ifdef HCAL_MAHI_GPUDEBUG +#ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID + if (id != DETID_TO_DEBUG) + return; +#endif +#endif + + // compute hash for this did + auto const hashedId = + did.subdetId() == HcalBarrel + ? hcal::reconstruction::did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB) + : hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) + + offsetForHashes; + + // conditions based on the hash + // FIXME: remove hardcoded values + auto const qieType = qieTypes[hashedId] > 0 ? 1 : 0; // 2 types at this point + auto const* qieOffsets = qieCoderOffsets + hashedId * HcalQIECodersGPU::numValuesPerChannel; + auto const* qieSlopes = qieCoderSlopes + hashedId * HcalQIECodersGPU::numValuesPerChannel; + auto const* pedestalsForChannel = pedestals + hashedId * 4; + auto const* pedestalWidthsForChannel = useEffectivePedestals && (gch < nchannelsf01HE || gch >= nchannelsf015) + ? effectivePedestalWidths + hashedId * 4 + : pedestalWidths + hashedId * 4; + auto const* gains = gainValues + hashedId * 4; + auto const gain = gains[capid]; + auto const gain0 = gains[0]; + auto const respCorrection = respCorrectionValues[hashedId]; + auto const pedestal = pedestalsForChannel[capid]; + auto const pedestalWidth = pedestalWidthsForChannel[capid]; + // if needed, only use effective pedestals for f01 + auto const pedestalToUseForMethod0 = useEffectivePedestals && (gch < nchannelsf01HE || gch >= nchannelsf015) + ? effectivePedestals[hashedId * 4 + capid] + : pedestal; + auto const sipmType = sipmTypeValues[hashedId]; + auto const fcByPE = fcByPEValues[hashedId]; + auto const recoParam1 = recoParam1Values[hashedId]; + auto const recoParam2 = recoParam2Values[hashedId]; + +#ifdef HCAL_MAHI_GPUDEBUG + printf("qieType = %d qieOffset0 = %f qieOffset1 = %f qieSlope0 = %f qieSlope1 = %f\n", + qieType, + qieOffsets[0], + qieOffsets[1], + qieSlopes[0], + qieSlopes[1]); +#endif + + // compute charge + auto const charge = hcal::reconstruction::compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes); + + shrChargeMinusPedestal[linearThPerBlock] = charge - pedestal; + if (gch < nchannelsf01HE) { + // NOTE: assume that soi is high only for a single guy! + // which must be the case. cpu version does not check for that + // if that is not the case, we will see that with cuda mmecheck + auto const soibit = soibit_for_sample(dataf01HE + stride * gch, sample); + if (soibit == 1) + soiSamples[gch] = sampleWithinWindow; + } else if (gch >= nchannelsf015) { + auto const soibit = soibit_for_sample(dataf3HB + stride * (gch - nchannelsf015), sample); + if (soibit == 1) + soiSamples[gch] = sampleWithinWindow; + } + __syncthreads(); + int32_t const soi = gch < nchannelsf01HE + ? soiSamples[gch] + : (gch < nchannelsf015 ? npresamplesf5HB[gch - nchannelsf01HE] : soiSamples[gch]); + //int32_t const soi = gch >= nchannelsf01HE + // ? npresamplesf5HB[gch - nchannelsf01HE] + // : soiSamples[gch]; + // this is here just to make things uniform... + if (gch >= nchannelsf01HE && gch < nchannelsf015 && sampleWithinWindow == 0) + soiSamples[gch] = npresamplesf5HB[gch - nchannelsf01HE]; + + // + // compute various quantities (raw charge and tdc stuff) + // NOTE: this branch will be divergent only for a single warp that + // sits on the boundary when flavor 01 channels end and flavor 5 start + // + float const rawCharge = get_raw_charge(charge, + pedestal, + shrChargeMinusPedestal, + parLin1Values, + parLin2Values, + parLin3Values, + nsamplesForCompute, + soi, + sipmQTSShift, + sipmQNTStoSum, + sipmType, + fcByPE, + gch < nchannelsf01HE || gch >= nchannelsf015); + + auto const dfc = hcal::reconstruction::compute_diff_charge_gain( + qieType, adc, capid, qieOffsets, qieSlopes, gch < nchannelsf01HE || gch >= nchannelsf015); + +#ifdef COMPUTE_TDC_TIME + float tdcTime; + if (gch >= nchannelsf01HE && gch < nchannelsf015) { + tdcTime = HcalSpecialTimes::UNKNOWN_T_NOTDC; + } else { + if (gch < nchannelsf01HE) + tdcTime = HcalSpecialTimes::getTDCTime(tdc_for_sample(dataf01HE + stride * gch, sample)); + else if (gch >= nchannelsf015) + tdcTime = + HcalSpecialTimes::getTDCTime(tdc_for_sample(dataf3HB + stride * (gch - nchannelsf015), sample)); + } +#endif // COMPUTE_TDC_TIME + + // compute method 0 quantities + // TODO: need to apply containment + // TODO: need to apply time slew + // TODO: for < run 3, apply HBM legacy energy correction + auto const nsamplesToAdd = recoParam1 < 10 ? recoParam2 : (recoParam1 >> 14) & 0xF; + auto const startSampleTmp = soi + firstSampleShift; + auto const startSample = startSampleTmp < 0 ? 0 : startSampleTmp; + auto const endSample = + startSample + nsamplesToAdd < nsamplesForCompute ? startSample + nsamplesToAdd : nsamplesForCompute; + // NOTE: gain is a small number < 10^-3, multiply it last + auto const energym0_per_ts = gain * ((rawCharge - pedestalToUseForMethod0) * respCorrection); + auto const energym0_per_ts_gain0 = gain0 * ((rawCharge - pedestalToUseForMethod0) * respCorrection); + // store to shared mem + shrEnergyM0PerTS[lch * nsamplesForCompute + sampleWithinWindow] = energym0_per_ts; + atomicAdd(&shrEnergyM0TotalAccum[lch], energym0_per_ts_gain0); + +#ifdef HCAL_MAHI_GPUDEBUG + printf( + "id = %u sample = %d gch = %d hashedId = %u adc = %u capid = %u\n" + " charge = %f rawCharge = %f dfc = %f pedestal = %f\n" + " gain = %f respCorrection = %f energym0_per_ts = %f\n", + id, + sample, + gch, + hashedId, + adc, + capid, + charge, + rawCharge, + dfc, + pedestalToUseForMethod0, + gain, + respCorrection, + energym0_per_ts); + printf( + "startSample = %d endSample = %d param1 = %u param2 = %u\n", startSample, endSample, recoParam1, recoParam2); +#endif + + if (sampleWithinWindow >= startSample && sampleWithinWindow < endSample) { + atomicAdd(&shrMethod0EnergyAccum[lch], energym0_per_ts); + // pack sample, energy as 64 bit value + unsigned long long int old = shrMethod0EnergySamplePair[lch], assumed; + unsigned long long int val = + (static_cast(sampleWithinWindow) << 32) + __float_as_uint(energym0_per_ts); + do { + assumed = old; + // decode energy, sample values + //int const current_sample = (assumed >> 32) & 0xffffffff; + float const current_energy = __uint_as_float(assumed & 0xffffffff); + if (energym0_per_ts > current_energy) + old = atomicCAS(&shrMethod0EnergySamplePair[lch], assumed, val); + else + break; + } while (assumed != old); + } + __syncthreads(); + + // NOTE: must take soi, as values for that thread are used... + if (sampleWithinWindow == soi) { + auto const method0_energy = shrMethod0EnergyAccum[lch]; + auto const val = shrMethod0EnergySamplePair[lch]; + int const max_sample = (val >> 32) & 0xffffffff; + float const max_energy = __uint_as_float(val & 0xffffffff); + float const max_energy_1 = + max_sample < nsamplesForCompute - 1 ? shrEnergyM0PerTS[lch * nsamplesForCompute + max_sample + 1] : 0.f; + float const position = nsamplesToAdd < nsamplesForCompute ? max_sample - soi : max_sample; + auto const sum = max_energy + max_energy_1; + // FIXME: for full comparison with cpu method 0 timing, + // need to correct by slew + // requires an accumulator -> more shared mem -> omit here unless + // really needed + float const time = + max_energy > 0.f && max_energy_1 > 0.f ? 25.f * (position + max_energy_1 / sum) : 25.f * position; + + // store method0 quantities to global mem + outputdid[gch] = id; + method0Energy[gch] = method0_energy; + method0Time[gch] = time; + +#ifdef HCAL_MAHI_GPUDEBUG + printf("tsTOT = %f tstrig = %f ts4Thresh = %f\n", shrEnergyM0TotalAccum[lch], energym0_per_ts_gain0, ts4Thresh); +#endif + + // check as in cpu version if mahi is not needed + // FIXME: KNOWN ISSUE: observed a problem when rawCharge and pedestal + // are basically equal and generate -0.00000... + // needs to be treated properly + if (!(shrEnergyM0TotalAccum[lch] > 0 && energym0_per_ts_gain0 > ts4Thresh)) { + // do not need to run mahi minimization + //outputEnergy[gch] = 0; energy already inited to 0 + outputChi2[gch] = -9999.f; + } + +#ifdef HCAL_MAHI_GPUDEBUG + printf("method0_energy = %f max_sample = %d max_energy = %f time = %f\n", + method0_energy, + max_sample, + max_energy, + time); +#endif + } + + // + // preparations for mahi fit + // + auto const amplitude = rawCharge - pedestalToUseForMethod0; + auto const noiseADC = (1. / std::sqrt(12)) * dfc; + auto const noisePhotoSq = amplitude > pedestalWidth ? (amplitude * fcByPE) : 0.f; + auto const noiseTerm = noiseADC * noiseADC + noisePhotoSq + pedestalWidth * pedestalWidth; + +#ifdef HCAL_MAHI_GPUDEBUG + printf( + "charrge(%d) = %f pedestal(%d) = %f dfc(%d) = %f pedestalWidth(%d) = %f noiseADC(%d) = %f noisPhoto(%d) = " + "%f\n", + sample, + rawCharge, + sample, + pedestalToUseForMethod0, + sample, + dfc, + sample, + pedestalWidth, + sample, + noiseADC, + sample, + noisePhotoSq); +#endif + + // store to global memory + amplitudesForChannel[sampleWithinWindow] = amplitude; + noiseTermsForChannel[sampleWithinWindow] = noiseTerm; + } + + // TODO: need to add an array of offsets for pulses (a la activeBXs...) + // Assume for now 8 pulses + __global__ void kernel_prep_pulseMatrices_sameNumberOfSamples(float* pulseMatrices, + float* pulseMatricesM, + float* pulseMatricesP, + int const* pulseOffsets, + float const* amplitudes, + uint32_t const* idsf01HE, + uint32_t const* idsf5HB, + uint32_t const* idsf3HB, + uint32_t const nchannelsf01HE, + uint32_t const nchannelsf5HB, + uint32_t const nchannelsTotal, + int8_t const* soiSamples, + uint32_t const* recoPulseShapeIds, + float const* acc25nsVecValues, + float const* diff25nsItvlVecValues, + float const* accVarLenIdxMinusOneVecValues, + float const* diffVarItvlIdxMinusOneVecValues, + float const* accVarLenIdxZeroVecValues, + float const* diffVarItvlIdxZeroVecValues, + float const meanTime, + float const timeSigmaSiPM, + float const timeSigmaHPD, + int const maxDepthHB, + int const maxDepthHE, + int const maxPhiHE, + int const firstHBRing, + int const lastHBRing, + int const firstHERing, + int const lastHERing, + int const nEtaHB, + int const nEtaHE, + uint32_t const offsetForHashes, + bool const applyTimeSlew, + float const tzeroTimeSlew, + float const slopeTimeSlew, + float const tmaxTimeSlew) { + // indices + auto const ipulse = threadIdx.y; + auto const npulses = blockDim.y; + auto const sample = threadIdx.x; + auto const nsamples = blockDim.x; + auto const lch = threadIdx.z; + auto const gch = lch + blockIdx.x * blockDim.z; + auto const nchannelsf015 = nchannelsf01HE + nchannelsf5HB; + + if (gch >= nchannelsTotal) + return; + + // conditions + auto const id = gch < nchannelsf01HE + ? idsf01HE[gch] + : (gch < nchannelsf015 ? idsf5HB[gch - nchannelsf01HE] : idsf3HB[gch - nchannelsf015]); + //auto const id = gch >= nchannelsf01HE + // ? idsf5HB[gch - nchannelsf01HE] + // : idsf01HE[gch]; + auto const deltaT = gch >= nchannelsf01HE && gch < nchannelsf015 ? timeSigmaHPD : timeSigmaSiPM; + auto const did = DetId{id}; + auto const hashedId = + did.subdetId() == HcalBarrel + ? hcal::reconstruction::did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB) + : hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) + + offsetForHashes; + auto const recoPulseShapeId = recoPulseShapeIds[hashedId]; + auto const* acc25nsVec = acc25nsVecValues + recoPulseShapeId * hcal::constants::maxPSshapeBin; + auto const* diff25nsItvlVec = diff25nsItvlVecValues + recoPulseShapeId * hcal::constants::maxPSshapeBin; + auto const* accVarLenIdxMinusOneVec = accVarLenIdxMinusOneVecValues + recoPulseShapeId * hcal::constants::nsPerBX; + auto const* diffVarItvlIdxMinusOneVec = + diffVarItvlIdxMinusOneVecValues + recoPulseShapeId * hcal::constants::nsPerBX; + auto const* accVarLenIdxZeroVec = accVarLenIdxZeroVecValues + recoPulseShapeId * hcal::constants::nsPerBX; + auto const* diffVarItvlIdxZeroVec = diffVarItvlIdxZeroVecValues + recoPulseShapeId * hcal::constants::nsPerBX; + + // offset output arrays + auto* pulseMatrix = pulseMatrices + nsamples * npulses * gch; + auto* pulseMatrixM = pulseMatricesM + nsamples * npulses * gch; + auto* pulseMatrixP = pulseMatricesP + nsamples * npulses * gch; + + // amplitude per ipulse + int const soi = soiSamples[gch]; + int const pulseOffset = pulseOffsets[ipulse]; + auto const amplitude = amplitudes[gch * nsamples + pulseOffset + soi]; + +#ifdef HCAL_MAHI_GPUDEBUG +#ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID + if (id != DETID_TO_DEBUG) + return; +#endif +#endif + +#ifdef HCAL_MAHI_GPUDEBUG + if (sample == 0 && ipulse == 0) { + for (int i = 0; i < 8; i++) + printf("amplitude(%d) = %f\n", i, amplitudes[gch * nsamples + i]); + printf("acc25nsVec and diff25nsItvlVec for recoPulseShapeId = %u\n", recoPulseShapeId); + for (int i = 0; i < 256; i++) { + printf("acc25nsVec(%d) = %f diff25nsItvlVec(%d) = %f\n", i, acc25nsVec[i], i, diff25nsItvlVec[i]); + } + printf("accVarLenIdxZEROVec and accVarLenIdxMinusOneVec\n"); + for (int i = 0; i < 25; i++) { + printf("accVarLenIdxZEROVec(%d) = %f accVarLenIdxMinusOneVec(%d) = %f\n", + i, + accVarLenIdxZeroVec[i], + i, + accVarLenIdxMinusOneVec[i]); + } + printf("diffVarItvlIdxZEROVec and diffVarItvlIdxMinusOneVec\n"); + for (int i = 0; i < 25; i++) { + printf("diffVarItvlIdxZEROVec(%d) = %f diffVarItvlIdxMinusOneVec(%d) = %f\n", + i, + diffVarItvlIdxZeroVec[i], + i, + diffVarItvlIdxMinusOneVec[i]); + } + } +#endif + + auto t0 = meanTime; + if (applyTimeSlew) { + if (amplitude <= 1.0f) + t0 += hcal::reconstruction::compute_time_slew_delay(1.0, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew); + else + t0 += hcal::reconstruction::compute_time_slew_delay(amplitude, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew); + } + auto const t0m = -deltaT + t0; + auto const t0p = deltaT + t0; + +#ifdef HCAL_MAHI_GPUDEBUG + if (sample == 0 && ipulse == 0) { + printf("time values: %f %f %f\n", t0, t0m, t0p); + } + + if (sample == 0 && ipulse == 0) { + for (int i = 0; i < hcal::constants::maxSamples; i++) { + auto const value = hcal::reconstruction::compute_pulse_shape_value(t0, + i, + 0, + acc25nsVec, + diff25nsItvlVec, + accVarLenIdxMinusOneVec, + diffVarItvlIdxMinusOneVec, + accVarLenIdxZeroVec, + diffVarItvlIdxZeroVec); + printf("pulse(%d) = %f\n", i, value); + } + printf("\n"); + for (int i = 0; i < hcal::constants::maxSamples; i++) { + auto const value = hcal::reconstruction::compute_pulse_shape_value(t0p, + i, + 0, + acc25nsVec, + diff25nsItvlVec, + accVarLenIdxMinusOneVec, + diffVarItvlIdxMinusOneVec, + accVarLenIdxZeroVec, + diffVarItvlIdxZeroVec); + printf("pulseP(%d) = %f\n", i, value); + } + printf("\n"); + for (int i = 0; i < hcal::constants::maxSamples; i++) { + auto const value = hcal::reconstruction::compute_pulse_shape_value(t0m, + i, + 0, + acc25nsVec, + diff25nsItvlVec, + accVarLenIdxMinusOneVec, + diffVarItvlIdxMinusOneVec, + accVarLenIdxZeroVec, + diffVarItvlIdxZeroVec); + printf("pulseM(%d) = %f\n", i, value); + } + } +#endif + + // FIXME: shift should be treated properly, + // here assume 8 time slices and 8 samples + auto const shift = 4 - soi; // as in cpu version! + + // auto const offset = ipulse - soi; + // auto const idx = sample - offset; + int32_t const idx = sample - pulseOffset; + auto const value = idx >= 0 && idx < nsamples + ? hcal::reconstruction::compute_pulse_shape_value(t0, + idx, + shift, + acc25nsVec, + diff25nsItvlVec, + accVarLenIdxMinusOneVec, + diffVarItvlIdxMinusOneVec, + accVarLenIdxZeroVec, + diffVarItvlIdxZeroVec) + : 0; + auto const value_t0m = idx >= 0 && idx < nsamples + ? hcal::reconstruction::compute_pulse_shape_value(t0m, + idx, + shift, + acc25nsVec, + diff25nsItvlVec, + accVarLenIdxMinusOneVec, + diffVarItvlIdxMinusOneVec, + accVarLenIdxZeroVec, + diffVarItvlIdxZeroVec) + : 0; + auto const value_t0p = idx >= 0 && idx < nsamples + ? hcal::reconstruction::compute_pulse_shape_value(t0p, + idx, + shift, + acc25nsVec, + diff25nsItvlVec, + accVarLenIdxMinusOneVec, + diffVarItvlIdxMinusOneVec, + accVarLenIdxZeroVec, + diffVarItvlIdxZeroVec) + : 0; + + // store to global + if (amplitude > 0.f) { + pulseMatrix[ipulse * nsamples + sample] = value; + pulseMatrixM[ipulse * nsamples + sample] = value_t0m; + pulseMatrixP[ipulse * nsamples + sample] = value_t0p; + } else { + pulseMatrix[ipulse * nsamples + sample] = 0.f; + pulseMatrixM[ipulse * nsamples + sample] = 0.f; + pulseMatrixP[ipulse * nsamples + sample] = 0.f; + } + } + + template + __forceinline__ __device__ void update_covariance( + calo::multifit::ColumnVector const& resultAmplitudesVector, + calo::multifit::MapSymM& covarianceMatrix, + Eigen::Map> const& pulseMatrix, + Eigen::Map> const& pulseMatrixM, + Eigen::Map> const& pulseMatrixP) { +#pragma unroll + for (int ipulse = 0; ipulse < NPULSES; ipulse++) { + auto const resultAmplitude = resultAmplitudesVector(ipulse); + if (resultAmplitude == 0) + continue; + +#ifdef HCAL_MAHI_GPUDEBUG + printf("pulse cov array for ibx = %d\n", ipulse); +#endif + + // preload a column + float pmcol[NSAMPLES], pmpcol[NSAMPLES], pmmcol[NSAMPLES]; +#pragma unroll + for (int counter = 0; counter < NSAMPLES; counter++) { + pmcol[counter] = __ldg(&pulseMatrix.coeffRef(counter, ipulse)); + pmpcol[counter] = __ldg(&pulseMatrixP.coeffRef(counter, ipulse)); + pmmcol[counter] = __ldg(&pulseMatrixM.coeffRef(counter, ipulse)); + } + + auto const ampl2 = resultAmplitude * resultAmplitude; +#pragma unroll + for (int col = 0; col < NSAMPLES; col++) { + auto const valueP_col = pmpcol[col]; + auto const valueM_col = pmmcol[col]; + auto const value_col = pmcol[col]; + auto const tmppcol = valueP_col - value_col; + auto const tmpmcol = valueM_col - value_col; + + // diagonal + auto tmp_value = 0.5 * (tmppcol * tmppcol + tmpmcol * tmpmcol); + covarianceMatrix(col, col) += ampl2 * tmp_value; + +// FIXME: understand if this actually gets unrolled +#pragma unroll + for (int row = col + 1; row < NSAMPLES; row++) { + float const valueP_row = pmpcol[row]; //pulseMatrixP(j, ipulseReal); + float const value_row = pmcol[row]; //pulseMatrix(j, ipulseReal); + float const valueM_row = pmmcol[row]; //pulseMatrixM(j, ipulseReal); + + float tmpprow = valueP_row - value_row; + float tmpmrow = valueM_row - value_row; + + auto const covValue = 0.5 * (tmppcol * tmpprow + tmpmcol * tmpmrow); + + covarianceMatrix(row, col) += ampl2 * covValue; + } + } + } + } + + template + __global__ void kernel_minimize(float* outputEnergy, + float* outputChi2, + float const* __restrict__ inputAmplitudes, + float const* __restrict__ pulseMatrices, + float const* __restrict__ pulseMatricesM, + float const* __restrict__ pulseMatricesP, + int const* __restrict__ pulseOffsetValues, + float const* __restrict__ noiseTerms, + int8_t const* __restrict__ soiSamples, + float const* __restrict__ pedestalWidths, + float const* __restrict__ effectivePedestalWidths, + bool const useEffectivePedestals, + uint32_t const* __restrict__ idsf01HE, + uint32_t const* __restrict__ idsf5HB, + uint32_t const* __restrict__ idsf3HB, + float const* __restrict__ gainValues, + float const* __restrict__ respCorrectionValues, + uint32_t const nchannelsf01HE, + uint32_t const nchannelsf5HB, + uint32_t const nchannelsTotal, + uint32_t const offsetForHashes, + int const maxDepthHB, + int const maxDepthHE, + int const maxPhiHE, + int const firstHBRing, + int const lastHBRing, + int const firstHERing, + int const lastHERing, + int const nEtaHB, + int const nEtaHE) { + // can be relaxed if needed - minor updates are needed in that case! + static_assert(NPULSES == NSAMPLES); + + // indices + auto const gch = threadIdx.x + blockIdx.x * blockDim.x; + auto const nchannelsf015 = nchannelsf01HE + nchannelsf5HB; + if (gch >= nchannelsTotal) + return; + + // if chi2 is set to -9999 do not run minimization + if (outputChi2[gch] == -9999.f) + return; + + // configure shared mem + extern __shared__ char shrmem[]; + float* shrMatrixLFnnlsStorage = + reinterpret_cast(shrmem) + calo::multifit::MapSymM::total * threadIdx.x; + float* shrAtAStorage = reinterpret_cast(shrmem) + + calo::multifit::MapSymM::total * (threadIdx.x + blockDim.x); + + // conditions for pedestal widths + auto const id = gch < nchannelsf01HE + ? idsf01HE[gch] + : (gch < nchannelsf015 ? idsf5HB[gch - nchannelsf01HE] : idsf3HB[gch - nchannelsf015]); + auto const did = DetId{id}; + auto const hashedId = + did.subdetId() == HcalBarrel + ? hcal::reconstruction::did2linearIndexHB(id, maxDepthHB, firstHBRing, lastHBRing, nEtaHB) + : hcal::reconstruction::did2linearIndexHE(id, maxDepthHE, maxPhiHE, firstHERing, lastHERing, nEtaHE) + + offsetForHashes; + + auto const* pedestalWidthsForChannel = useEffectivePedestals && (gch < nchannelsf01HE || gch >= nchannelsf015) + ? effectivePedestalWidths + hashedId * 4 + : pedestalWidths + hashedId * 4; + auto const averagePedestalWidth2 = 0.25 * (pedestalWidthsForChannel[0] * pedestalWidthsForChannel[0] + + pedestalWidthsForChannel[1] * pedestalWidthsForChannel[1] + + pedestalWidthsForChannel[2] * pedestalWidthsForChannel[2] + + pedestalWidthsForChannel[3] * pedestalWidthsForChannel[3]); + auto const* gains = gainValues + hashedId * 4; + // FIXME on cpu ts 0 capid was used - does it make any difference + auto const gain = gains[0]; + auto const respCorrection = respCorrectionValues[hashedId]; + +#ifdef HCAL_MAHI_GPUDEBUG +#ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID + if (id != DETID_TO_DEBUG) + return; +#endif +#endif + + /* + // TODO: provide this properly + int const soi = soiSamples[gch]; + */ + calo::multifit::ColumnVector pulseOffsets; +#pragma unroll + for (int i = 0; i < NPULSES; ++i) + pulseOffsets(i) = i; + // pulseOffsets(i) = pulseOffsetValues[i] - pulseOffsetValues[0]; + + // output amplitudes/weights + calo::multifit::ColumnVector resultAmplitudesVector = calo::multifit::ColumnVector::Zero(); + + // map views + Eigen::Map> inputAmplitudesView{inputAmplitudes + gch * NSAMPLES}; + Eigen::Map> noiseTermsView{noiseTerms + gch * NSAMPLES}; + Eigen::Map> glbPulseMatrixMView{pulseMatricesM + + gch * NSAMPLES * NPULSES}; + Eigen::Map> glbPulseMatrixPView{pulseMatricesP + + gch * NSAMPLES * NPULSES}; + Eigen::Map> glbPulseMatrixView{pulseMatrices + + gch * NSAMPLES * NPULSES}; + +#ifdef HCAL_MAHI_GPUDEBUG + for (int i = 0; i < NSAMPLES; i++) + printf("inputValues(%d) = %f noiseTerms(%d) = %f\n", i, inputAmplitudesView(i), i, noiseTermsView(i)); + for (int i = 0; i < NSAMPLES; i++) { + for (int j = 0; j < NPULSES; j++) + printf("%f ", glbPulseMatrixView(i, j)); + printf("\n"); + } + printf("\n"); + for (int i = 0; i < NSAMPLES; i++) { + for (int j = 0; j < NPULSES; j++) + printf("%f ", glbPulseMatrixMView(i, j)); + printf("\n"); + } + printf("\n"); + for (int i = 0; i < NSAMPLES; i++) { + for (int j = 0; j < NPULSES; j++) + printf("%f ", glbPulseMatrixPView(i, j)); + printf("\n"); + } +#endif + + int npassive = 0; + float chi2 = 0, previous_chi2 = 0.f, chi2_2itersback = 0.f; + for (int iter = 1; iter < nMaxItersMin; iter++) { + //float covarianceMatrixStorage[MapSymM::total]; + // NOTE: only works when NSAMPLES == NPULSES + // if does not hold -> slightly rearrange shared mem to still reuse + // shared memory + float* covarianceMatrixStorage = shrMatrixLFnnlsStorage; + calo::multifit::MapSymM covarianceMatrix{covarianceMatrixStorage}; +#pragma unroll + for (int counter = 0; counter < calo::multifit::MapSymM::total; counter++) + covarianceMatrixStorage[counter] = averagePedestalWidth2; +#pragma unroll + for (int counter = 0; counter < calo::multifit::MapSymM::stride; counter++) + covarianceMatrix(counter, counter) += __ldg(&noiseTermsView.coeffRef(counter)); + + // update covariance matrix + update_covariance( + resultAmplitudesVector, covarianceMatrix, glbPulseMatrixView, glbPulseMatrixMView, glbPulseMatrixPView); + +#ifdef HCAL_MAHI_GPUDEBUG + printf("covariance matrix\n"); + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 8; j++) + printf("%f ", covarianceMatrix(i, j)); + printf("\n"); + } +#endif + + // compute Cholesky Decomposition L matrix + //matrixDecomposition.compute(covarianceMatrix); + //auto const& matrixL = matrixDecomposition.matrixL(); + float matrixLStorage[calo::multifit::MapSymM::total]; + calo::multifit::MapSymM matrixL{matrixLStorage}; + calo::multifit::compute_decomposition_unrolled(matrixL, covarianceMatrix); + + // + // replace eigen + // + //auto const& A = matrixDecomposition + // .matrixL() + // .solve(pulseMatrixView); + calo::multifit::ColMajorMatrix A; + calo::multifit::solve_forward_subst_matrix(A, glbPulseMatrixView, matrixL); + + // + // remove eigen + // + //auto const& b = matrixL + // .solve(inputAmplitudesView); + // + float reg_b[NSAMPLES]; + calo::multifit::solve_forward_subst_vector(reg_b, inputAmplitudesView, matrixL); + + // TODO: we do not really need to change these matrcies + // will be fixed in the optimized version + //ColMajorMatrix AtA = A.transpose() * A; + //ColumnVector Atb = A.transpose() * b; + //ColMajorMatrix AtA; + //float AtAStorage[MapSymM::total]; + calo::multifit::MapSymM AtA{shrAtAStorage}; + calo::multifit::ColumnVector Atb; +#pragma unroll + for (int icol = 0; icol < NPULSES; icol++) { + float reg_ai[NSAMPLES]; + +// load column icol +#pragma unroll + for (int counter = 0; counter < NSAMPLES; counter++) + reg_ai[counter] = A(counter, icol); + + // compute diagonal + float sum = 0.f; +#pragma unroll + for (int counter = 0; counter < NSAMPLES; counter++) + sum += reg_ai[counter] * reg_ai[counter]; + + // store + AtA(icol, icol) = sum; + +// go thru the other columns +#pragma unroll + for (int j = icol + 1; j < NPULSES; j++) { + // load column j + float reg_aj[NSAMPLES]; +#pragma unroll + for (int counter = 0; counter < NSAMPLES; counter++) + reg_aj[counter] = A(counter, j); + + // accum + float sum = 0.f; +#pragma unroll + for (int counter = 0; counter < NSAMPLES; counter++) + sum += reg_aj[counter] * reg_ai[counter]; + + // store + //AtA(icol, j) = sum; + AtA(j, icol) = sum; + } + + // Atb accum + float sum_atb = 0; +#pragma unroll + for (int counter = 0; counter < NSAMPLES; counter++) + sum_atb += reg_ai[counter] * reg_b[counter]; + + // store atb + Atb(icol) = sum_atb; + } + +#ifdef HCAL_MAHI_GPUDEBUG + printf("AtA\n"); + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 8; j++) + printf("%f ", AtA(i, j)); + printf("\n"); + } + printf("Atb\n"); + for (int i = 0; i < 8; i++) + printf("%f ", Atb(i)); + printf("\n"); + printf("result Amplitudes before nnls\n"); + for (int i = 0; i < 8; i++) + printf("%f ", resultAmplitudesVector(i)); + printf("\n"); +#endif + + // for fnnls + calo::multifit::MapSymM matrixLForFnnls{shrMatrixLFnnlsStorage}; + + // run fast nnls + calo::multifit::fnnls( + AtA, Atb, resultAmplitudesVector, npassive, pulseOffsets, matrixLForFnnls, nnlsThresh, nMaxItersNNLS, 10, 10); + +#ifdef HCAL_MAHI_GPUDEBUG + printf("result Amplitudes\n"); + for (int i = 0; i < 8; i++) + printf("resultAmplitudes(%d) = %f\n", i, resultAmplitudesVector(i)); +#endif + + calo::multifit::calculateChiSq(matrixL, glbPulseMatrixView, resultAmplitudesVector, inputAmplitudesView, chi2); + + auto const deltaChi2 = std::abs(chi2 - previous_chi2); + if (chi2 == chi2_2itersback && chi2 < previous_chi2) + break; + + // update + chi2_2itersback = previous_chi2; + previous_chi2 = chi2; + + // exit condition + if (deltaChi2 < deltaChi2Threashold) + break; + } + +#ifdef HCAL_MAHI_GPUDEBUG + for (int i = 0; i < NPULSES; i++) + printf("pulseOffsets(%d) = %d outputAmplitudes(%d) = %f\n", i, pulseOffsets(i), i, resultAmplitudesVector(i)); + printf("chi2 = %f\n", chi2); +#endif + + outputChi2[gch] = chi2; + auto const idx_for_energy = std::abs(pulseOffsetValues[0]); + outputEnergy[gch] = (gain * resultAmplitudesVector(idx_for_energy)) * respCorrection; + /* + #pragma unroll + for (int i=0; i(inputGPU.f01HEDigis.stride); + auto const f5nsamples = compute_nsamples(inputGPU.f5HBDigis.stride); + auto const f3nsamples = compute_nsamples(inputGPU.f3HBDigis.stride); + int constexpr windowSize = 8; + int const startingSample = f01nsamples - windowSize; + assert(startingSample == 0 || startingSample == 2); + if (inputGPU.f01HEDigis.stride > 0 && inputGPU.f5HBDigis.stride > 0) + assert(f01nsamples == f5nsamples); + if (inputGPU.f01HEDigis.stride > 0 && inputGPU.f3HBDigis.stride > 0) + assert(f01nsamples == f3nsamples); + + dim3 threadsPerBlock{windowSize, configParameters.kprep1dChannelsPerBlock}; + int blocks = static_cast(threadsPerBlock.y) > totalChannels + ? 1 + : (totalChannels + threadsPerBlock.y - 1) / threadsPerBlock.y; + int nbytesShared = + ((2 * windowSize + 2) * sizeof(float) + sizeof(uint64_t)) * configParameters.kprep1dChannelsPerBlock; + hcal::mahi::kernel_prep1d_sameNumberOfSamples<<>>( + scratch.amplitudes.get(), + scratch.noiseTerms.get(), + outputGPU.recHits.energy.get(), + outputGPU.recHits.chi2.get(), + inputGPU.f01HEDigis.data.get(), + inputGPU.f5HBDigis.data.get(), + inputGPU.f3HBDigis.data.get(), + inputGPU.f01HEDigis.ids.get(), + inputGPU.f5HBDigis.ids.get(), + inputGPU.f3HBDigis.ids.get(), + inputGPU.f01HEDigis.stride, + inputGPU.f5HBDigis.stride, + inputGPU.f3HBDigis.stride, + inputGPU.f01HEDigis.size, + inputGPU.f5HBDigis.size, + inputGPU.f5HBDigis.npresamples.get(), + scratch.soiSamples.get(), + outputGPU.recHits.energyM0.get(), + outputGPU.recHits.timeM0.get(), + outputGPU.recHits.did.get(), + totalChannels, + conditions.recoParams.param1, + conditions.recoParams.param2, + conditions.qieCoders.offsets, + conditions.qieCoders.slopes, + conditions.qieTypes.values, + conditions.pedestalWidths.values, + conditions.effectivePedestalWidths.values, + conditions.pedestals.values, + conditions.convertedEffectivePedestals ? conditions.convertedEffectivePedestals->values + : conditions.pedestals.values, + configParameters.useEffectivePedestals, + conditions.sipmParameters.type, + conditions.sipmParameters.fcByPE, + conditions.sipmCharacteristics.parLin1, + conditions.sipmCharacteristics.parLin2, + conditions.sipmCharacteristics.parLin3, + conditions.gains.values, + conditions.respCorrs.values, + conditions.topology->maxDepthHB(), + conditions.topology->maxDepthHE(), + conditions.recConstants->getNPhi(1) > hcal::reconstruction::IPHI_MAX ? conditions.recConstants->getNPhi(1) + : hcal::reconstruction::IPHI_MAX, + conditions.topology->firstHBRing(), + conditions.topology->lastHBRing(), + conditions.topology->firstHERing(), + conditions.topology->lastHERing(), + conditions.recConstants->getEtaRange(0).second - conditions.recConstants->getEtaRange(0).first + 1, + conditions.topology->firstHERing() > conditions.topology->lastHERing() + ? 0 + : (conditions.topology->lastHERing() - conditions.topology->firstHERing() + 1), + configParameters.sipmQTSShift, + configParameters.sipmQNTStoSum, + configParameters.firstSampleShift, + conditions.offsetForHashes, + configParameters.ts4Thresh, + startingSample); + cudaCheck(cudaGetLastError()); + + // 1024 is the max threads per block for gtx1080 + // FIXME: take this from cuda service or something like that + uint32_t const channelsPerBlock = 1024 / (windowSize * conditions.pulseOffsetsHost.size()); + dim3 threadsPerBlock2{windowSize, static_cast(conditions.pulseOffsetsHost.size()), channelsPerBlock}; + int blocks2 = + threadsPerBlock2.z > totalChannels ? 1 : (totalChannels + threadsPerBlock2.z - 1) / threadsPerBlock2.z; + +#ifdef HCAL_MAHI_CPUDEBUG + std::cout << "threads: " << threadsPerBlock2.x << " " << threadsPerBlock2.y << " " << threadsPerBlock2.z + << std::endl; + std::cout << "blocks: " << blocks2 << std::endl; +#endif + + hcal::mahi::kernel_prep_pulseMatrices_sameNumberOfSamples<<>>( + scratch.pulseMatrices.get(), + scratch.pulseMatricesM.get(), + scratch.pulseMatricesP.get(), + conditions.pulseOffsets.values, + scratch.amplitudes.get(), + inputGPU.f01HEDigis.ids.get(), + inputGPU.f5HBDigis.ids.get(), + inputGPU.f3HBDigis.ids.get(), + inputGPU.f01HEDigis.size, + inputGPU.f5HBDigis.size, + totalChannels, + scratch.soiSamples.get(), + conditions.recoParams.ids, + conditions.recoParams.acc25nsVec, + conditions.recoParams.diff25nsItvlVec, + conditions.recoParams.accVarLenIdxMinusOneVec, + conditions.recoParams.diffVarItvlIdxMinusOneVec, + conditions.recoParams.accVarLenIdxZEROVec, + conditions.recoParams.diffVarItvlIdxZEROVec, + configParameters.meanTime, + configParameters.timeSigmaSiPM, + configParameters.timeSigmaHPD, + conditions.topology->maxDepthHB(), + conditions.topology->maxDepthHE(), + conditions.recConstants->getNPhi(1) > hcal::reconstruction::IPHI_MAX ? conditions.recConstants->getNPhi(1) + : hcal::reconstruction::IPHI_MAX, + conditions.topology->firstHBRing(), + conditions.topology->lastHBRing(), + conditions.topology->firstHERing(), + conditions.topology->lastHERing(), + conditions.recConstants->getEtaRange(0).second - conditions.recConstants->getEtaRange(0).first + 1, + conditions.topology->firstHERing() > conditions.topology->lastHERing() + ? 0 + : (conditions.topology->lastHERing() - conditions.topology->firstHERing() + 1), + conditions.offsetForHashes, + configParameters.applyTimeSlew, + configParameters.tzeroTimeSlew, + configParameters.slopeTimeSlew, + configParameters.tmaxTimeSlew); + cudaCheck(cudaGetLastError()); + + // number of samples is checked in above assert + if (conditions.pulseOffsetsHost.size() == 8u) { + // FIXME: provide constants from configuration + uint32_t threadsPerBlock = configParameters.kernelMinimizeThreads[0]; + uint32_t blocks = threadsPerBlock > totalChannels ? 1 : (totalChannels + threadsPerBlock - 1) / threadsPerBlock; + auto const nbytesShared = 2 * threadsPerBlock * calo::multifit::MapSymM::total * sizeof(float); + hcal::mahi::kernel_minimize<8, 8><<>>( + outputGPU.recHits.energy.get(), + outputGPU.recHits.chi2.get(), + scratch.amplitudes.get(), + scratch.pulseMatrices.get(), + scratch.pulseMatricesM.get(), + scratch.pulseMatricesP.get(), + conditions.pulseOffsets.values, + scratch.noiseTerms.get(), + scratch.soiSamples.get(), + conditions.pedestalWidths.values, + conditions.effectivePedestalWidths.values, + configParameters.useEffectivePedestals, + inputGPU.f01HEDigis.ids.get(), + inputGPU.f5HBDigis.ids.get(), + inputGPU.f3HBDigis.ids.get(), + conditions.gains.values, + conditions.respCorrs.values, + inputGPU.f01HEDigis.size, + inputGPU.f5HBDigis.size, + totalChannels, + conditions.offsetForHashes, + conditions.topology->maxDepthHB(), + conditions.topology->maxDepthHE(), + conditions.recConstants->getNPhi(1) > hcal::reconstruction::IPHI_MAX ? conditions.recConstants->getNPhi(1) + : hcal::reconstruction::IPHI_MAX, + conditions.topology->firstHBRing(), + conditions.topology->lastHBRing(), + conditions.topology->firstHERing(), + conditions.topology->lastHERing(), + conditions.recConstants->getEtaRange(0).second - conditions.recConstants->getEtaRange(0).first + 1, + conditions.topology->firstHERing() > conditions.topology->lastHERing() + ? 0 + : (conditions.topology->lastHERing() - conditions.topology->firstHERing() + 1)); + } else { + throw cms::Exception("Invalid MahiGPU configuration") + << "Currently support only 8 pulses and 8 time samples and provided: " << f01nsamples << " samples and " + << conditions.pulseOffsetsHost.size() << " pulses" << std::endl; + } + } + + } // namespace reconstruction +} // namespace hcal diff --git a/RecoLocalCalo/HcalRecProducers/src/SimpleAlgoGPU.h b/RecoLocalCalo/HcalRecProducers/src/SimpleAlgoGPU.h new file mode 100644 index 0000000000000..c0bb499b517a7 --- /dev/null +++ b/RecoLocalCalo/HcalRecProducers/src/SimpleAlgoGPU.h @@ -0,0 +1,19 @@ +#ifndef RecoLocalCalo_HcalRecProducers_src_SimpleAlgoGPU_h +#define RecoLocalCalo_HcalRecProducers_src_SimpleAlgoGPU_h + +#include "DeclsForKernels.h" + +namespace hcal { + namespace reconstruction { + + void entryPoint(InputDataGPU const&, + OutputDataGPU&, + ConditionsProducts const&, + ScratchDataGPU&, + ConfigParameters const&, + cudaStream_t); + + } +} // namespace hcal + +#endif // RecoLocalCalo_HcalRecProducers_src_SimpleAlgoGPU_h diff --git a/RecoLocalCalo/HcalRecProducers/test/make_GPUvsCPU_HCAL_plots.py b/RecoLocalCalo/HcalRecProducers/test/make_GPUvsCPU_HCAL_plots.py new file mode 100644 index 0000000000000..2b97efc2f2d8c --- /dev/null +++ b/RecoLocalCalo/HcalRecProducers/test/make_GPUvsCPU_HCAL_plots.py @@ -0,0 +1,28 @@ +import FWCore.ParameterSet.Config as cms + +process = cms.Process("PLOT") + +process.load("FWCore.MessageService.MessageLogger_cfi") +process.options = cms.untracked.PSet( + wantSummary = cms.untracked.bool(False) +) + +process.load('Configuration.StandardSequences.GeometryRecoDB_cff') +process.load("Configuration.StandardSequences.FrontierConditions_GlobalTag_cff") +from Configuration.AlCa.GlobalTag import GlobalTag +process.GlobalTag = GlobalTag(process.GlobalTag, 'auto:run2_hlt_relval', '') + +process.maxEvents = cms.untracked.PSet( input = cms.untracked.int32(-1) ) +process.MessageLogger.cerr.FwkReport.reportEvery = 500 + +process.source = cms.Source("PoolSource", + fileNames = cms.untracked.vstring('file:GPUvsCPU_HCAL_rechits.root') +) + +process.comparisonPlots = cms.EDAnalyzer('HCALGPUAnalyzer') + +process.TFileService = cms.Service('TFileService', + fileName = cms.string('GPUvsCPU_HCAL_plots.root') +) + +process.path = cms.Path(process.comparisonPlots) diff --git a/RecoLocalCalo/HcalRecProducers/test/make_GPUvsCPU_HCAL_rechits.py b/RecoLocalCalo/HcalRecProducers/test/make_GPUvsCPU_HCAL_rechits.py new file mode 100644 index 0000000000000..32d4104a842ef --- /dev/null +++ b/RecoLocalCalo/HcalRecProducers/test/make_GPUvsCPU_HCAL_rechits.py @@ -0,0 +1,152 @@ +import FWCore.ParameterSet.Config as cms + +from Configuration.StandardSequences.Eras import eras +#from Configuration.ProcessModifiers.gpu_cff import gpu + +process = cms.Process('RECOgpu', eras.Run2_2018) + +# import of standard configurations +process.load('Configuration.StandardSequences.Services_cff') +process.load('FWCore.MessageService.MessageLogger_cfi') +process.load('HeterogeneousCore.CUDAServices.CUDAService_cfi') + +process.load('Configuration.StandardSequences.FrontierConditions_GlobalTag_cff') +from Configuration.AlCa.GlobalTag import GlobalTag +process.GlobalTag = GlobalTag(process.GlobalTag, 'auto:run2_hlt_relval', '') + +process.maxEvents = cms.untracked.PSet( + input = cms.untracked.int32(1000) +) + +#----------------------------------------- +# INPUT +#----------------------------------------- + +process.source = cms.Source("PoolSource", + fileNames = cms.untracked.vstring('/store/data/Run2018D/EphemeralHLTPhysics1/RAW/v1/000/323/775/00000/A27DFA33-8FCB-BE42-A2D2-1A396EEE2B6E.root') +) + +process.hltGetRaw = cms.EDAnalyzer( "HLTGetRaw", + RawDataCollection = cms.InputTag( "rawDataCollector" ) +) + +process.input = cms.Path( process.hltGetRaw ) + +#----------------------------------------- +# CMSSW/Hcal non-DQM Related Module import +#----------------------------------------- + +process.load('Configuration.StandardSequences.GeometryRecoDB_cff') +process.load("RecoLocalCalo.Configuration.hcalLocalReco_cff") +process.load("EventFilter.HcalRawToDigi.HcalRawToDigi_cfi") +process.load("RecoLuminosity.LumiProducer.bunchSpacingProducer_cfi") + +process.hcalDigis.InputLabel = cms.InputTag("rawDataCollector") + +#----------------------------------------- +# CMSSW/Hcal GPU related files +#----------------------------------------- + +process.load("RecoLocalCalo.HcalRecProducers.hbheRecHitProducerGPUTask_cff") +process.load("RecoLocalCalo.HcalRecProducers.hcalCPURecHitsProducer_cfi") +process.hcalCPURecHitsProducer.recHitsM0LabelIn = cms.InputTag("hbheRecHitProducerGPU","") +process.hcalCPURecHitsProducer.recHitsM0LabelOut = cms.string("") + +#----------------------------------------- +# Temporary customization (things not implemented on the GPU) +#----------------------------------------- + +## the one below is taken directly from the DB, regard M0 +#process.hbheprereco.algorithm.correctForPhaseContainment = cms.bool(False) + +## do always 8 pulse +process.hbheprereco.algorithm.chiSqSwitch = cms.double(-1) + +## to match hard coded setting (will be fixed on CPU) +process.hbheprereco.algorithm.nMaxItersMin = cms.int32(50) + +#----------------------------------------- +# Final Custmization for Run3 +#----------------------------------------- + +# we will not run arrival Time at HLT +process.hbheprereco.algorithm.calculateArrivalTime = cms.bool(False) + +## we do not need this +process.hbheprereco.algorithm.applyLegacyHBMCorrection = cms.bool(False) + +# we only run Mahi at HLT +process.hbheprereco.algorithm.useM3 = cms.bool(False) + +# we will not have the HPD noise flags in Run3, as will be all siPM +process.hbheprereco.setLegacyFlagsQIE8 = cms.bool(False) +process.hbheprereco.setNegativeFlagsQIE8 = cms.bool(False) +process.hbheprereco.setNoiseFlagsQIE8 = cms.bool(False) +process.hbheprereco.setPulseShapeFlagsQIE8 = cms.bool(False) + +# for testing M0 only +##process.hbheprereco.algorithm.useMahi = cms.bool(False) + +#----------------------------------------- +# OUTPUT +#----------------------------------------- + +#process.out = cms.OutputModule("AsciiOutputModule", +# outputCommands = cms.untracked.vstring( +# 'keep *_*_*_*', +# ), +# verbosity = cms.untracked.uint32(0) +#) + +process.out = cms.OutputModule("PoolOutputModule", + fileName = cms.untracked.string("GPUvsCPU_HCAL_rechits.root") +) + +#--------------- + +process.finalize = cms.EndPath(process.out) + +process.bunchSpacing = cms.Path( + process.bunchSpacingProducer +) + +#----------------------------------------- +# gpu test +#----------------------------------------- + +process.digiPathCPU = cms.Path( + process.hcalDigis +) + +process.recoPathCPU = cms.Path( + process.hbheprereco +) + +#--------------- + +## hcalCPUDigisProducer <-- this convert the GPU digi on cpu (for dqm) +process.recoPathGPU = cms.Path( + process.hbheRecHitProducerGPUSequence + * process.hcalCPURecHitsProducer +) + +#--------------- + +process.schedule = cms.Schedule( + process.input, + process.digiPathCPU, + process.recoPathCPU, + process.recoPathGPU, + process.finalize +) + +process.options = cms.untracked.PSet( + numberOfThreads = cms.untracked.uint32(8), + numberOfStreams = cms.untracked.uint32(8), + SkipEvent = cms.untracked.vstring('ProductNotFound'), + wantSummary = cms.untracked.bool(True) +) + +# report CUDAService messages +process.MessageLogger.cerr.FwkReport.reportEvery = 100 +process.MessageLogger.categories.append("CUDAService")