diff --git a/CUDADataFormats/HcalDigi/BuildFile.xml b/CUDADataFormats/HcalDigi/BuildFile.xml
new file mode 100644
index 0000000000000..fb871f16b69f0
--- /dev/null
+++ b/CUDADataFormats/HcalDigi/BuildFile.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
diff --git a/CUDADataFormats/HcalDigi/interface/DigiCollection.h b/CUDADataFormats/HcalDigi/interface/DigiCollection.h
new file mode 100644
index 0000000000000..e2f4bf0848e94
--- /dev/null
+++ b/CUDADataFormats/HcalDigi/interface/DigiCollection.h
@@ -0,0 +1,160 @@
+#ifndef CUDADataFormats_HcalDigi_interface_DigiCollection_h
+#define CUDADataFormats_HcalDigi_interface_DigiCollection_h
+
+#include "CUDADataFormats/CaloCommon/interface/Common.h"
+
+namespace hcal {
+
+ // FLAVOR_HE_QIE11 = 1; Phase1 upgrade
+ struct Flavor1 {
+ static constexpr int WORDS_PER_SAMPLE = 1;
+ static constexpr int SAMPLES_PER_WORD = 1;
+ static constexpr int HEADER_WORDS = 1;
+
+ static constexpr uint8_t adc(uint16_t const* const sample_start) { return (*sample_start & 0xff); }
+ static constexpr uint8_t tdc(uint16_t const* const sample_start) { return (*sample_start >> 8) & 0x3f; }
+ static constexpr uint8_t soibit(uint16_t const* const sample_start) { return (*sample_start >> 14) & 0x1; }
+ };
+
+ // FLAVOR_HB_QIE11 = 3; Phase1 upgrade
+ struct Flavor3 {
+ static constexpr int WORDS_PER_SAMPLE = 1;
+ static constexpr int SAMPLES_PER_WORD = 1;
+ static constexpr int HEADER_WORDS = 1;
+
+ static constexpr uint8_t adc(uint16_t const* const sample_start) { return (*sample_start & 0xff); }
+ static constexpr uint8_t tdc(uint16_t const* const sample_start) { return ((*sample_start >> 8) & 0x3); }
+ static constexpr uint8_t soibit(uint16_t const* const sample_start) { return ((*sample_start >> 14) & 0x1); }
+ static constexpr uint8_t capid(uint16_t const* const sample_start) { return ((*sample_start >> 10) & 0x3); }
+ };
+
+ // FLAVOR_HB_QIE10 = 5; Phase0
+ struct Flavor5 {
+ static constexpr float WORDS_PER_SAMPLE = 0.5;
+ static constexpr int SAMPLES_PER_WORD = 2;
+ static constexpr int HEADER_WORDS = 1;
+
+ static constexpr uint8_t adc(uint16_t const* const sample_start, uint8_t const shifter) {
+ return ((*sample_start >> shifter * 8) & 0x7f);
+ }
+ };
+
+ template
+ constexpr uint8_t capid_for_sample(uint16_t const* const dfstart, uint32_t const sample) {
+ auto const capid_first = (*dfstart >> 8) & 0x3;
+ return (capid_first + sample) & 0x3; // same as % 4
+ }
+
+ template <>
+ constexpr uint8_t capid_for_sample(uint16_t const* const dfstart, uint32_t const sample) {
+ return Flavor3::capid(dfstart + Flavor3::HEADER_WORDS + sample * Flavor3::WORDS_PER_SAMPLE);
+ }
+
+ template
+ constexpr uint8_t soibit_for_sample(uint16_t const* const dfstart, uint32_t const sample) {
+ return Flavor::soibit(dfstart + Flavor::HEADER_WORDS + sample * Flavor::WORDS_PER_SAMPLE);
+ }
+
+ template
+ constexpr uint8_t adc_for_sample(uint16_t const* const dfstart, uint32_t const sample) {
+ return Flavor::adc(dfstart + Flavor::HEADER_WORDS + sample * Flavor::WORDS_PER_SAMPLE);
+ }
+
+ template
+ constexpr uint8_t tdc_for_sample(uint16_t const* const dfstart, uint32_t const sample) {
+ return Flavor::tdc(dfstart + Flavor::HEADER_WORDS + sample * Flavor::WORDS_PER_SAMPLE);
+ }
+
+ template <>
+ constexpr uint8_t adc_for_sample(uint16_t const* const dfstart, uint32_t const sample) {
+ // avoid using WORDS_PER_SAMPLE and simply shift
+ return Flavor5::adc(dfstart + Flavor5::HEADER_WORDS + (sample >> 1), sample % 2);
+ }
+
+ template
+ constexpr uint32_t compute_stride(uint32_t const nsamples) {
+ return static_cast(nsamples * Flavor::WORDS_PER_SAMPLE) + Flavor::HEADER_WORDS;
+ }
+
+ template
+ constexpr uint32_t compute_nsamples(uint32_t const nwords) {
+ if constexpr (Flavor::SAMPLES_PER_WORD >= 1)
+ return (nwords - Flavor::HEADER_WORDS) * Flavor::SAMPLES_PER_WORD;
+ else
+ return (nwords - Flavor::HEADER_WORDS) / Flavor::WORDS_PER_SAMPLE;
+ }
+
+ //
+ template
+ struct DigiCollectionBase : public ::calo::common::AddSize {
+ DigiCollectionBase() = default;
+ DigiCollectionBase(DigiCollectionBase const&) = default;
+ DigiCollectionBase& operator=(DigiCollectionBase const&) = default;
+
+ DigiCollectionBase(DigiCollectionBase&&) = default;
+ DigiCollectionBase& operator=(DigiCollectionBase&&) = default;
+
+ template
+ typename std::enable_if::value, void>::type resize(std::size_t size) {
+ ids.resize(size);
+ data.resize(size * stride);
+ }
+
+ template
+ typename std::enable_if::value, void>::type reserve(std::size_t size) {
+ ids.reserve(size);
+ data.reserve(size * stride);
+ }
+
+ template
+ typename std::enable_if::value, void>::type clear() {
+ ids.clear();
+ data.clear();
+ }
+
+ typename StoragePolicy::template StorageSelector::type ids;
+ typename StoragePolicy::template StorageSelector::type data;
+ uint32_t stride{0};
+ };
+
+ template
+ struct DigiCollection : public DigiCollectionBase {
+ using DigiCollectionBase::DigiCollectionBase;
+ };
+
+ // NOTE: base ctors will not be available
+ template
+ struct DigiCollection : public DigiCollectionBase {
+ DigiCollection() = default;
+
+ DigiCollection(DigiCollection const&) = default;
+ DigiCollection& operator=(DigiCollection const&) = default;
+
+ DigiCollection(DigiCollection&&) = default;
+ DigiCollection& operator=(DigiCollection&&) = default;
+
+ template
+ typename std::enable_if::value, void>::type resize(std::size_t size) {
+ DigiCollectionBase::resize(size);
+ npresamples.resize(size);
+ }
+
+ template
+ typename std::enable_if::value, void>::type reserve(std::size_t size) {
+ DigiCollectionBase::reserve(size);
+ npresamples.reserve(size);
+ }
+
+ template
+ typename std::enable_if::value, void>::type clear() {
+ DigiCollectionBase::clear();
+ npresamples.clear();
+ }
+
+ // add npresamples member
+ typename StoragePolicy::template StorageSelector::type npresamples;
+ };
+
+} // namespace hcal
+
+#endif // CUDADataFormats_HcalDigi_interface_DigiCollection_h
diff --git a/CUDADataFormats/HcalDigi/src/classes.h b/CUDADataFormats/HcalDigi/src/classes.h
new file mode 100644
index 0000000000000..8c4a20318928e
--- /dev/null
+++ b/CUDADataFormats/HcalDigi/src/classes.h
@@ -0,0 +1,3 @@
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+#include "DataFormats/Common/interface/Wrapper.h"
diff --git a/CUDADataFormats/HcalDigi/src/classes_def.xml b/CUDADataFormats/HcalDigi/src/classes_def.xml
new file mode 100644
index 0000000000000..71997eb59ba61
--- /dev/null
+++ b/CUDADataFormats/HcalDigi/src/classes_def.xml
@@ -0,0 +1,36 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/CUDADataFormats/HcalRecHitSoA/BuildFile.xml b/CUDADataFormats/HcalRecHitSoA/BuildFile.xml
new file mode 100644
index 0000000000000..245701de5fdb0
--- /dev/null
+++ b/CUDADataFormats/HcalRecHitSoA/BuildFile.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
diff --git a/CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h b/CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h
new file mode 100644
index 0000000000000..424b2c0813b4c
--- /dev/null
+++ b/CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h
@@ -0,0 +1,38 @@
+#ifndef CUDADataFormats_HcalRecHitCollectionSoA_interface_RecHitCollection_h
+#define CUDADataFormats_HcalRecHitCollectionSoA_interface_RecHitCollection_h
+
+#include
+
+#include "CUDADataFormats/CaloCommon/interface/Common.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+
+namespace hcal {
+
+ template
+ struct RecHitCollection : public ::calo::common::AddSize {
+ RecHitCollection() = default;
+ RecHitCollection(const RecHitCollection&) = default;
+ RecHitCollection& operator=(const RecHitCollection&) = default;
+
+ RecHitCollection(RecHitCollection&&) = default;
+ RecHitCollection& operator=(RecHitCollection&&) = default;
+
+ typename StoragePolicy::template StorageSelector::type energy;
+ typename StoragePolicy::template StorageSelector::type chi2;
+ typename StoragePolicy::template StorageSelector::type energyM0;
+ typename StoragePolicy::template StorageSelector::type timeM0;
+ typename StoragePolicy::template StorageSelector::type did;
+
+ template
+ typename std::enable_if::value, void>::type resize(size_t size) {
+ energy.resize(size);
+ chi2.resize(size);
+ energyM0.resize(size);
+ timeM0.resize(size);
+ did.resize(size);
+ }
+ };
+
+} // namespace hcal
+
+#endif // RecoLocalCalo_HcalRecAlgos_interface_RecHitCollection_h
diff --git a/CUDADataFormats/HcalRecHitSoA/src/classes.h b/CUDADataFormats/HcalRecHitSoA/src/classes.h
new file mode 100644
index 0000000000000..a13782165c413
--- /dev/null
+++ b/CUDADataFormats/HcalRecHitSoA/src/classes.h
@@ -0,0 +1,3 @@
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/HcalRecHitSoA/interface/RecHitCollection.h"
+#include "DataFormats/Common/interface/Wrapper.h"
diff --git a/CUDADataFormats/HcalRecHitSoA/src/classes_def.xml b/CUDADataFormats/HcalRecHitSoA/src/classes_def.xml
new file mode 100644
index 0000000000000..71dd18a7daddb
--- /dev/null
+++ b/CUDADataFormats/HcalRecHitSoA/src/classes_def.xml
@@ -0,0 +1,13 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/EventFilter/HcalRawToDigi/bin/BuildFile.xml b/EventFilter/HcalRawToDigi/bin/BuildFile.xml
new file mode 100644
index 0000000000000..7a24968df89c8
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/bin/BuildFile.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
diff --git a/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp b/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp
new file mode 100644
index 0000000000000..039c38dd9df16
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp
@@ -0,0 +1,386 @@
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h"
+
+#define CREATE_HIST_1D(varname, nbins, first, last) auto varname = new TH1D(#varname, #varname, nbins, first, last)
+
+#define CREATE_HIST_2D(varname, nbins, first, last) \
+ auto varname = new TH2D(#varname, #varname, nbins, first, last, nbins, first, last)
+
+QIE11DigiCollection filterQIE11(QIE11DigiCollection const& coll) {
+ QIE11DigiCollection out;
+ out.reserve(coll.size());
+
+ for (uint32_t i = 0; i < coll.size(); i++) {
+ auto const df = coll[i];
+ auto const id = HcalDetId{df.id()};
+ if (id.subdetId() != HcalEndcap)
+ continue;
+
+ out.push_back(QIE11DataFrame{df});
+ }
+
+ return out;
+}
+
+int main(int argc, char* argv[]) {
+ if (argc < 3) {
+ std::cout << "run with: ./ \n";
+ exit(0);
+ }
+
+ auto filterf01HE = [](QIE11DigiCollection const& coll) {
+ QIE11DigiCollection out{coll.samples(), coll.subdetId()};
+ out.reserve(coll.size());
+
+ for (uint32_t i = 0; i < coll.size(); i++) {
+ auto const df = QIE11DataFrame{coll[i]};
+ auto const id = HcalDetId{df.id()};
+ if ((df.flavor() == 0 or df.flavor() == 1) and id.subdetId() == HcalEndcap)
+ out.push_back(df);
+ }
+
+ return out;
+ };
+
+ auto filterf3HB = [](QIE11DigiCollection const& coll) {
+ QIE11DigiCollection out{coll.samples(), coll.subdetId()};
+ out.reserve(coll.size());
+
+ for (uint32_t i = 0; i < coll.size(); i++) {
+ auto const df = QIE11DataFrame{coll[i]};
+ auto const did = HcalDetId{df.id()};
+ if (df.flavor() == 3 and did.subdetId() == HcalBarrel)
+ out.push_back(df);
+ }
+
+ return out;
+ };
+
+ // branches to use
+ using Collectionf01 =
+ hcal::DigiCollection>;
+ using Collectionf5 =
+ hcal::DigiCollection>;
+ using Collectionf3 =
+ hcal::DigiCollection>;
+ edm::Wrapper* wgpuf01he = nullptr;
+ edm::Wrapper* wgpuf5hb = nullptr;
+ edm::Wrapper* wgpuf3hb = nullptr;
+ edm::Wrapper* wcpuf01he = nullptr;
+ edm::Wrapper* wcpuf5hb = nullptr;
+
+ std::string inFileName{argv[1]};
+ std::string outFileName{argv[2]};
+
+ // prep output
+ TFile rfout{outFileName.c_str(), "recreate"};
+
+ CREATE_HIST_1D(hADCf01HEGPU, 256, 0, 256);
+ CREATE_HIST_1D(hADCf01HECPU, 256, 0, 256);
+ CREATE_HIST_1D(hADCf5HBGPU, 128, 0, 128);
+ CREATE_HIST_1D(hADCf5HBCPU, 128, 0, 128);
+ CREATE_HIST_1D(hADCf3HBGPU, 256, 0, 256);
+ CREATE_HIST_1D(hADCf3HBCPU, 256, 0, 256);
+ CREATE_HIST_1D(hTDCf01HEGPU, 64, 0, 64);
+ CREATE_HIST_1D(hTDCf01HECPU, 64, 0, 64);
+
+ CREATE_HIST_2D(hADCf01HEGPUvsCPU, 256, 0, 256);
+ CREATE_HIST_2D(hADCf3HBGPUvsCPU, 256, 0, 256);
+ CREATE_HIST_2D(hADCf5HBGPUvsCPU, 128, 0, 128);
+ CREATE_HIST_2D(hTDCf01HEGPUvsCPU, 64, 0, 64);
+ CREATE_HIST_2D(hTDCf3HBGPUvsCPU, 4, 0, 4);
+
+ // prep input
+ TFile rfin{inFileName.c_str()};
+ TTree* rt = (TTree*)rfin.Get("Events");
+ rt->SetBranchAddress("QIE11DataFrameHcalDataFrameContainer_hcalDigis__RECO.", &wcpuf01he);
+ rt->SetBranchAddress("HBHEDataFramesSorted_hcalDigis__RECO.", &wcpuf5hb);
+ rt->SetBranchAddress(
+ "hcalFlavor5calocommonCUDAHostAllocatorAliascalocommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_"
+ "f5HBDigis_RECO.",
+ &wgpuf5hb);
+ rt->SetBranchAddress(
+ "hcalFlavor1calocommonCUDAHostAllocatorAliascalocommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_"
+ "f01HEDigis_RECO.",
+ &wgpuf01he);
+ rt->SetBranchAddress(
+ "hcalFlavor3calocommonCUDAHostAllocatorAliascalocommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_"
+ "f3HBDigis_RECO.",
+ &wgpuf3hb);
+
+ // accumulate
+ auto const nentries = rt->GetEntries();
+ std::cout << ">>> nentries = " << nentries << std::endl;
+ for (int ie = 0; ie < nentries; ++ie) {
+ rt->GetEntry(ie);
+
+ auto const& f01HEProduct = wgpuf01he->bareProduct();
+ auto const& f5HBProduct = wgpuf5hb->bareProduct();
+ auto const& f3HBProduct = wgpuf3hb->bareProduct();
+ auto const& qie11Product = wcpuf01he->bareProduct();
+ auto const qie11Filteredf01 = filterf01HE(qie11Product);
+ auto const qie11Filteredf3 = filterf3HB(qie11Product);
+ auto const& qie8Product = wcpuf5hb->bareProduct();
+
+ auto const ngpuf01he = f01HEProduct.ids.size();
+ auto const ngpuf5hb = f5HBProduct.ids.size();
+ auto const ngpuf3hb = f3HBProduct.ids.size();
+ auto const ncpuf01he = qie11Filteredf01.size();
+ auto const ncpuf5hb = qie8Product.size();
+ auto const ncpuf3hb = qie11Filteredf3.size();
+
+ /*
+ printf("ngpuf01he = %u nqie11 = %u ncpuf01he = %u ngpuf5hb = %u ncpuf5hb = %u\n",
+ f01HEProduct.size(), qie11Product.size(), qie11Filtered.size(),
+ f5HBProduct.size(),
+ static_cast(qie8Product.size()));
+ */
+
+ if (ngpuf01he != ncpuf01he) {
+ std::cerr << "*** mismatch in number of flavor 01 digis for event " << ie << std::endl
+ << ">>> ngpuf01he = " << ngpuf01he << std::endl
+ << ">>> ncpuf01he = " << ncpuf01he << std::endl;
+ }
+
+ {
+ auto const& idsgpu = f01HEProduct.ids;
+ auto const& datagpu = f01HEProduct.data;
+
+ for (uint32_t ich = 0; ich < ncpuf01he; ich++) {
+ auto const cpudf = QIE11DataFrame{qie11Filteredf01[ich]};
+ auto const cpuid = cpudf.id();
+ auto iter2idgpu = std::find(idsgpu.begin(), idsgpu.end(), cpuid);
+
+ if (iter2idgpu == idsgpu.end()) {
+ std::cerr << "missing " << HcalDetId{cpuid} << std::endl;
+ continue;
+ }
+
+ // FIXME: cna fail...
+ assert(*iter2idgpu == cpuid);
+
+ auto const ptrdiff = iter2idgpu - idsgpu.begin();
+ auto const nsamples_gpu = hcal::compute_nsamples(f01HEProduct.stride);
+ auto const nsamples_cpu = qie11Filteredf01.samples();
+ assert(static_cast(nsamples_cpu) == nsamples_gpu);
+
+ uint32_t ichgpu = ptrdiff;
+ uint32_t offset = ichgpu * f01HEProduct.stride;
+ uint16_t const* df_start = datagpu.data() + offset;
+ for (uint32_t sample = 0u; sample < nsamples_gpu; sample++) {
+ auto const cpuadc = cpudf[sample].adc();
+ auto const gpuadc = hcal::adc_for_sample(df_start, sample);
+ auto const cputdc = cpudf[sample].tdc();
+ auto const gputdc = hcal::tdc_for_sample(df_start, sample);
+ auto const cpucapid = cpudf[sample].capid();
+ auto const gpucapid = hcal::capid_for_sample(df_start, sample);
+
+ hADCf01HEGPU->Fill(gpuadc);
+ hADCf01HECPU->Fill(cpuadc);
+ hTDCf01HEGPU->Fill(gputdc);
+ hTDCf01HECPU->Fill(cputdc);
+ hADCf01HEGPUvsCPU->Fill(cpuadc, gpuadc);
+ hTDCf01HEGPUvsCPU->Fill(cputdc, gputdc);
+
+ // At RAW Decoding level there must not be any mistmatches
+ // in the adc values at all!
+ assert(static_cast(cpuadc) == gpuadc);
+ assert(static_cast(cputdc) == gputdc);
+ assert(static_cast(cpucapid) == gpucapid);
+ }
+ }
+ }
+
+ if (ngpuf3hb != ncpuf3hb) {
+ std::cerr << "*** mismatch in number of flavor 3 digis for event " << ie << std::endl
+ << ">>> ngpuf01he = " << ngpuf3hb << std::endl
+ << ">>> ncpuf01he = " << ncpuf3hb << std::endl;
+ }
+
+ {
+ auto const& idsgpu = f3HBProduct.ids;
+ auto const& datagpu = f3HBProduct.data;
+
+ for (uint32_t ich = 0; ich < ncpuf3hb; ich++) {
+ auto const cpudf = QIE11DataFrame{qie11Filteredf3[ich]};
+ auto const cpuid = cpudf.id();
+ auto iter2idgpu = std::find(idsgpu.begin(), idsgpu.end(), cpuid);
+
+ if (iter2idgpu == idsgpu.end()) {
+ std::cerr << "missing " << HcalDetId{cpuid} << std::endl;
+ continue;
+ }
+
+ // FIXME: cna fail...
+ assert(*iter2idgpu == cpuid);
+
+ auto const ptrdiff = iter2idgpu - idsgpu.begin();
+ auto const nsamples_gpu = hcal::compute_nsamples(f3HBProduct.stride);
+ auto const nsamples_cpu = qie11Filteredf3.samples();
+ assert(static_cast(nsamples_cpu) == nsamples_gpu);
+
+ uint32_t ichgpu = ptrdiff;
+ uint32_t offset = ichgpu * f3HBProduct.stride;
+ uint16_t const* df_start = datagpu.data() + offset;
+ for (uint32_t sample = 0u; sample < nsamples_gpu; sample++) {
+ auto const cpuadc = cpudf[sample].adc();
+ auto const gpuadc = hcal::adc_for_sample(df_start, sample);
+ auto const cputdc = cpudf[sample].tdc();
+ auto const gputdc = hcal::tdc_for_sample(df_start, sample);
+
+ hADCf3HBGPU->Fill(gpuadc);
+ hADCf3HBCPU->Fill(cpuadc);
+ hADCf3HBGPUvsCPU->Fill(cpuadc, gpuadc);
+ hTDCf3HBGPUvsCPU->Fill(cputdc, gputdc);
+
+ // At RAW Decoding level there must not be any mistmatches
+ // in the adc values at all!
+ assert(static_cast(cpuadc) == gpuadc);
+ assert(static_cast(cputdc) == gputdc);
+ }
+ }
+ }
+
+ if (ngpuf5hb != ncpuf5hb) {
+ std::cerr << "*** mismatch in number of flavor 5 digis for event " << ie << std::endl
+ << ">>> ngpuf5hb = " << ngpuf5hb << std::endl
+ << ">>> ncpuf5hb = " << ncpuf5hb << std::endl;
+ }
+
+ {
+ auto const& idsgpu = f5HBProduct.ids;
+ auto const& datagpu = f5HBProduct.data;
+ for (uint32_t i = 0; i < ncpuf5hb; i++) {
+ auto const cpudf = qie8Product[i];
+ auto const cpuid = cpudf.id().rawId();
+ auto iter2idgpu = std::find(idsgpu.begin(), idsgpu.end(), cpuid);
+ if (iter2idgpu == idsgpu.end()) {
+ std::cerr << "missing " << HcalDetId{cpuid} << std::endl;
+ continue;
+ }
+
+ assert(*iter2idgpu == cpuid);
+
+ auto const ptrdiff = iter2idgpu - idsgpu.begin();
+ auto const nsamples_gpu = hcal::compute_nsamples(f5HBProduct.stride);
+ auto const nsamples_cpu = qie8Product[0].size();
+ assert(static_cast(nsamples_cpu) == nsamples_gpu);
+
+ uint32_t offset = ptrdiff * f5HBProduct.stride;
+ uint16_t const* df_start = datagpu.data() + offset;
+ for (uint32_t sample = 0u; sample < nsamples_gpu; sample++) {
+ auto const cpuadc = cpudf.sample(sample).adc();
+ auto const gpuadc = hcal::adc_for_sample(df_start, sample);
+ auto const cpucapid = cpudf.sample(sample).capid();
+ auto const gpucapid = hcal::capid_for_sample(df_start, sample);
+
+ hADCf5HBGPU->Fill(gpuadc);
+ hADCf5HBCPU->Fill(cpuadc);
+ hADCf5HBGPUvsCPU->Fill(cpuadc, gpuadc);
+
+ // the must for us at RAW Decoding stage
+ assert(static_cast(cpuadc) == gpuadc);
+ assert(static_cast(cpucapid) == gpucapid);
+ }
+ }
+ }
+ }
+
+ {
+ TCanvas c{"plots", "plots", 4200, 6200};
+ c.Divide(3, 3);
+ c.cd(1);
+ {
+ gPad->SetLogy();
+ hADCf01HECPU->SetLineColor(kBlack);
+ hADCf01HECPU->SetLineWidth(1.);
+ hADCf01HECPU->Draw("");
+ hADCf01HEGPU->SetLineColor(kBlue);
+ hADCf01HEGPU->SetLineWidth(1.);
+ hADCf01HEGPU->Draw("sames");
+ gPad->Update();
+ auto stats = (TPaveStats*)hADCf01HEGPU->FindObject("stats");
+ auto y2 = stats->GetY2NDC();
+ auto y1 = stats->GetY1NDC();
+ stats->SetY2NDC(y1);
+ stats->SetY1NDC(y1 - (y2 - y1));
+ }
+ c.cd(2);
+ {
+ gPad->SetLogy();
+ hADCf5HBCPU->SetLineColor(kBlack);
+ hADCf5HBCPU->SetLineWidth(1.);
+ hADCf5HBCPU->Draw("");
+ hADCf5HBGPU->SetLineColor(kBlue);
+ hADCf5HBGPU->SetLineWidth(1.);
+ hADCf5HBGPU->Draw("sames");
+ gPad->Update();
+ auto stats = (TPaveStats*)hADCf5HBGPU->FindObject("stats");
+ auto y2 = stats->GetY2NDC();
+ auto y1 = stats->GetY1NDC();
+ stats->SetY2NDC(y1);
+ stats->SetY1NDC(y1 - (y2 - y1));
+ }
+ c.cd(3);
+ {
+ gPad->SetLogy();
+ hADCf3HBCPU->SetLineColor(kBlack);
+ hADCf3HBCPU->SetLineWidth(1.);
+ hADCf3HBCPU->Draw("");
+ hADCf3HBGPU->SetLineColor(kBlue);
+ hADCf3HBGPU->SetLineWidth(1.);
+ hADCf3HBGPU->Draw("sames");
+ gPad->Update();
+ auto stats = (TPaveStats*)hADCf3HBGPU->FindObject("stats");
+ auto y2 = stats->GetY2NDC();
+ auto y1 = stats->GetY1NDC();
+ stats->SetY2NDC(y1);
+ stats->SetY1NDC(y1 - (y2 - y1));
+ }
+ c.cd(4);
+ hADCf01HEGPUvsCPU->Draw("colz");
+ c.cd(5);
+ hADCf5HBGPUvsCPU->Draw("colz");
+ c.cd(6);
+ hADCf3HBGPUvsCPU->Draw("colz");
+ c.cd(7);
+ {
+ gPad->SetLogy();
+ hTDCf01HECPU->SetLineColor(kBlack);
+ hTDCf01HECPU->SetLineWidth(1.);
+ hTDCf01HECPU->Draw("");
+ hTDCf01HEGPU->SetLineColor(kBlue);
+ hTDCf01HEGPU->SetLineWidth(1.);
+ hTDCf01HEGPU->Draw("sames");
+ gPad->Update();
+ auto stats = (TPaveStats*)hTDCf01HEGPU->FindObject("stats");
+ auto y2 = stats->GetY2NDC();
+ auto y1 = stats->GetY1NDC();
+ stats->SetY2NDC(y1);
+ stats->SetY1NDC(y1 - (y2 - y1));
+ }
+ c.cd(8);
+ hTDCf01HEGPUvsCPU->Draw("colz");
+ c.cd(9);
+ hTDCf3HBGPUvsCPU->Draw("colz");
+
+ c.SaveAs("plots.pdf");
+ }
+
+ rfin.Close();
+ rfout.Write();
+ rfout.Close();
+}
diff --git a/EventFilter/HcalRawToDigi/plugins/BuildFile.xml b/EventFilter/HcalRawToDigi/plugins/BuildFile.xml
index ccf6a061119c2..3077a68a665e4 100644
--- a/EventFilter/HcalRawToDigi/plugins/BuildFile.xml
+++ b/EventFilter/HcalRawToDigi/plugins/BuildFile.xml
@@ -1,16 +1,27 @@
+
+
+
+
+
-
-
-
+
-
-
-
+
+
+
+
+
+
+
+
+
+
+
diff --git a/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
new file mode 100644
index 0000000000000..9903b77efb341
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h
@@ -0,0 +1,86 @@
+#ifndef EventFilter_HcalRawToDigi_interface_DeclsForKernels_h
+#define EventFilter_HcalRawToDigi_interface_DeclsForKernels_h
+
+#include
+
+#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+#include "ElectronicsMappingGPU.h"
+
+namespace hcal {
+ namespace raw {
+
+ constexpr int32_t empty_event_size = 32;
+ constexpr uint32_t utca_nfeds_max = 50;
+ constexpr uint32_t nbytes_per_fed_max = 10 * 1024;
+
+ // each collection corresponds to a particular flavor with a certain number of
+ // samples per digi
+ constexpr uint32_t numOutputCollections = 3;
+ constexpr uint8_t OutputF01HE = 0;
+ constexpr uint8_t OutputF5HB = 1;
+ constexpr uint8_t OutputF3HB = 2;
+
+ struct ConfigurationParameters {
+ uint32_t maxChannelsF01HE;
+ uint32_t maxChannelsF5HB;
+ uint32_t maxChannelsF3HB;
+ uint32_t nsamplesF01HE;
+ uint32_t nsamplesF5HB;
+ uint32_t nsamplesF3HB;
+ };
+
+ struct InputDataCPU {
+ cms::cuda::host::unique_ptr data;
+ cms::cuda::host::unique_ptr offsets;
+ cms::cuda::host::unique_ptr feds;
+ };
+
+ struct OutputDataCPU {
+ cms::cuda::host::unique_ptr nchannels;
+ };
+
+ struct ScratchDataGPU {
+ // depends on the number of output collections
+ // that is a statically known predefined number
+ cms::cuda::device::unique_ptr pChannelsCounters;
+ };
+
+ struct OutputDataGPU {
+ DigiCollection digisF01HE;
+ DigiCollection digisF5HB;
+ DigiCollection digisF3HB;
+
+ void allocate(ConfigurationParameters const &config, cudaStream_t cudaStream) {
+ digisF01HE.data = cms::cuda::make_device_unique(
+ config.maxChannelsF01HE * compute_stride(config.nsamplesF01HE), cudaStream);
+ digisF01HE.ids = cms::cuda::make_device_unique(config.maxChannelsF01HE, cudaStream);
+
+ digisF5HB.data = cms::cuda::make_device_unique(
+ config.maxChannelsF5HB * compute_stride(config.nsamplesF5HB), cudaStream);
+ digisF5HB.ids = cms::cuda::make_device_unique(config.maxChannelsF5HB, cudaStream);
+ digisF5HB.npresamples = cms::cuda::make_device_unique(config.maxChannelsF5HB, cudaStream);
+
+ digisF3HB.data = cms::cuda::make_device_unique(
+ config.maxChannelsF3HB * compute_stride(config.nsamplesF3HB), cudaStream);
+ digisF3HB.ids = cms::cuda::make_device_unique(config.maxChannelsF3HB, cudaStream);
+ }
+ };
+
+ struct InputDataGPU {
+ cms::cuda::device::unique_ptr data;
+ cms::cuda::device::unique_ptr offsets;
+ cms::cuda::device::unique_ptr feds;
+ };
+
+ struct ConditionsProducts {
+ ElectronicsMappingGPU::Product const &eMappingProduct;
+ };
+
+ } // namespace raw
+} // namespace hcal
+
+#endif // EventFilter_HcalRawToDigi_interface_DeclsForKernels_h
diff --git a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
new file mode 100644
index 0000000000000..4f2ca85861b30
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu
@@ -0,0 +1,593 @@
+#include "DataFormats/HcalDetId/interface/HcalElectronicsId.h"
+#include "DataFormats/HcalDetId/interface/HcalSubdetector.h"
+#include "DataFormats/HcalDetId/interface/HcalDetId.h"
+
+#include "EventFilter/HcalRawToDigi/plugins/DecodeGPU.h"
+
+#include
+using namespace cooperative_groups;
+
+namespace hcal {
+ namespace raw {
+
+ __forceinline__ __device__ char const* get_subdet_str(DetId const& did) {
+ switch (did.subdetId()) {
+ case HcalEmpty:
+ return "HcalEmpty";
+ break;
+ case HcalBarrel:
+ return "HcalBarrel";
+ break;
+ case HcalEndcap:
+ return "HcalEndcap";
+ break;
+ case HcalOuter:
+ return "HcalOuter";
+ break;
+ case HcalForward:
+ return "HcalForward";
+ break;
+ case HcalTriggerTower:
+ return "HcalTriggerTower";
+ break;
+ case HcalOther:
+ return "HcalOther";
+ break;
+ default:
+ return "Unknown";
+ break;
+ }
+
+ return "Unknown";
+ }
+
+ __forceinline__ __device__ bool is_channel_header_word(uint16_t const* ptr) {
+ uint8_t bit = (*ptr >> 15) & 0x1;
+ return bit == 1;
+ }
+
+ template
+ constexpr bool is_power_of_two(T x) {
+ return (x != 0) && ((x & (x - 1)) == 0);
+ }
+
+ template
+ __global__ void kernel_rawdecode_test(unsigned char const* data,
+ uint32_t const* offsets,
+ int const* feds,
+ uint32_t const* eid2did,
+ uint32_t const* eid2tid,
+ uint16_t* digisF01HE,
+ uint32_t* idsF01HE,
+ uint16_t* digisF5HB,
+ uint32_t* idsF5HB,
+ uint8_t* npresamplesF5HB,
+ uint16_t* digisF3HB,
+ uint32_t* idsF3HB,
+ uint32_t* pChannelsCounters,
+ uint32_t const nsamplesF01HE,
+ uint32_t const nsamplesF5HB,
+ uint32_t const nsamplesF3HB,
+ uint32_t const nBytesTotal) {
+ // in order to properly use cooperative groups
+ static_assert(is_power_of_two(NTHREADS) == true && NTHREADS <= 32);
+
+ thread_block_tile thread_group = tiled_partition(this_thread_block());
+
+ auto const iamc = threadIdx.x / NTHREADS;
+ auto const ifed = blockIdx.x;
+ auto const offset = offsets[ifed];
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG_CG
+ if (ifed > 0 || iamc > 0)
+ return;
+ printf("threadIdx.x = %d rank = %d iamc = %d\n", threadIdx.x, thread_group.thread_rank(), iamc);
+#endif
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+ auto const fed = feds[ifed];
+ auto const size = ifed == gridDim.x - 1 ? nBytesTotal - offset : offsets[ifed + 1] - offset;
+ printf("ifed = %d fed = %d offset = %u size = %u\n", ifed, fed, offset, size);
+#endif
+
+ // offset to the right raw buffer
+ uint64_t const* buffer = reinterpret_cast(data + offset);
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+ //
+ // fed header
+ //
+ auto const fed_header = buffer[0];
+ uint32_t const fed_id = (fed_header >> 8) & 0xfff;
+ uint32_t const bx = (fed_header >> 20) & 0xfff;
+ uint32_t const lv1 = (fed_header >> 32) & 0xffffff;
+ uint8_t const trigger_type = (fed_header >> 56) & 0xf;
+ uint8_t const bid_fed_header = (fed_header >> 60) & 0xf;
+
+ printf("fed = %d fed_id = %u bx = %u lv1 = %u trigger_type = %u bid = %u\n",
+ fed,
+ fed_id,
+ bx,
+ lv1,
+ trigger_type,
+ bid_fed_header);
+#endif
+
+ // amc 13 header
+ auto const amc13word = buffer[1];
+ uint8_t const namc = (amc13word >> 52) & 0xf;
+ if (iamc >= namc)
+ return;
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+ uint8_t const amc13version = (amc13word >> 60) & 0xf;
+ uint32_t const amc13OrbitNumber = (amc13word >> 4) & 0xffffffffu;
+ printf("fed = %d namc = %u amc13version = %u amc13OrbitNumber = %u\n", fed, namc, amc13version, amc13OrbitNumber);
+#endif
+
+ // compute the offset int to the right buffer
+ uint32_t amcoffset = 0;
+ for (uint8_t ii = 0u; ii < iamc; ii++) {
+ auto const word = buffer[2 + ii];
+ int const amcSize = (word >> 32) & 0xffffff;
+ amcoffset += amcSize;
+ }
+
+ auto const word = buffer[2 + iamc];
+ int const amcSize = (word >> 32) & 0xffffff;
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+ uint16_t const amcid = word & 0xffff;
+ int const slot = (word >> 16) & 0xf;
+ int const amcBlockNumber = (word >> 20) & 0xff;
+ printf("fed = %d amcid = %u slot = %d amcBlockNumber = %d\n", fed, amcid, slot, amcBlockNumber);
+
+ bool const amcmore = ((word >> 61) & 0x1) != 0;
+ bool const amcSegmented = ((word >> 60) & 0x1) != 0;
+ bool const amcLengthOk = ((word >> 62) & 0x1) != 0;
+ bool const amcCROk = ((word >> 56) & 0x1) != 0;
+ bool const amcDataPresent = ((word >> 58) & 0x1) != 0;
+ bool const amcDataValid = ((word >> 56) & 0x1) != 0;
+ bool const amcEnabled = ((word >> 59) & 0x1) != 0;
+ printf(
+ "fed = %d amcmore = %d amcSegmented = %d, amcLengthOk = %d amcCROk = %d\n>> amcDataPresent = %d amcDataValid "
+ "= %d amcEnabled = %d\n",
+ fed,
+ static_cast(amcmore),
+ static_cast(amcSegmented),
+ static_cast(amcLengthOk),
+ static_cast(amcCROk),
+ static_cast(amcDataPresent),
+ static_cast(amcDataValid),
+ static_cast(amcEnabled));
+#endif
+
+ // get to the payload
+ auto const* payload64 = buffer + 2 + namc + amcoffset;
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+ // uhtr header v1 1st 64 bits
+ auto const payload64_w0 = payload64[0];
+#endif
+ // uhtr n bytes comes from amcSize, according to the cpu version!
+ uint32_t const data_length64 = amcSize;
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+ uint16_t bcn = (payload64_w0 >> 20) & 0xfff;
+ uint32_t evn = (payload64_w0 >> 32) & 0xffffff;
+ printf("fed = %d data_length64 = %u bcn = %u evn = %u\n", fed, data_length64, bcn, evn);
+#endif
+
+ // uhtr header v1 2nd 64 bits
+ auto const payload64_w1 = payload64[1];
+ uint8_t const uhtrcrate = payload64_w1 & 0xff;
+ uint8_t const uhtrslot = (payload64_w1 >> 8) & 0xf;
+ uint8_t const presamples = (payload64_w1 >> 12) & 0xf;
+ uint8_t const payloadFormat = (payload64_w1 >> 44) & 0xf;
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+ uint16_t const orbitN = (payload64_w1 >> 16) & 0xffff;
+ uint8_t const firmFlavor = (payload64_w1 >> 32) & 0xff;
+ uint8_t const eventType = (payload64_w1 >> 40) & 0xf;
+ printf(
+ "fed = %d crate = %u slot = %u presamples = %u\n>>> orbitN = %u firmFlavor = %u eventType = %u payloadFormat "
+ "= %u\n",
+ fed,
+ uhtrcrate,
+ uhtrslot,
+ presamples,
+ orbitN,
+ firmFlavor,
+ eventType,
+ payloadFormat);
+#endif
+
+ // this should be filtering out uMNio...
+ if (payloadFormat != 1)
+ return;
+
+ // skip uhtr header words
+ auto const channelDataSize = data_length64 - 2; // 2 uhtr header v1 words
+ auto const* channelDataBuffer64Start = payload64 + 2; // 2 uhtr header v2 wds
+ auto const* ptr = reinterpret_cast(channelDataBuffer64Start);
+ auto const* end = ptr + sizeof(uint64_t) / sizeof(uint16_t) * (channelDataSize - 1);
+ auto const t_rank = thread_group.thread_rank();
+
+ // iterate through the channel data
+ while (ptr != end) {
+ // this is the starting point for this thread group for this iteration
+ // with respect to this pointer every thread will move forward afterwards
+ auto const* const start_ptr = ptr;
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG_CG
+ thread_group.sync();
+#endif
+
+ // skip to the header word of the right channel for this thread
+ int counter = 0;
+ while (counter < thread_group.thread_rank()) {
+ // just a check for threads that land beyond the end
+ if (ptr == end)
+ break;
+
+ // move ptr one forward past header
+ if (is_channel_header_word(ptr))
+ ++ptr;
+ else {
+ // go to the next channel and do not consider this guy as a channel
+ while (ptr != end)
+ if (!is_channel_header_word(ptr))
+ ++ptr;
+ else
+ break;
+ continue;
+ }
+
+ // skip
+ while (ptr != end)
+ if (!is_channel_header_word(ptr))
+ ++ptr;
+ else
+ break;
+ counter++;
+ }
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG_CG
+ thread_group.sync();
+ printf("ptr - start_ptr = %d counter = %d rank = %d\n", static_cast(ptr - start_ptr), counter, t_rank);
+#endif
+
+ // when the end is near, channels will land outside of the [start_ptr, end) region
+ if (ptr != end) {
+ // for all of the flavors, these 2 guys have the same bit layout
+ uint8_t const flavor = (ptr[0] >> 12) & 0x7;
+ uint8_t const channelid = ptr[0] & 0xff;
+ auto const* const new_channel_start = ptr;
+
+ // flavor dependent stuff
+ switch (flavor) {
+ case 0:
+ case 1: {
+ // treat eid and did
+ uint8_t fiber = (channelid >> 3) & 0x1f;
+ uint8_t fchannel = channelid & 0x7;
+ HcalElectronicsId eid{uhtrcrate, uhtrslot, fiber, fchannel, false};
+ auto const did = HcalDetId{eid2did[eid.linearIndex()]};
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+ printf("erawId = %u linearIndex = %u drawid = %u subdet = %s\n",
+ eid.rawId(),
+ eid.linearIndex(),
+ did.rawId(),
+ get_subdet_str(did));
+ printf("flavor = %u crate = %u slot = %u channelid = %u fiber = %u fchannel = %u\n",
+ flavor,
+ uhtrcrate,
+ uhtrslot,
+ channelid,
+ fiber,
+ fchannel);
+#endif
+
+ // remove digis not for HE
+ if (did.subdetId() != HcalEndcap)
+ break;
+
+ // count words
+ auto const* channel_header_word = ptr++;
+ while (!is_channel_header_word(ptr) && ptr != end)
+ ++ptr;
+ auto const* channel_end = ptr; // set ptr
+ uint32_t const nwords = channel_end - channel_header_word;
+
+ // filter out this digi if nwords does not equal expected
+ auto const expected_words = compute_stride(nsamplesF01HE);
+ if (nwords != expected_words)
+ break;
+
+ // inc the number of digis of this type
+ auto const pos = atomicAdd(&pChannelsCounters[OutputF01HE], 1);
+#ifdef HCAL_RAWDECODE_GPUDEBUG_CG
+ printf("rank = %d pos = %d\n", thread_group.thread_rank(), pos);
+#endif
+
+ // store to global mem words for this digi
+ idsF01HE[pos] = did.rawId();
+
+ for (uint32_t iword = 0; iword < expected_words; iword++)
+ digisF01HE[pos * expected_words + iword] = channel_header_word[iword];
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+ printf("nwords = %u\n", nwords);
+#endif
+
+ break;
+ }
+ case 3: {
+ // treat eid and did
+ uint8_t fiber = (channelid >> 3) & 0x1f;
+ uint8_t fchannel = channelid & 0x7;
+ HcalElectronicsId eid{uhtrcrate, uhtrslot, fiber, fchannel, false};
+ auto const did = HcalDetId{eid2did[eid.linearIndex()]};
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+ printf("erawId = %u linearIndex = %u drawid = %u subdet = %s\n",
+ eid.rawId(),
+ eid.linearIndex(),
+ did.rawId(),
+ get_subdet_str(did));
+ printf("flavor = %u crate = %u slot = %u channelid = %u fiber = %u fchannel = %u\n",
+ flavor,
+ uhtrcrate,
+ uhtrslot,
+ channelid,
+ fiber,
+ fchannel);
+#endif
+
+ // remove digis not for HE
+ if (did.subdetId() != HcalBarrel)
+ break;
+
+ // count words
+ auto const* channel_header_word = ptr++;
+ while (!is_channel_header_word(ptr) && ptr != end)
+ ++ptr;
+ auto const* channel_end = ptr; // set ptr
+ uint32_t const nwords = channel_end - channel_header_word;
+
+ // filter out this digi if nwords does not equal expected
+ auto const expected_words = compute_stride(nsamplesF3HB);
+ if (nwords != expected_words)
+ break;
+
+ // inc the number of digis of this type
+ auto const pos = atomicAdd(&pChannelsCounters[OutputF3HB], 1);
+
+ // store to global mem words for this digi
+ idsF3HB[pos] = did.rawId();
+ for (uint32_t iword = 0; iword < expected_words; iword++)
+ digisF3HB[pos * expected_words + iword] = channel_header_word[iword];
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+ printf("nwords = %u\n", nwords);
+#endif
+
+ break;
+ }
+ case 2: {
+ uint8_t fiber = (channelid >> 3) & 0x1f;
+ uint8_t fchannel = channelid & 0x7;
+ HcalElectronicsId eid{uhtrcrate, uhtrslot, fiber, fchannel, false};
+ auto const did = DetId{eid2did[eid.linearIndex()]};
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+ printf("erawId = %u linearIndex = %u drawid = %u subdet = %s\n",
+ eid.rawId(),
+ eid.linearIndex(),
+ did.rawId(),
+ get_subdet_str(did));
+ printf("flavor = %u crate = %u slot = %u channelid = %u fiber = %u fchannel = %u\n",
+ flavor,
+ uhtrcrate,
+ uhtrslot,
+ channelid,
+ fiber,
+ fchannel);
+#endif
+
+ break;
+ }
+ case 4: {
+ uint8_t link = (channelid >> 4) & 0xf;
+ uint8_t tower = channelid & 0xf;
+ HcalElectronicsId eid{uhtrcrate, uhtrslot, link, tower, true};
+ auto const did = DetId{eid2tid[eid.linearIndex()]};
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+ printf("erawId = %u linearIndex = %u drawid = %u subdet = %s\n",
+ eid.rawId(),
+ eid.linearIndex(),
+ did.rawId(),
+ get_subdet_str(did));
+ printf("flavor = %u crate = %u slot = %u channelid = %u link = %u tower = %u\n",
+ flavor,
+ uhtrcrate,
+ uhtrslot,
+ channelid,
+ link,
+ tower);
+#endif
+
+ break;
+ }
+ case 5: {
+ uint8_t fiber = (channelid >> 2) & 0x3f;
+ uint8_t fchannel = channelid & 0x3;
+ HcalElectronicsId eid{uhtrcrate, uhtrslot, fiber, fchannel, false};
+ auto const did = HcalDetId{eid2did[eid.linearIndex()]};
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+ printf("erawId = %u linearIndex = %u drawid = %u subdet = %s\n",
+ eid.rawId(),
+ eid.linearIndex(),
+ did.rawId(),
+ get_subdet_str(did));
+ printf("flavor = %u crate = %u slot = %u channelid = %u fiber = %u fchannel = %u\n",
+ flavor,
+ uhtrcrate,
+ uhtrslot,
+ channelid,
+ fiber,
+ fchannel);
+#endif
+
+ // remove digis not for HB
+ if (did.subdetId() != HcalBarrel)
+ break;
+
+ // count words
+ auto const* channel_header_word = ptr++;
+ while (!is_channel_header_word(ptr) && ptr != end)
+ ++ptr;
+ auto const* channel_end = ptr; // set ptr
+ uint32_t const nwords = channel_end - channel_header_word;
+
+ // filter out this digi if nwords does not equal expected
+ auto const expected_words = compute_stride(nsamplesF5HB);
+ if (nwords != expected_words)
+ break;
+
+ // inc the number of digis of this type
+ auto const pos = atomicAdd(&pChannelsCounters[OutputF5HB], 1);
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG_CG
+ printf("rank = %d pos = %d\n", thread_group.thread_rank(), pos);
+#endif
+
+ // store to global mem words for this digi
+ idsF5HB[pos] = did.rawId();
+ npresamplesF5HB[pos] = presamples;
+ for (uint32_t iword = 0; iword < expected_words; iword++)
+ digisF5HB[pos * expected_words + iword] = channel_header_word[iword];
+
+ break;
+ }
+ case 7: {
+ uint8_t const fiber = (channelid >> 2) & 0x3f;
+ uint8_t const fchannel = channelid & 0x3;
+ HcalElectronicsId eid{uhtrcrate, uhtrslot, fiber, fchannel, false};
+ auto const did = DetId{eid2did[eid.linearIndex()]};
+
+ /* uncomment to check the linear index validity
+ if (eid.rawId() >= HcalElectronicsId::maxLinearIndex) {
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+ printf("*** rawid = %u has no known det id***\n", eid.rawId());
+#endif
+ break;
+ }
+ */
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+ printf("erawId = %u linearIndex = %u drawid = %u\n", eid.rawId(), eid.linearIndex(), did.rawId());
+ printf("flavor = %u crate = %u slot = %u channelid = %u fiber = %u fchannel = %u\n",
+ flavor,
+ uhtrcrate,
+ uhtrslot,
+ channelid,
+ fiber,
+ fchannel);
+#endif
+
+ break;
+ }
+ default:
+#ifdef HCAL_RAWDECODE_GPUDEBUG
+ printf("flavor = %u crate = %u slot = %u channelid = %u\n", flavor, uhtrcrate, uhtrslot, channelid);
+#endif
+ ;
+ }
+
+ // skip to the next word in case
+ // 1) current channel was not treated
+ // 2) we are in the middle of the digi words and not at the end
+ while (new_channel_start == ptr || !is_channel_header_word(ptr) && ptr != end)
+ ++ptr;
+ }
+
+ // thread with rank 31 of the group will have the ptr pointing to the
+ // header word of the next channel or the end
+ int const offset_to_shuffle = ptr - start_ptr;
+
+ // always receive from the last guy in the group
+ auto const offset_for_rank31 = thread_group.shfl(offset_to_shuffle, NTHREADS - 1);
+
+#ifdef HCAL_RAWDECODE_GPUDEBUG_CG
+ printf("rank = %d offset_to_shuffle = %d offset_for_rank32 = %d\n",
+ thread_group.thread_rank(),
+ offset_to_shuffle,
+ offset_for_rank31);
+#endif
+
+ // update the ptr for all threads of this group
+ // NOTE: relative to the start_ptr that is the same for all threads of
+ // this group
+ ptr = start_ptr + offset_for_rank31;
+ }
+ }
+
+ void entryPoint(InputDataCPU const& inputCPU,
+ InputDataGPU& inputGPU,
+ OutputDataGPU& outputGPU,
+ ScratchDataGPU& scratchGPU,
+ OutputDataCPU& outputCPU,
+ ConditionsProducts const& conditions,
+ ConfigurationParameters const& config,
+ cudaStream_t cudaStream,
+ uint32_t const nfedsWithData,
+ uint32_t const nbytesTotal) {
+ // transfer
+ cudaCheck(cudaMemcpyAsync(inputGPU.data.get(),
+ inputCPU.data.get(),
+ nbytesTotal * sizeof(unsigned char),
+ cudaMemcpyHostToDevice,
+ cudaStream));
+ cudaCheck(cudaMemcpyAsync(inputGPU.offsets.get(),
+ inputCPU.offsets.get(),
+ nfedsWithData * sizeof(uint32_t),
+ cudaMemcpyHostToDevice,
+ cudaStream));
+ cudaCheck(
+ cudaMemsetAsync(scratchGPU.pChannelsCounters.get(), 0, sizeof(uint32_t) * numOutputCollections, cudaStream));
+ cudaCheck(cudaMemcpyAsync(
+ inputGPU.feds.get(), inputCPU.feds.get(), nfedsWithData * sizeof(int), cudaMemcpyHostToDevice, cudaStream));
+
+ // 12 is the max number of modules per crate
+ kernel_rawdecode_test<32><<>>(inputGPU.data.get(),
+ inputGPU.offsets.get(),
+ inputGPU.feds.get(),
+ conditions.eMappingProduct.eid2did,
+ conditions.eMappingProduct.eid2tid,
+ outputGPU.digisF01HE.data.get(),
+ outputGPU.digisF01HE.ids.get(),
+ outputGPU.digisF5HB.data.get(),
+ outputGPU.digisF5HB.ids.get(),
+ outputGPU.digisF5HB.npresamples.get(),
+ outputGPU.digisF3HB.data.get(),
+ outputGPU.digisF3HB.ids.get(),
+ scratchGPU.pChannelsCounters.get(),
+ config.nsamplesF01HE,
+ config.nsamplesF5HB,
+ config.nsamplesF3HB,
+ nbytesTotal);
+ cudaCheck(cudaGetLastError());
+
+ cudaCheck(cudaMemcpyAsync(outputCPU.nchannels.get(),
+ scratchGPU.pChannelsCounters.get(),
+ sizeof(uint32_t) * numOutputCollections,
+ cudaMemcpyDeviceToHost,
+ cudaStream));
+ }
+
+ } // namespace raw
+} // namespace hcal
diff --git a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.h b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.h
new file mode 100644
index 0000000000000..3d5e4eec32269
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.h
@@ -0,0 +1,23 @@
+#ifndef EventFilter_HcalRawToDigi_interface_DecodeGPU_h
+#define EventFilter_HcalRawToDigi_interface_DecodeGPU_h
+
+#include "DeclsForKernels.h"
+
+namespace hcal {
+ namespace raw {
+
+ void entryPoint(InputDataCPU const&,
+ InputDataGPU&,
+ OutputDataGPU&,
+ ScratchDataGPU&,
+ OutputDataCPU&,
+ ConditionsProducts const&,
+ ConfigurationParameters const&,
+ cudaStream_t cudaStream,
+ uint32_t const,
+ uint32_t const);
+
+ }
+} // namespace hcal
+
+#endif // EventFilter_HcalRawToDigi_interface_DecodeGPU_h
diff --git a/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.cc b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.cc
new file mode 100644
index 0000000000000..6b7b89cc6ea77
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.cc
@@ -0,0 +1,63 @@
+#include "DataFormats/HcalDetId/interface/HcalElectronicsId.h"
+#include "FWCore/Utilities/interface/typelookup.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "ElectronicsMappingGPU.h"
+
+namespace hcal {
+ namespace raw {
+
+ // TODO: 0x3FFFFF * 4B ~= 16MB
+ // tmp solution for linear mapping of eid -> did
+ ElectronicsMappingGPU::ElectronicsMappingGPU(HcalElectronicsMap const& mapping)
+ : eid2tid_(HcalElectronicsId::maxLinearIndex, 0u), eid2did_(HcalElectronicsId::maxLinearIndex, 0u) {
+ auto const& eidsPrecision = mapping.allElectronicsIdPrecision();
+ for (uint32_t i = 0; i < eidsPrecision.size(); ++i) {
+ auto const& eid = eidsPrecision[i];
+
+ // assign
+ eid2did_[eid.linearIndex()] = eid.isTriggerChainId() ? 0u : mapping.lookup(eid).rawId();
+ }
+
+ auto const& eidsTrigger = mapping.allElectronicsIdTrigger();
+ for (uint32_t i = 0; i < eidsTrigger.size(); i++) {
+ auto const& eid = eidsTrigger[i];
+
+ // assign
+ eid2tid_[eid.linearIndex()] = eid.isTriggerChainId() ? mapping.lookupTrigger(eid).rawId() : 0u;
+ }
+ }
+
+ ElectronicsMappingGPU::Product::~Product() {
+ // deallocation
+ cudaCheck(cudaFree(eid2did));
+ cudaCheck(cudaFree(eid2tid));
+ }
+
+ ElectronicsMappingGPU::Product const& ElectronicsMappingGPU::getProduct(cudaStream_t cudaStream) const {
+ auto const& product = product_.dataForCurrentDeviceAsync(
+ cudaStream, [this](ElectronicsMappingGPU::Product& product, cudaStream_t cudaStream) {
+ // malloc
+ cudaCheck(cudaMalloc((void**)&product.eid2did, this->eid2did_.size() * sizeof(uint32_t)));
+ cudaCheck(cudaMalloc((void**)&product.eid2tid, this->eid2tid_.size() * sizeof(uint32_t)));
+
+ // transfer
+ cudaCheck(cudaMemcpyAsync(product.eid2did,
+ this->eid2did_.data(),
+ this->eid2did_.size() * sizeof(uint32_t),
+ cudaMemcpyHostToDevice,
+ cudaStream));
+ cudaCheck(cudaMemcpyAsync(product.eid2tid,
+ this->eid2tid_.data(),
+ this->eid2tid_.size() * sizeof(uint32_t),
+ cudaMemcpyHostToDevice,
+ cudaStream));
+ });
+
+ return product;
+ }
+
+ } // namespace raw
+} // namespace hcal
+
+TYPELOOKUP_DATA_REG(hcal::raw::ElectronicsMappingGPU);
diff --git a/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h
new file mode 100644
index 0000000000000..0f4c12f02a92d
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/ElectronicsMappingGPU.h
@@ -0,0 +1,48 @@
+#ifndef EventFilter_HcalRawToDigi_plugins_ElectronicsMappingGPU_h
+#define EventFilter_HcalRawToDigi_plugins_ElectronicsMappingGPU_h
+
+#include "CondFormats/HcalObjects/interface/HcalElectronicsMap.h"
+
+#ifndef __CUDACC__
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#endif
+
+namespace hcal {
+ namespace raw {
+
+ class ElectronicsMappingGPU {
+ public:
+ struct Product {
+ ~Product();
+ // trigger
+ uint32_t *eid2tid;
+ // detector
+ uint32_t *eid2did;
+ };
+
+#ifndef __CUDACC__
+
+ // rearrange pedestals
+ ElectronicsMappingGPU(HcalElectronicsMap const &);
+
+ // will call dealloation for Product thru ~Product
+ ~ElectronicsMappingGPU() = default;
+
+ // get device pointers
+ Product const &getProduct(cudaStream_t) const;
+
+ private:
+ // in the future, we need to arrange so to avoid this copy on the host
+ // if possible
+ std::vector> eid2tid_;
+ std::vector> eid2did_;
+
+ cms::cuda::ESProduct product_;
+#endif
+ };
+
+ } // namespace raw
+} // namespace hcal
+
+#endif // EventFilter_HcalRawToDigi_plugins_ElectronicsMappingGPU_h
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc b/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc
new file mode 100644
index 0000000000000..c2b67a10afaff
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/HcalCPUDigisProducer.cc
@@ -0,0 +1,117 @@
+#include
+
+#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
+#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+class HcalCPUDigisProducer : public edm::stream::EDProducer {
+public:
+ explicit HcalCPUDigisProducer(edm::ParameterSet const& ps);
+ ~HcalCPUDigisProducer() override;
+ static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+ void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+ void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+ using IProductTypef01 = cms::cuda::Product>;
+ edm::EDGetTokenT digisF01HETokenIn_;
+ using IProductTypef5 = cms::cuda::Product>;
+ edm::EDGetTokenT digisF5HBTokenIn_;
+ using IProductTypef3 = cms::cuda::Product>;
+ edm::EDGetTokenT digisF3HBTokenIn_;
+
+ using OProductTypef01 =
+ hcal::DigiCollection>;
+ edm::EDPutTokenT digisF01HETokenOut_;
+ using OProductTypef5 =
+ hcal::DigiCollection>;
+ edm::EDPutTokenT digisF5HBTokenOut_;
+ using OProductTypef3 =
+ hcal::DigiCollection>;
+ edm::EDPutTokenT digisF3HBTokenOut_;
+
+ // needed to pass data from acquire to produce
+ OProductTypef01 digisf01HE_;
+ OProductTypef5 digisf5HB_;
+ OProductTypef3 digisf3HB_;
+};
+
+void HcalCPUDigisProducer::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+ edm::ParameterSetDescription desc;
+
+ desc.add("digisLabelF01HEIn", edm::InputTag{"hcalRawToDigiGPU", "f01HEDigisGPU"});
+ desc.add("digisLabelF5HBIn", edm::InputTag{"hcalRawToDigiGPU", "f5HBDigisGPU"});
+ desc.add("digisLabelF3HBIn", edm::InputTag{"hcalRawToDigiGPU", "f3HBDigisGPU"});
+ desc.add("digisLabelF01HEOut", "f01HEDigis");
+ desc.add("digisLabelF5HBOut", "f5HBDigis");
+ desc.add("digisLabelF3HBOut", "f3HBDigis");
+
+ confDesc.addWithDefaultLabel(desc);
+}
+
+HcalCPUDigisProducer::HcalCPUDigisProducer(const edm::ParameterSet& ps)
+ : digisF01HETokenIn_{consumes(ps.getParameter("digisLabelF01HEIn"))},
+ digisF5HBTokenIn_{consumes(ps.getParameter("digisLabelF5HBIn"))},
+ digisF3HBTokenIn_{consumes(ps.getParameter("digisLabelF3HBIn"))},
+ digisF01HETokenOut_{produces(ps.getParameter("digisLabelF01HEOut"))},
+ digisF5HBTokenOut_{produces(ps.getParameter("digisLabelF5HBOut"))},
+ digisF3HBTokenOut_{produces(ps.getParameter("digisLabelF3HBOut"))} {}
+
+HcalCPUDigisProducer::~HcalCPUDigisProducer() {}
+
+void HcalCPUDigisProducer::acquire(edm::Event const& event,
+ edm::EventSetup const& setup,
+ edm::WaitingTaskWithArenaHolder taskHolder) {
+ // retrieve data/ctx
+ auto const& f01HEProduct = event.get(digisF01HETokenIn_);
+ auto const& f5HBProduct = event.get(digisF5HBTokenIn_);
+ auto const& f3HBProduct = event.get(digisF3HBTokenIn_);
+ cms::cuda::ScopedContextAcquire ctx{f01HEProduct, std::move(taskHolder)};
+ auto const& f01HEDigis = ctx.get(f01HEProduct);
+ auto const& f5HBDigis = ctx.get(f5HBProduct);
+ auto const& f3HBDigis = ctx.get(f3HBProduct);
+
+ // resize out tmp buffers
+ digisf01HE_.stride = f01HEDigis.stride;
+ digisf5HB_.stride = f5HBDigis.stride;
+ digisf3HB_.stride = f3HBDigis.stride;
+ digisf01HE_.resize(f01HEDigis.size);
+ digisf5HB_.resize(f5HBDigis.size);
+ digisf3HB_.resize(f3HBDigis.size);
+
+ auto lambdaToTransfer = [&ctx](auto& dest, auto* src) {
+ using vector_type = typename std::remove_reference::type;
+ using type = typename vector_type::value_type;
+ using src_data_type = typename std::remove_pointer::type;
+ static_assert(std::is_same::value && "Dest and Src data types do not match");
+ cudaCheck(cudaMemcpyAsync(dest.data(), src, dest.size() * sizeof(type), cudaMemcpyDeviceToHost, ctx.stream()));
+ };
+
+ lambdaToTransfer(digisf01HE_.data, f01HEDigis.data.get());
+ lambdaToTransfer(digisf01HE_.ids, f01HEDigis.ids.get());
+
+ lambdaToTransfer(digisf5HB_.data, f5HBDigis.data.get());
+ lambdaToTransfer(digisf5HB_.ids, f5HBDigis.ids.get());
+ lambdaToTransfer(digisf5HB_.npresamples, f5HBDigis.npresamples.get());
+
+ lambdaToTransfer(digisf3HB_.data, f3HBDigis.data.get());
+ lambdaToTransfer(digisf3HB_.ids, f3HBDigis.ids.get());
+}
+
+void HcalCPUDigisProducer::produce(edm::Event& event, edm::EventSetup const& setup) {
+ event.emplace(digisF01HETokenOut_, std::move(digisf01HE_));
+ event.emplace(digisF5HBTokenOut_, std::move(digisf5HB_));
+ event.emplace(digisF3HBTokenOut_, std::move(digisf3HB_));
+}
+
+DEFINE_FWK_MODULE(HcalCPUDigisProducer);
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
new file mode 100644
index 0000000000000..9ca33340f7036
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc
@@ -0,0 +1,235 @@
+#include
+
+#include "CUDADataFormats/HcalDigi/interface/DigiCollection.h"
+#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+class HcalDigisProducerGPU : public edm::stream::EDProducer {
+public:
+ explicit HcalDigisProducerGPU(edm::ParameterSet const& ps);
+ ~HcalDigisProducerGPU() override = default;
+ static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+ void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+ void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+ // input product tokens
+ edm::EDGetTokenT hbheDigiToken_;
+ edm::EDGetTokenT qie11DigiToken_;
+
+ // type aliases
+ using HostCollectionf01 =
+ hcal::DigiCollection>;
+ using DeviceCollectionf01 = hcal::DigiCollection;
+ using HostCollectionf5 =
+ hcal::DigiCollection>;
+ using DeviceCollectionf5 = hcal::DigiCollection;
+ using HostCollectionf3 =
+ hcal::DigiCollection>;
+ using DeviceCollectionf3 = hcal::DigiCollection;
+
+ // output product tokens
+ using ProductTypef01 = cms::cuda::Product;
+ edm::EDPutTokenT digisF01HEToken_;
+ using ProductTypef5 = cms::cuda::Product;
+ edm::EDPutTokenT digisF5HBToken_;
+ using ProductTypef3 = cms::cuda::Product;
+ edm::EDPutTokenT digisF3HBToken_;
+
+ cms::cuda::ContextState cudaState_;
+
+ struct ConfigParameters {
+ uint32_t maxChannelsF01HE, maxChannelsF5HB, maxChannelsF3HB;
+ };
+ ConfigParameters config_;
+
+ // per event host buffers
+ HostCollectionf01 hf01_;
+ HostCollectionf5 hf5_;
+ HostCollectionf3 hf3_;
+
+ // device products: product owns memory (i.e. not the module)
+ DeviceCollectionf01 df01_;
+ DeviceCollectionf5 df5_;
+ DeviceCollectionf3 df3_;
+};
+
+void HcalDigisProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+ edm::ParameterSetDescription desc;
+
+ // FIXME
+ desc.add("hbheDigisLabel", edm::InputTag("hcalDigis"));
+ desc.add("qie11DigiLabel", edm::InputTag("hcalDigis"));
+ desc.add("digisLabelF01HE", std::string{"f01HEDigisGPU"});
+ desc.add("digisLabelF5HB", std::string{"f5HBDigisGPU"});
+ desc.add("digisLabelF3HB", std::string{"f3HBDigisGPU"});
+ desc.add("maxChannelsF01HE", 10000u);
+ desc.add("maxChannelsF5HB", 10000u);
+ desc.add("maxChannelsF3HB", 10000u);
+
+ confDesc.addWithDefaultLabel(desc);
+}
+
+HcalDigisProducerGPU::HcalDigisProducerGPU(const edm::ParameterSet& ps)
+ : hbheDigiToken_{consumes(ps.getParameter("hbheDigisLabel"))},
+ qie11DigiToken_{consumes(ps.getParameter("qie11DigiLabel"))},
+ digisF01HEToken_{produces(ps.getParameter("digisLabelF01HE"))},
+ digisF5HBToken_{produces(ps.getParameter("digisLabelF5HB"))},
+ digisF3HBToken_{produces(ps.getParameter("digisLabelF3HB"))} {
+ config_.maxChannelsF01HE = ps.getParameter("maxChannelsF01HE");
+ config_.maxChannelsF5HB = ps.getParameter("maxChannelsF5HB");
+ config_.maxChannelsF3HB = ps.getParameter("maxChannelsF3HB");
+
+ // this is a preallocation for the max statically known number of time samples
+ // actual stride/nsamples will be inferred from data
+ hf01_.stride = hcal::compute_stride(QIE11DigiCollection::MAXSAMPLES);
+ hf5_.stride = hcal::compute_stride(HBHEDataFrame::MAXSAMPLES);
+ hf3_.stride = hcal::compute_stride(QIE11DigiCollection::MAXSAMPLES);
+
+ // preallocate pinned host memory only if CUDA is available
+ edm::Service cs;
+ if (cs and cs->enabled()) {
+ hf01_.reserve(config_.maxChannelsF01HE);
+ hf5_.reserve(config_.maxChannelsF5HB);
+ hf3_.reserve(config_.maxChannelsF3HB);
+ }
+}
+
+void HcalDigisProducerGPU::acquire(edm::Event const& event,
+ edm::EventSetup const& setup,
+ edm::WaitingTaskWithArenaHolder holder) {
+ // raii
+ cms::cuda::ScopedContextAcquire ctx{event.streamID(), std::move(holder), cudaState_};
+
+ // clear host buffers
+ hf01_.clear();
+ hf5_.clear();
+ hf3_.clear();
+
+ // event data
+ edm::Handle hbheDigis;
+ edm::Handle qie11Digis;
+ event.getByToken(hbheDigiToken_, hbheDigis);
+ event.getByToken(qie11DigiToken_, qie11Digis);
+
+ // init f5 collection
+ if (not hbheDigis->empty()) {
+ auto const nsamples = (*hbheDigis)[0].size();
+ auto const stride = hcal::compute_stride(nsamples);
+ hf5_.stride = stride;
+
+ // flavor5 get device blobs
+ df5_.stride = stride;
+ df5_.data = cms::cuda::make_device_unique(config_.maxChannelsF5HB * stride, ctx.stream());
+ df5_.ids = cms::cuda::make_device_unique(config_.maxChannelsF5HB, ctx.stream());
+ df5_.npresamples = cms::cuda::make_device_unique(config_.maxChannelsF5HB, ctx.stream());
+ }
+
+ if (not qie11Digis->empty()) {
+ auto const nsamples = qie11Digis->samples();
+ auto const stride01 = hcal::compute_stride(nsamples);
+ auto const stride3 = hcal::compute_stride(nsamples);
+
+ hf01_.stride = stride01;
+ hf3_.stride = stride3;
+
+ // flavor 0/1 get devie blobs
+ df01_.stride = stride01;
+ df01_.data = cms::cuda::make_device_unique(config_.maxChannelsF01HE * stride01, ctx.stream());
+ df01_.ids = cms::cuda::make_device_unique(config_.maxChannelsF01HE, ctx.stream());
+
+ // flavor3 get device blobs
+ df3_.stride = stride3;
+ df3_.data = cms::cuda::make_device_unique(config_.maxChannelsF3HB * stride3, ctx.stream());
+ df3_.ids = cms::cuda::make_device_unique(config_.maxChannelsF3HB, ctx.stream());
+ }
+
+ for (auto const& hbhe : *hbheDigis) {
+ auto const id = hbhe.id().rawId();
+ auto const presamples = hbhe.presamples();
+ hf5_.ids.push_back(id);
+ hf5_.npresamples.push_back(presamples);
+ auto const stride = hcal::compute_stride(hbhe.size());
+ assert(stride == hf5_.stride && "strides must be the same for every single digi of the collection");
+ // simple for now...
+ static_assert(hcal::Flavor5::HEADER_WORDS == 1);
+ uint16_t header_word = (1 << 15) | (0x5 << 12) | (0 << 10) | ((hbhe.sample(0).capid() & 0x3) << 8);
+ hf5_.data.push_back(header_word);
+ for (unsigned int i = 0; i < stride - hcal::Flavor5::HEADER_WORDS; i++) {
+ uint16_t s0 = (0 << 7) | (static_cast(hbhe.sample(2 * i).adc()) & 0x7f);
+ uint16_t s1 = (0 << 7) | (static_cast(hbhe.sample(2 * i + 1).adc()) & 0x7f);
+ uint16_t sample = (s1 << 8) | s0;
+ hf5_.data.push_back(sample);
+ }
+ }
+
+ for (unsigned int i = 0; i < qie11Digis->size(); i++) {
+ auto const& digi = QIE11DataFrame{(*qie11Digis)[i]};
+ assert(digi.samples() == qie11Digis->samples() && "collection nsamples must equal per digi samples");
+ if (digi.flavor() == 0 or digi.flavor() == 1) {
+ if (digi.detid().subdetId() != HcalEndcap)
+ continue;
+ auto const id = digi.detid().rawId();
+ hf01_.ids.push_back(id);
+ for (int hw = 0; hw < hcal::Flavor1::HEADER_WORDS; hw++)
+ hf01_.data.push_back((*qie11Digis)[i][hw]);
+ for (int sample = 0; sample < digi.samples(); sample++) {
+ hf01_.data.push_back((*qie11Digis)[i][hcal::Flavor1::HEADER_WORDS + sample]);
+ }
+ } else if (digi.flavor() == 3) {
+ if (digi.detid().subdetId() != HcalBarrel)
+ continue;
+ auto const id = digi.detid().rawId();
+ hf3_.ids.push_back(id);
+ for (int hw = 0; hw < hcal::Flavor3::HEADER_WORDS; hw++)
+ hf3_.data.push_back((*qie11Digis)[i][hw]);
+ for (int sample = 0; sample < digi.samples(); sample++) {
+ hf3_.data.push_back((*qie11Digis)[i][hcal::Flavor3::HEADER_WORDS + sample]);
+ }
+ }
+ }
+
+ auto lambdaToTransfer = [&ctx](auto* dest, auto const& src) {
+ if (src.empty())
+ return;
+ using vector_type = typename std::remove_reference::type;
+ using type = typename vector_type::value_type;
+ using dest_data_type = typename std::remove_pointer::type;
+ static_assert(std::is_same::value && "Dest and Src data typesdo not match");
+ cudaCheck(cudaMemcpyAsync(dest, src.data(), src.size() * sizeof(type), cudaMemcpyHostToDevice, ctx.stream()));
+ };
+
+ lambdaToTransfer(df01_.data.get(), hf01_.data);
+ lambdaToTransfer(df01_.ids.get(), hf01_.ids);
+
+ lambdaToTransfer(df5_.data.get(), hf5_.data);
+ lambdaToTransfer(df5_.ids.get(), hf5_.ids);
+ lambdaToTransfer(df5_.npresamples.get(), hf5_.npresamples);
+
+ lambdaToTransfer(df3_.data.get(), hf3_.data);
+ lambdaToTransfer(df3_.ids.get(), hf3_.ids);
+
+ df01_.size = hf01_.ids.size();
+ df5_.size = hf5_.ids.size();
+ df3_.size = hf3_.ids.size();
+}
+
+void HcalDigisProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) {
+ cms::cuda::ScopedContextProduce ctx{cudaState_};
+
+ ctx.emplace(event, digisF01HEToken_, std::move(df01_));
+ ctx.emplace(event, digisF5HBToken_, std::move(df5_));
+ ctx.emplace(event, digisF3HBToken_, std::move(df3_));
+}
+
+DEFINE_FWK_MODULE(HcalDigisProducerGPU);
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc b/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc
new file mode 100644
index 0000000000000..749a98e990755
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/HcalESProducerGPUDefs.cc
@@ -0,0 +1,10 @@
+#include "CondFormats/DataRecord/interface/HcalElectronicsMapRcd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "HeterogeneousCore/CUDACore/interface/ConvertingESProducerT.h"
+
+#include "ElectronicsMappingGPU.h"
+
+using HcalElectronicsMappingGPUESProducer =
+ ConvertingESProducerT;
+
+DEFINE_FWK_EVENTSETUP_MODULE(HcalElectronicsMappingGPUESProducer);
diff --git a/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
new file mode 100644
index 0000000000000..7e8388a5f4d2f
--- /dev/null
+++ b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc
@@ -0,0 +1,200 @@
+#include
+
+#include "CondFormats/DataRecord/interface/HcalElectronicsMapRcd.h"
+#include "DataFormats/FEDRawData/interface/FEDNumbering.h"
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+#include "DeclsForKernels.h"
+#include "DecodeGPU.h"
+#include "ElectronicsMappingGPU.h"
+
+class HcalRawToDigiGPU : public edm::stream::EDProducer {
+public:
+ explicit HcalRawToDigiGPU(edm::ParameterSet const& ps);
+ ~HcalRawToDigiGPU() override;
+ static void fillDescriptions(edm::ConfigurationDescriptions&);
+
+private:
+ void acquire(edm::Event const&, edm::EventSetup const&, edm::WaitingTaskWithArenaHolder) override;
+ void produce(edm::Event&, edm::EventSetup const&) override;
+
+private:
+ edm::EDGetTokenT rawDataToken_;
+ using ProductTypef01 = cms::cuda::Product>;
+ edm::EDPutTokenT digisF01HEToken_;
+ using ProductTypef5 = cms::cuda::Product>;
+ edm::EDPutTokenT digisF5HBToken_;
+ using ProductTypef3 = cms::cuda::Product>;
+ edm::EDPutTokenT digisF3HBToken_;
+
+ cms::cuda::ContextState cudaState_;
+
+ std::vector fedsToUnpack_;
+
+ hcal::raw::ConfigurationParameters config_;
+ hcal::raw::OutputDataGPU outputGPU_;
+ hcal::raw::OutputDataCPU outputCPU_;
+};
+
+void HcalRawToDigiGPU::fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
+ edm::ParameterSetDescription desc;
+
+ desc.add("InputLabel", edm::InputTag("rawDataCollector"));
+ auto nFeds = FEDNumbering::MAXHCALuTCAFEDID - FEDNumbering::MINHCALuTCAFEDID + 1;
+ std::vector feds(nFeds);
+ for (int i = 0; i < nFeds; ++i)
+ feds[i] = i + FEDNumbering::MINHCALuTCAFEDID;
+ desc.add>("FEDs", feds);
+ desc.add("maxChannelsF01HE", 10000u);
+ desc.add("maxChannelsF5HB", 10000u);
+ desc.add("maxChannelsF3HB", 10000u);
+ desc.add("nsamplesF01HE", 8);
+ desc.add("nsamplesF5HB", 8);
+ desc.add("nsamplesF3HB", 8);
+ desc.add("digisLabelF5HB", "f5HBDigisGPU");
+ desc.add("digisLabelF01HE", "f01HEDigisGPU");
+ desc.add("digisLabelF3HB", "f3HBDigisGPU");
+
+ std::string label = "hcalRawToDigiGPU";
+ confDesc.add(label, desc);
+}
+
+HcalRawToDigiGPU::HcalRawToDigiGPU(const edm::ParameterSet& ps)
+ : rawDataToken_{consumes(ps.getParameter("InputLabel"))},
+ digisF01HEToken_{produces(ps.getParameter("digisLabelF01HE"))},
+ digisF5HBToken_{produces(ps.getParameter("digisLabelF5HB"))},
+ digisF3HBToken_{produces