diff --git a/CUDADataFormats/HcalDigi/interface/DigiCollection.h b/CUDADataFormats/HcalDigi/interface/DigiCollection.h index f5ae63d0954c6..10350b52d4c52 100644 --- a/CUDADataFormats/HcalDigi/interface/DigiCollection.h +++ b/CUDADataFormats/HcalDigi/interface/DigiCollection.h @@ -136,7 +136,7 @@ namespace hcal { typename StoragePolicy::template StorageSelector::type ids; typename StoragePolicy::template StorageSelector::type data; - uint32_t stride; + uint32_t stride{0}; }; template diff --git a/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp b/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp index fd144ae452363..94a43892e08b6 100644 --- a/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp +++ b/EventFilter/HcalRawToDigi/bin/makeHcalRaw2DigiGpuValidationPlots.cpp @@ -130,15 +130,15 @@ int main(int argc, char* argv[]) { rt->SetBranchAddress("QIE11DataFrameHcalDataFrameContainer_hcalDigis__RECO.", &wcpuf01he); rt->SetBranchAddress("HBHEDataFramesSorted_hcalDigis__RECO.", &wcpuf5hb); rt->SetBranchAddress( - "hcalFlavor5hcalCUDAHostAllocatorAliashcalcommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_" + "hcalFlavor5calocommonCUDAHostAllocatorAliascalocommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_" "f5HBDigis_RECO.", &wgpuf5hb); rt->SetBranchAddress( - "hcalFlavor01hcalCUDAHostAllocatorAliashcalcommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_" + "hcalFlavor01calocommonCUDAHostAllocatorAliascalocommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_" "f01HEDigis_RECO.", &wgpuf01he); rt->SetBranchAddress( - "hcalFlavor3hcalCUDAHostAllocatorAliashcalcommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_" + "hcalFlavor3calocommonCUDAHostAllocatorAliascalocommonVecStoragePolicyhcalDigiCollection_hcalCPUDigisProducer_" "f3HBDigis_RECO.", &wgpuf3hb); diff --git a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc index d49e2ad366817..d5b6d42d0ed2c 100644 --- a/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc +++ b/EventFilter/HcalRawToDigi/plugins/HcalDigisProducerGPU.cc @@ -15,7 +15,7 @@ class HcalDigisProducerGPU : public edm::stream::EDProducer { public: explicit HcalDigisProducerGPU(edm::ParameterSet const& ps); - ~HcalDigisProducerGPU() override; + ~HcalDigisProducerGPU() override = default; static void fillDescriptions(edm::ConfigurationDescriptions&); private: @@ -49,11 +49,11 @@ class HcalDigisProducerGPU : public edm::stream::EDProducer { cms::cuda::ContextState cudaState_; struct ConfigParameters { - uint32_t maxChannelsF01HE, maxChannelsF5HB, maxChannelsF3HB, nsamplesF01HE, nsamplesF5HB, nsamplesF3HB; + uint32_t maxChannelsF01HE, maxChannelsF5HB, maxChannelsF3HB; }; ConfigParameters config_; - // tmp on the host + // per event host buffers HostCollectionf01 hf01_; HostCollectionf5 hf5_; HostCollectionf3 hf3_; @@ -76,9 +76,6 @@ void HcalDigisProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& conf desc.add("maxChannelsF01HE", 10000u); desc.add("maxChannelsF5HB", 10000u); desc.add("maxChannelsF3HB", 10000u); - desc.add("nsamplesF01HE", 8); - desc.add("nsamplesF5HB", 8); - desc.add("nsamplesF3HB", 8); confDesc.addWithDefaultLabel(desc); } @@ -92,27 +89,24 @@ HcalDigisProducerGPU::HcalDigisProducerGPU(const edm::ParameterSet& ps) config_.maxChannelsF01HE = ps.getParameter("maxChannelsF01HE"); config_.maxChannelsF5HB = ps.getParameter("maxChannelsF5HB"); config_.maxChannelsF3HB = ps.getParameter("maxChannelsF3HB"); - config_.nsamplesF01HE = ps.getParameter("nsamplesF01HE"); - config_.nsamplesF5HB = ps.getParameter("nsamplesF5HB"); - config_.nsamplesF3HB = ps.getParameter("nsamplesF3HB"); - - // preallocate on the host - hf01_.stride = hcal::compute_stride(config_.nsamplesF01HE); - hf5_.stride = hcal::compute_stride(config_.nsamplesF5HB); - hf3_.stride = hcal::compute_stride(config_.nsamplesF3HB); + + // this is a preallocation for the max statically known number of time samples + // actual stride/nsamples will be inferred from data + hf5_.stride = hcal::compute_stride(HBHEDataFrame::MAXSAMPLES); + hf01_.stride = hcal::compute_stride(QIE11DigiCollection::MAXSAMPLES); + hf3_.stride = hcal::compute_stride(QIE11DigiCollection::MAXSAMPLES); hf01_.reserve(config_.maxChannelsF01HE); hf5_.reserve(config_.maxChannelsF5HB); hf3_.reserve(config_.maxChannelsF3HB); } -HcalDigisProducerGPU::~HcalDigisProducerGPU() {} - void HcalDigisProducerGPU::acquire(edm::Event const& event, edm::EventSetup const& setup, edm::WaitingTaskWithArenaHolder holder) { // raii cms::cuda::ScopedContextAcquire ctx{event.streamID(), std::move(holder), cudaState_}; + // clear host buffers hf01_.clear(); hf5_.clear(); hf3_.clear(); @@ -123,35 +117,54 @@ void HcalDigisProducerGPU::acquire(edm::Event const& event, event.getByToken(hbheDigiToken_, hbheDigis); event.getByToken(qie11DigiToken_, qie11Digis); - // flavor 0/1 get devie blobs - df01_.data = cms::cuda::make_device_unique( - config_.maxChannelsF01HE * hcal::compute_stride(config_.nsamplesF01HE), ctx.stream()); - df01_.ids = cms::cuda::make_device_unique(config_.maxChannelsF01HE, ctx.stream()); - // flavor3 get device blobs - df3_.data = cms::cuda::make_device_unique( - config_.maxChannelsF3HB * hcal::compute_stride(config_.nsamplesF3HB), ctx.stream()); - df3_.ids = cms::cuda::make_device_unique(config_.maxChannelsF3HB, ctx.stream()); + // init f5 collection + if (hbheDigis->size() > 0) { + auto const nsamples = (*hbheDigis)[0].size(); + auto const stride = hcal::compute_stride(nsamples); + hf5_.stride = stride; + + // flavor5 get device blobs + df5_.stride = stride; + df5_.data = cms::cuda::make_device_unique( + config_.maxChannelsF5HB * stride, ctx.stream()); + df5_.ids = cms::cuda::make_device_unique(config_.maxChannelsF5HB, ctx.stream()); + df5_.npresamples = cms::cuda::make_device_unique(config_.maxChannelsF5HB, ctx.stream()); + } - // flavor5 get device blobs - df5_.data = cms::cuda::make_device_unique( - config_.maxChannelsF5HB * hcal::compute_stride(config_.nsamplesF5HB), ctx.stream()); - df5_.ids = cms::cuda::make_device_unique(config_.maxChannelsF5HB, ctx.stream()); - df5_.npresamples = cms::cuda::make_device_unique(config_.maxChannelsF5HB, ctx.stream()); + if (qie11Digis->size() > 0) { + auto const nsamples = qie11Digis->samples(); + auto const stride01 = hcal::compute_stride(nsamples); + auto const stride3 = hcal::compute_stride(nsamples); + + hf01_.stride = stride01; + hf3_.stride = stride3; + + // flavor 0/1 get devie blobs + df01_.stride = stride01; + df01_.data = cms::cuda::make_device_unique( + config_.maxChannelsF01HE * stride01, ctx.stream()); + df01_.ids = cms::cuda::make_device_unique(config_.maxChannelsF01HE, ctx.stream()); + + // flavor3 get device blobs + df3_.stride = stride3; + df3_.data = cms::cuda::make_device_unique( + config_.maxChannelsF3HB * stride3, ctx.stream()); + df3_.ids = cms::cuda::make_device_unique(config_.maxChannelsF3HB, ctx.stream()); + } for (auto const& hbhe : *hbheDigis) { auto const id = hbhe.id().rawId(); auto const presamples = hbhe.presamples(); hf5_.ids.push_back(id); hf5_.npresamples.push_back(presamples); - int stride = hcal::compute_stride(config_.nsamplesF5HB); + auto const stride = hcal::compute_stride(hbhe.size()); + assert(stride == hf5_.stride && "strides must be the same for every single digi of the collection"); // simple for now... static_assert(hcal::Flavor5::HEADER_WORDS == 1); uint16_t header_word = (1 << 15) | (0x5 << 12) | (0 << 10) | ((hbhe.sample(0).capid() & 0x3) << 8); hf5_.data.push_back(header_word); - //for (int i=0; i(hbhe.sample(2 * i).adc()) & 0x7f); uint16_t s1 = (0 << 7) | (static_cast(hbhe.sample(2 * i + 1).adc()) & 0x7f); uint16_t sample = (s1 << 8) | s0; @@ -161,6 +174,7 @@ void HcalDigisProducerGPU::acquire(edm::Event const& event, for (unsigned int i = 0; i < qie11Digis->size(); i++) { auto const& digi = QIE11DataFrame{(*qie11Digis)[i]}; + assert(digi.samples() == qie11Digis->samples() && "collection nsamples must equal per digi samples"); if (digi.flavor() == 0 or digi.flavor() == 1) { if (digi.detid().subdetId() != HcalEndcap) continue; @@ -185,6 +199,7 @@ void HcalDigisProducerGPU::acquire(edm::Event const& event, } auto lambdaToTransfer = [&ctx](auto* dest, auto const& src) { + if (src.size() == 0) return; using vector_type = typename std::remove_reference::type; using type = typename vector_type::value_type; using dest_data_type = typename std::remove_pointer::type; @@ -201,17 +216,14 @@ void HcalDigisProducerGPU::acquire(edm::Event const& event, lambdaToTransfer(df3_.data.get(), hf3_.data); lambdaToTransfer(df3_.ids.get(), hf3_.ids); -} - -void HcalDigisProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) { - cms::cuda::ScopedContextProduce ctx{cudaState_}; - df01_.stride = hcal::compute_stride(config_.nsamplesF01HE); df01_.size = hf01_.ids.size(); - df5_.stride = hcal::compute_stride(config_.nsamplesF5HB); df5_.size = hf5_.ids.size(); - df3_.stride = hcal::compute_stride(config_.nsamplesF3HB); df3_.size = hf3_.ids.size(); +} + +void HcalDigisProducerGPU::produce(edm::Event& event, edm::EventSetup const& setup) { + cms::cuda::ScopedContextProduce ctx{cudaState_}; ctx.emplace(event, digisF01HEToken_, std::move(df01_)); ctx.emplace(event, digisF5HBToken_, std::move(df5_)); diff --git a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu index 7789b86a50958..8796e6c6396fa 100644 --- a/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu +++ b/RecoLocalCalo/HcalRecProducers/src/MahiGPU.cu @@ -68,10 +68,12 @@ namespace hcal { int const sipmQNTStoSum, int const firstSampleShift, uint32_t const offsetForHashes, - float const ts4Thresh) { + float const ts4Thresh, + int const startingSample) { // indices + runtime constants - auto const sample = threadIdx.x; - int32_t const nsamplesExpected = blockDim.x; + auto const sample = threadIdx.x + startingSample; + auto const sampleWithinWindow = threadIdx.x; + int32_t const nsamplesForCompute = blockDim.x; auto const lch = threadIdx.y; auto const gch = lch + blockDim.y * blockIdx.x; auto const nchannels_per_block = blockDim.y; @@ -82,7 +84,7 @@ namespace hcal { return; // initialize all output buffers - if (sample == 0) { + if (sampleWithinWindow == 0) { outputdid[gch] = 0; method0Energy[gch] = 0; method0Time[gch] = 0; @@ -100,20 +102,20 @@ namespace hcal { // configure shared mem extern __shared__ char smem[]; float* shrEnergyM0PerTS = reinterpret_cast(smem); - float* shrChargeMinusPedestal = shrEnergyM0PerTS + nsamplesExpected * nchannels_per_block; - float* shrMethod0EnergyAccum = shrChargeMinusPedestal + nsamplesExpected * nchannels_per_block; + float* shrChargeMinusPedestal = shrEnergyM0PerTS + nsamplesForCompute * nchannels_per_block; + float* shrMethod0EnergyAccum = shrChargeMinusPedestal + nsamplesForCompute * nchannels_per_block; float* shrEnergyM0TotalAccum = shrMethod0EnergyAccum + nchannels_per_block; unsigned long long int* shrMethod0EnergySamplePair = reinterpret_cast(shrEnergyM0TotalAccum + nchannels_per_block); - if (sample == 0) { + if (sampleWithinWindow == 0) { shrMethod0EnergyAccum[lch] = 0; shrMethod0EnergySamplePair[lch] = __float_as_uint(std::numeric_limits::min()); shrEnergyM0TotalAccum[lch] = 0; } // offset output - auto* amplitudesForChannel = amplitudes + nsamplesExpected * gch; - auto* noiseTermsForChannel = noiseTerms + nsamplesExpected * gch; + auto* amplitudesForChannel = amplitudes + nsamplesForCompute * gch; + auto* noiseTermsForChannel = noiseTerms + nsamplesForCompute * gch; auto const nchannelsf015 = nchannelsf01HE + nchannelsf5HB; // get event input quantities @@ -123,7 +125,7 @@ namespace hcal { : compute_nsamples(stride)); #ifdef HCAL_MAHI_GPUDEBUG - assert(nsamples == nsamplesExpected); + assert(nsamples == nsamplesForCompute || nsamples-startingSample==nsampelsForCompute); #endif auto const id = gch < nchannelsf01HE @@ -198,11 +200,11 @@ namespace hcal { // if that is not the case, we will see that with cuda mmecheck auto const soibit = soibit_for_sample(dataf01HE + stride * gch, sample); if (soibit == 1) - soiSamples[gch] = sample; + soiSamples[gch] = sampleWithinWindow; } else if (gch >= nchannelsf015) { auto const soibit = soibit_for_sample(dataf3HB + stride * (gch - nchannelsf015), sample); if (soibit == 1) - soiSamples[gch] = sample; + soiSamples[gch] = sampleWithinWindow; } __syncthreads(); int32_t const soi = gch < nchannelsf01HE @@ -212,7 +214,7 @@ namespace hcal { // ? npresamplesf5HB[gch - nchannelsf01HE] // : soiSamples[gch]; // this is here just to make things uniform... - if (gch >= nchannelsf01HE && gch < nchannelsf015 && sample == 0) + if (gch >= nchannelsf01HE && gch < nchannelsf015 && sampleWithinWindow == 0) soiSamples[gch] = npresamplesf5HB[gch - nchannelsf01HE]; // @@ -240,10 +242,10 @@ namespace hcal { auto const parLin3 = parLin3Values[sipmType - 1]; int const first = std::max(soi + sipmQTSShift, 0); - int const last = std::min(soi + sipmQNTStoSum, nsamplesExpected); + int const last = std::min(soi + sipmQNTStoSum, nsamplesForCompute); float sipmq = 0.0f; for (auto ts = first; ts < last; ts++) - sipmq += shrChargeMinusPedestal[threadIdx.y * nsamplesExpected + ts]; + sipmq += shrChargeMinusPedestal[threadIdx.y * nsamplesForCompute + ts]; auto const effectivePixelsFired = sipmq / fcByPE; auto const factor = hcal::reconstruction::compute_reco_correction_factor(parLin1, parLin2, parLin3, effectivePixelsFired); @@ -268,12 +270,12 @@ namespace hcal { auto const nsamplesToAdd = recoParam1 < 10 ? recoParam2 : (recoParam1 >> 14) & 0xF; auto const startSampleTmp = soi + firstSampleShift; auto const startSample = startSampleTmp < 0 ? 0 : startSampleTmp; - auto const endSample = startSample + nsamplesToAdd < nsamples ? startSample + nsamplesToAdd : nsamples; + auto const endSample = startSample + nsamplesToAdd < nsamplesForCompute ? startSample + nsamplesToAdd : nsamplesForCompute; // NOTE: gain is a small number < 10^-3, multiply it last auto const energym0_per_ts = gain * ((rawCharge - pedestalToUseForMethod0) * respCorrection); auto const energym0_per_ts_gain0 = gain0 * ((rawCharge - pedestalToUseForMethod0) * respCorrection); // store to shared mem - shrEnergyM0PerTS[lch * nsamplesExpected + sample] = energym0_per_ts; + shrEnergyM0PerTS[lch * nsamplesForCompute + sampleWithinWindow] = energym0_per_ts; atomicAdd(&shrEnergyM0TotalAccum[lch], energym0_per_ts_gain0); #ifdef HCAL_MAHI_GPUDEBUG @@ -298,12 +300,12 @@ namespace hcal { "startSample = %d endSample = %d param1 = %u param2 = %u\n", startSample, endSample, recoParam1, recoParam2); #endif - if (sample >= startSample && sample < endSample) { + if (sampleWithinWindow >= startSample && sampleWithinWindow < endSample) { atomicAdd(&shrMethod0EnergyAccum[lch], energym0_per_ts); // pack sample, energy as 64 bit value unsigned long long int old = shrMethod0EnergySamplePair[lch], assumed; unsigned long long int val = - (static_cast(sample) << 32) + __float_as_uint(energym0_per_ts); + (static_cast(sampleWithinWindow) << 32) + __float_as_uint(energym0_per_ts); do { assumed = old; // decode energy, sample values @@ -318,14 +320,14 @@ namespace hcal { __syncthreads(); // NOTE: must take soi, as values for that thread are used... - if (sample == soi) { + if (sampleWithinWindow == soi) { auto const method0_energy = shrMethod0EnergyAccum[lch]; auto const val = shrMethod0EnergySamplePair[lch]; int const max_sample = (val >> 32) & 0xffffffff; float const max_energy = __uint_as_float(val & 0xffffffff); float const max_energy_1 = - max_sample < nsamples - 1 ? shrEnergyM0PerTS[lch * nsamplesExpected + max_sample + 1] : 0.f; - float const position = nsamplesToAdd < nsamples ? max_sample - soi : max_sample; + max_sample < nsamplesForCompute - 1 ? shrEnergyM0PerTS[lch * nsamplesForCompute + max_sample + 1] : 0.f; + float const position = nsamplesToAdd < nsamplesForCompute ? max_sample - soi : max_sample; auto const sum = max_energy + max_energy_1; // FIXME: for full comparison with cpu method 0 timing, // need to correct by slew @@ -389,8 +391,8 @@ namespace hcal { #endif // store to global memory - amplitudesForChannel[sample] = amplitude; - noiseTermsForChannel[sample] = noiseTerm; + amplitudesForChannel[sampleWithinWindow] = amplitude; + noiseTermsForChannel[sampleWithinWindow] = noiseTerm; } // TODO: need to add an array of offsets for pulses (a la activeBXs...) @@ -1215,19 +1217,25 @@ namespace hcal { outputGPU.recHits.size = totalChannels; // TODO: this can be lifted by implementing a separate kernel - // similar to the default one, but properly handling the diff in #samples + // similar to the default one, but properly handling the diff in #sample // or modifying existing one auto const f01nsamples = compute_nsamples(inputGPU.f01HEDigis.stride); auto const f5nsamples = compute_nsamples(inputGPU.f5HBDigis.stride); auto const f3nsamples = compute_nsamples(inputGPU.f3HBDigis.stride); - assert(f01nsamples == f5nsamples && f01nsamples == f3nsamples); - - dim3 threadsPerBlock{f01nsamples, configParameters.kprep1dChannelsPerBlock}; + int constexpr windowSize = 8; + int const startingSample = f01nsamples - windowSize; + assert(startingSample==0 || startingSample==2); + if (inputGPU.f01HEDigis.stride > 0 && inputGPU.f5HBDigis.stride> 0) + assert(f01nsamples == f5nsamples); + if (inputGPU.f01HEDigis.stride > 0 && inputGPU.f3HBDigis.stride > 0) + assert(f01nsamples == f3nsamples); + + dim3 threadsPerBlock{windowSize, configParameters.kprep1dChannelsPerBlock}; int blocks = static_cast(threadsPerBlock.y) > totalChannels ? 1 : (totalChannels + threadsPerBlock.y - 1) / threadsPerBlock.y; int nbytesShared = - ((2 * f01nsamples + 2) * sizeof(float) + sizeof(uint64_t)) * configParameters.kprep1dChannelsPerBlock; + ((2 * windowSize + 2) * sizeof(float) + sizeof(uint64_t)) * configParameters.kprep1dChannelsPerBlock; kernel_prep1d_sameNumberOfSamples<<>>( scratch.amplitudes.get(), scratch.noiseTerms.get(), @@ -1284,13 +1292,14 @@ namespace hcal { configParameters.sipmQNTStoSum, configParameters.firstSampleShift, conditions.offsetForHashes, - configParameters.ts4Thresh); + configParameters.ts4Thresh, + startingSample); cudaCheck(cudaGetLastError()); // 1024 is the max threads per block for gtx1080 // FIXME: take this from cuda service or something like that - uint32_t const channelsPerBlock = 1024 / (f01nsamples * conditions.pulseOffsetsHost.size()); - dim3 threadsPerBlock2{f01nsamples, static_cast(conditions.pulseOffsetsHost.size()), channelsPerBlock}; + uint32_t const channelsPerBlock = 1024 / (windowSize * conditions.pulseOffsetsHost.size()); + dim3 threadsPerBlock2{windowSize, static_cast(conditions.pulseOffsetsHost.size()), channelsPerBlock}; int blocks2 = threadsPerBlock2.z > totalChannels ? 1 : (totalChannels + threadsPerBlock2.z - 1) / threadsPerBlock2.z; @@ -1342,7 +1351,8 @@ namespace hcal { configParameters.tmaxTimeSlew); cudaCheck(cudaGetLastError()); - if (f01nsamples == 8 && conditions.pulseOffsetsHost.size() == 8u) { + // number of samples is checked in above assert + if (conditions.pulseOffsetsHost.size() == 8u) { // FIXME: provide constants from configuration uint32_t threadsPerBlock = configParameters.kernelMinimizeThreads[0]; uint32_t blocks = threadsPerBlock > totalChannels ? 1 : (totalChannels + threadsPerBlock - 1) / threadsPerBlock;