Skip to content

Commit

Permalink
Clean up CAHitNtupletHeterogeneousEDProducer (#83)
Browse files Browse the repository at this point in the history
Apply some clean up to the code and formatting of `CAHitNtupletHeterogeneousEDProducer` and `CAHitQuadrupletGeneratorGPU`, as suggested by @makortel during the review of #48:
  - clean up the `BuildFile.xml`
  - remove unused data members and arguments from function calls;
  - percolate the CUDA stream instead of storing it as a data member.

Also:
  - add `cudaCheck` calls around memory allocations and copies;
  - reduce the number of memory allocations used to set up the GPU state.
  • Loading branch information
fwyzard authored Jun 18, 2018
1 parent 96559f3 commit bcee919
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 154 deletions.
31 changes: 15 additions & 16 deletions RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
<use name="RecoTracker/TkTrackingRegions"/>
<use name="RecoPixelVertexing/PixelTriplets"/>
<use name="RecoTracker/TkSeedingLayers"/>
<use name="RecoPixelVertexing/PixelTrackFitting"/>
<library file="*.cu *.cc" name="RecoPixelVertexingPixelTripletsPlugins">
<use name="cuda"/>
<flags EDM_PLUGIN="1"/>
<flags CUDA_FLAGS="--expt-relaxed-constexpr"/>
<use name="FWCore/Framework"/>
<use name="FWCore/PluginManager"/>
<use name="FWCore/ParameterSet"/>
<use name="HeterogeneousCore/Producer"/>
<use name="HeterogeneousCore/Product"/>
<use name="HeterogeneousCore/CUDACore"/>
<use name="cuda-api-wrappers"/>
<use name="cuda"/>
<use name="cuda-api-wrappers"/>
<use name="FWCore/Framework"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/PluginManager"/>
<use name="HeterogeneousCore/CUDACore"/>
<use name="HeterogeneousCore/Producer"/>
<use name="HeterogeneousCore/Product"/>
<use name="RecoPixelVertexing/PixelTrackFitting"/>
<use name="RecoPixelVertexing/PixelTriplets"/>
<use name="RecoTracker/TkSeedingLayers"/>
<use name="RecoTracker/TkTrackingRegions"/>
<library file="*.cu *.cc" name="RecoPixelVertexingPixelTripletsPlugins">
<flags EDM_PLUGIN="1"/>
</library>
<flags CXXFLAGS="-Ofast -fno-math-errno"/>
<flags CXXFLAGS="-Ofast -fno-math-errno"/>
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ class CAHitNtupletHeterogeneousEDProducer

bool emptyRegionDoublets = false;
std::unique_ptr<RegionsSeedingHitSets> seedingHitSets_;
std::vector<OrderedHitSeeds> ntuplets_;
};

CAHitNtupletHeterogeneousEDProducer::CAHitNtupletHeterogeneousEDProducer(
Expand Down Expand Up @@ -89,7 +88,7 @@ void CAHitNtupletHeterogeneousEDProducer::acquireGPUCuda(
const edm::HeterogeneousEvent &iEvent, const edm::EventSetup &iSetup,
cuda::stream_t<> &cudaStream) {
edm::Handle<IntermediateHitDoublets> hdoublets;
iEvent.event().getByToken(doubletToken_, hdoublets);
iEvent.getByToken(doubletToken_, hdoublets);
const auto &regionDoublets = *hdoublets;

const SeedingLayerSetsHits &seedingLayerHits =
Expand All @@ -114,37 +113,34 @@ void CAHitNtupletHeterogeneousEDProducer::acquireGPUCuda(
GPUGenerator_.initEvent(iEvent.event(), iSetup);

LogDebug("CAHitNtupletHeterogeneousEDProducer")
<< "Creating ntuplets_ for " << regionDoublets.regionSize()
<< "Creating ntuplets for " << regionDoublets.regionSize()
<< " regions, and " << regionDoublets.layerPairsSize()
<< " layer pairs";
ntuplets_.clear();
ntuplets_.resize(regionDoublets.regionSize());
for (auto &ntuplet : ntuplets_)
ntuplet.reserve(localRA_.upper());

GPUGenerator_.hitNtuplets(regionDoublets, ntuplets_, iSetup,
seedingLayerHits, cudaStream.id());
GPUGenerator_.hitNtuplets(regionDoublets, iSetup, seedingLayerHits, cudaStream.id());
}
}

void CAHitNtupletHeterogeneousEDProducer::produceGPUCuda(
edm::HeterogeneousEvent &iEvent, const edm::EventSetup &iSetup,
cuda::stream_t<> &cudaStream) {

if (!emptyRegionDoublets) {
if (not emptyRegionDoublets) {
edm::Handle<IntermediateHitDoublets> hdoublets;
iEvent.getByToken(doubletToken_, hdoublets);
const auto &regionDoublets = *hdoublets;
const SeedingLayerSetsHits &seedingLayerHits =
regionDoublets.seedingLayerHits();
int index = 0;
std::vector<OrderedHitSeeds> ntuplets(regionDoublets.regionSize());
for (auto &ntuplet : ntuplets)
ntuplet.reserve(localRA_.upper());
for (const auto &regionLayerPairs : regionDoublets) {
const TrackingRegion &region = regionLayerPairs.region();
auto seedingHitSetsFiller = seedingHitSets_->beginRegion(&region);
GPUGenerator_.fillResults(regionDoublets, ntuplets_, iSetup,
seedingLayerHits, cudaStream.id());
fillNtuplets(seedingHitSetsFiller, ntuplets_[index]);
ntuplets_[index].clear();
GPUGenerator_.fillResults(regionDoublets, ntuplets, iSetup, seedingLayerHits, cudaStream.id());
fillNtuplets(seedingHitSetsFiller, ntuplets[index]);
ntuplets[index].clear();
index++;
}
localRA_.update(seedingHitSets_->size());
Expand All @@ -158,10 +154,8 @@ void CAHitNtupletHeterogeneousEDProducer::produceCPU(
iEvent.getByToken(doubletToken_, hdoublets);
const auto &regionDoublets = *hdoublets;

const SeedingLayerSetsHits &seedingLayerHits =
regionDoublets.seedingLayerHits();
if (seedingLayerHits.numberOfLayersInSet() <
CAHitQuadrupletGenerator::minLayers) {
const SeedingLayerSetsHits &seedingLayerHits = regionDoublets.seedingLayerHits();
if (seedingLayerHits.numberOfLayersInSet() < CAHitQuadrupletGenerator::minLayers) {
throw cms::Exception("LogicError")
<< "CAHitNtupletEDProducer expects "
"SeedingLayerSetsHits::numberOfLayersInSet() to be >= "
Expand All @@ -180,21 +174,21 @@ void CAHitNtupletHeterogeneousEDProducer::produceCPU(
CPUGenerator_.initEvent(iEvent.event(), iSetup);

LogDebug("CAHitNtupletEDProducer")
<< "Creating ntuplets_ for " << regionDoublets.regionSize()
<< "Creating ntuplets for " << regionDoublets.regionSize()
<< " regions, and " << regionDoublets.layerPairsSize() << " layer pairs";
std::vector<OrderedHitSeeds> ntuplets_;
ntuplets_.resize(regionDoublets.regionSize());
for (auto &ntuplet : ntuplets_)
std::vector<OrderedHitSeeds> ntuplets;
ntuplets.resize(regionDoublets.regionSize());
for (auto &ntuplet : ntuplets)
ntuplet.reserve(localRA_.upper());

CPUGenerator_.hitNtuplets(regionDoublets, ntuplets_, iSetup, seedingLayerHits);
CPUGenerator_.hitNtuplets(regionDoublets, ntuplets, iSetup, seedingLayerHits);
int index = 0;
for (const auto &regionLayerPairs : regionDoublets) {
const TrackingRegion &region = regionLayerPairs.region();
auto seedingHitSetsFiller = seedingHitSets->beginRegion(&region);

fillNtuplets(seedingHitSetsFiller, ntuplets_[index]);
ntuplets_[index].clear();
fillNtuplets(seedingHitSetsFiller, ntuplets[index]);
ntuplets[index].clear();
index++;
}
localRA_.update(seedingHitSets->size());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@

namespace {

template <typename T> T sqr(T x) { return x * x; }
template <typename T> T sqr(T x) { return x * x; }

} // namespace

using namespace std;
Expand Down Expand Up @@ -182,9 +183,8 @@ void fillGraph(const SeedingLayerSetsHits &layers,

void CAHitQuadrupletGeneratorGPU::hitNtuplets(
const IntermediateHitDoublets &regionDoublets,
std::vector<OrderedHitSeeds> &result, const edm::EventSetup &es,
const SeedingLayerSetsHits &layers, const cudaStream_t &cudaStream) {
cudaStream_ = cudaStream;
const edm::EventSetup &es,
const SeedingLayerSetsHits &layers, cudaStream_t cudaStream) {
CAGraph g;

hitDoublets.resize(regionDoublets.regionSize());
Expand Down Expand Up @@ -288,7 +288,7 @@ void CAHitQuadrupletGeneratorGPU::hitNtuplets(
cudaMemcpyAsync(&d_indices_[j * maxNumberOfDoublets_ * 2],
&h_indices_[j * maxNumberOfDoublets_ * 2],
tmp_layerDoublets_[j].size * 2 * sizeof(int),
cudaMemcpyHostToDevice, cudaStream_);
cudaMemcpyHostToDevice, cudaStream);
}

for (unsigned int j = 0; j < numberOfLayers_; ++j) {
Expand All @@ -302,43 +302,43 @@ void CAHitQuadrupletGeneratorGPU::hitNtuplets(

cudaMemcpyAsync(&d_x_[maxNumberOfHits_ * j], &h_x_[j * maxNumberOfHits_],
tmp_layers_[j].size * sizeof(float),
cudaMemcpyHostToDevice, cudaStream_);
cudaMemcpyHostToDevice, cudaStream);

tmp_layers_[j].y = &d_y_[maxNumberOfHits_ * j];
cudaMemcpyAsync(&d_y_[maxNumberOfHits_ * j], &h_y_[j * maxNumberOfHits_],
tmp_layers_[j].size * sizeof(float),
cudaMemcpyHostToDevice, cudaStream_);
cudaMemcpyHostToDevice, cudaStream);

tmp_layers_[j].z = &d_z_[maxNumberOfHits_ * j];

cudaMemcpyAsync(&d_z_[maxNumberOfHits_ * j], &h_z_[j * maxNumberOfHits_],
tmp_layers_[j].size * sizeof(float),
cudaMemcpyHostToDevice, cudaStream_);
cudaMemcpyHostToDevice, cudaStream);
}

cudaMemcpyAsync(d_rootLayerPairs_, h_rootLayerPairs_,
numberOfRootLayerPairs_ * sizeof(unsigned int),
cudaMemcpyHostToDevice, cudaStream_);
cudaMemcpyHostToDevice, cudaStream);
cudaMemcpyAsync(d_doublets_, tmp_layerDoublets_,
numberOfLayerPairs_ * sizeof(GPULayerDoublets),
cudaMemcpyHostToDevice, cudaStream_);
cudaMemcpyHostToDevice, cudaStream);
cudaMemcpyAsync(d_layers_, tmp_layers_, numberOfLayers_ * sizeof(GPULayerHits),
cudaMemcpyHostToDevice, cudaStream_);
cudaMemcpyHostToDevice, cudaStream);

launchKernels(region, index);
}
launchKernels(region, index, cudaStream);
}
}

void CAHitQuadrupletGeneratorGPU::fillResults(
const IntermediateHitDoublets &regionDoublets,
std::vector<OrderedHitSeeds> &result, const edm::EventSetup &es,
const SeedingLayerSetsHits &layers, const cudaStream_t &cudaStream)
const SeedingLayerSetsHits &layers, cudaStream_t cudaStream)
{
int index = 0;

for (const auto &regionLayerPairs : regionDoublets) {
const TrackingRegion &region = regionLayerPairs.region();
auto foundQuads = fetchKernelResult(index);
auto foundQuads = fetchKernelResult(index, cudaStream);
unsigned int numberOfFoundQuadruplets = foundQuads.size();
const QuantityDependsPtEval maxChi2Eval = maxChi2.evaluator(es);

Expand Down
Loading

0 comments on commit bcee919

Please sign in to comment.