Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move BeamSpot transfer to GPU to its own producer #318

Merged
merged 6 commits into from
Apr 23, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CUDADataFormats/BeamSpot/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<use name="FWCore/ServiceRegistry"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="cuda-api-wrappers"/>
<use name="rootcore"/>

<export>
<lib name="1"/>
</export>
32 changes: 32 additions & 0 deletions CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#ifndef CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h
#define CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h

#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"

#include <cuda/api_wrappers.h>

class BeamSpotCUDA {
public:
// alignas(128) doesn't really make sense as there is only one
// beamspot per event?
struct Data {
float x,y,z; // position
// TODO: add covariance matrix

float sigmaZ;
float beamWidthX, beamWidthY;
float dxdz, dydz;
float emittanceX, emittanceY;
float betaStar;
};

BeamSpotCUDA() = default;
BeamSpotCUDA(Data const* data_h, cuda::stream_t<>& stream);

Data const* data() const { return data_d_.get(); }

private:
cudautils::device::unique_ptr<Data> data_d_;
};

#endif
11 changes: 11 additions & 0 deletions CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"

#include "FWCore/ServiceRegistry/interface/Service.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"

BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cuda::stream_t<>& stream) {
edm::Service<CUDAService> cs;

data_d_ = cs->make_device_unique<Data>(stream);
cuda::memory::async::copy(data_d_.get(), data_h, sizeof(Data), stream.id());
}
8 changes: 8 additions & 0 deletions CUDADataFormats/BeamSpot/src/classes.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef CUDADataFormats_BeamSpot_classes_h
#define CUDADataFormats_BeamSpot_classes_h

#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
#include "DataFormats/Common/interface/Wrapper.h"

#endif
4 changes: 4 additions & 0 deletions CUDADataFormats/BeamSpot/src/classes_def.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<lcgdict>
<class name="CUDAProduct<BeamSpotCUDA>" persistent="false"/>
<class name="edm::Wrapper<CUDAProduct<BeamSpotCUDA>>" persistent="false"/>
</lcgdict>
6 changes: 6 additions & 0 deletions CUDADataFormats/Common/interface/CUDAProduct.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@ class CUDAProduct: public CUDAProductBase {
data_(std::move(data))
{}

template <typename... Args>
explicit CUDAProduct(int device, std::shared_ptr<cuda::stream_t<>> stream, std::shared_ptr<cuda::event_t> event, Args&&... args):
CUDAProductBase(device, std::move(stream), std::move(event)),
data_(std::forward<Args>(args)...)
{}

T data_; //!
};

Expand Down
4 changes: 2 additions & 2 deletions Configuration/StandardSequences/python/Reconstruction_cff.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,9 +198,9 @@
reconstruction_trackingOnly = cms.Sequence(localreco*globalreco_tracking)
reconstruction_pixelTrackingOnly = cms.Sequence(
pixeltrackerlocalreco*
offlineBeamSpot*
siPixelClusterShapeCachePreSplitting*
recopixelvertexing
recopixelvertexing,
offlineBeamSpotTask
)

#need a fully expanded sequence copy
Expand Down
1 change: 1 addition & 0 deletions HeterogeneousCore/CUDAUtilities/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<use name="cub"/>
<use name="cuda"/>
<use name="cuda-api-wrappers"/>

<export>
<lib name="1"/>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#ifndef HeterogeneousCore_CUDAUtilities_interface_host_noncached_unique_ptr_h
#define HeterogeneousCore_CUDAUtilities_interface_host_noncached_unique_ptr_h

#include <memory>

#include <cuda/api_wrappers.h>
#include <cuda_runtime.h>

namespace cudautils {
namespace host {
namespace noncached {
namespace impl {
// Additional layer of types to distinguish from host::unique_ptr
class HostDeleter {
public:
void operator()(void *ptr) {
cuda::throw_if_error(cudaFreeHost(ptr));
}
};
}

template <typename T>
using unique_ptr = std::unique_ptr<T, impl::HostDeleter>;

namespace impl {
template <typename T>
struct make_host_unique_selector { using non_array = cudautils::host::noncached::unique_ptr<T>; };
template <typename T>
struct make_host_unique_selector<T[]> { using unbounded_array = cudautils::host::noncached::unique_ptr<T[]>; };
template <typename T, size_t N>
struct make_host_unique_selector<T[N]> { struct bounded_array {}; };
}
}
}

/**
* The difference wrt. CUDAService::make_host_unique is that these
* do not cache, so they should not be called per-event.
*/
template <typename T>
typename host::noncached::impl::make_host_unique_selector<T>::non_array
make_host_noncached_unique(unsigned int flags = cudaHostAllocDefault) {
static_assert(std::is_trivially_constructible<T>::value, "Allocating with non-trivial constructor on the pinned host memory is not supported");
void *mem;
cuda::throw_if_error(cudaHostAlloc(&mem, sizeof(T), flags));
return typename cudautils::host::noncached::impl::make_host_unique_selector<T>::non_array(reinterpret_cast<T *>(mem));
}

template <typename T>
typename host::noncached::impl::make_host_unique_selector<T>::unbounded_array
make_host_noncached_unique(size_t n, unsigned int flags = cudaHostAllocDefault) {
using element_type = typename std::remove_extent<T>::type;
static_assert(std::is_trivially_constructible<element_type>::value, "Allocating with non-trivial constructor on the pinned host memory is not supported");
void *mem;
cuda::throw_if_error(cudaHostAlloc(&mem, n*sizeof(element_type), flags));
return typename cudautils::host::noncached::impl::make_host_unique_selector<T>::unbounded_array(reinterpret_cast<element_type *>(mem));
}

template <typename T, typename ...Args>
typename cudautils::host::noncached::impl::make_host_unique_selector<T>::bounded_array
make_host_noncached_unique(Args&&...) = delete;
}

#endif

2 changes: 1 addition & 1 deletion HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
<flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
</bin>

<bin file="testCatch2Main.cpp,copyAsync_t.cpp,memsetAsync_t.cpp" name="cudaMemUtils_t">
<bin file="testCatch2Main.cpp,copyAsync_t.cpp,memsetAsync_t.cpp,host_noncached_unique_ptr_t.cpp" name="cudaMemUtils_t">
<use name="HeterogeneousCore/CUDAServices"/>
<use name="catch2"/>
</bin>
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#include "catch.hpp"

#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"

TEST_CASE("host_noncached_unique_ptr", "[cudaMemTools]") {
exitSansCUDADevices();

SECTION("Single element") {
auto ptr1 = cudautils::make_host_noncached_unique<int>();
REQUIRE(ptr1 != nullptr);
auto ptr2 = cudautils::make_host_noncached_unique<int>(cudaHostAllocPortable | cudaHostAllocWriteCombined);
REQUIRE(ptr2 != nullptr);
}

SECTION("Multiple elements") {
auto ptr1 = cudautils::make_host_noncached_unique<int[]>(10);
REQUIRE(ptr1 != nullptr);
auto ptr2 = cudautils::make_host_noncached_unique<int[]>(10, cudaHostAllocPortable | cudaHostAllocWriteCombined);
REQUIRE(ptr2 != nullptr);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@
#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
#include "FWCore/ParameterSet/interface/ParameterSet.h"
#include "FWCore/ServiceRegistry/interface/Service.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h"
#include "RecoTracker/Record/interface/CkfComponentsRecord.h"

Expand Down Expand Up @@ -62,6 +64,7 @@ class SiPixelRawToClusterCUDA: public edm::stream::EDProducer<edm::ExternalWork>
std::unique_ptr<PixelUnpackingRegions> regions_;

pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_;
std::unique_ptr<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender> wordFedAppender_;
PixelDataFormatter::Errors errors_;

const bool includeErrors_;
Expand All @@ -88,6 +91,11 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfi
}

if(usePilotBlade_) edm::LogInfo("SiPixelRawToCluster") << " Use pilot blade data (FED 40)";

edm::Service<CUDAService> cs;
if(cs->enabled()) {
wordFedAppender_ = std::make_unique<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender>();
}
}

void SiPixelRawToClusterCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
Expand Down Expand Up @@ -161,7 +169,6 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event

// In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData()
ErrorChecker errorcheck;
auto wordFedAppender = pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender(ctx.stream());
for(int fedId: fedIds_) {
if (!usePilotBlade_ && (fedId==40) ) continue; // skip pilot blade data
if (regions_ && !regions_->mayUnpackFED(fedId)) continue;
Expand Down Expand Up @@ -209,13 +216,13 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event
const cms_uint32_t * ew = (const cms_uint32_t *)(trailer);

assert(0 == (ew-bw)%2);
wordFedAppender.initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw));
wordFedAppender_->initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw));
wordCounterGPU+=(ew-bw);

} // end of for loop

gpuAlgo_.makeClustersAsync(gpuMap, gpuModulesToUnpack, gpuGains,
wordFedAppender,
*wordFedAppender_,
std::move(errors_),
wordCounterGPU, fedCounter,
useQuality_, includeErrors_,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,9 @@ namespace pixelgpudetails {
// number of words for all the FEDs
constexpr uint32_t MAX_FED_WORDS = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;

SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender(cuda::stream_t<>& cudaStream) {
edm::Service<CUDAService> cs;
word_ = cs->make_host_unique<unsigned int[]>(MAX_FED_WORDS, cudaStream);
fedId_ = cs->make_host_unique<unsigned char[]>(MAX_FED_WORDS, cudaStream);
SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender() {
word_ = cudautils::make_host_noncached_unique<unsigned int[]>(MAX_FED_WORDS, cudaHostAllocWriteCombined);
fedId_ = cudautils::make_host_noncached_unique<unsigned char[]>(MAX_FED_WORDS, cudaHostAllocWriteCombined);
}

void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "FWCore/Utilities/interface/typedefs.h"
#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
#include "DataFormats/SiPixelDigi/interface/PixelErrors.h"

struct SiPixelFedCablingMapGPU;
Expand Down Expand Up @@ -159,7 +160,7 @@ namespace pixelgpudetails {
public:
class WordFedAppender {
public:
WordFedAppender(cuda::stream_t<>& cudaStream);
WordFedAppender();
~WordFedAppender() = default;

void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length);
Expand All @@ -168,8 +169,8 @@ namespace pixelgpudetails {
const unsigned char *fedId() const { return fedId_.get(); }

private:
cudautils::host::unique_ptr<unsigned int[]> word_;
cudautils::host::unique_ptr<unsigned char[]> fedId_;
cudautils::host::noncached::unique_ptr<unsigned int[]> word_;
cudautils::host::noncached::unique_ptr<unsigned char[]> fedId_;
};

SiPixelRawToClusterGPUKernel() = default;
Expand Down
1 change: 1 addition & 0 deletions RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<use name="CUDADataFormats/BeamSpot"/>
<use name="DataFormats/TrackerCommon"/>
<use name="HeterogeneousCore/CUDACore"/>
<use name="HeterogeneousCore/Producer"/>
Expand Down
7 changes: 2 additions & 5 deletions RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ namespace pixelgpudetails {

constexpr auto MAX_HITS = siPixelRecHitsHeterogeneousProduct::maxHits();

cudaCheck(cudaMalloc((void **) & gpu_.bs_d, 3 * sizeof(float)));
cudaCheck(cudaMalloc((void **) & gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t)));

// Coalesce all 32bit and 16bit arrays to two big blobs
Expand Down Expand Up @@ -111,7 +110,6 @@ namespace pixelgpudetails {
#endif
}
PixelRecHitGPUKernel::~PixelRecHitGPUKernel() {
cudaCheck(cudaFree(gpu_.bs_d));
cudaCheck(cudaFree(gpu_.hitsLayerStart_d));
cudaCheck(cudaFree(gpu_.owner_32bit_));
cudaCheck(cudaFree(gpu_.owner_16bit_));
Expand All @@ -131,11 +129,10 @@ namespace pixelgpudetails {

void PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d,
SiPixelClustersCUDA const& clusters_d,
float const * bs,
BeamSpotCUDA const& bs_d,
pixelCPEforGPU::ParamsOnGPU const * cpeParams,
bool transferToCPU,
cuda::stream_t<>& stream) {
cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id()));
gpu_.hitsModuleStart_d = clusters_d.clusModuleStart();
gpu_.cpeParams = cpeParams; // copy it for use in clients
cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id()));
Expand All @@ -148,7 +145,7 @@ namespace pixelgpudetails {
#endif
gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream.id()>>>(
cpeParams,
gpu_.bs_d,
bs_d.data(),
digis_d.moduleInd(),
digis_d.xx(), digis_d.yy(), digis_d.adc(),
clusters_d.moduleStart(),
Expand Down
3 changes: 2 additions & 1 deletion RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h
#define RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h

#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h"
Expand Down Expand Up @@ -34,7 +35,7 @@ namespace pixelgpudetails {

void makeHitsAsync(SiPixelDigisCUDA const& digis_d,
SiPixelClustersCUDA const& clusters_d,
float const * bs,
BeamSpotCUDA const& bs_d,
pixelCPEforGPU::ParamsOnGPU const * cpeParams,
bool transferToCPU,
cuda::stream_t<>& stream);
Expand Down
Loading