diff --git a/CUDADataFormats/Common/BuildFile.xml b/CUDADataFormats/Common/BuildFile.xml new file mode 100644 index 0000000000000..1046b76eef0f7 --- /dev/null +++ b/CUDADataFormats/Common/BuildFile.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/CUDADataFormats/Common/interface/CUDAProduct.h b/CUDADataFormats/Common/interface/CUDAProduct.h new file mode 100644 index 0000000000000..ca07a344ba2d5 --- /dev/null +++ b/CUDADataFormats/Common/interface/CUDAProduct.h @@ -0,0 +1,51 @@ +#ifndef CUDADataFormats_Common_CUDAProduct_h +#define CUDADataFormats_Common_CUDAProduct_h + +#include + +#include + +#include "CUDADataFormats/Common/interface/CUDAProductBase.h" + +namespace edm { + template class Wrapper; +} + +/** + * The purpose of this class is to wrap CUDA data to edm::Event in a + * way which forces correct use of various utilities. + * + * The non-default construction has to be done with CUDAScopedContext + * (in order to properly register the CUDA event). + * + * The default constructor is needed only for the ROOT dictionary generation. + * + * The CUDA event is in practice needed only for stream-stream + * synchronization, but someone with long-enough lifetime has to own + * it. Here is a somewhat natural place. If overhead is too much, we + * can e.g. make CUDAService own them (creating them on demand) and + * use them only where synchronization between streams is needed. + */ +template +class CUDAProduct: public CUDAProductBase { +public: + CUDAProduct() = default; // Needed only for ROOT dictionary generation + + CUDAProduct(const CUDAProduct&) = delete; + CUDAProduct& operator=(const CUDAProduct&) = delete; + CUDAProduct(CUDAProduct&&) = default; + CUDAProduct& operator=(CUDAProduct&&) = default; + +private: + friend class CUDAScopedContext; + friend class edm::Wrapper>; + + explicit CUDAProduct(int device, std::shared_ptr> stream, T data): + CUDAProductBase(device, std::move(stream)), + data_(std::move(data)) + {} + + T data_; //! +}; + +#endif diff --git a/CUDADataFormats/Common/interface/CUDAProductBase.h b/CUDADataFormats/Common/interface/CUDAProductBase.h new file mode 100644 index 0000000000000..eb6fdae0e5abf --- /dev/null +++ b/CUDADataFormats/Common/interface/CUDAProductBase.h @@ -0,0 +1,40 @@ +#ifndef CUDADataFormats_Common_CUDAProductBase_h +#define CUDADataFormats_Common_CUDAProductBase_h + +#include + +#include + +/** + * Base class for all instantiations of CUDA to hold the + * non-T-dependent members. + */ +class CUDAProductBase { +public: + CUDAProductBase() = default; // Needed only for ROOT dictionary generation + + bool isValid() const { return stream_.get() != nullptr; } + + int device() const { return device_; } + + const cuda::stream_t<>& stream() const { return *stream_; } + cuda::stream_t<>& stream() { return *stream_; } + const std::shared_ptr>& streamPtr() const { return stream_; } + + const cuda::event_t& event() const { return *event_; } + cuda::event_t& event() { return *event_; } + +protected: + explicit CUDAProductBase(int device, std::shared_ptr> stream); + +private: + // The cuda::stream_t is really shared among edm::Event products, so + // using shared_ptr also here + std::shared_ptr> stream_; //! + // shared_ptr because of caching in CUDAService + std::shared_ptr event_; //! + + int device_ = -1; //! +}; + +#endif diff --git a/CUDADataFormats/Common/interface/device_unique_ptr.h b/CUDADataFormats/Common/interface/device_unique_ptr.h deleted file mode 100644 index 1282c52125fa6..0000000000000 --- a/CUDADataFormats/Common/interface/device_unique_ptr.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef CUDADataFormats_Common_interface_device_unique_ptr_h -#define CUDADataFormats_Common_interface_device_unique_ptr_h - -#include -#include - -namespace edm { - namespace cuda { - namespace device { - template - using unique_ptr = std::unique_ptr>; - } - } -} - -#endif diff --git a/CUDADataFormats/Common/interface/host_unique_ptr.h b/CUDADataFormats/Common/interface/host_unique_ptr.h deleted file mode 100644 index c945d9b0aa027..0000000000000 --- a/CUDADataFormats/Common/interface/host_unique_ptr.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef CUDADataFormats_Common_interface_host_unique_ptr_h -#define CUDADataFormats_Common_interface_host_unique_ptr_h - -#include -#include - -namespace edm { - namespace cuda { - namespace host { - template - using unique_ptr = std::unique_ptr>; - } - } -} - -#endif diff --git a/CUDADataFormats/Common/src/CUDAProductBase.cc b/CUDADataFormats/Common/src/CUDAProductBase.cc new file mode 100644 index 0000000000000..c034b4f7295f8 --- /dev/null +++ b/CUDADataFormats/Common/src/CUDAProductBase.cc @@ -0,0 +1,19 @@ +#include "CUDADataFormats/Common/interface/CUDAProductBase.h" + +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + +CUDAProductBase::CUDAProductBase(int device, std::shared_ptr> stream): + stream_(std::move(stream)), + device_(device) +{ + edm::Service cs; + event_ = cs->getCUDAEvent(); + + // Record CUDA event to the CUDA stream. The event will become + // "occurred" after all work queued to the stream before this + // point has been finished. + event_->record(stream_->id()); +} + + diff --git a/CUDADataFormats/Common/test/BuildFile.xml b/CUDADataFormats/Common/test/BuildFile.xml new file mode 100644 index 0000000000000..5e804fe80a736 --- /dev/null +++ b/CUDADataFormats/Common/test/BuildFile.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/CUDADataFormats/Common/test/test_CUDAProduct.cc b/CUDADataFormats/Common/test/test_CUDAProduct.cc new file mode 100644 index 0000000000000..bd5ddf7f512fe --- /dev/null +++ b/CUDADataFormats/Common/test/test_CUDAProduct.cc @@ -0,0 +1,63 @@ +#include "catch.hpp" + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" + +#include + +namespace cudatest { + class TestCUDAScopedContext { + public: + static + CUDAScopedContext make(int dev) { + auto device = cuda::device::get(dev); + return CUDAScopedContext(dev, std::make_unique>(device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream))); + } + }; +} + +TEST_CASE("Use of CUDAProduct template", "[CUDACore]") { + SECTION("Default constructed") { + auto foo = CUDAProduct(); + REQUIRE(!foo.isValid()); + + auto bar = std::move(foo); + } + + exitSansCUDADevices(); + + constexpr int defaultDevice = 0; + { + auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice); + std::unique_ptr> dataPtr = ctx.wrap(10); + auto& data = *dataPtr; + + SECTION("Construct from CUDAScopedContext") { + REQUIRE(data.isValid()); + REQUIRE(data.device() == defaultDevice); + REQUIRE(data.stream().id() == ctx.stream().id()); + REQUIRE(&data.event() != nullptr); + } + + SECTION("Move constructor") { + auto data2 = CUDAProduct(std::move(data)); + REQUIRE(data2.isValid()); + REQUIRE(!data.isValid()); + } + + SECTION("Move assignment") { + CUDAProduct data2; + data2 = std::move(data); + REQUIRE(data2.isValid()); + REQUIRE(!data.isValid()); + } + } + + // Destroy and clean up all resources so that the next test can + // assume to start from a clean state. + cudaCheck(cudaSetDevice(defaultDevice)); + cudaCheck(cudaDeviceSynchronize()); + cudaDeviceReset(); +} diff --git a/CUDADataFormats/Common/test/test_main.cc b/CUDADataFormats/Common/test/test_main.cc new file mode 100644 index 0000000000000..0c7c351f437f5 --- /dev/null +++ b/CUDADataFormats/Common/test/test_main.cc @@ -0,0 +1,2 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" diff --git a/CUDADataFormats/SiPixelCluster/BuildFile.xml b/CUDADataFormats/SiPixelCluster/BuildFile.xml index 21c527e7b2f0d..d34658faa2573 100644 --- a/CUDADataFormats/SiPixelCluster/BuildFile.xml +++ b/CUDADataFormats/SiPixelCluster/BuildFile.xml @@ -1,6 +1,7 @@ + diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h index ca8a75d178b6c..f25a8a25f0808 100644 --- a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h +++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h @@ -1,14 +1,15 @@ #ifndef CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h #define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h -#include "CUDADataFormats/Common/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" #include class SiPixelClustersCUDA { public: SiPixelClustersCUDA() = default; - explicit SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream); + explicit SiPixelClustersCUDA(size_t maxClusters, cuda::stream_t<>& stream); ~SiPixelClustersCUDA() = default; SiPixelClustersCUDA(const SiPixelClustersCUDA&) = delete; @@ -16,20 +17,23 @@ class SiPixelClustersCUDA { SiPixelClustersCUDA(SiPixelClustersCUDA&&) = default; SiPixelClustersCUDA& operator=(SiPixelClustersCUDA&&) = default; + void setNClusters(uint32_t nClusters) { + nClusters_h = nClusters; + } + + uint32_t nClusters() const { return nClusters_h; } + uint32_t *moduleStart() { return moduleStart_d.get(); } - int32_t *clus() { return clus_d.get(); } uint32_t *clusInModule() { return clusInModule_d.get(); } uint32_t *moduleId() { return moduleId_d.get(); } uint32_t *clusModuleStart() { return clusModuleStart_d.get(); } uint32_t const *moduleStart() const { return moduleStart_d.get(); } - int32_t const *clus() const { return clus_d.get(); } uint32_t const *clusInModule() const { return clusInModule_d.get(); } uint32_t const *moduleId() const { return moduleId_d.get(); } uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); } uint32_t const *c_moduleStart() const { return moduleStart_d.get(); } - int32_t const *c_clus() const { return clus_d.get(); } uint32_t const *c_clusInModule() const { return clusInModule_d.get(); } uint32_t const *c_moduleId() const { return moduleId_d.get(); } uint32_t const *c_clusModuleStart() const { return clusModuleStart_d.get(); } @@ -40,7 +44,6 @@ class SiPixelClustersCUDA { #ifdef __CUDACC__ __device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_+i); } - __device__ __forceinline__ int32_t clus(int i) const { return __ldg(clus_+i); } __device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_+i); } __device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_+i); } __device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_+i); } @@ -50,7 +53,6 @@ class SiPixelClustersCUDA { private: uint32_t const *moduleStart_; - int32_t const *clus_; uint32_t const *clusInModule_; uint32_t const *moduleId_; uint32_t const *clusModuleStart_; @@ -59,15 +61,16 @@ class SiPixelClustersCUDA { DeviceConstView *view() const { return view_d.get(); } private: - edm::cuda::device::unique_ptr moduleStart_d; // index of the first pixel of each module - edm::cuda::device::unique_ptr clus_d; // cluster id of each pixel - edm::cuda::device::unique_ptr clusInModule_d; // number of clusters found in each module - edm::cuda::device::unique_ptr moduleId_d; // module id of each module + cudautils::device::unique_ptr moduleStart_d; // index of the first pixel of each module + cudautils::device::unique_ptr clusInModule_d; // number of clusters found in each module + cudautils::device::unique_ptr moduleId_d; // module id of each module // originally from rechits - edm::cuda::device::unique_ptr clusModuleStart_d; + cudautils::device::unique_ptr clusModuleStart_d; + + cudautils::device::unique_ptr view_d; // "me" pointer - edm::cuda::device::unique_ptr view_d; // "me" pointer + uint32_t nClusters_h; }; #endif diff --git a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc index 7363c2fd364af..d88a1b0a6370b 100644 --- a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc +++ b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc @@ -2,23 +2,22 @@ #include "FWCore/ServiceRegistry/interface/Service.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" -SiPixelClustersCUDA::SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream) { +SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxClusters, cuda::stream_t<>& stream) { edm::Service cs; - moduleStart_d = cs->make_device_unique(nelements+1, stream); - clus_d = cs->make_device_unique< int32_t[]>(feds, stream); - clusInModule_d = cs->make_device_unique(nelements, stream); - moduleId_d = cs->make_device_unique(nelements, stream); - clusModuleStart_d = cs->make_device_unique(nelements+1, stream); + moduleStart_d = cs->make_device_unique(maxClusters+1, stream); + clusInModule_d = cs->make_device_unique(maxClusters, stream); + moduleId_d = cs->make_device_unique(maxClusters, stream); + clusModuleStart_d = cs->make_device_unique(maxClusters+1, stream); auto view = cs->make_host_unique(stream); view->moduleStart_ = moduleStart_d.get(); - view->clus_ = clus_d.get(); view->clusInModule_ = clusInModule_d.get(); view->moduleId_ = moduleId_d.get(); view->clusModuleStart_ = clusModuleStart_d.get(); view_d = cs->make_device_unique(stream); - cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id()); + cudautils::copyAsync(view_d, view, stream); } diff --git a/CUDADataFormats/SiPixelCluster/src/classes.h b/CUDADataFormats/SiPixelCluster/src/classes.h new file mode 100644 index 0000000000000..08d46244adc7d --- /dev/null +++ b/CUDADataFormats/SiPixelCluster/src/classes.h @@ -0,0 +1,8 @@ +#ifndef CUDADataFormats_SiPixelCluster_classes_h +#define CUDADataFormats_SiPixelCluster_classes_h + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" +#include "DataFormats/Common/interface/Wrapper.h" + +#endif diff --git a/CUDADataFormats/SiPixelCluster/src/classes_def.xml b/CUDADataFormats/SiPixelCluster/src/classes_def.xml new file mode 100644 index 0000000000000..ba0706ac4b8aa --- /dev/null +++ b/CUDADataFormats/SiPixelCluster/src/classes_def.xml @@ -0,0 +1,4 @@ + + + + diff --git a/CUDADataFormats/SiPixelDigi/BuildFile.xml b/CUDADataFormats/SiPixelDigi/BuildFile.xml index 259aa9f08d054..29ec13098819c 100644 --- a/CUDADataFormats/SiPixelDigi/BuildFile.xml +++ b/CUDADataFormats/SiPixelDigi/BuildFile.xml @@ -1,6 +1,8 @@ + + diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h new file mode 100644 index 0000000000000..e9c8c0f644722 --- /dev/null +++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h @@ -0,0 +1,40 @@ +#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h +#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h + +#include "DataFormats/SiPixelDigi/interface/PixelErrors.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" + +#include + +class SiPixelDigiErrorsCUDA { +public: + SiPixelDigiErrorsCUDA() = default; + explicit SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cuda::stream_t<>& stream); + ~SiPixelDigiErrorsCUDA() = default; + + SiPixelDigiErrorsCUDA(const SiPixelDigiErrorsCUDA&) = delete; + SiPixelDigiErrorsCUDA& operator=(const SiPixelDigiErrorsCUDA&) = delete; + SiPixelDigiErrorsCUDA(SiPixelDigiErrorsCUDA&&) = default; + SiPixelDigiErrorsCUDA& operator=(SiPixelDigiErrorsCUDA&&) = default; + + const PixelFormatterErrors& formatterErrors() const { return formatterErrors_h; } + + GPU::SimpleVector *error() { return error_d.get(); } + GPU::SimpleVector const *error() const { return error_d.get(); } + GPU::SimpleVector const *c_error() const { return error_d.get(); } + + using HostDataError = std::pair, cudautils::host::unique_ptr>; + HostDataError dataErrorToHostAsync(cuda::stream_t<>& stream) const; + + void copyErrorToHostAsync(cuda::stream_t<>& stream); + +private: + cudautils::device::unique_ptr data_d; + cudautils::device::unique_ptr> error_d; + cudautils::host::unique_ptr> error_h; + PixelFormatterErrors formatterErrors_h; +}; + +#endif diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h index 66ca680effd19..6a52545483eb8 100644 --- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h +++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h @@ -1,15 +1,15 @@ #ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h #define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h -#include "CUDADataFormats/Common/interface/device_unique_ptr.h" -#include "FWCore/Utilities/interface/propagate_const.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" #include class SiPixelDigisCUDA { public: SiPixelDigisCUDA() = default; - explicit SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream); + explicit SiPixelDigisCUDA(size_t maxFedWords, cuda::stream_t<>& stream); ~SiPixelDigisCUDA() = default; SiPixelDigisCUDA(const SiPixelDigisCUDA&) = delete; @@ -17,20 +17,42 @@ class SiPixelDigisCUDA { SiPixelDigisCUDA(SiPixelDigisCUDA&&) = default; SiPixelDigisCUDA& operator=(SiPixelDigisCUDA&&) = default; + void setNModulesDigis(uint32_t nModules, uint32_t nDigis) { + nModules_h = nModules; + nDigis_h = nDigis; + } + + uint32_t nModules() const { return nModules_h; } + uint32_t nDigis() const { return nDigis_h; } + uint16_t * xx() { return xx_d.get(); } uint16_t * yy() { return yy_d.get(); } uint16_t * adc() { return adc_d.get(); } uint16_t * moduleInd() { return moduleInd_d.get(); } + int32_t * clus() { return clus_d.get(); } + uint32_t * pdigi() { return pdigi_d.get(); } + uint32_t * rawIdArr() { return rawIdArr_d.get(); } uint16_t const *xx() const { return xx_d.get(); } uint16_t const *yy() const { return yy_d.get(); } uint16_t const *adc() const { return adc_d.get(); } uint16_t const *moduleInd() const { return moduleInd_d.get(); } + int32_t const *clus() const { return clus_d.get(); } + uint32_t const *pdigi() const { return pdigi_d.get(); } + uint32_t const *rawIdArr() const { return rawIdArr_d.get(); } uint16_t const *c_xx() const { return xx_d.get(); } uint16_t const *c_yy() const { return yy_d.get(); } uint16_t const *c_adc() const { return adc_d.get(); } uint16_t const *c_moduleInd() const { return moduleInd_d.get(); } + int32_t const *c_clus() const { return clus_d.get(); } + uint32_t const *c_pdigi() const { return pdigi_d.get(); } + uint32_t const *c_rawIdArr() const { return rawIdArr_d.get(); } + + cudautils::host::unique_ptr adcToHostAsync(cuda::stream_t<>& stream) const; + cudautils::host::unique_ptr< int32_t[]> clusToHostAsync(cuda::stream_t<>& stream) const; + cudautils::host::unique_ptr pdigiToHostAsync(cuda::stream_t<>& stream) const; + cudautils::host::unique_ptr rawIdArrToHostAsync(cuda::stream_t<>& stream) const; class DeviceConstView { public: @@ -41,6 +63,7 @@ class SiPixelDigisCUDA { __device__ __forceinline__ uint16_t yy(int i) const { return __ldg(yy_+i); } __device__ __forceinline__ uint16_t adc(int i) const { return __ldg(adc_+i); } __device__ __forceinline__ uint16_t moduleInd(int i) const { return __ldg(moduleInd_+i); } + __device__ __forceinline__ int32_t clus(int i) const { return __ldg(clus_+i); } #endif friend class SiPixelDigisCUDA; @@ -50,16 +73,27 @@ class SiPixelDigisCUDA { uint16_t const *yy_; uint16_t const *adc_; uint16_t const *moduleInd_; + int32_t const *clus_; }; const DeviceConstView *view() const { return view_d.get(); } private: - edm::cuda::device::unique_ptr xx_d; // local coordinates of each pixel - edm::cuda::device::unique_ptr yy_d; // - edm::cuda::device::unique_ptr adc_d; // ADC of each pixel - edm::cuda::device::unique_ptr moduleInd_d; // module id of each pixel - edm::cuda::device::unique_ptr view_d; // "me" pointer + // These are consumed by downstream device code + cudautils::device::unique_ptr xx_d; // local coordinates of each pixel + cudautils::device::unique_ptr yy_d; // + cudautils::device::unique_ptr adc_d; // ADC of each pixel + cudautils::device::unique_ptr moduleInd_d; // module id of each pixel + cudautils::device::unique_ptr clus_d; // cluster id of each pixel + cudautils::device::unique_ptr view_d; // "me" pointer + + // These are for CPU output; should we (eventually) place them to a + // separate product? + cudautils::device::unique_ptr pdigi_d; + cudautils::device::unique_ptr rawIdArr_d; + + uint32_t nModules_h = 0; + uint32_t nDigis_h = 0; }; #endif diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc new file mode 100644 index 0000000000000..92aab1ec9d578 --- /dev/null +++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc @@ -0,0 +1,44 @@ +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h" + +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" +#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h" + +SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cuda::stream_t<>& stream): + formatterErrors_h(std::move(errors)) +{ + edm::Service cs; + + error_d = cs->make_device_unique>(stream); + data_d = cs->make_device_unique(maxFedWords, stream); + + cudautils::memsetAsync(data_d, 0x00, maxFedWords, stream); + + error_h = cs->make_host_unique>(stream); + GPU::make_SimpleVector(error_h.get(), maxFedWords, data_d.get()); + assert(error_h->size() == 0); + assert(error_h->capacity() == static_cast(maxFedWords)); + + cudautils::copyAsync(error_d, error_h, stream); +} + +void SiPixelDigiErrorsCUDA::copyErrorToHostAsync(cuda::stream_t<>& stream) { + cudautils::copyAsync(error_h, error_d, stream); +} + +SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync(cuda::stream_t<>& stream) const { + edm::Service cs; + // On one hand size() could be sufficient. On the other hand, if + // someone copies the SimpleVector<>, (s)he might expect the data + // buffer to actually have space for capacity() elements. + auto data = cs->make_host_unique(error_h->capacity(), stream); + + // but transfer only the required amount + if(error_h->size() > 0) { + cudautils::copyAsync(data, data_d, error_h->size(), stream); + } + auto err = *error_h; + err.set_data(data.get()); + return HostDataError(std::move(err), std::move(data)); +} diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc index 7e3d876ac8bdc..ef13ed9612dbf 100644 --- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc +++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc @@ -2,24 +2,55 @@ #include "FWCore/ServiceRegistry/interface/Service.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" -#include - -SiPixelDigisCUDA::SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream) { +SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cuda::stream_t<>& stream) { edm::Service cs; - xx_d = cs->make_device_unique(nelements, stream); - yy_d = cs->make_device_unique(nelements, stream); - adc_d = cs->make_device_unique(nelements, stream); - moduleInd_d = cs->make_device_unique(nelements, stream); + xx_d = cs->make_device_unique(maxFedWords, stream); + yy_d = cs->make_device_unique(maxFedWords, stream); + adc_d = cs->make_device_unique(maxFedWords, stream); + moduleInd_d = cs->make_device_unique(maxFedWords, stream); + clus_d = cs->make_device_unique< int32_t[]>(maxFedWords, stream); + + pdigi_d = cs->make_device_unique(maxFedWords, stream); + rawIdArr_d = cs->make_device_unique(maxFedWords, stream); auto view = cs->make_host_unique(stream); view->xx_ = xx_d.get(); view->yy_ = yy_d.get(); view->adc_ = adc_d.get(); view->moduleInd_ = moduleInd_d.get(); + view->clus_ = clus_d.get(); view_d = cs->make_device_unique(stream); - cudaCheck(cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id())); + cudautils::copyAsync(view_d, view, stream); +} + +cudautils::host::unique_ptr SiPixelDigisCUDA::adcToHostAsync(cuda::stream_t<>& stream) const { + edm::Service cs; + auto ret = cs->make_host_unique(nDigis(), stream); + cudautils::copyAsync(ret, adc_d, nDigis(), stream); + return ret; +} + +cudautils::host::unique_ptr SiPixelDigisCUDA::clusToHostAsync(cuda::stream_t<>& stream) const { + edm::Service cs; + auto ret = cs->make_host_unique(nDigis(), stream); + cudautils::copyAsync(ret, clus_d, nDigis(), stream); + return ret; +} + +cudautils::host::unique_ptr SiPixelDigisCUDA::pdigiToHostAsync(cuda::stream_t<>& stream) const { + edm::Service cs; + auto ret = cs->make_host_unique(nDigis(), stream); + cudautils::copyAsync(ret, pdigi_d, nDigis(), stream); + return ret; +} + +cudautils::host::unique_ptr SiPixelDigisCUDA::rawIdArrToHostAsync(cuda::stream_t<>& stream) const { + edm::Service cs; + auto ret = cs->make_host_unique(nDigis(), stream); + cudautils::copyAsync(ret, rawIdArr_d, nDigis(), stream); + return ret; } diff --git a/CUDADataFormats/SiPixelDigi/src/classes.h b/CUDADataFormats/SiPixelDigi/src/classes.h new file mode 100644 index 0000000000000..41b135640b883 --- /dev/null +++ b/CUDADataFormats/SiPixelDigi/src/classes.h @@ -0,0 +1,9 @@ +#ifndef CUDADataFormats_SiPixelDigi_classes_h +#define CUDADataFormats_SiPixelDigi_classes_h + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h" +#include "DataFormats/Common/interface/Wrapper.h" + +#endif diff --git a/CUDADataFormats/SiPixelDigi/src/classes_def.xml b/CUDADataFormats/SiPixelDigi/src/classes_def.xml new file mode 100644 index 0000000000000..9d6816ed3b14c --- /dev/null +++ b/CUDADataFormats/SiPixelDigi/src/classes_def.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/Configuration/StandardSequences/python/RawToDigi_cff.py b/Configuration/StandardSequences/python/RawToDigi_cff.py index 96aaeebbfaacd..ed10c78a40c9b 100644 --- a/Configuration/StandardSequences/python/RawToDigi_cff.py +++ b/Configuration/StandardSequences/python/RawToDigi_cff.py @@ -3,7 +3,7 @@ # This object is used to selectively make changes for different running # scenarios. In this case it makes changes for Run 2. -from EventFilter.SiPixelRawToDigi.SiPixelRawToDigi_cfi import * +from EventFilter.SiPixelRawToDigi.siPixelDigis_cff import * from EventFilter.SiStripRawToDigi.SiStripDigis_cfi import * @@ -49,7 +49,7 @@ from EventFilter.CTPPSRawToDigi.ctppsRawToDigi_cff import * RawToDigiTask = cms.Task(L1TRawToDigiTask, - siPixelDigis, + siPixelDigisTask, siStripDigis, ecalDigis, ecalPreshowerDigis, @@ -64,15 +64,14 @@ ) RawToDigi = cms.Sequence(RawToDigiTask) -RawToDigiTask_noTk = RawToDigiTask.copyAndExclude([siPixelDigis, siStripDigis]) +RawToDigiTask_noTk = RawToDigiTask.copyAndExclude([siPixelDigisTask, siStripDigis]) RawToDigi_noTk = cms.Sequence(RawToDigiTask_noTk) -RawToDigiTask_pixelOnly = cms.Task(siPixelDigis) +RawToDigiTask_pixelOnly = cms.Task(siPixelDigisTask) RawToDigi_pixelOnly = cms.Sequence(RawToDigiTask_pixelOnly) scalersRawToDigi.scalersInputTag = 'rawDataCollector' -from Configuration.ProcessModifiers.gpu_cff import gpu -(~gpu).toModify(siPixelDigis, InputLabel = 'rawDataCollector') +siPixelDigis.cpu.InputLabel = 'rawDataCollector' #false by default anyways ecalDigis.DoRegional = False ecalDigis.InputLabel = 'rawDataCollector' ecalPreshowerDigis.sourceTag = 'rawDataCollector' diff --git a/DataFormats/SiPixelDigi/interface/PixelErrors.h b/DataFormats/SiPixelDigi/interface/PixelErrors.h new file mode 100644 index 0000000000000..5231b7d1f372a --- /dev/null +++ b/DataFormats/SiPixelDigi/interface/PixelErrors.h @@ -0,0 +1,21 @@ +#ifndef DataFormats_SiPixelDigi_PixelErrors_h +#define DataFormats_SiPixelDigi_PixelErrors_h + +#include "DataFormats/SiPixelRawData/interface/SiPixelRawDataError.h" +#include "FWCore/Utilities/interface/typedefs.h" + +#include +#include + +// Better ideas for the placement of these? + +struct PixelErrorCompact { + uint32_t rawId; + uint32_t word; + unsigned char errorType; + unsigned char fedId; +}; + +using PixelFormatterErrors = std::map>; + +#endif diff --git a/DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h b/DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h new file mode 100644 index 0000000000000..1a7710b4fb3c6 --- /dev/null +++ b/DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h @@ -0,0 +1,28 @@ +#ifndef DataFormats_SiPixelDigi_interface_SiPixelDigiErrorsSoA_h +#define DataFormats_SiPixelDigi_interface_SiPixelDigiErrorsSoA_h + +#include "DataFormats/SiPixelDigi/interface/PixelErrors.h" + +#include +#include + +class SiPixelDigiErrorsSoA { +public: + SiPixelDigiErrorsSoA() = default; + explicit SiPixelDigiErrorsSoA(size_t nErrors, const PixelErrorCompact *error, const PixelFormatterErrors *err); + ~SiPixelDigiErrorsSoA() = default; + + auto size() const { return error_.size(); } + + const PixelFormatterErrors *formatterErrors() const { return formatterErrors_; } + + const PixelErrorCompact& error(size_t i) const { return error_[i]; } + + const std::vector& errorVector() const { return error_; } + +private: + std::vector error_; + const PixelFormatterErrors *formatterErrors_ = nullptr; +}; + +#endif diff --git a/DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h b/DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h new file mode 100644 index 0000000000000..df249a3790cd2 --- /dev/null +++ b/DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h @@ -0,0 +1,32 @@ +#ifndef DataFormats_SiPixelDigi_interface_SiPixelDigisSoA_h +#define DataFormats_SiPixelDigi_interface_SiPixelDigisSoA_h + +#include +#include + +class SiPixelDigisSoA { +public: + SiPixelDigisSoA() = default; + explicit SiPixelDigisSoA(size_t nDigis, const uint32_t *pdigi, const uint32_t *rawIdArr, const uint16_t *adc, const int32_t *clus); + ~SiPixelDigisSoA() = default; + + auto size() const { return pdigi_.size(); } + + uint32_t pdigi(size_t i) const { return pdigi_[i]; } + uint32_t rawIdArr(size_t i) const { return rawIdArr_[i]; } + uint16_t adc(size_t i) const { return adc_[i]; } + int32_t clus(size_t i) const { return clus_[i]; } + + const std::vector& pdigiVector() const { return pdigi_; } + const std::vector& rawIdArrVector() const { return rawIdArr_; } + const std::vector& adcVector() const { return adc_; } + const std::vector& clusVector() const { return clus_; } + +private: + std::vector pdigi_; + std::vector rawIdArr_; + std::vector adc_; + std::vector clus_; +}; + +#endif diff --git a/DataFormats/SiPixelDigi/src/SiPixelDigiErrorsSoA.cc b/DataFormats/SiPixelDigi/src/SiPixelDigiErrorsSoA.cc new file mode 100644 index 0000000000000..ef2b4581fc46e --- /dev/null +++ b/DataFormats/SiPixelDigi/src/SiPixelDigiErrorsSoA.cc @@ -0,0 +1,10 @@ +#include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h" + +#include + +SiPixelDigiErrorsSoA::SiPixelDigiErrorsSoA(size_t nErrors, const PixelErrorCompact *error, const PixelFormatterErrors *err): + error_(error, error+nErrors), + formatterErrors_(err) +{ + assert(error_.size() == nErrors); +} diff --git a/DataFormats/SiPixelDigi/src/SiPixelDigisSoA.cc b/DataFormats/SiPixelDigi/src/SiPixelDigisSoA.cc new file mode 100644 index 0000000000000..ebc8ba2055f78 --- /dev/null +++ b/DataFormats/SiPixelDigi/src/SiPixelDigisSoA.cc @@ -0,0 +1,12 @@ +#include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h" + +#include + +SiPixelDigisSoA::SiPixelDigisSoA(size_t nDigis, const uint32_t *pdigi, const uint32_t *rawIdArr, const uint16_t *adc, const int32_t *clus): + pdigi_(pdigi, pdigi+nDigis), + rawIdArr_(rawIdArr, rawIdArr+nDigis), + adc_(adc, adc+nDigis), + clus_(clus, clus+nDigis) +{ + assert(pdigi_.size() == nDigis); +} diff --git a/DataFormats/SiPixelDigi/src/classes.h b/DataFormats/SiPixelDigi/src/classes.h index 0c6a09d852959..256ca41ad1867 100644 --- a/DataFormats/SiPixelDigi/src/classes.h +++ b/DataFormats/SiPixelDigi/src/classes.h @@ -5,6 +5,8 @@ #include "DataFormats/SiPixelDigi/interface/PixelDigiCollection.h" #include "DataFormats/SiPixelDigi/interface/SiPixelCalibDigi.h" #include "DataFormats/SiPixelDigi/interface/SiPixelCalibDigiError.h" +#include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h" +#include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h" #include "DataFormats/Common/interface/Wrapper.h" #include "DataFormats/Common/interface/DetSetVector.h" #include "DataFormats/Common/interface/DetSetVectorNew.h" diff --git a/DataFormats/SiPixelDigi/src/classes_def.xml b/DataFormats/SiPixelDigi/src/classes_def.xml index de7779a5c00ea..8cabbd3f3f06e 100755 --- a/DataFormats/SiPixelDigi/src/classes_def.xml +++ b/DataFormats/SiPixelDigi/src/classes_def.xml @@ -49,4 +49,10 @@ + + + + + + diff --git a/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml b/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml index f92aa68373927..4d2b5ebf45542 100644 --- a/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml +++ b/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml @@ -1,4 +1,7 @@ + + + diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsFromSoA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsFromSoA.cc new file mode 100644 index 0000000000000..9e998b92fc403 --- /dev/null +++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsFromSoA.cc @@ -0,0 +1,183 @@ +#include "CondFormats/DataRecord/interface/SiPixelFedCablingMapRcd.h" +#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingMap.h" +#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingTree.h" +#include "DataFormats/Common/interface/DetSetVector.h" +#include "DataFormats/Common/interface/Handle.h" +#include "DataFormats/DetId/interface/DetIdCollection.h" +#include "DataFormats/SiPixelDetId/interface/PixelFEDChannel.h" +#include "DataFormats/SiPixelDigi/interface/PixelDigi.h" +#include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h" +#include "EventFilter/SiPixelRawToDigi/interface/PixelDataFormatter.h" +#include "FWCore/Framework/interface/ESTransientHandle.h" +#include "FWCore/Framework/interface/ESWatcher.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" + +#include + +class SiPixelDigiErrorsFromSoA: public edm::stream::EDProducer<> { +public: + explicit SiPixelDigiErrorsFromSoA(const edm::ParameterSet& iConfig); + ~SiPixelDigiErrorsFromSoA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; + + edm::EDGetTokenT digiErrorSoAGetToken_; + + edm::EDPutTokenT> errorPutToken_; + edm::EDPutTokenT tkErrorPutToken_; + edm::EDPutTokenT userErrorPutToken_; + edm::EDPutTokenT> disabledChannelPutToken_; + + edm::ESWatcher cablingWatcher_; + std::unique_ptr cabling_; + const std::string cablingMapLabel_; + + const std::vector tkerrorlist_; + const std::vector usererrorlist_; + + const bool usePhase1_; +}; + +SiPixelDigiErrorsFromSoA::SiPixelDigiErrorsFromSoA(const edm::ParameterSet& iConfig): + digiErrorSoAGetToken_{consumes(iConfig.getParameter("digiErrorSoASrc"))}, + errorPutToken_{produces>()}, + tkErrorPutToken_{produces()}, + userErrorPutToken_{produces("UserErrorModules")}, + disabledChannelPutToken_{produces>()}, + cablingMapLabel_(iConfig.getParameter("CablingMapLabel")), + tkerrorlist_(iConfig.getParameter>("ErrorList")), + usererrorlist_(iConfig.getParameter>("UserErrorList")), + usePhase1_(iConfig.getParameter ("UsePhase1")) +{} + +void SiPixelDigiErrorsFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("digiErrorSoASrc", edm::InputTag("siPixelDigiErrorsSoA")); + desc.add("CablingMapLabel","")->setComment("CablingMap label"); + desc.add("UsePhase1",false)->setComment("## Use phase1"); + desc.add >("ErrorList", std::vector{29})->setComment("## ErrorList: list of error codes used by tracking to invalidate modules"); + desc.add >("UserErrorList", std::vector{40})->setComment("## UserErrorList: list of error codes used by Pixel experts for investigation"); + descriptions.addWithDefaultLabel(desc); +} + +void SiPixelDigiErrorsFromSoA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { + // pack errors into collection + + // initialize cabling map or update if necessary + if (cablingWatcher_.check(iSetup)) { + // cabling map, which maps online address (fed->link->ROC->local pixel) to offline (DetId->global pixel) + edm::ESTransientHandle cablingMap; + iSetup.get().get(cablingMapLabel_, cablingMap); + cabling_ = cablingMap->cablingTree(); + LogDebug("map version:")<< cabling_->version(); + } + + const auto& digiErrors = iEvent.get(digiErrorSoAGetToken_); + + + edm::DetSetVector errorcollection{}; + DetIdCollection tkerror_detidcollection{}; + DetIdCollection usererror_detidcollection{}; + edmNew::DetSetVector disabled_channelcollection{}; + + PixelDataFormatter formatter(cabling_.get(), usePhase1_); // for phase 1 & 0 + const PixelDataFormatter::Errors *formatterErrors = digiErrors.formatterErrors(); + assert(formatterErrors != nullptr); + auto errors = *formatterErrors; // make a copy + PixelDataFormatter::DetErrors nodeterrors; + + auto size = digiErrors.size(); + for (auto i = 0U; i < size; i++) { + PixelErrorCompact err = digiErrors.error(i); + if (err.errorType != 0) { + SiPixelRawDataError error(err.word, err.errorType, err.fedId + 1200); + errors[err.rawId].push_back(error); + } + } + + constexpr uint32_t dummydetid = 0xffffffff; + typedef PixelDataFormatter::Errors::iterator IE; + for (IE is = errors.begin(); is != errors.end(); is++) { + + uint32_t errordetid = is->first; + if (errordetid == dummydetid) {// errors given dummy detId must be sorted by Fed + nodeterrors.insert( nodeterrors.end(), errors[errordetid].begin(), errors[errordetid].end() ); + } + else { + edm::DetSet& errorDetSet = errorcollection.find_or_insert(errordetid); + errorDetSet.data.insert(errorDetSet.data.end(), is->second.begin(), is->second.end()); + // Fill detid of the detectors where there is error AND the error number is listed + // in the configurable error list in the job option cfi. + // Code needs to be here, because there can be a set of errors for each + // entry in the for loop over PixelDataFormatter::Errors + + std::vector disabledChannelsDetSet; + + for (auto const& aPixelError : errorDetSet) { + // For the time being, we extend the error handling functionality with ErrorType 25 + // In the future, we should sort out how the usage of tkerrorlist can be generalized + if (aPixelError.getType() == 25) { + int fedId = aPixelError.getFedId(); + const sipixelobjects::PixelFEDCabling* fed = cabling_->fed(fedId); + if (fed) { + cms_uint32_t linkId = formatter.linkId(aPixelError.getWord32()); + const sipixelobjects::PixelFEDLink* link = fed->link(linkId); + if (link) { + // The "offline" 0..15 numbering is fixed by definition, also, the FrameConversion depends on it + // in contrast, the ROC-in-channel numbering is determined by hardware --> better to use the "offline" scheme + PixelFEDChannel ch = {fed->id(), linkId, 25, 0}; + for (unsigned int iRoc = 1; iRoc <= link->numberOfROCs(); iRoc++) { + const sipixelobjects::PixelROC * roc = link->roc(iRoc); + if (roc->idInDetUnit() < ch.roc_first) ch.roc_first = roc->idInDetUnit(); + if (roc->idInDetUnit() > ch.roc_last) ch.roc_last = roc->idInDetUnit(); + } + if (ch.roc_first& errorDetSet = errorcollection.find_or_insert(dummydetid); + errorDetSet.data = nodeterrors; + + iEvent.emplace(errorPutToken_, std::move(errorcollection)); + iEvent.emplace(tkErrorPutToken_, std::move(tkerror_detidcollection)); + iEvent.emplace(userErrorPutToken_, std::move(usererror_detidcollection)); + iEvent.emplace(disabledChannelPutToken_, std::move(disabled_channelcollection)); +} + +DEFINE_FWK_MODULE(SiPixelDigiErrorsFromSoA); diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc new file mode 100644 index 0000000000000..d47542528ed86 --- /dev/null +++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc @@ -0,0 +1,75 @@ +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h" +#include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" + +class SiPixelDigiErrorsSoAFromCUDA: public edm::stream::EDProducer { +public: + explicit SiPixelDigiErrorsSoAFromCUDA(const edm::ParameterSet& iConfig); + ~SiPixelDigiErrorsSoAFromCUDA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; + + edm::EDGetTokenT> digiErrorGetToken_; + edm::EDPutTokenT digiErrorPutToken_; + + cudautils::host::unique_ptr data_; + GPU::SimpleVector error_; + const PixelFormatterErrors *formatterErrors_ = nullptr; +}; + +SiPixelDigiErrorsSoAFromCUDA::SiPixelDigiErrorsSoAFromCUDA(const edm::ParameterSet& iConfig): + digiErrorGetToken_(consumes>(iConfig.getParameter("src"))), + digiErrorPutToken_(produces()) +{} + +void SiPixelDigiErrorsSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("src", edm::InputTag("siPixelClustersCUDA")); + descriptions.addWithDefaultLabel(desc); +} + +void SiPixelDigiErrorsSoAFromCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + // Do the transfer in a CUDA stream parallel to the computation CUDA stream + CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)}; + + const auto& gpuDigiErrors = ctx.get(iEvent, digiErrorGetToken_); + + auto tmp = gpuDigiErrors.dataErrorToHostAsync(ctx.stream()); + error_ = std::move(tmp.first); + data_ = std::move(tmp.second); + formatterErrors_ = &(gpuDigiErrors.formatterErrors()); +} + +void SiPixelDigiErrorsSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { + // The following line copies the data from the pinned host memory to + // regular host memory. In principle that feels unnecessary (why not + // just use the pinned host memory?). There are a few arguments for + // doing it though + // - Now can release the pinned host memory back to the (caching) allocator + // * if we'd like to keep the pinned memory, we'd need to also + // keep the CUDA stream around as long as that, or allow pinned + // host memory to be allocated without a CUDA stream + // - What if a CPU algorithm would produce the same SoA? We can't + // use cudaMallocHost without a GPU... + iEvent.emplace(digiErrorPutToken_, error_.size(), error_.data(), formatterErrors_); + + error_ = GPU::make_SimpleVector(0, nullptr); + data_.reset(); + formatterErrors_ = nullptr; +} + +// define as framework plugin +DEFINE_FWK_MODULE(SiPixelDigiErrorsSoAFromCUDA); diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc new file mode 100644 index 0000000000000..068701f0bcf07 --- /dev/null +++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc @@ -0,0 +1,81 @@ +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" +#include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" + + +class SiPixelDigisSoAFromCUDA: public edm::stream::EDProducer { +public: + explicit SiPixelDigisSoAFromCUDA(const edm::ParameterSet& iConfig); + ~SiPixelDigisSoAFromCUDA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; + + edm::EDGetTokenT> digiGetToken_; + edm::EDPutTokenT digiPutToken_; + + cudautils::host::unique_ptr pdigi_; + cudautils::host::unique_ptr rawIdArr_; + cudautils::host::unique_ptr adc_; + cudautils::host::unique_ptr< int32_t[]> clus_; + + int nDigis_; +}; + +SiPixelDigisSoAFromCUDA::SiPixelDigisSoAFromCUDA(const edm::ParameterSet& iConfig): + digiGetToken_(consumes>(iConfig.getParameter("src"))), + digiPutToken_(produces()) +{} + +void SiPixelDigisSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("src", edm::InputTag("siPixelClustersCUDA")); + descriptions.addWithDefaultLabel(desc); +} + +void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + // Do the transfer in a CUDA stream parallel to the computation CUDA stream + CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)}; + + const auto& gpuDigis = ctx.get(iEvent, digiGetToken_); + + nDigis_ = gpuDigis.nDigis(); + pdigi_ = gpuDigis.pdigiToHostAsync(ctx.stream()); + rawIdArr_ = gpuDigis.rawIdArrToHostAsync(ctx.stream()); + adc_ = gpuDigis.adcToHostAsync(ctx.stream()); + clus_ = gpuDigis.clusToHostAsync(ctx.stream()); +} + +void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { + // The following line copies the data from the pinned host memory to + // regular host memory. In principle that feels unnecessary (why not + // just use the pinned host memory?). There are a few arguments for + // doing it though + // - Now can release the pinned host memory back to the (caching) allocator + // * if we'd like to keep the pinned memory, we'd need to also + // keep the CUDA stream around as long as that, or allow pinned + // host memory to be allocated without a CUDA stream + // - What if a CPU algorithm would produce the same SoA? We can't + // use cudaMallocHost without a GPU... + iEvent.emplace(digiPutToken_, nDigis_, pdigi_.get(), rawIdArr_.get(), adc_.get(), clus_.get()); + + pdigi_.reset(); + rawIdArr_.reset(); + adc_.reset(); + clus_.reset(); +} + +// define as framework plugin +DEFINE_FWK_MODULE(SiPixelDigisSoAFromCUDA); diff --git a/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py b/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py index c2479af1f60bd..50c8f0fcabd3c 100644 --- a/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py +++ b/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py @@ -1,14 +1,24 @@ import FWCore.ParameterSet.Config as cms -import EventFilter.SiPixelRawToDigi.siPixelRawToDigi_cfi +from EventFilter.SiPixelRawToDigi.siPixelRawToDigi_cfi import siPixelRawToDigi as _siPixelRawToDigi -siPixelDigis = EventFilter.SiPixelRawToDigi.siPixelRawToDigi_cfi.siPixelRawToDigi.clone() +from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA +siPixelDigis = SwitchProducerCUDA( + cpu = _siPixelRawToDigi.clone() +) from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel -phase1Pixel.toModify(siPixelDigis, UsePhase1=True) - -import RecoLocalTracker.SiPixelClusterizer.siPixelDigiHeterogeneousConverter_cfi -_siPixelDigis_gpu = RecoLocalTracker.SiPixelClusterizer.siPixelDigiHeterogeneousConverter_cfi.siPixelDigiHeterogeneousConverter.clone() -_siPixelDigis_gpu.includeErrors = cms.bool(True) +phase1Pixel.toModify(siPixelDigis.cpu, UsePhase1=True) from Configuration.ProcessModifiers.gpu_cff import gpu -gpu.toReplaceWith(siPixelDigis, _siPixelDigis_gpu) +gpu.toModify(siPixelDigis, + cuda = cms.EDAlias( + siPixelDigiErrors = cms.VPSet( + cms.PSet(type = cms.string("DetIdedmEDCollection")), + cms.PSet(type = cms.string("SiPixelRawDataErroredmDetSetVector")), + cms.PSet(type = cms.string("PixelFEDChanneledmNewDetSetVector")) + ), + siPixelDigisClustersPreSplitting = cms.VPSet( + cms.PSet(type = cms.string("PixelDigiedmDetSetVector")) + ) + ) +) diff --git a/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py b/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py new file mode 100644 index 0000000000000..31ba8596bddc6 --- /dev/null +++ b/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py @@ -0,0 +1,30 @@ +import FWCore.ParameterSet.Config as cms + +from EventFilter.SiPixelRawToDigi.SiPixelRawToDigi_cfi import siPixelDigis +from EventFilter.SiPixelRawToDigi.siPixelDigisSoAFromCUDA_cfi import siPixelDigisSoAFromCUDA as _siPixelDigisSoAFromCUDA +from EventFilter.SiPixelRawToDigi.siPixelDigiErrorsSoAFromCUDA_cfi import siPixelDigiErrorsSoAFromCUDA as _siPixelDigiErrorsSoAFromCUDA +from EventFilter.SiPixelRawToDigi.siPixelDigiErrorsFromSoA_cfi import siPixelDigiErrorsFromSoA as _siPixelDigiErrorsFromSoA + +siPixelDigisTask = cms.Task(siPixelDigis) + +siPixelDigisSoA = _siPixelDigisSoAFromCUDA.clone( + src = "siPixelClustersCUDAPreSplitting" +) +siPixelDigiErrorsSoA = _siPixelDigiErrorsSoAFromCUDA.clone( + src = "siPixelClustersCUDAPreSplitting" +) +siPixelDigiErrors = _siPixelDigiErrorsFromSoA.clone() + +from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel +phase1Pixel.toModify(siPixelDigiErrors, UsePhase1=True) + +siPixelDigisTaskCUDA = cms.Task( + siPixelDigisSoA, + siPixelDigiErrorsSoA, + siPixelDigiErrors +) + +from Configuration.ProcessModifiers.gpu_cff import gpu +_siPixelDigisTask_gpu = siPixelDigisTask.copy() +_siPixelDigisTask_gpu.add(siPixelDigisTaskCUDA) +gpu.toReplaceWith(siPixelDigisTask, _siPixelDigisTask_gpu) diff --git a/HeterogeneousCore/CUDACore/BuildFile.xml b/HeterogeneousCore/CUDACore/BuildFile.xml index ba6b35c6d0ce7..dc0066701ece3 100644 --- a/HeterogeneousCore/CUDACore/BuildFile.xml +++ b/HeterogeneousCore/CUDACore/BuildFile.xml @@ -1,6 +1,8 @@ + + diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md new file mode 100644 index 0000000000000..a2bdb67cfba11 --- /dev/null +++ b/HeterogeneousCore/CUDACore/README.md @@ -0,0 +1,636 @@ +# CUDA algorithms in CMSSW + +## Outline + +* [Introduction](#introduction) + * [Design goals](#design-goals) + * [Overall guidelines](#overall-guidelines) +* [Sub-packages](#sub-packages) +* [Examples](#examples) + * [Isolated producer (no CUDA input nor output)](#isolated-producer-no-cuda-input-nor-output) + * [Producer with CUDA input](#producer-with-cuda-output) + * [Producer with CUDA output](#producer-with-cuda-input) + * [Producer with CUDA input and output (with ExternalWork)](#producer-with-cuda-input-and-output-with-externalwork) + * [Producer with CUDA input and output (without ExternalWork)](#producer-with-cuda-input-and-output-without-externalwork) + * [Configuration](#configuration) + * [GPU-only configuration](#gpu-only-configuration) + * [Automatic switching between CPU and GPU modules](#automatic-switching-between-cpu-and-gpu-modules) +* [More details](#more-details) + * [Device choice](#device-choice) + * [Data model](#data-model) + * [CUDA EDProducer](#cuda-edproducer) + * [Class declaration](#class-declaration) + * [Memory allocation](#memory-allocation) + * [Caching allocator](#caching-allocator) + * [CUDA API](#cuda-api) + * [Setting the current device](#setting-the-current-device) + * [Getting input](#getting-input) + * [Calling the CUDA kernels](#calling-the-cuda-kernels) + * [Putting output](#putting-output) + * [`ExternalWork` extension](#externalwork-extension) + * [Transferring GPU data to CPU](#transferring-gpu-data-to-cpu) + * [Synchronizing between CUDA streams](#synchronizing-between-cuda-streams) + + +## Introduction + +This page documents the CUDA integration within CMSSW + +### Design goals + +1. Provide a mechanism for a chain of modules to share a resource + * Resource can be e.g. CUDA device memory or a CUDA stream +2. Minimize data movements between the CPU and the device +3. Support multiple devices +4. Allow the same job configuration to be used on all hardware combinations + +### Overall guidelines + +1. Within the `acquire()`/`produce()` functions all CUDA operations should be asynchronous, i.e. + * Use `cudaMemcpyAsync()`, `cudaMemsetAsync()`, `cudaMemPrefetchAsync()` etc. + * Avoid `cudaMalloc*()`, `cudaHostAlloc()`, `cudaFree*()`, `cudaHostRegister()`, `cudaHostUnregister()` on every event + * Occasional calls are permitted through a caching mechanism that amortizes the cost (see also [Caching allocator](#caching-allocator)) + * Avoid `assert()` in device functions, or use `#include HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h` + * With the latter the `assert()` calls in CUDA code are disabled by + default, but can be enabled by defining a `GPU_DEBUG` macro + (before the aforementioned include) +2. Synchronization needs should be fulfilled with + [`ExternalWork`](https://twiki.cern.ch/twiki/bin/view/CMSPublic/FWMultithreadedFrameworkStreamModuleInterface#edm_ExternalWork) + extension to EDProducers + * `ExternalWork` can be used to replace one synchronization point + (e.g. between device kernels and copying a known amount of data + back to CPU). + * For further synchronization points (e.g. copying data whose + amount is known only at the device side), split the work to + multiple `ExternalWork` producers. This approach has the added + benefit that e.g. data transfers to CPU become on-demand automatically + * A general breakdown of the possible steps: + * Convert input legacy CPU data format to CPU SoA + * Transfer input CPU SoA to GPU + * Launch kernels + * Transfer the number of output elements to CPU + * Transfer the output data from GPU to CPU SoA + * Convert the output SoA to legacy CPU data formats +3. Within `acquire()`/`produce()`, the current CUDA device is set + implicitly and the CUDA stream is provided by the system (with + `CUDAScopedContext`) + * It is strongly recommended to use the provided CUDA stream for all operations + * If that is not feasible for some reason, the provided CUDA + stream must synchronize with the work queued on other CUDA + streams (with CUDA events and `cudaStreamWaitEvent()`) +4. Outside of `acquire()`/`produce()`, CUDA API functions may be + called only if `CUDAService::enabled()` returns `true`. + * With point 3 it follows that in these cases multiple devices have + to be dealt with explicitly, as well as CUDA streams + +## Sub-packages +* [`HeterogeneousCore/CUDACore`](#cuda-integration) CUDA-specific core components +* [`HeterogeneousCore/CUDAServices`](../CUDAServices) Various edm::Services related to CUDA +* [`HeterogeneousCore/CUDAUtilities`](../CUDAUtilities) Various utilities for CUDA kernel code +* [`HeterogeneousCore/CUDATest`](../CUDATest) Test modules and configurations +* [`CUDADataFormats/Common`](../../CUDADataFormats/Common) Utilities for event products with CUDA data + +## Examples + +### Isolated producer (no CUDA input nor output) + +```cpp +class IsolatedProducerCUDA: public edm::stream::EDProducer { +public: + ... + void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; + ... +private: + ... + IsolatedProducerGPUAlgo gpuAlgo_; + edm::EDGetTokenT inputToken_; + edm::EDPutTokenT outputToken_; +}; +... +void IsolatedProducerCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + // Sets the current device and creates a CUDA stream + CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)}; + + auto const& inputData = iEvent.get(inputToken_); + + // Queues asynchronous data transfers and kernels to the CUDA stream + // returned by CUDAScopedContext::stream() + gpuAlgo_.makeAsync(inputData, ctx.stream()); + + // Destructor of ctx queues a callback to the CUDA stream notifying + // waitingTaskHolder when the queued asynchronous work has finished +} + +// Called after the asynchronous work has finished +void IsolatedProducerCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { + // Real life is likely more complex than this simple example. Here + // getResult() returns some data in CPU memory that is passed + // directly to the OutputData constructor. + iEvent.emplace(outputToken_, gpuAlgo_.getResult()); +} +``` + +### Producer with CUDA output + +```cpp +class ProducerOutputCUDA: public edm::stream::EDProducer { +public: + ... + void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; + ... +private: + ... + ProducerOutputGPUAlgo gpuAlgo_; + edm::EDGetTokenT inputToken_; + edm::EDPutTokenT> outputToken_; + CUDAContextToken ctxTmp_; +}; +... +void ProducerOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + // Sets the current device and creates a CUDA stream + CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)}; + + auto const& inputData = iEvent.get(inputToken_); + + // Queues asynchronous data transfers and kernels to the CUDA stream + // returned by CUDAScopedContext::stream() + gpuAlgo.makeAsync(inputData, ctx.stream()); + + // Passes the current device and CUDA stream to produce() + // Feels a bit silly, and will hopefully get improved in the future + ctxTmp_ = ctx.toToken(); + + // Destructor of ctx queues a callback to the CUDA stream notifying + // waitingTaskHolder when the queued asynchronous work has finished +} + +// Called after the asynchronous work has finished +void ProducerOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { + // Sets again the current device, uses the CUDA stream created in the acquire() + CUDAScopedContext ctx{std::move(ctxTmp_)}; + + // Now getResult() returns data in GPU memory that is passed to the + // constructor of OutputData. CUDAScopedContext::emplace() wraps the + // OutputData to CUDAProduct. CUDAProduct stores also + // the current device and the CUDA stream since those will be needed + // in the consumer side. + ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult()); +} +``` + +### Producer with CUDA input + +```cpp +class ProducerInputCUDA: public edm::stream::EDProducer { +public: + ... + void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; + ... +private: + ... + ProducerInputGPUAlgo gpuAlgo_; + edm::EDGetTokenT> inputToken_; + edm::EDGetTokenT> otherInputToken_; + edm::EDPutTokenT outputToken_; +}; +... +void ProducerInputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + CUDAProduct const& inputDataWrapped = iEvent.get(inputToken_); + + // Set the current device to the same that was used to produce + // InputData, and also use the same CUDA stream + CUDAScopedContext ctx{inputDataWrapped, std::move(waitingTaskHolder)}; + + // Alternatively a new CUDA stream can be created here. This is for + // a case where there are two (or more) consumers of + // CUDAProduct whose work is independent and thus can be run + // in parallel. + CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder); + + // Grab the real input data. Checks that the input data is on the + // current device. If the input data was produced in a different CUDA + // stream than the CUDAScopedContext holds, create an inter-stream + // synchronization point with CUDA event and cudaStreamWaitEvent() + auto const& inputData = ctx.get(inputDataWrapped); + + // Input data from another producer + auto const& otherInputData = ctx.get(iEvent.get(otherInputToken_)); + // or + auto const& otherInputData = ctx.get(iEvent, otherInputToken_); + + + // Queues asynchronous data transfers and kernels to the CUDA stream + // returned by CUDAScopedContext::stream() + gpuAlgo.makeAsync(inputData, otherInputData, ctx.stream()); + + // Destructor of ctx queues a callback to the CUDA stream notifying + // waitingTaskHolder when the queued asynchronous work has finished +} + +// Called after the asynchronous work has finished +void ProducerInputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) { + // Real life is likely more complex than this simple example. Here + // getResult() returns some data in CPU memory that is passed + // directly to the OutputData constructor. + iEvent.emplace(outputToken_, gpuAlgo_.getResult()); +} +``` + +### Producer with CUDA input and output (with ExternalWork) + +```cpp +class ProducerInputOutputCUDA: public edm::stream::EDProducer { +public: + ... + void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override; + ... +private: + ... + ProducerInputGPUAlgo gpuAlgo_; + edm::EDGetTokenT> inputToken_; + edm::EDPutTokenT> outputToken_; + CUDAContextToken ctxTmp_; +}; +... +void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + CUDAProduct const& inputDataWrapped = iEvent.get(inputToken_); + + // Set the current device to the same that was used to produce + // InputData, and also use the same CUDA stream + CUDAScopedContext ctx{inputDataWrapped, std::move(waitingTaskHolder)}; + + // Grab the real input data. Checks that the input data is on the + // current device. If the input data was produced in a different CUDA + // stream than the CUDAScopedContext holds, create an inter-stream + // synchronization point with CUDA event and cudaStreamWaitEvent() + auto const& inputData = ctx.get(inputDataWrapped); + + // Queues asynchronous data transfers and kernels to the CUDA stream + // returned by CUDAScopedContext::stream() + gpuAlgo.makeAsync(inputData, ctx.stream()); + + // Passes the current device and CUDA stream to produce() + // Feels a bit silly, and will hopefully get improved in the future + ctxTmp_ = ctx.toToken(); + +// Destructor of ctx queues a callback to the CUDA stream notifying + // waitingTaskHolder when the queued asynchronous work has finished +} + +// Called after the asynchronous work has finished +void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) { + // Sets again the current device, uses the CUDA stream created in the acquire() + CUDAScopedContext ctx{std::move(ctxTmp_)}; + + // Now getResult() returns data in GPU memory that is passed to the + // constructor of OutputData. CUDAScopedContext::emplace() wraps the + // OutputData to CUDAProduct. CUDAProduct stores also + // the current device and the CUDA stream since those will be needed + // in the consumer side. + ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult()); +} +``` + +### Producer with CUDA input and output (without ExternalWork) + +If the producer does not need to transfer anything back to CPU (like +the number of output elements), the `ExternalWork` extension is not +needed as there is no need to synchronize. + +```cpp +class ProducerInputOutputCUDA: public edm::global::EDProducer<> { +public: + ... + void produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const override; + ... +private: + ... + ProducerInputGPUAlgo gpuAlgo_; + edm::EDGetTokenT> inputToken_; + edm::EDPutTokenT> outputToken_; +}; +... +void ProducerInputOutputCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const { + CUDAProduct const& inputDataWrapped = iEvent.get(inputToken_); + + // Set the current device to the same that was used to produce + // InputData, and also use the same CUDA stream + CUDAScopedContext ctx{streamID}; + + // Grab the real input data. Checks that the input data is on the + // current device. If the input data was produced in a different CUDA + // stream than the CUDAScopedContext holds, create an inter-stream + // synchronization point with CUDA event and cudaStreamWaitEvent() + auto const& inputData = ctx.get(inputDataWrapped); + + // Queues asynchronous data transfers and kernels to the CUDA stream + // returned by CUDAScopedContext::stream(). Here makeAsync() also + // returns data in GPU memory that is passed to the constructor of + // OutputData. CUDAScopedContext::emplace() wraps the OutputData to + // CUDAProduct. CUDAProduct stores also the current + // device and the CUDA stream since those will be needed in the + // consumer side. + ctx.emplace(iEvent, outputToken, gpuAlgo.makeAsync(inputData, ctx.stream()); + + // Destructor of ctx queues a callback to the CUDA stream notifying + // waitingTaskHolder when the queued asynchronous work has finished +} +``` + +### Configuration + +#### GPU-only configuration + +For a GPU-only configuration there is nothing special to be done, just +construct the Paths/Sequences/Tasks from the GPU modules. + +#### Automatic switching between CPU and GPU modules + +The `SwitchProducer` mechanism can be used to switch automatically +between CPU and GPU modules based on the availability of GPUs on the +machine where the configuration is done. Framework decides at the +beginning of the job which of the modules to run for a given module +label. + +Framework requires that the modules in the switch must produce the +same types of output products (the closer the actual results are the +better, but the framework can not enforce that). This means that for a +chain of GPU modules, it is the module that transforms the SoA data +format back to the legacy data formats (possibly, but not necessarily, +transferring the SoA data from GPU to CPU) that should be switched +between the legacy CPU module. The rest of the GPU modules should be +placed to a `Task`, in which case framework runs them only if their +output is needed by another module. + +```python +from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA +process.foo = SwitchProducerCUDA( + cpu = cms.EDProducer("FooProducer"), # legacy CPU + cuda = cms.EDProducer("FooProducerFromCUDA", + src="fooCUDA" + ) +) +process.fooCUDA = cms.EDProducer("FooProducerCUDA") + +process.fooTaskCUDA = cms.Task(process.fooCUDA) +process.fooTask = cms.Task( + process.foo, + process.fooTaskCUDA +) +``` + +For a more complete example, see [here](../CUDATest/test/testCUDASwitch_cfg.py). + + + + + +## More details + +### Device choice + +As discussed above, with `SwitchProducer` the choice between CPU and +GPU modules is done at the beginning of the job. + +For multi-GPU setup the device is chosen in the first CUDA module in a +chain of modules by one of the constructors of `CUDAScopedContext` +```cpp +CUDAScopedContext ctx{iEvent.streamID()}; +``` +As the choice is still the static EDM stream to device assignment, the +EDM stream ID is needed. The logic will likely evolve in the future to +be more dynamic, and likely the device choice has to be made for the +full event. + +### Data model + +The "GPU data product" should be a class/struct containing smart +pointer(s) to device data (see [Memory allocation](#memory-allocation)). +When putting the data to event, the data is wrapped to +`CUDAProduct` template, which holds +* the GPU data product + * must be moveable, but no other restrictions +* the current device where the data was produced, and the CUDA stream the data was produced with +* [CUDA event for synchronization between multiple CUDA streams](#synchronizing-between-cuda-streams) + +Note that the `CUDAProduct` wrapper can be constructed only with +`CUDAScopedContext::wrap()`, and the data `T` can be obtained from it +only with `CUDAScopedContext::get()`, as described further below. When +putting the data product directly to `edm::Event`, also +`CUDASCopedContext::emplace()` can be used. + +The GPU data products that depend on the CUDA runtime should be placed +under `CUDADataFormats` package, using the same name for sub-package +that would be used in `DataFormats`. Everything else, e.g. SoA for +CPU, should go under `DataFormats` as usual. + + +### CUDA EDProducer + +#### Class declaration + +The CUDA producers are normal EDProducers. The `ExternalWork` +extension should be used if a synchronization between the GPU and CPU +is needed, e.g. when transferring data from GPU to CPU. + +#### Memory allocation + +##### Caching allocator + +The memory allocations should be done dynamically with `CUDAService` +```cpp +edm::Service cs; +cudautils::device::unique_ptr device_buffer = cs->make_device_unique(50, cudaStream); +cudautils::host::unique_ptr host_buffer = cs->make_host_unique(50, cudaStream); +``` + +in the `acquire()` and `produce()` functions. The same +`cuda::stream_t<>` object that is used for transfers and kernels +should be passed to the allocator. + +The allocator is based on `cub::CachingDeviceAllocator`. The memory is +guaranteed to be reserved +* for the host: up to the destructor of the `unique_ptr` +* for the device: until all work queued in the `cudaStream` up to the point when the `unique_ptr` destructor is called has finished + +##### CUDA API + +The `cudaMalloc()` etc may be used outside of the event loop, but that +should be limited to only relatively small allocations in order to +allow as much re-use of device memory as possible. + +If really needed, the `cudaMalloc()` etc may be used also within the +event loop, but then the cost of allocation and implicit +synchronization should be explicitly amortized e.g. by caching. + +#### Setting the current device + +A CUDA producer should construct `CUDAScopedContext` in `acquire()` +(`produce()` if not using `ExternalWork`) either with `edm::StreamID`, +or with a `CUDAProduct` read as an input. + +```cpp +// From edm::StreamID +CUDAScopedContext ctx{iEvent.streamID()}; + +// From CUDAProduct +CUDAProduct cclus = iEvent.get(srcToken_); +CUDAScopedContext ctx{cclus}; +``` + +`CUDAScopedContext` works in the RAII way and does the following +* Sets the current device for the current scope + - If constructed from the `edm::StreamID`, chooses the device and creates a new CUDA stream + - If constructed from the `CUDAProduct`, uses the same device and CUDA stream as was used to produce the `CUDAProduct` +* Gives access to the CUDA stream the algorithm should use to queue asynchronous work +* Calls `edm::WaitingTaskWithArenaHolder::doneWaiting()` when necessary +* [Synchronizes between CUDA streams if necessary](#synchronizing-between-cuda-streams) +* Needed to get/put `CUDAProduct` from/to the event + +In case of multiple input products, from possibly different CUDA +streams and/or CUDA devices, this approach gives the developer full +control in which of them the kernels of the algorithm should be run. + +#### Getting input + +The real product (`T`) can be obtained from `CUDAProduct` only with +the help of `CUDAScopedContext`. + +```cpp +// From CUDAProduct +CUDAProduct cclus = iEvent.get(srcToken_); +GPUClusters const& clus = ctx.get(cclus); + +// Directly from Event +GPUClusters const& clus = ctx.get(iEvent, srcToken_); +``` + +This step is needed to +* check that the data are on the same CUDA device + * if not, throw an exception (with unified memory could prefetch instead) +* if the CUDA streams are different, synchronize between them + +#### Calling the CUDA kernels + +It is usually best to wrap the CUDA kernel calls to a separate class, +and then call methods of that class from the EDProducer. The only +requirement is that the CUDA stream where to queue the operations +should be the one from the `CUDAScopedContext` + +```cpp +gpuAlgo.makeClustersAsync(..., ctx.stream()); +``` + +If necessary, different CUDA streams may be used internally, but they +should to be made to synchronize with the provided CUDA stream with +CUDA events and `cudaStreamWaitEvent()`. + + +#### Putting output + +The GPU data needs to be wrapped to `CUDAProduct` template with +`CUDAScopedContext::wrap()` or `CUDAScopedContext::emplace()` + +```cpp +GPUClusters clusters = gpuAlgo.makeClustersAsync(..., ctx.stream()); +std::unique_ptr> ret = ctx.wrap(clusters); +iEvent.put(std::move(ret)); + +// or with one line +iEvent.put(ctx.wrap(gpuAlgo.makeClustersAsync(ctx.stream()))); + +// or avoid one unique_ptr with emplace +edm::PutTokenT> putToken_ = produces>(); // in constructor +... +ctx.emplace(iEvent, putToken_, gpuAlgo.makeClustersAsync(ctx.stream())); +``` + +This step is needed to +* store the current device and CUDA stream into `CUDAProduct` +* record the CUDA event needed for CUDA stream synchronization + +#### `ExternalWork` extension + +Everything above works both with and without `ExternalWork`. + +Without `ExternalWork` the `EDProducer`s act similar to TBB +flowgraph's "streaming node". In other words, they just queue more +asynchronous work to the CUDA stream in their `produce()`. + +The `ExternalWork` is needed when one would otherwise call +`cudeStreamSynchronize()`. For example transferring something to CPU +needed for downstream DQM, or queueing more asynchronous work. With +`ExternalWork` an `acquire()` method needs to be implemented that gets +an `edm::WaitingTaskWithArenaHolder` parameter. The +`edm::WaitingTaskWithArenaHolder` should then be passed to the +constructor of `CUDAScopedContext` along + +```cpp +void acquire(..., edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + CUDAProduct const& cclus = iEvent.get(token_); + CUDAScopedContext ctx{cclus, std::move(waitingTaskHolder)}; // can also copy instead of move if waitingTaskHolder is needed for something else as well + ... +``` + +When constructed this way, `CUDAScopedContext` registers a callback +function to the CUDA stream in its destructor to call +`waitingTaskHolder.doneWaiting()`. + +A GPU->GPU producer needs a `CUDAScopedContext` also in its +`produce()`. Currently the best way is to store the state of +`CUDAScopedContext` to `CUDAContextToken` member variable: + +```cpp +class FooProducerCUDA ... { + ... + CUDAContextToken ctxTmp_; +}; + +void acquire(...) { + ... + ctxTmp_ = ctx.toToken(); +} + +void produce(...( { + ... + CUDAScopedContext ctx{std::move(ctxTmp_)}; +} +``` + +Ideas for improvements are welcome. + + +#### Transferring GPU data to CPU + +The GPU->CPU data transfer needs synchronization to ensure the CPU +memory to have all data before putting that to the event. This means +the `ExternalWork` needs to be used along +* In `acquire()` + * (allocate CPU memory buffers) + * Queue all GPU->CPU transfers asynchronously +* In `produce()` + * If needed, read additional CPU products (e.g. from `edm::Ref`s) + * Reformat data back to legacy data formats + * Note: `CUDAScopedContext` is **not** needed in `produce()` + +#### Synchronizing between CUDA streams + +In case the producer needs input data that were produced in two (or +more) CUDA streams, these streams have to be synchronized. Here this +synchronization is achieved with CUDA events. + +Each `CUDAProduct` constains also a CUDA event object. The call to +`CUDAScopedContext::wrap()` will *record* the event in the CUDA stream. +This means that when all work queued to the CUDA stream up to that +point has been finished, the CUDA event becomes *occurred*. Then, in +`CUDAScopedContext::get()`, if the `CUDAProduct` to get from has a +different CUDA stream than the `CUDAScopedContext`, +`cudaStreamWaitEvent(stream, event)` is called. This means that all +subsequent work queued to the CUDA stream will wait for the CUDA event +to become occurred. Therefore this subsequent work can assume that the +to-be-getted CUDA product exists. + diff --git a/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h b/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h new file mode 100644 index 0000000000000..1a599132d13f1 --- /dev/null +++ b/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h @@ -0,0 +1,38 @@ +#ifndef HeterogeneousCore_CUDACore_CUDAContextToken_h +#define HeterogeneousCore_CUDACore_CUDAContextToken_h + +#include + +/** + * The purpose of this class is to deliver the device and CUDA stream + * information from ExternalWork's acquire() to producer() via a + * member/StreamCache variable. + */ +class CUDAContextToken { +public: + CUDAContextToken() = default; + ~CUDAContextToken() = default; + + CUDAContextToken(const CUDAContextToken&) = delete; + CUDAContextToken& operator=(const CUDAContextToken&) = delete; + CUDAContextToken(CUDAContextToken&&) = default; + CUDAContextToken& operator=(CUDAContextToken&& other) = default; + +private: + friend class CUDAScopedContext; + + explicit CUDAContextToken(int device, std::shared_ptr> stream): + stream_(std::move(stream)), + device_(device) + {} + + int device() { return device_; } + std::shared_ptr>&& streamPtr() { + return std::move(stream_); + } + + std::shared_ptr> stream_; + int device_; +}; + +#endif diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h new file mode 100644 index 0000000000000..ef87d017373f8 --- /dev/null +++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h @@ -0,0 +1,109 @@ +#ifndef HeterogeneousCore_CUDACore_CUDAScopedContext_h +#define HeterogeneousCore_CUDACore_CUDAScopedContext_h + +#include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Utilities/interface/StreamID.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/EDPutToken.h" +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAContextToken.h" + +#include + +#include + +namespace cudatest { + class TestCUDAScopedContext; +} + +/** + * The aim of this class is to do necessary per-event "initialization": + * - setting the current device + * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary + * - synchronizing between CUDA streams if necessary + * and enforce that those get done in a proper way in RAII fashion. + */ +class CUDAScopedContext { +public: + explicit CUDAScopedContext(edm::StreamID streamID); + + explicit CUDAScopedContext(CUDAContextToken&& token): + currentDevice_(token.device()), + setDeviceForThisScope_(currentDevice_), + stream_(std::move(token.streamPtr())) + {} + + template + explicit CUDAScopedContext(const CUDAProduct& data): + currentDevice_(data.device()), + setDeviceForThisScope_(currentDevice_), + stream_(data.streamPtr()) + {} + + explicit CUDAScopedContext(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder): + CUDAScopedContext(streamID) + { + waitingTaskHolder_ = std::move(waitingTaskHolder); + } + + template + explicit CUDAScopedContext(const CUDAProduct& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder): + CUDAScopedContext(data) + { + waitingTaskHolder_ = std::move(waitingTaskHolder); + } + + ~CUDAScopedContext(); + + int device() const { return currentDevice_; } + + cuda::stream_t<>& stream() { return *stream_; } + const cuda::stream_t<>& stream() const { return *stream_; } + const std::shared_ptr>& streamPtr() const { return stream_; } + + CUDAContextToken toToken() { + return CUDAContextToken(currentDevice_, stream_); + } + + template + const T& get(const CUDAProduct& data) { + synchronizeStreams(data.device(), data.stream(), data.event()); + return data.data_; + } + + template + const T& get(const edm::Event& iEvent, edm::EDGetTokenT> token) { + return get(iEvent.get(token)); + } + + template + std::unique_ptr > wrap(T data) const { + // make_unique doesn't work because of private constructor + // + // CUDAProduct constructor records CUDA event to the CUDA + // stream. The event will become "occurred" after all work queued + // to the stream before this point has been finished. + return std::unique_ptr >(new CUDAProduct(device(), streamPtr(), std::move(data))); + } + + template + auto emplace(edm::Event& iEvent, edm::EDPutTokenT token, Args&&... args) const { + return iEvent.emplace(token, device(), streamPtr(), std::forward(args)...); + } + +private: + friend class cudatest::TestCUDAScopedContext; + + // This construcor is only meant for testing + explicit CUDAScopedContext(int device, std::unique_ptr> stream); + + void synchronizeStreams(int dataDevice, const cuda::stream_t<>& dataStream, const cuda::event_t& dataEvent); + + int currentDevice_; + std::optional waitingTaskHolder_; + cuda::device::current::scoped_override_t<> setDeviceForThisScope_; + std::shared_ptr> stream_; +}; + +#endif diff --git a/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py b/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py new file mode 100644 index 0000000000000..ded114e2fddfe --- /dev/null +++ b/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py @@ -0,0 +1,34 @@ +import FWCore.ParameterSet.Config as cms + +_cuda_enabled_cached = None + +def _switch_cuda(): + global _cuda_enabled_cached + if _cuda_enabled_cached is None: + import os + _cuda_enabled_cached = (os.system("cudaIsEnabled") == 0) + return (_cuda_enabled_cached, 2) + +class SwitchProducerCUDA(cms.SwitchProducer): + def __init__(self, **kargs): + super(SwitchProducerCUDA,self).__init__( + dict(cpu = cms.SwitchProducer.getCpu(), + cuda = _switch_cuda), + **kargs + ) +cms.specialImportRegistry.registerSpecialImportForType(SwitchProducerCUDA, "from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA") + +if __name__ == "__main__": + import unittest + + class TestSwitchProducerCUDA(unittest.TestCase): + def testPickle(self): + import pickle + sp = SwitchProducerCUDA(cpu = cms.EDProducer("Foo"), cuda = cms.EDProducer("Bar")) + pkl = pickle.dumps(sp) + unpkl = pickle.loads(pkl) + self.assertEqual(unpkl.cpu.type_(), "Foo") + self.assertEqual(unpkl.cuda.type_(), "Bar") + + unittest.main() + diff --git a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc new file mode 100644 index 0000000000000..a29fbee36865f --- /dev/null +++ b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc @@ -0,0 +1,67 @@ +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" + +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/Utilities/interface/Exception.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + +#include "chooseCUDADevice.h" + + +CUDAScopedContext::CUDAScopedContext(edm::StreamID streamID): + currentDevice_(cudacore::chooseCUDADevice(streamID)), + setDeviceForThisScope_(currentDevice_) +{ + edm::Service cs; + stream_ = cs->getCUDAStream(); +} + +CUDAScopedContext::CUDAScopedContext(int device, std::unique_ptr> stream): + currentDevice_(device), + setDeviceForThisScope_(device), + stream_(std::move(stream)) +{} + +CUDAScopedContext::~CUDAScopedContext() { + if(waitingTaskHolder_.has_value()) { + stream_->enqueue.callback([device=currentDevice_, + waitingTaskHolder=*waitingTaskHolder_] + (cuda::stream::id_t streamId, cuda::status_t status) mutable { + if(cuda::is_success(status)) { + LogTrace("CUDAScopedContext") << " GPU kernel finished (in callback) device " << device << " CUDA stream " << streamId; + waitingTaskHolder.doneWaiting(nullptr); + } + else { + // wrap the exception in a try-catch block to let GDB "catch throw" break on it + try { + auto error = cudaGetErrorName(status); + auto message = cudaGetErrorString(status); + throw cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << device << " error " << error << ": " << message; + } catch(cms::Exception&) { + waitingTaskHolder.doneWaiting(std::current_exception()); + } + } + }); + } +} + +void CUDAScopedContext::synchronizeStreams(int dataDevice, const cuda::stream_t<>& dataStream, const cuda::event_t& dataEvent) { + if(dataDevice != currentDevice_) { + // Eventually replace with prefetch to current device (assuming unified memory works) + // If we won't go to unified memory, need to figure out something else... + throw cms::Exception("LogicError") << "Handling data from multiple devices is not yet supported"; + } + + if(dataStream.id() != stream_->id()) { + // Different streams, need to synchronize + if(!dataEvent.has_occurred()) { + // Event not yet occurred, so need to add synchronization + // here. Sychronization is done by making the CUDA stream to + // wait for an event, so all subsequent work in the stream + // will run only after the event has "occurred" (i.e. data + // product became available). + auto ret = cudaStreamWaitEvent(stream_->id(), dataEvent.id(), 0); + cuda::throw_if_error(ret, "Failed to make a stream to wait for an event"); + } + } +} diff --git a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc new file mode 100644 index 0000000000000..a582ed2f72866 --- /dev/null +++ b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc @@ -0,0 +1,18 @@ +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + +#include "chooseCUDADevice.h" + +namespace cudacore { + int chooseCUDADevice(edm::StreamID id) { + edm::Service cudaService; + + // For startes we "statically" assign the device based on + // edm::Stream number. This is suboptimal if the number of + // edm::Streams is not a multiple of the number of CUDA devices + // (and even then there is no load balancing). + // + // TODO: improve the "assignment" logic + return id % cudaService->numberOfDevices(); + } +} diff --git a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.h b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.h new file mode 100644 index 0000000000000..bb09c302af7f5 --- /dev/null +++ b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.h @@ -0,0 +1,10 @@ +#ifndef HeterogeneousCore_CUDACore_chooseCUDADevice_h +#define HeterogeneousCore_CUDACore_chooseCUDADevice_h + +#include "FWCore/Utilities/interface/StreamID.h" + +namespace cudacore { + int chooseCUDADevice(edm::StreamID id); +} + +#endif diff --git a/HeterogeneousCore/CUDACore/test/BuildFile.xml b/HeterogeneousCore/CUDACore/test/BuildFile.xml new file mode 100644 index 0000000000000..cd2c3b094243c --- /dev/null +++ b/HeterogeneousCore/CUDACore/test/BuildFile.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc new file mode 100644 index 0000000000000..eda2b94f5dfb4 --- /dev/null +++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc @@ -0,0 +1,119 @@ +#include "catch.hpp" + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" + +#include "test_CUDAScopedContextKernels.h" + +namespace cudatest { + class TestCUDAScopedContext { + public: + static + CUDAScopedContext make(int dev) { + auto device = cuda::device::get(dev); + return CUDAScopedContext(dev, std::make_unique>(device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream))); + } + }; +} + +namespace { + std::unique_ptr > produce(int device, int *d, int *h) { + auto ctx = cudatest::TestCUDAScopedContext::make(device); + + cuda::memory::async::copy(d, h, sizeof(int), ctx.stream().id()); + testCUDAScopedContextKernels_single(d, ctx.stream()); + return ctx.wrap(d); + } +} + +TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") { + exitSansCUDADevices(); + + constexpr int defaultDevice = 0; + { + auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice); + + SECTION("Construct from device ID") { + REQUIRE(cuda::device::current::get().id() == defaultDevice); + } + + SECTION("Wrap T to CUDAProduct") { + std::unique_ptr > dataPtr = ctx.wrap(10); + REQUIRE(dataPtr.get() != nullptr); + REQUIRE(dataPtr->device() == ctx.device()); + REQUIRE(dataPtr->stream().id() == ctx.stream().id()); + } + + SECTION("Construct from from CUDAProduct") { + std::unique_ptr> dataPtr = ctx.wrap(10); + const auto& data = *dataPtr; + + CUDAScopedContext ctx2{data}; + REQUIRE(cuda::device::current::get().id() == data.device()); + REQUIRE(ctx2.stream().id() == data.stream().id()); + } + + SECTION("Storing state as CUDAContextToken") { + CUDAContextToken ctxtok; + { // acquire + std::unique_ptr> dataPtr = ctx.wrap(10); + const auto& data = *dataPtr; + CUDAScopedContext ctx2{data}; + ctxtok = ctx2.toToken(); + } + + { // produce + CUDAScopedContext ctx2{std::move(ctxtok)}; + REQUIRE(cuda::device::current::get().id() == ctx.device()); + REQUIRE(ctx2.stream().id() == ctx.stream().id()); + } + } + + SECTION("Joining multiple CUDA streams") { + cuda::device::current::scoped_override_t<> setDeviceForThisScope(defaultDevice); + auto current_device = cuda::device::current::get(); + + // Mimick a producer on the second CUDA stream + int h_a1 = 1; + auto d_a1 = cuda::memory::device::make_unique(current_device); + auto wprod1 = produce(defaultDevice, d_a1.get(), &h_a1); + + // Mimick a producer on the second CUDA stream + int h_a2 = 2; + auto d_a2 = cuda::memory::device::make_unique(current_device); + auto wprod2 = produce(defaultDevice, d_a2.get(), &h_a2); + + REQUIRE(wprod1->stream().id() != wprod2->stream().id()); + + // Mimick a third producer "joining" the two streams + CUDAScopedContext ctx2{*wprod1}; + + auto prod1 = ctx.get(*wprod1); + auto prod2 = ctx.get(*wprod2); + + auto d_a3 = cuda::memory::device::make_unique(current_device); + testCUDAScopedContextKernels_join(prod1, prod2, d_a3.get(), ctx.stream()); + ctx.stream().synchronize(); + REQUIRE(wprod2->event().has_occurred()); + + h_a1 = 0; + h_a2 = 0; + int h_a3 = 0; + cuda::memory::async::copy(&h_a1, d_a1.get(), sizeof(int), ctx.stream().id()); + cuda::memory::async::copy(&h_a2, d_a2.get(), sizeof(int), ctx.stream().id()); + cuda::memory::async::copy(&h_a3, d_a3.get(), sizeof(int), ctx.stream().id()); + + REQUIRE(h_a1 == 2); + REQUIRE(h_a2 == 4); + REQUIRE(h_a3 == 6); + } + } + + // Destroy and clean up all resources so that the next test can + // assume to start from a clean state. + cudaCheck(cudaSetDevice(defaultDevice)); + cudaCheck(cudaDeviceSynchronize()); + cudaDeviceReset(); +} diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu new file mode 100644 index 0000000000000..18bdf50abeaa5 --- /dev/null +++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu @@ -0,0 +1,24 @@ +#include "test_CUDAScopedContextKernels.h" + +#include +#include + +namespace { + __global__ + void single_mul(int *d) { + d[0] = d[0]*2; + } + + __global__ + void join_add(const int *d1, const int *d2, int *d3) { + d3[0] = d1[0] + d2[0]; + } +} + +void testCUDAScopedContextKernels_single(int *d, cuda::stream_t<>& stream) { + single_mul<<<1, 1, 0, stream.id()>>>(d); +} + +void testCUDAScopedContextKernels_join(const int *d1, const int *d2, int *d3, cuda::stream_t<>& stream) { + join_add<<<1, 1, 0, stream.id()>>>(d1, d2, d3); +} diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h new file mode 100644 index 0000000000000..9d3f9ce33bc97 --- /dev/null +++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h @@ -0,0 +1,9 @@ +#ifndef HeterogeneousCore_CUDACore_test_CUDAScopedContextKernels_h +#define HeterogeneousCore_CUDACore_test_CUDAScopedContextKernels_h + +#include + +void testCUDAScopedContextKernels_single(int *d, cuda::stream_t<>& stream); +void testCUDAScopedContextKernels_join(const int *d1, const int *d2, int *d3, cuda::stream_t<>& stream); + +#endif diff --git a/HeterogeneousCore/CUDACore/test/test_main.cc b/HeterogeneousCore/CUDACore/test/test_main.cc new file mode 100644 index 0000000000000..0c7c351f437f5 --- /dev/null +++ b/HeterogeneousCore/CUDACore/test/test_main.cc @@ -0,0 +1,2 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" diff --git a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml index a116ca8d78d33..041ed25ba134a 100644 --- a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml +++ b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml @@ -1,3 +1,7 @@ + + + + diff --git a/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp new file mode 100644 index 0000000000000..b24f05adb2213 --- /dev/null +++ b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp @@ -0,0 +1,31 @@ +#include +#include +#include +#include + +#include + +int main() { + int devices = 0; + auto status = cudaGetDeviceCount(& devices); + if (status != cudaSuccess) { + return EXIT_FAILURE; + } + + int minimumMajor = 6; // min minor is implicitly 0 + + // This approach (requiring all devices are supported) is rather + // conservative. In principle we could consider just dropping the + // unsupported devices. Currently that would be easiest to achieve + // in CUDAService though. + for (int i = 0; i < devices; ++i) { + cudaDeviceProp properties; + cudaGetDeviceProperties(&properties, i); + + if(properties.major < minimumMajor) { + return EXIT_FAILURE; + } + } + + return EXIT_SUCCESS; +} diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h index 9555321f5153a..a7c416c17ed63 100644 --- a/HeterogeneousCore/CUDAServices/interface/CUDAService.h +++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h @@ -8,8 +8,8 @@ #include "FWCore/Utilities/interface/StreamID.h" -#include "CUDADataFormats/Common/interface/device_unique_ptr.h" -#include "CUDADataFormats/Common/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" namespace edm { class ParameterSet; @@ -19,16 +19,16 @@ namespace edm { namespace cudaserviceimpl { template - struct make_device_unique_selector { using non_array = edm::cuda::device::unique_ptr; }; + struct make_device_unique_selector { using non_array = cudautils::device::unique_ptr; }; template - struct make_device_unique_selector { using unbounded_array = edm::cuda::device::unique_ptr; }; + struct make_device_unique_selector { using unbounded_array = cudautils::device::unique_ptr; }; template struct make_device_unique_selector { struct bounded_array {}; }; template - struct make_host_unique_selector { using non_array = edm::cuda::host::unique_ptr; }; + struct make_host_unique_selector { using non_array = cudautils::host::unique_ptr; }; template - struct make_host_unique_selector { using unbounded_array = edm::cuda::host::unique_ptr; }; + struct make_host_unique_selector { using unbounded_array = cudautils::host::unique_ptr; }; template struct make_host_unique_selector { struct bounded_array {}; }; } @@ -76,9 +76,9 @@ class CUDAService { int dev = getCurrentDevice(); void *mem = allocate_device(dev, sizeof(T), stream); return typename cudaserviceimpl::make_device_unique_selector::non_array(reinterpret_cast(mem), - [this, dev](void *ptr) { - this->free_device(dev, ptr); - }); + cudautils::device::impl::DeviceDeleter([this, dev](void *ptr) { + this->free_device(dev, ptr); + })); } template @@ -89,9 +89,9 @@ class CUDAService { int dev = getCurrentDevice(); void *mem = allocate_device(dev, n*sizeof(element_type), stream); return typename cudaserviceimpl::make_device_unique_selector::unbounded_array(reinterpret_cast(mem), - [this, dev](void *ptr) { - this->free_device(dev, ptr); - }); + cudautils::device::impl::DeviceDeleter([this, dev](void *ptr) { + this->free_device(dev, ptr); + })); } template @@ -105,9 +105,9 @@ class CUDAService { static_assert(std::is_trivially_constructible::value, "Allocating with non-trivial constructor on the pinned host memory is not supported"); void *mem = allocate_host(sizeof(T), stream); return typename cudaserviceimpl::make_host_unique_selector::non_array(reinterpret_cast(mem), - [this](void *ptr) { - this->free_host(ptr); - }); + cudautils::host::impl::HostDeleter([this](void *ptr) { + this->free_host(ptr); + })); } template @@ -117,9 +117,9 @@ class CUDAService { static_assert(std::is_trivially_constructible::value, "Allocating with non-trivial constructor on the pinned host memory is not supported"); void *mem = allocate_host(n*sizeof(element_type), stream); return typename cudaserviceimpl::make_host_unique_selector::unbounded_array(reinterpret_cast(mem), - [this](void *ptr) { - this->free_host(ptr); - }); + cudautils::host::impl::HostDeleter([this](void *ptr) { + this->free_host(ptr); + })); } template @@ -132,6 +132,14 @@ class CUDAService { // Free pinned host memory (to be called from unique_ptr) void free_host(void *ptr); + // Gets a (cached) CUDA stream for the current device. The stream + // will be returned to the cache by the shared_ptr destructor. + std::shared_ptr> getCUDAStream(); + + // Gets a (cached) CUDA event for the current device. The event + // will be returned to the cache by the shared_ptr destructor. + std::shared_ptr getCUDAEvent(); + private: // PIMPL to hide details of allocator struct Allocator; @@ -139,6 +147,14 @@ class CUDAService { void *allocate_device(int dev, size_t nbytes, cuda::stream_t<>& stream); void *allocate_host(size_t nbytes, cuda::stream_t<>& stream); + // PIMPL to hide details of the CUDA stream cache + struct CUDAStreamCache; + std::unique_ptr cudaStreamCache_; + + // PIMPL to hide details of the CUDA event cache + struct CUDAEventCache; + std::unique_ptr cudaEventCache_; + int numberOfDevices_ = 0; unsigned int numberOfStreamsTotal_ = 0; std::vector> computeCapabilities_; diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc index babe062f9bab2..e776c349f2e6c 100644 --- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc +++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc @@ -9,6 +9,7 @@ #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/ReusableObjectHolder.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" @@ -98,7 +99,7 @@ namespace { cudaCheck(cudaGetDevice(&device)); for(int i=0; i([&](size_t size, cuda::stream_t<>& stream) { + preallocate([&](size_t size, cuda::stream_t<>& stream) { return cs.make_device_unique(size, stream); }, bufferSizes); } @@ -106,7 +107,7 @@ namespace { } void hostPreallocate(CUDAService& cs, const std::vector& bufferSizes) { - preallocate([&](size_t size, cuda::stream_t<>& stream) { + preallocate([&](size_t size, cuda::stream_t<>& stream) { return cs.make_host_unique(size, stream); }, bufferSizes); } @@ -336,6 +337,11 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& log << "cub::CachingDeviceAllocator disabled\n"; } + cudaStreamCache_ = std::make_unique(numberOfDevices_); + cudaEventCache_ = std::make_unique(numberOfDevices_); + + log << "\n"; + log << "CUDAService fully initialized"; enabled_ = true; @@ -350,6 +356,8 @@ CUDAService::~CUDAService() { if(allocator_) { allocator_.reset(); } + cudaEventCache_.reset(); + cudaStreamCache_.reset(); for (int i = 0; i < numberOfDevices_; ++i) { cudaCheck(cudaSetDevice(i)); @@ -490,3 +498,36 @@ void CUDAService::free_host(void *ptr) { cuda::throw_if_error(cudaFreeHost(ptr)); } } + + +// CUDA stream cache +struct CUDAService::CUDAStreamCache { + explicit CUDAStreamCache(int ndev): cache(ndev) {} + + // Separate caches for each device for fast lookup + std::vector>> cache; +}; + +std::shared_ptr> CUDAService::getCUDAStream() { + return cudaStreamCache_->cache[getCurrentDevice()].makeOrGet([](){ + auto current_device = cuda::device::current::get(); + return std::make_unique>(current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream)); + }); +} + +// CUDA event cache +struct CUDAService::CUDAEventCache { + explicit CUDAEventCache(int ndev): cache(ndev) {} + + // Separate caches for each device for fast lookup + std::vector> cache; +}; + +std::shared_ptr CUDAService::getCUDAEvent() { + return cudaEventCache_->cache[getCurrentDevice()].makeOrGet([](){ + auto current_device = cuda::device::current::get(); + // We should not return a recorded, but not-yet-occurred event + return std::make_unique(current_device.create_event(cuda::event::sync_by_busy_waiting, // default; we should try to avoid explicit synchronization, so maybe the value doesn't matter much? + cuda::event::dont_record_timings)); // it should be a bit faster to ignore timings + }); +} diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp index d0a1afcc8203f..4764d74c427c2 100644 --- a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp +++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp @@ -179,8 +179,8 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") { ps.addUntrackedParameter("allocator", alloc); auto cs = makeCUDAService(ps, ar); cs.setCurrentDevice(0); - auto current_device = cuda::device::current::get(); - auto cudaStream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream); + auto cudaStreamPtr = cs.getCUDAStream(); + auto& cudaStream = *cudaStreamPtr; SECTION("Destructor") { auto ptr = cs.make_device_unique(cudaStream); @@ -214,8 +214,8 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") { ps.addUntrackedParameter("allocator", alloc); auto cs = makeCUDAService(ps, ar); cs.setCurrentDevice(0); - auto current_device = cuda::device::current::get(); - auto cudaStream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream); + auto cudaStreamPtr = cs.getCUDAStream(); + auto& cudaStream = *cudaStreamPtr; SECTION("Destructor") { auto ptr = cs.make_host_unique(cudaStream); diff --git a/HeterogeneousCore/CUDATest/BuildFile.xml b/HeterogeneousCore/CUDATest/BuildFile.xml new file mode 100644 index 0000000000000..112c200812d98 --- /dev/null +++ b/HeterogeneousCore/CUDATest/BuildFile.xml @@ -0,0 +1,3 @@ + + + diff --git a/HeterogeneousCore/CUDATest/interface/CUDAThing.h b/HeterogeneousCore/CUDATest/interface/CUDAThing.h new file mode 100644 index 0000000000000..ecda1f2aafdf6 --- /dev/null +++ b/HeterogeneousCore/CUDATest/interface/CUDAThing.h @@ -0,0 +1,19 @@ +#ifndef HeterogeneousCore_CUDATest_CUDAThing_H +#define HeterogeneousCore_CUDATest_CUDAThing_H + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" + +class CUDAThing { +public: + CUDAThing() = default; + CUDAThing(cudautils::device::unique_ptr ptr): + ptr_(std::move(ptr)) + {} + + const float *get() const { return ptr_.get(); } + +private: + cudautils::device::unique_ptr ptr_;; +}; + +#endif diff --git a/HeterogeneousCore/CUDATest/plugins/BuildFile.xml b/HeterogeneousCore/CUDATest/plugins/BuildFile.xml new file mode 100644 index 0000000000000..09a8fb844d4c9 --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/BuildFile.xml @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc new file mode 100644 index 0000000000000..30131f796a32c --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc @@ -0,0 +1,65 @@ +#include "FWCore/Framework/interface/global/EDProducer.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" + +#include +#include +#include + +class TestCUDAProducerCPU: public edm::global::EDProducer<> { +public: + explicit TestCUDAProducerCPU(const edm::ParameterSet& iConfig); + ~TestCUDAProducerCPU() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + + void produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const; +private: + std::string label_; + edm::EDGetTokenT srcToken_; + edm::EDPutTokenT dstToken_; +}; + +TestCUDAProducerCPU::TestCUDAProducerCPU(const edm::ParameterSet& iConfig): + label_{iConfig.getParameter("@module_label")}, + dstToken_{produces()} +{ + auto srcTag = iConfig.getParameter("src"); + if(!srcTag.label().empty()) { + srcToken_ = consumes(srcTag); + } +} + +void TestCUDAProducerCPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("src", edm::InputTag())->setComment("Optional source of another TestCUDAProducerCPU."); + descriptions.addWithDefaultLabel(desc); + descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a CPU algorithm."); +} + +void TestCUDAProducerCPU::produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const { + edm::LogVerbatim("TestCUDAProducerCPU") << label_ << " TestCUDAProducerCPU::produce begin event " << iEvent.id().event() << " stream " << id; + + int input = 0; + if(!srcToken_.isUninitialized()) { + input = iEvent.get(srcToken_); + } + + std::random_device r; + std::mt19937 gen(r()); + auto dist = std::uniform_real_distribution<>(0.2, 1.5); + auto dur = dist(gen); + edm::LogVerbatim("TestCUDAProducerCPU") << " Task (CPU) for event " << iEvent.id().event() << " in stream " << id << " will take " << dur << " seconds"; + std::this_thread::sleep_for(std::chrono::seconds(1)*dur); + + const unsigned int output = input + id*100 + iEvent.id().event(); + + iEvent.emplace(dstToken_, output); + + edm::LogVerbatim("TestCUDAProducerCPU") << label_ << " TestCUDAProducerCPU::produce end event " << iEvent.id().event() << " stream " << id << " result " << output; +} + +DEFINE_FWK_MODULE(TestCUDAProducerCPU); diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc new file mode 100644 index 0000000000000..fc74b714f22c5 --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc @@ -0,0 +1,54 @@ +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDProducer.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h" + +#include "TestCUDAProducerGPUKernel.h" + +class TestCUDAProducerGPU: public edm::global::EDProducer<> { +public: + explicit TestCUDAProducerGPU(const edm::ParameterSet& iConfig); + ~TestCUDAProducerGPU() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + + void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override; +private: + std::string label_; + edm::EDGetTokenT> srcToken_; + edm::EDPutTokenT> dstToken_; + TestCUDAProducerGPUKernel gpuAlgo_; +}; + +TestCUDAProducerGPU::TestCUDAProducerGPU(const edm::ParameterSet& iConfig): + label_(iConfig.getParameter("@module_label")), + srcToken_(consumes>(iConfig.getParameter("src"))), + dstToken_(produces>()) +{} + +void TestCUDAProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("src", edm::InputTag())->setComment("Source of CUDAProduct."); + descriptions.addWithDefaultLabel(desc); + descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this is not the first algorithm in the chain of the GPU EDProducers. Produces CUDAProduct."); +} + +void TestCUDAProducerGPU::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { + edm::LogVerbatim("TestCUDAProducerGPU") << label_ << " TestCUDAProducerGPU::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID(); + + const auto& in = iEvent.get(srcToken_); + CUDAScopedContext ctx{in}; + const CUDAThing& input = ctx.get(in); + + ctx.emplace(iEvent, dstToken_, CUDAThing{gpuAlgo_.runAlgo(label_, input.get(), ctx.stream())}); + + edm::LogVerbatim("TestCUDAProducerGPU") << label_ << " TestCUDAProducerGPU::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID(); +} + +DEFINE_FWK_MODULE(TestCUDAProducerGPU); diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc new file mode 100644 index 0000000000000..b084ad8920c20 --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc @@ -0,0 +1,74 @@ +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAContextToken.h" +#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h" + +#include "TestCUDAProducerGPUKernel.h" + +class TestCUDAProducerGPUEW: public edm::stream::EDProducer { +public: + explicit TestCUDAProducerGPUEW(const edm::ParameterSet& iConfig); + ~TestCUDAProducerGPUEW() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + + void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; +private: + std::string label_; + edm::EDGetTokenT> srcToken_; + edm::EDPutTokenT> dstToken_; + TestCUDAProducerGPUKernel gpuAlgo_; + CUDAContextToken ctxTmp_; + cudautils::device::unique_ptr devicePtr_; + float hostData_ = 0.f; +}; + +TestCUDAProducerGPUEW::TestCUDAProducerGPUEW(const edm::ParameterSet& iConfig): + label_{iConfig.getParameter("@module_label")}, + srcToken_{consumes>(iConfig.getParameter("src"))}, + dstToken_{produces>()} +{} + +void TestCUDAProducerGPUEW::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("src", edm::InputTag()); + descriptions.addWithDefaultLabel(desc); +} + +void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire begin event " << iEvent.id().event() << " stream " << iEvent.streamID(); + + const auto& in = iEvent.get(srcToken_); + CUDAScopedContext ctx{in, std::move(waitingTaskHolder)}; + const CUDAThing& input = ctx.get(in); + + devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx.stream()); + // Mimick the need to transfer some of the GPU data back to CPU to + // be used for something within this module, or to be put in the + // event. + cuda::memory::async::copy(&hostData_, devicePtr_.get()+10, sizeof(float), ctx.stream().id()); + + edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID(); + + ctxTmp_ = ctx.toToken(); +} + +void TestCUDAProducerGPUEW::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { + edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID() << " 10th element " << hostData_; + + CUDAScopedContext ctx{std::move(ctxTmp_)}; + + ctx.emplace(iEvent, dstToken_, std::move(devicePtr_)); + + edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID(); +} + +DEFINE_FWK_MODULE(TestCUDAProducerGPUEW); diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc new file mode 100644 index 0000000000000..2c78e2730be8b --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc @@ -0,0 +1,50 @@ +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDProducer.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h" + +#include "TestCUDAProducerGPUKernel.h" + +class TestCUDAProducerGPUFirst: public edm::global::EDProducer<> { +public: + explicit TestCUDAProducerGPUFirst(const edm::ParameterSet& iConfig); + ~TestCUDAProducerGPUFirst() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + + void produce(edm::StreamID stream, edm::Event& iEvent, const edm::EventSetup& iSetup) const override; +private: + std::string label_; + TestCUDAProducerGPUKernel gpuAlgo_; +}; + +TestCUDAProducerGPUFirst::TestCUDAProducerGPUFirst(const edm::ParameterSet& iConfig): + label_(iConfig.getParameter("@module_label")) +{ + produces>(); +} + +void TestCUDAProducerGPUFirst::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + descriptions.addWithDefaultLabel(desc); + descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this the first algorithm in the chain of the GPU EDProducers. Produces CUDA."); +} + +void TestCUDAProducerGPUFirst::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { + edm::LogVerbatim("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID(); + + CUDAScopedContext ctx{streamID}; + + cudautils::device::unique_ptr output = gpuAlgo_.runAlgo(label_, ctx.stream()); + iEvent.put(ctx.wrap(CUDAThing(std::move(output)))); + + edm::LogVerbatim("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID(); +} + +DEFINE_FWK_MODULE(TestCUDAProducerGPUFirst); diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu new file mode 100644 index 0000000000000..0bffb6656f31c --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu @@ -0,0 +1,112 @@ +#include "TestCUDAProducerGPUKernel.h" + +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/Utilities/interface/Exception.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + +namespace { + template + __global__ + void vectorAdd(const T *a, const T *b, T *c, int numElements) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if(i < numElements) { c[i] = a[i] + b[i]; } + } + + template + __global__ + void vectorProd(const T *a, const T *b, T *c, int numElements) { + int row = blockIdx.y*blockDim.y + threadIdx.y; + int col = blockIdx.x*blockDim.x + threadIdx.x; + + if(row < numElements && col < numElements) { + c[row*numElements + col] = a[row]*b[col]; + } + } + + template + __global__ + void matrixMul(const T *a, const T *b, T *c, int numElements) { + int row = blockIdx.y*blockDim.y + threadIdx.y; + int col = blockIdx.x*blockDim.x + threadIdx.x; + + if(row < numElements && col < numElements) { + T tmp = 0; + for(int i=0; i + __global__ + void matrixMulVector(const T *a, const T *b, T *c, int numElements) { + int row = blockIdx.y*blockDim.y + threadIdx.y; + + if(row < numElements) { + T tmp = 0; + for(int i=0; i TestCUDAProducerGPUKernel::runAlgo(const std::string& label, const float *d_input, cuda::stream_t<>& stream) const { + // First make the sanity check + if(d_input != nullptr) { + auto h_check = std::make_unique(NUM_VALUES); + cuda::memory::copy(h_check.get(), d_input, NUM_VALUES*sizeof(float)); + for(int i=0; i cs; + + auto h_a = cs->make_host_unique(NUM_VALUES, stream); + auto h_b = cs->make_host_unique(NUM_VALUES, stream); + + for (auto i=0; imake_device_unique(NUM_VALUES, stream); + auto d_b = cs->make_device_unique(NUM_VALUES, stream); + + cuda::memory::async::copy(d_a.get(), h_a.get(), NUM_VALUES*sizeof(float), stream.id()); + cuda::memory::async::copy(d_b.get(), h_b.get(), NUM_VALUES*sizeof(float), stream.id()); + + int threadsPerBlock {32}; + int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock; + + auto d_c = cs->make_device_unique(NUM_VALUES, stream); + auto current_device = cuda::device::current::get(); + edm::LogVerbatim("TestHeterogeneousEDProducerGPU") << " " << label << " GPU launching kernels device " << current_device.id() << " CUDA stream " << stream.id(); + vectorAdd<<>>(d_a.get(), d_b.get(), d_c.get(), NUM_VALUES); + + auto d_ma = cs->make_device_unique(NUM_VALUES*NUM_VALUES, stream); + auto d_mb = cs->make_device_unique(NUM_VALUES*NUM_VALUES, stream); + auto d_mc = cs->make_device_unique(NUM_VALUES*NUM_VALUES, stream); + dim3 threadsPerBlock3{NUM_VALUES, NUM_VALUES}; + dim3 blocksPerGrid3{1,1}; + if(NUM_VALUES*NUM_VALUES > 32) { + threadsPerBlock3.x = 32; + threadsPerBlock3.y = 32; + blocksPerGrid3.x = ceil(double(NUM_VALUES)/double(threadsPerBlock3.x)); + blocksPerGrid3.y = ceil(double(NUM_VALUES)/double(threadsPerBlock3.y)); + } + vectorProd<<>>(d_a.get(), d_b.get(), d_ma.get(), NUM_VALUES); + vectorProd<<>>(d_a.get(), d_c.get(), d_mb.get(), NUM_VALUES); + matrixMul<<>>(d_ma.get(), d_mb.get(), d_mc.get(), NUM_VALUES); + + matrixMulVector<<>>(d_mc.get(), d_b.get(), d_c.get(), NUM_VALUES); + + edm::LogVerbatim("TestHeterogeneousEDProducerGPU") << " " << label << " GPU kernels launched, returning return pointer device " << current_device.id() << " CUDA stream " << stream.id(); + return d_a; +} diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h new file mode 100644 index 0000000000000..d03c4c1e0dc70 --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h @@ -0,0 +1,31 @@ +#ifndef HeterogeneousCore_CUDACore_TestCUDAProducerGPUKernel_h +#define HeterogeneousCore_CUDACore_TestCUDAProducerGPUKernel_h + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" + +#include + +/** + * This class models the actual CUDA implementation of an algorithm. + * + * Memory is allocated dynamically with the allocator in CUDAService + * + * The algorithm is intended to waste time with large matrix + * operations so that the asynchronous nature of the CUDA integration + * becomes visible with debug prints. + */ +class TestCUDAProducerGPUKernel { +public: + static constexpr int NUM_VALUES = 4000; + + TestCUDAProducerGPUKernel() = default; + ~TestCUDAProducerGPUKernel() = default; + + // returns (owning) pointer to device memory + cudautils::device::unique_ptr runAlgo(const std::string& label, cuda::stream_t<>& stream) const { + return runAlgo(label, nullptr, stream); + } + cudautils::device::unique_ptr runAlgo(const std::string& label, const float *d_input, cuda::stream_t<>& stream) const; +}; + +#endif diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc new file mode 100644 index 0000000000000..b5653d0af366d --- /dev/null +++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc @@ -0,0 +1,75 @@ +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ServiceRegistry/interface/Service.h" + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h" + +#include "TestCUDAProducerGPUKernel.h" + +class TestCUDAProducerGPUtoCPU: public edm::stream::EDProducer { +public: + explicit TestCUDAProducerGPUtoCPU(const edm::ParameterSet& iConfig); + ~TestCUDAProducerGPUtoCPU() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + + void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; +private: + std::string label_; + edm::EDGetTokenT> srcToken_; + edm::EDPutTokenT dstToken_; + cudautils::host::unique_ptr buffer_; +}; + +TestCUDAProducerGPUtoCPU::TestCUDAProducerGPUtoCPU(const edm::ParameterSet& iConfig): + label_{iConfig.getParameter("@module_label")}, + srcToken_{consumes>(iConfig.getParameter("src"))}, + dstToken_{produces()} +{} + +void TestCUDAProducerGPUtoCPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("src", edm::InputTag())->setComment("Source for CUDAProduct."); + descriptions.addWithDefaultLabel(desc); + descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models the GPU->CPU data transfer and formatting of the data to legacy data format. Produces int, to be compatible with TestCUDAProducerCPU."); +} + +void TestCUDAProducerGPUtoCPU::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire begin event " << iEvent.id().event() << " stream " << iEvent.streamID(); + + const auto& in = iEvent.get(srcToken_); + CUDAScopedContext ctx{in, std::move(waitingTaskHolder)}; + const CUDAThing& device = ctx.get(in); + + edm::Service cs; + buffer_ = cs->make_host_unique(TestCUDAProducerGPUKernel::NUM_VALUES, ctx.stream()); + // Enqueue async copy, continue in produce once finished + cuda::memory::async::copy(buffer_.get(), device.get(), TestCUDAProducerGPUKernel::NUM_VALUES*sizeof(float), ctx.stream().id()); + + edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID(); +} + +void TestCUDAProducerGPUtoCPU::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { + edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID(); + + int counter = 0; + for(int i=0; i + + + diff --git a/HeterogeneousCore/CUDATest/test/BuildFile.xml b/HeterogeneousCore/CUDATest/test/BuildFile.xml new file mode 100644 index 0000000000000..3287d65c14470 --- /dev/null +++ b/HeterogeneousCore/CUDATest/test/BuildFile.xml @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/HeterogeneousCore/CUDATest/test/TestCUDATest.cc b/HeterogeneousCore/CUDATest/test/TestCUDATest.cc new file mode 100644 index 0000000000000..b2991bd18ae57 --- /dev/null +++ b/HeterogeneousCore/CUDATest/test/TestCUDATest.cc @@ -0,0 +1,3 @@ +#include "FWCore/Utilities/interface/TestHelper.h" + +RUNTEST() diff --git a/HeterogeneousCore/CUDATest/test/runtests.sh b/HeterogeneousCore/CUDATest/test/runtests.sh new file mode 100755 index 0000000000000..6817aa8d7ffab --- /dev/null +++ b/HeterogeneousCore/CUDATest/test/runtests.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +function die { echo Failure $1: status $2 ; exit $2 ; } + +pushd ${LOCAL_TMP_DIR} + + echo "*************************************************" + echo "CUDA producer configuration with SwitchProducer" + cmsRun ${LOCAL_TEST_DIR}/testCUDASwitch_cfg.py || die "cmsRun testCUDASwitch_cfg.py 1" $? + +popd diff --git a/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py b/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py new file mode 100644 index 0000000000000..8bac73608065d --- /dev/null +++ b/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py @@ -0,0 +1,92 @@ +import FWCore.ParameterSet.Config as cms + +silent = True +#silent = False + +from Configuration.ProcessModifiers.gpu_cff import gpu +process = cms.Process("Test") +process.load("FWCore.MessageService.MessageLogger_cfi") +process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi") + +process.source = cms.Source("EmptySource") + +process.maxEvents = cms.untracked.PSet( input = cms.untracked.int32(3) ) +if not silent: + process.maxEvents.input = 10 + process.MessageLogger.cerr.threshold = cms.untracked.string("INFO") + process.MessageLogger.cerr.INFO.limit = process.MessageLogger.cerr.default.limit + + +process.options = cms.untracked.PSet( +# numberOfThreads = cms.untracked.uint32(4), + numberOfStreams = cms.untracked.uint32(0) +) +#process.Tracer = cms.Service("Tracer") + +# Flow diagram of the modules +# +# 1 5 +# / \ | +# 2 4 6 +# | +# 3 + +from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA + +process.load("HeterogeneousCore.CUDATest.prod1Switch_cff") +process.load("HeterogeneousCore.CUDATest.prod5Switch_cff") +process.load("HeterogeneousCore.CUDATest.prod6Switch_cff") + +# GPU producers +from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst +from HeterogeneousCore.CUDATest.testCUDAProducerGPU_cfi import testCUDAProducerGPU +from HeterogeneousCore.CUDATest.testCUDAProducerGPUEW_cfi import testCUDAProducerGPUEW +from HeterogeneousCore.CUDATest.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU + +process.prod2CUDA = testCUDAProducerGPU.clone(src = "prod1CUDA") +process.prod3CUDA = testCUDAProducerGPU.clone(src = "prod2CUDA") +process.prod4CUDA = testCUDAProducerGPUEW.clone(src = "prod1CUDA") + +# CPU producers, switched with modules to copy data from GPU to CPU +# (as "on demand" as any other EDProducer, i.e. according to +# consumes() and prefetching). If a separate conversion step is needed +# to get the same data formats as the CPU modules, those are then ones +# that should be replaced-with here. +from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU +process.prod2 = SwitchProducerCUDA( + cpu = testCUDAProducerCPU.clone(src = "prod1"), + cuda = testCUDAProducerGPUtoCPU.clone(src = "prod2CUDA") +) +process.prod3 = SwitchProducerCUDA( + cpu = testCUDAProducerCPU.clone(src = "prod2"), + cuda = testCUDAProducerGPUtoCPU.clone(src = "prod3CUDA") +) +process.prod4 = SwitchProducerCUDA( + cpu = testCUDAProducerCPU.clone(src = "prod1"), + cuda = testCUDAProducerGPUtoCPU.clone(src = "prod4CUDA") +) + +process.out = cms.OutputModule("AsciiOutputModule", + outputCommands = cms.untracked.vstring( + "keep *_prod3_*_*", + "keep *_prod4_*_*", + "keep *_prod5_*_*", + ), + verbosity = cms.untracked.uint32(0), +) + +process.prod2Task = cms.Task(process.prod2, process.prod2CUDA) +process.prod3Task = cms.Task(process.prod3, process.prod3CUDA) +process.prod4Task = cms.Task(process.prod4, process.prod4CUDA) + +process.t = cms.Task( + process.prod1Task, + process.prod2Task, + process.prod3Task, + process.prod4Task, + process.prod5Task, + process.prod6Task +) +process.p = cms.Path() +process.p.associate(process.t) +process.ep = cms.EndPath(process.out) diff --git a/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py b/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py new file mode 100644 index 0000000000000..5cb6c678402b4 --- /dev/null +++ b/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py @@ -0,0 +1,79 @@ +import FWCore.ParameterSet.Config as cms + +enableGPU = True + +from Configuration.ProcessModifiers.gpu_cff import gpu +process = cms.Process("Test", gpu) if enableGPU else cms.Process("Test") +process.load("FWCore.MessageService.MessageLogger_cfi") +process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi") + +process.source = cms.Source("EmptySource") + +process.maxEvents = cms.untracked.PSet( input = cms.untracked.int32(10) ) + +process.options = cms.untracked.PSet( +# numberOfThreads = cms.untracked.uint32(4), + numberOfStreams = cms.untracked.uint32(0) +) +#process.Tracer = cms.Service("Tracer") + +# Flow diagram of the modules +# +# 1 5 +# / \ | +# 2 4 6 +# | +# 3 + +process.load("HeterogeneousCore.CUDATest.prod1_cff") +process.load("HeterogeneousCore.CUDATest.prod5_cff") +process.load("HeterogeneousCore.CUDATest.prod6_cff") + +# CPU producers +from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU +process.prod2 = testCUDAProducerCPU.clone(src = "prod1") +process.prod3 = testCUDAProducerCPU.clone(src = "prod2") +process.prod4 = testCUDAProducerCPU.clone(src = "prod1") + +from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst +from HeterogeneousCore.CUDATest.testCUDAProducerGPU_cfi import testCUDAProducerGPU +from HeterogeneousCore.CUDATest.testCUDAProducerGPUEW_cfi import testCUDAProducerGPUEW +from HeterogeneousCore.CUDATest.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU + +# GPU producers +process.prod2CUDA = testCUDAProducerGPU.clone(src = "prod1CUDA") +process.prod3CUDA = testCUDAProducerGPU.clone(src = "prod2CUDA") +process.prod4CUDA = testCUDAProducerGPUEW.clone(src = "prod1CUDA") + +# Modules to copy data from GPU to CPU (as "on demand" as any other +# EDProducer, i.e. according to consumes() and prefetching). If a +# separate conversion step is needed to get the same data formats as +# the CPU modules, those are then ones that should be replaced-with here. +gpu.toReplaceWith(process.prod2, testCUDAProducerGPUtoCPU.clone(src = "prod2CUDA")) +gpu.toReplaceWith(process.prod3, testCUDAProducerGPUtoCPU.clone(src = "prod3CUDA")) +gpu.toReplaceWith(process.prod4, testCUDAProducerGPUtoCPU.clone(src = "prod4CUDA")) + +process.out = cms.OutputModule("AsciiOutputModule", + outputCommands = cms.untracked.vstring( + "keep *_prod3_*_*", + "keep *_prod4_*_*", + "keep *_prod5_*_*", + ), + verbosity = cms.untracked.uint32(0), +) + +process.prod2Task = cms.Task(process.prod2, process.prod2CUDA) +process.prod3Task = cms.Task(process.prod3, process.prod3CUDA) +process.prod4Task = cms.Task(process.prod4, process.prod4CUDA) + +process.t = cms.Task( + process.prod1Task, + process.prod2Task, + process.prod3Task, + process.prod4Task, + process.prod5Task, + process.prod6Task +) +process.p = cms.Path() +process.p.associate(process.t) +process.ep = cms.EndPath(process.out) diff --git a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc new file mode 100644 index 0000000000000..f19bd24813fbf --- /dev/null +++ b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc @@ -0,0 +1,91 @@ +#include "catch.hpp" +#include "FWCore/TestProcessor/interface/TestProcessor.h" +#include "FWCore/Utilities/interface/Exception.h" + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" + +#include + +static constexpr auto s_tag = "[TestCUDAProducerGPUFirst]"; + +TEST_CASE("Standard checks of TestCUDAProducerGPUFirst", s_tag) { + const std::string baseConfig{ +R"_(from FWCore.TestProcessor.TestProcess import * +process = TestProcess() +process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi") +process.toTest = cms.EDProducer("TestCUDAProducerGPUFirst") +process.moduleToTest(process.toTest) +)_" + }; + + edm::test::TestProcessor::Config config{ baseConfig }; + SECTION("base configuration is OK") { + REQUIRE_NOTHROW(edm::test::TestProcessor(config)); + } + + SECTION("No event data") { + edm::test::TestProcessor tester(config); + + REQUIRE_NOTHROW(tester.test()); + } + + SECTION("beginJob and endJob only") { + edm::test::TestProcessor tester(config); + + REQUIRE_NOTHROW(tester.testBeginAndEndJobOnly()); + } + + SECTION("Run with no LuminosityBlocks") { + edm::test::TestProcessor tester(config); + + REQUIRE_NOTHROW(tester.testRunWithNoLuminosityBlocks()); + } + + SECTION("LuminosityBlock with no Events") { + edm::test::TestProcessor tester(config); + + REQUIRE_NOTHROW(tester.testLuminosityBlockWithNoEvents()); + } + +} + +TEST_CASE("TestCUDAProducerGPUFirst operation", s_tag) { + const std::string baseConfig{ +R"_(from FWCore.TestProcessor.TestProcess import * +process = TestProcess() +process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi") +process.toTest = cms.EDProducer("TestCUDAProducerGPUFirst") +process.moduleToTest(process.toTest) +)_" + }; + edm::test::TestProcessor::Config config{ baseConfig }; + + exitSansCUDADevices(); + + constexpr int defaultDevice = 0; + + SECTION("Produce") { + edm::test::TestProcessor tester{config}; + auto event = tester.test(); + auto prod = event.get >(); + REQUIRE(prod->device() == defaultDevice); + auto ctx = CUDAScopedContext(*prod); + const CUDAThing& thing = ctx.get(*prod); + const float *data = thing.get(); + REQUIRE(data != nullptr); + + float firstElements[10]; + cuda::memory::async::copy(firstElements, data, sizeof(float)*10, prod->stream().id()); + + std::cout << "Synchronizing with CUDA stream" << std::endl; + auto stream = prod->stream(); + stream.synchronize(); + std::cout << "Synchronized" << std::endl; + REQUIRE(firstElements[0] == 0.f); + REQUIRE(firstElements[1] == 1.f); + REQUIRE(firstElements[9] == 9.f); + } +}; diff --git a/HeterogeneousCore/CUDATest/test/test_main.cc b/HeterogeneousCore/CUDATest/test/test_main.cc new file mode 100644 index 0000000000000..0c7c351f437f5 --- /dev/null +++ b/HeterogeneousCore/CUDATest/test/test_main.cc @@ -0,0 +1,2 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" diff --git a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h new file mode 100644 index 0000000000000..fa0db60aea592 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h @@ -0,0 +1,43 @@ +#ifndef HeterogeneousCore_CUDAUtilities_copyAsync_h +#define HeterogeneousCore_CUDAUtilities_copyAsync_h + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" + +#include + +#include + +namespace cudautils { + // Single element + template + inline + void copyAsync(cudautils::device::unique_ptr& dst, const cudautils::host::unique_ptr& src, cuda::stream_t<>& stream) { + // Shouldn't compile for array types because of sizeof(T), but + // let's add an assert with a more helpful message + static_assert(std::is_array::value == false, "For array types, use the other overload with the size parameter"); + cuda::memory::async::copy(dst.get(), src.get(), sizeof(T), stream.id()); + } + + template + inline + void copyAsync(cudautils::host::unique_ptr& dst, const cudautils::device::unique_ptr& src, cuda::stream_t<>& stream) { + static_assert(std::is_array::value == false, "For array types, use the other overload with the size parameter"); + cuda::memory::async::copy(dst.get(), src.get(), sizeof(T), stream.id()); + } + + // Multiple elements + template + inline + void copyAsync(cudautils::device::unique_ptr& dst, const cudautils::host::unique_ptr& src, size_t nelements, cuda::stream_t<>& stream) { + cuda::memory::async::copy(dst.get(), src.get(), nelements*sizeof(T), stream.id()); + } + + template + inline + void copyAsync(cudautils::host::unique_ptr& dst, const cudautils::device::unique_ptr& src, size_t nelements, cuda::stream_t<>& stream) { + cuda::memory::async::copy(dst.get(), src.get(), nelements*sizeof(T), stream.id()); + } +} + +#endif diff --git a/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h new file mode 100644 index 0000000000000..06a0424450983 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h @@ -0,0 +1,27 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_device_unique_ptr_h +#define HeterogeneousCore_CUDAUtilities_interface_device_unique_ptr_h + +#include +#include + +namespace cudautils { + namespace device { + namespace impl { + // Additional layer of types to distinguish from host::unique_ptr + class DeviceDeleter { + public: + DeviceDeleter() = default; + explicit DeviceDeleter(std::function f): f_(f) {} + + void operator()(void *ptr) { f_(ptr); } + private: + std::function f_; + }; + } + + template + using unique_ptr = std::unique_ptr; + } +} + +#endif diff --git a/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h new file mode 100644 index 0000000000000..2a39c475cbb91 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h @@ -0,0 +1,27 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_host_unique_ptr_h +#define HeterogeneousCore_CUDAUtilities_interface_host_unique_ptr_h + +#include +#include + +namespace cudautils { + namespace host { + namespace impl { + // Additional layer of types to distinguish from host::unique_ptr + class HostDeleter { + public: + HostDeleter() = default; + explicit HostDeleter(std::function f): f_(f) {} + + void operator()(void *ptr) { f_(ptr); } + private: + std::function f_; + }; + } + + template + using unique_ptr = std::unique_ptr; + } +} + +#endif diff --git a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h new file mode 100644 index 0000000000000..d87c50b666b61 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h @@ -0,0 +1,33 @@ +#ifndef HeterogeneousCore_CUDAUtilities_memsetAsync_h +#define HeterogeneousCore_CUDAUtilities_memsetAsync_h + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" + +#include + +#include + +namespace cudautils { + template + inline + void memsetAsync(cudautils::device::unique_ptr& ptr, T value, cuda::stream_t<>& stream) { + // Shouldn't compile for array types because of sizeof(T), but + // let's add an assert with a more helpful message + static_assert(std::is_array::value == false, "For array types, use the other overload with the size parameter"); + cuda::memory::device::async::set(ptr.get(), value, sizeof(T), stream.id()); + } + + /** + * The type of `value` is `int` because of `cudaMemsetAsync()` takes + * it as an `int`. Note that `cudaMemsetAsync()` sets the value of + * each **byte** to `value`. This may lead to unexpected results if + * `sizeof(T) > 1` and `value != 0`. + */ + template + inline + void memsetAsync(cudautils::device::unique_ptr& ptr, int value, size_t nelements, cuda::stream_t<>& stream) { + cuda::memory::device::async::set(ptr.get(), value, nelements*sizeof(T), stream.id()); + } +} + +#endif diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml index d3f8d77a35d32..f6d6a4bb7e594 100644 --- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml @@ -66,3 +66,7 @@ + + + + diff --git a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp new file mode 100644 index 0000000000000..adfe833c5dcb4 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp @@ -0,0 +1,132 @@ +#include "catch.hpp" + +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" + +namespace { + CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) { + auto desc = edm::ConfigurationDescriptions("Service", "CUDAService"); + CUDAService::fillDescriptions(desc); + desc.validate(ps, "CUDAService"); + return CUDAService(ps, ar); + } +} + +TEST_CASE("copyAsync", "[cudaMemTools]") { + exitSansCUDADevices(); + + edm::ActivityRegistry ar; + edm::ParameterSet ps; + auto cs = makeCUDAService(ps, ar); + + auto current_device = cuda::device::current::get(); + auto stream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream); + + SECTION("Host to device") { + SECTION("Single element") { + auto host_orig = cs.make_host_unique(stream); + *host_orig = 42; + + auto device = cs.make_device_unique(stream); + auto host = cs.make_host_unique(stream); + + cudautils::copyAsync(device, host_orig, stream); + cuda::memory::async::copy(host.get(), device.get(), sizeof(int), stream.id()); + stream.synchronize(); + + REQUIRE(*host == 42); + } + + SECTION("Multiple elements") { + constexpr int N = 100; + + auto host_orig = cs.make_host_unique(N, stream); + for(int i=0; i(N, stream); + auto host = cs.make_host_unique(N, stream); + + SECTION("Copy all") { + cudautils::copyAsync(device, host_orig, N, stream); + cuda::memory::async::copy(host.get(), device.get(), N*sizeof(int), stream.id()); + stream.synchronize(); + for(int i=0; i(stream); + *host_orig = 42; + + auto device = cs.make_device_unique(stream); + auto host = cs.make_host_unique(stream); + + cuda::memory::async::copy(device.get(), host_orig.get(), sizeof(int), stream.id()); + cudautils::copyAsync(host, device, stream); + stream.synchronize(); + + REQUIRE(*host == 42); + } + + SECTION("Multiple elements") { + constexpr int N = 100; + + auto host_orig = cs.make_host_unique(N, stream); + for(int i=0; i(N, stream); + auto host = cs.make_host_unique(N, stream); + + SECTION("Copy all") { + cuda::memory::async::copy(device.get(), host_orig.get(), N*sizeof(int), stream.id()); + cudautils::copyAsync(host, device, N, stream); + stream.synchronize(); + for(int i=0; i(stream); + *host_orig = 42; + + auto device = cs.make_device_unique(stream); + auto host = cs.make_host_unique(stream); + cudautils::copyAsync(device, host_orig, stream); + cudautils::memsetAsync(device, 0, stream); + cudautils::copyAsync(host, device, stream); + stream.synchronize(); + + REQUIRE(*host == 0); + } + + SECTION("Multiple elements") { + constexpr int N = 100; + + auto host_orig = cs.make_host_unique(N, stream); + for(int i=0; i(N, stream); + auto host = cs.make_host_unique(N, stream); + cudautils::copyAsync(device, host_orig, N, stream); + cudautils::memsetAsync(device, 0, N, stream); + cudautils::copyAsync(host, device, N, stream); + stream.synchronize(); + + for(int i=0; i < N; ++i) { + CHECK(host[i] == 0); + } + } + + //Fake the end-of-job signal. + ar.postEndJobSignal_(); +} + diff --git a/HeterogeneousCore/CUDAUtilities/test/testCatch2Main.cpp b/HeterogeneousCore/CUDAUtilities/test/testCatch2Main.cpp new file mode 100644 index 0000000000000..0c7c351f437f5 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/testCatch2Main.cpp @@ -0,0 +1,2 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" diff --git a/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py b/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py index b75e75e000d48..a486a83d178f4 100644 --- a/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py +++ b/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py @@ -9,11 +9,11 @@ from RecoLocalTracker.SiStripRecHitConverter.StripCPEfromTrackAngle_cfi import * from RecoLocalTracker.SiStripZeroSuppression.SiStripZeroSuppression_cfi import * from RecoLocalTracker.SiStripClusterizer.SiStripClusterizer_cfi import * -from RecoLocalTracker.SiPixelClusterizer.SiPixelClusterizerPreSplitting_cfi import * +from RecoLocalTracker.SiPixelClusterizer.siPixelClustersPreSplitting_cff import * from RecoLocalTracker.SiPixelRecHits.SiPixelRecHits_cfi import * from RecoLocalTracker.SubCollectionProducers.clustersummaryproducer_cfi import * -pixeltrackerlocalrecoTask = cms.Task(siPixelClustersPreSplitting,siPixelRecHitsPreSplitting) +pixeltrackerlocalrecoTask = cms.Task(siPixelClustersPreSplittingTask,siPixelRecHitsPreSplitting) striptrackerlocalrecoTask = cms.Task(siStripZeroSuppression,siStripClusters,siStripMatchedRecHits) trackerlocalrecoTask = cms.Task(pixeltrackerlocalrecoTask,striptrackerlocalrecoTask,clusterSummaryProducer) diff --git a/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h b/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h index e5e5f41053e3d..f0d996bd7310b 100644 --- a/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h +++ b/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h @@ -1,10 +1,9 @@ #ifndef RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h #define RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h -#include "CUDADataFormats/Common/interface/device_unique_ptr.h" -#include "CUDADataFormats/Common/interface/host_unique_ptr.h" #include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h" #include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPU.h" #include @@ -30,7 +29,7 @@ class SiPixelFedCablingMapGPUWrapper { // returns pointer to GPU memory const unsigned char *getModToUnpAllAsync(cuda::stream_t<>& cudaStream) const; - edm::cuda::device::unique_ptr getModToUnpRegionalAsync(std::set const& modules, cuda::stream_t<>& cudaStream) const; + cudautils::device::unique_ptr getModToUnpRegionalAsync(std::set const& modules, cuda::stream_t<>& cudaStream) const; private: const SiPixelFedCablingMap *cablingMap_; diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc new file mode 100644 index 0000000000000..4c405a8c85afd --- /dev/null +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc @@ -0,0 +1,158 @@ +#include "DataFormats/Common/interface/DetSetVector.h" +#include "DataFormats/Common/interface/Handle.h" +#include "DataFormats/DetId/interface/DetId.h" +#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h" +#include "DataFormats/SiPixelDigi/interface/PixelDigi.h" +#include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h" +#include "DataFormats/TrackerCommon/interface/TrackerTopology.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "Geometry/Records/interface/TrackerTopologyRcd.h" + +namespace { + struct AccretionCluster { + typedef unsigned short UShort; + static constexpr UShort MAXSIZE = 256; + UShort adc[MAXSIZE]; + UShort x[MAXSIZE]; + UShort y[MAXSIZE]; + UShort xmin=16000; + UShort ymin=16000; + unsigned int isize=0; + int charge=0; + + void clear() { + isize=0; + charge=0; + xmin=16000; + ymin=16000; + } + + bool add(SiPixelCluster::PixelPos const & p, UShort const iadc) { + if (isize==MAXSIZE) return false; + xmin=std::min(xmin,(unsigned short)(p.row())); + ymin=std::min(ymin,(unsigned short)(p.col())); + adc[isize]=iadc; + x[isize]=p.row(); + y[isize++]=p.col(); + charge+=iadc; + return true; + } + }; + + constexpr uint32_t dummydetid = 0xffffffff; +} + +class SiPixelDigisClustersFromSoA: public edm::global::EDProducer<> { +public: + explicit SiPixelDigisClustersFromSoA(const edm::ParameterSet& iConfig); + ~SiPixelDigisClustersFromSoA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override; + + edm::EDGetTokenT digiGetToken_; + + edm::EDPutTokenT> digiPutToken_; + edm::EDPutTokenT clusterPutToken_; + +}; + +SiPixelDigisClustersFromSoA::SiPixelDigisClustersFromSoA(const edm::ParameterSet& iConfig): + digiGetToken_(consumes(iConfig.getParameter("src"))), + digiPutToken_(produces>()), + clusterPutToken_(produces()) +{} + +void SiPixelDigisClustersFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("src", edm::InputTag("siPixelDigisSoA")); + descriptions.addWithDefaultLabel(desc); +} + +void SiPixelDigisClustersFromSoA::produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { + const auto& digis = iEvent.get(digiGetToken_); + + edm::ESHandle trackerTopologyHandle; + iSetup.get().get(trackerTopologyHandle); + const auto& ttopo = *trackerTopologyHandle; + + auto collection = std::make_unique>(); + auto outputClusters = std::make_unique(); + + const uint32_t nDigis = digis.size(); + edm::DetSet * detDigis=nullptr; + for (uint32_t i = 0; i < nDigis; i++) { + if (digis.pdigi(i)==0) continue; + detDigis = &collection->find_or_insert(digis.rawIdArr(i)); + if ( (*detDigis).empty() ) (*detDigis).data.reserve(32); // avoid the first relocations + break; + } + + int32_t nclus=-1; + std::vector aclusters(1024); + auto totCluseFilled=0; + + auto fillClusters = [&](uint32_t detId){ + if (nclus<0) return; // this in reality should never happen + edmNew::DetSetVector::FastFiller spc(*outputClusters, detId); + auto layer = (DetId(detId).subdetId()==1) ? ttopo.pxbLayer(detId) : 0; + auto clusterThreshold = (layer==1) ? 2000 : 4000; + for (int32_t ic=0; ic9000) continue; // not in cluster; TODO add an assert for the size + assert(digis.rawIdArr(i) > 109999); + if ( (*detDigis).detId() != digis.rawIdArr(i)) + { + fillClusters((*detDigis).detId()); + assert(nclus==-1); + detDigis = &collection->find_or_insert(digis.rawIdArr(i)); + if ( (*detDigis).empty() ) + (*detDigis).data.reserve(32); // avoid the first relocations + else { std::cout << "Problem det present twice in input! " << (*detDigis).detId() << std::endl; } + } + (*detDigis).data.emplace_back(digis.pdigi(i)); + auto const & dig = (*detDigis).data.back(); + // fill clusters + assert(digis.clus(i)>=0); + assert(digis.clus(i)<1024); + nclus = std::max(digis.clus(i),nclus); + auto row = dig.row(); + auto col = dig.column(); + SiPixelCluster::PixelPos pix(row,col); + aclusters[digis.clus(i)].add(pix, digis.adc(i)); + } + + // fill final clusters + fillClusters((*detDigis).detId()); + //std::cout << "filled " << totCluseFilled << " clusters" << std::endl; + + iEvent.put(digiPutToken_, std::move(collection)); + iEvent.put(clusterPutToken_, std::move(outputClusters)); +} + +DEFINE_FWK_MODULE(SiPixelDigisClustersFromSoA); diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc new file mode 100644 index 0000000000000..5dc04009f4832 --- /dev/null +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc @@ -0,0 +1,243 @@ +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h" +#include "CalibTracker/Records/interface/SiPixelGainCalibrationForHLTGPURcd.h" +#include "CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h" +#include "CondFormats/DataRecord/interface/SiPixelFedCablingMapRcd.h" +#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingMap.h" +#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingTree.h" +#include "DataFormats/FEDRawData/interface/FEDNumbering.h" +#include "DataFormats/FEDRawData/interface/FEDRawData.h" +#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h" +#include "EventFilter/SiPixelRawToDigi/interface/PixelDataFormatter.h" +#include "EventFilter/SiPixelRawToDigi/interface/PixelUnpackingRegions.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/ESTransientHandle.h" +#include "FWCore/Framework/interface/ESWatcher.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h" +#include "RecoTracker/Record/interface/CkfComponentsRecord.h" + +#include "SiPixelRawToClusterGPUKernel.h" + +#include +#include +#include + +class SiPixelRawToClusterCUDA: public edm::stream::EDProducer { +public: + explicit SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfig); + ~SiPixelRawToClusterCUDA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; + + edm::EDGetTokenT rawGetToken_; + + edm::EDPutTokenT> digiPutToken_; + edm::EDPutTokenT> digiErrorPutToken_; + edm::EDPutTokenT> clusterPutToken_; + + CUDAContextToken ctxTmp_; + + edm::ESWatcher recordWatcher; + + std::string cablingMapLabel_; + std::unique_ptr cabling_; + std::vector fedIds_; + const SiPixelFedCablingMap *cablingMap_ = nullptr; + std::unique_ptr regions_; + + pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_; + PixelDataFormatter::Errors errors_; + + const bool includeErrors_; + const bool useQuality_; + const bool usePilotBlade_; + const bool convertADCtoElectrons_; +}; + +SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfig): + rawGetToken_(consumes(iConfig.getParameter("InputLabel"))), + digiPutToken_(produces>()), + clusterPutToken_(produces>()), + cablingMapLabel_(iConfig.getParameter("CablingMapLabel")), + includeErrors_(iConfig.getParameter("IncludeErrors")), + useQuality_(iConfig.getParameter("UseQualityInfo")), + usePilotBlade_(iConfig.getParameter ("UsePilotBlade")), // Control the usage of pilot-blade data, FED=40 + convertADCtoElectrons_(iConfig.getParameter("ConvertADCtoElectrons")) +{ + if(includeErrors_) { + digiErrorPutToken_ = produces>(); + } + + // regions + if(!iConfig.getParameter("Regions").getParameterNames().empty()) { + regions_ = std::make_unique(iConfig, consumesCollector()); + } + + if(usePilotBlade_) edm::LogInfo("SiPixelRawToCluster") << " Use pilot blade data (FED 40)"; +} + +void SiPixelRawToClusterCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("IncludeErrors",true); + desc.add("UseQualityInfo",false); + desc.add("UsePilotBlade",false)->setComment("## Use pilot blades"); + desc.add("ConvertADCtoElectrons", false)->setComment("## do the calibration ADC-> Electron and apply the threshold, requried for clustering"); + desc.add("InputLabel",edm::InputTag("rawDataCollector")); + { + edm::ParameterSetDescription psd0; + psd0.addOptional>("inputs"); + psd0.addOptional>("deltaPhi"); + psd0.addOptional>("maxZ"); + psd0.addOptional("beamSpot"); + desc.add("Regions",psd0)->setComment("## Empty Regions PSet means complete unpacking"); + } + desc.add("CablingMapLabel","")->setComment("CablingMap label"); //Tav + descriptions.addWithDefaultLabel(desc); +} + + +void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)}; + + edm::ESHandle hgpuMap; + iSetup.get().get(hgpuMap); + if(hgpuMap->hasQuality() != useQuality_) { + throw cms::Exception("LogicError") << "UseQuality of the module (" << useQuality_ << ") differs the one from SiPixelFedCablingMapGPUWrapper. Please fix your configuration."; + } + // get the GPU product already here so that the async transfer can begin + const auto *gpuMap = hgpuMap->getGPUProductAsync(ctx.stream()); + + edm::ESHandle hgains; + iSetup.get().get(hgains); + // get the GPU product already here so that the async transfer can begin + const auto *gpuGains = hgains->getGPUProductAsync(ctx.stream()); + + cudautils::device::unique_ptr modulesToUnpackRegional; + const unsigned char *gpuModulesToUnpack; + + if(regions_) { + regions_->run(iEvent, iSetup); + LogDebug("SiPixelRawToCluster") << "region2unpack #feds: "<nFEDs(); + LogDebug("SiPixelRawToCluster") << "region2unpack #modules (BPIX,EPIX,total): "<nBarrelModules()<<" "<nForwardModules()<<" "<nModules(); + modulesToUnpackRegional = hgpuMap->getModToUnpRegionalAsync(*(regions_->modulesToUnpack()), ctx.stream()); + gpuModulesToUnpack = modulesToUnpackRegional.get(); + } + else { + gpuModulesToUnpack = hgpuMap->getModToUnpAllAsync(ctx.stream()); + } + + // initialize cabling map or update if necessary + if (recordWatcher.check(iSetup)) { + // cabling map, which maps online address (fed->link->ROC->local pixel) to offline (DetId->global pixel) + edm::ESTransientHandle cablingMap; + iSetup.get().get(cablingMapLabel_, cablingMap); //Tav + cablingMap_ = cablingMap.product(); + fedIds_ = cablingMap->fedIds(); + cabling_ = cablingMap->cablingTree(); + LogDebug("map version:")<< cabling_->version(); + } + + const auto& buffers = iEvent.get(rawGetToken_); + + errors_.clear(); + + // GPU specific: Data extraction for RawToDigi GPU + unsigned int wordCounterGPU = 0; + unsigned int fedCounter = 0; + bool errorsInEvent = false; + + // In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData() + ErrorChecker errorcheck; + auto wordFedAppender = pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender(ctx.stream()); + for(int fedId: fedIds_) { + if (!usePilotBlade_ && (fedId==40) ) continue; // skip pilot blade data + if (regions_ && !regions_->mayUnpackFED(fedId)) continue; + + // for GPU + // first 150 index stores the fedId and next 150 will store the + // start index of word in that fed + assert(fedId>=1200); + fedCounter++; + + // get event data for this fed + const FEDRawData& rawData = buffers.FEDData( fedId ); + + // GPU specific + int nWords = rawData.size()/sizeof(cms_uint64_t); + if (nWords == 0) { + continue; + } + + // check CRC bit + const cms_uint64_t* trailer = reinterpret_cast(rawData.data())+(nWords-1); + if (not errorcheck.checkCRC(errorsInEvent, fedId, trailer, errors_)) { + continue; + } + + // check headers + const cms_uint64_t* header = reinterpret_cast(rawData.data()); header--; + bool moreHeaders = true; + while (moreHeaders) { + header++; + bool headerStatus = errorcheck.checkHeader(errorsInEvent, fedId, header, errors_); + moreHeaders = headerStatus; + } + + // check trailers + bool moreTrailers = true; + trailer++; + while (moreTrailers) { + trailer--; + bool trailerStatus = errorcheck.checkTrailer(errorsInEvent, fedId, nWords, trailer, errors_); + moreTrailers = trailerStatus; + } + + const cms_uint32_t * bw = (const cms_uint32_t *)(header+1); + const cms_uint32_t * ew = (const cms_uint32_t *)(trailer); + + assert(0 == (ew-bw)%2); + wordFedAppender.initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw)); + wordCounterGPU+=(ew-bw); + + } // end of for loop + + gpuAlgo_.makeClustersAsync(gpuMap, gpuModulesToUnpack, gpuGains, + wordFedAppender, + std::move(errors_), + wordCounterGPU, fedCounter, convertADCtoElectrons_, + useQuality_, includeErrors_, + edm::MessageDrop::instance()->debugEnabled, + ctx.stream()); + + ctxTmp_ = ctx.toToken(); +} + +void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { + CUDAScopedContext ctx{std::move(ctxTmp_)}; + + auto tmp = gpuAlgo_.getResults(); + ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first)); + ctx.emplace(iEvent, clusterPutToken_, std::move(tmp.second)); + if(includeErrors_) { + ctx.emplace(iEvent, digiErrorPutToken_, gpuAlgo_.getErrors()); + } +} + +// define as framework plugin +DEFINE_FWK_MODULE(SiPixelRawToClusterCUDA); diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index 1388ed4852b25..fead8e59a0db3 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -44,13 +44,8 @@ namespace pixelgpudetails { - // data structures size - constexpr uint32_t vsize = sizeof(GPU::SimpleVector); - constexpr uint32_t esize = sizeof(pixelgpudetails::error_obj); - // number of words for all the FEDs constexpr uint32_t MAX_FED_WORDS = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD; - constexpr uint32_t MAX_ERROR_SIZE = MAX_FED_WORDS * esize; SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender(cuda::stream_t<>& cudaStream) { edm::Service cs; @@ -397,7 +392,7 @@ namespace pixelgpudetails { const uint32_t wordCounter, const uint32_t *word, const uint8_t *fedIds, uint16_t *xx, uint16_t *yy, uint16_t *adc, uint32_t *pdigi, uint32_t *rawIdArr, uint16_t *moduleId, - GPU::SimpleVector *err, + GPU::SimpleVector *err, bool useQualityInfo, bool includeErrors, bool debug) { //if (threadIdx.x==0) printf("Event: %u blockIdx.x: %u start: %u end: %u\n", eventno, blockIdx.x, begin, end); @@ -432,7 +427,7 @@ namespace pixelgpudetails { if (includeErrors and skipROC) { uint32_t rID = getErrRawID(fedId, ww, errorType, cablingMap, debug); - err->push_back(pixelgpudetails::error_obj{rID, ww, errorType, fedId}); + err->push_back(PixelErrorCompact{rID, ww, errorType, fedId}); continue; } @@ -476,7 +471,7 @@ namespace pixelgpudetails { if (includeErrors) { if (not rocRowColIsValid(row, col)) { uint8_t error = conversionError(fedId, 3, debug); //use the device function and fill the arrays - err->push_back(pixelgpudetails::error_obj{rawId, ww, error, fedId}); + err->push_back(PixelErrorCompact{rawId, ww, error, fedId}); if(debug) printf("BPIX1 Error status: %i\n", error); continue; } @@ -491,7 +486,7 @@ namespace pixelgpudetails { localPix.col = col; if (includeErrors and not dcolIsValid(dcol, pxid)) { uint8_t error = conversionError(fedId, 3, debug); - err->push_back(pixelgpudetails::error_obj{rawId, ww, error, fedId}); + err->push_back(PixelErrorCompact{rawId, ww, error, fedId}); if(debug) printf("Error status: %i %d %d %d %d\n", error, dcol, pxid, fedId, roc); continue; } @@ -514,19 +509,22 @@ namespace pixelgpudetails { const unsigned char *modToUnp, const SiPixelGainForHLTonGPU *gains, const WordFedAppender& wordFed, + PixelFormatterErrors&& errors, const uint32_t wordCounter, const uint32_t fedCounter, bool convertADCtoElectrons, - bool useQualityInfo, bool includeErrors, bool transferToCPU, bool debug, + bool useQualityInfo, bool includeErrors, bool debug, cuda::stream_t<>& stream) { nDigis = wordCounter; - constexpr uint32_t MAX_FED_WORDS = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD; - digis_d = SiPixelDigisCUDA(MAX_FED_WORDS, stream); - clusters_d = SiPixelClustersCUDA(MAX_FED_WORDS, gpuClustering::MaxNumModules, stream); + digis_d = SiPixelDigisCUDA(pixelgpudetails::MAX_FED_WORDS, stream); + if(includeErrors) { + digiErrors_d = SiPixelDigiErrorsCUDA(pixelgpudetails::MAX_FED_WORDS, std::move(errors), stream); + } + clusters_d = SiPixelClustersCUDA(gpuClustering::MaxNumModules, stream); edm::Service cs; - digis_clusters_h.nModules_Clusters = cs->make_host_unique(2, stream); + nModules_Clusters_h = cs->make_host_unique(2, stream); { const int threadsPerBlock = 512; @@ -537,20 +535,8 @@ namespace pixelgpudetails { auto word_d = cs->make_device_unique(wordCounter, stream); auto fedId_d = cs->make_device_unique(wordCounter, stream); - auto error_d = cs->make_device_unique>(stream); - auto data_d = cs->make_device_unique(MAX_FED_WORDS, stream); - cudaCheck(cudaMemsetAsync(data_d.get(), 0x00, MAX_ERROR_SIZE, stream.id())); - auto error_h_tmp = cs->make_host_unique>(stream); - GPU::make_SimpleVector(error_h_tmp.get(), MAX_FED_WORDS, data_d.get()); - assert(error_h_tmp->size() == 0); - assert(error_h_tmp->capacity() == static_cast(MAX_FED_WORDS)); - cudaCheck(cudaMemcpyAsync(word_d.get(), wordFed.word(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); cudaCheck(cudaMemcpyAsync(fedId_d.get(), wordFed.fedId(), wordCounter*sizeof(uint8_t) / 2, cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(error_d.get(), error_h_tmp.get(), vsize, cudaMemcpyDefault, stream.id())); - - auto pdigi_d = cs->make_device_unique(wordCounter, stream); - auto rawIdArr_d = cs->make_device_unique(wordCounter, stream); // Launch rawToDigi kernel RawToDigi_kernel<<>>( @@ -560,43 +546,17 @@ namespace pixelgpudetails { word_d.get(), fedId_d.get(), digis_d.xx(), digis_d.yy(), digis_d.adc(), - pdigi_d.get(), - rawIdArr_d.get(), + digis_d.pdigi(), + digis_d.rawIdArr(), digis_d.moduleInd(), - error_d.get(), + digiErrors_d.error(), // returns nullptr if default-constructed useQualityInfo, includeErrors, debug); cudaCheck(cudaGetLastError()); - // copy data to host variable - if(transferToCPU) { - digis_clusters_h.pdigi = cs->make_host_unique(MAX_FED_WORDS, stream); - digis_clusters_h.rawIdArr = cs->make_host_unique(MAX_FED_WORDS, stream); - cudaCheck(cudaMemcpyAsync(digis_clusters_h.pdigi.get(), pdigi_d.get(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(digis_clusters_h.rawIdArr.get(), rawIdArr_d.get(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); - - if (includeErrors) { - digis_clusters_h.data = cs->make_host_unique(MAX_FED_WORDS, stream); - digis_clusters_h.error = cs->make_host_unique>(stream); - GPU::make_SimpleVector(digis_clusters_h.error.get(), MAX_FED_WORDS, digis_clusters_h.data.get()); - assert(digis_clusters_h.error->size() == 0); - assert(digis_clusters_h.error->capacity() == static_cast(MAX_FED_WORDS)); - - cudaCheck(cudaMemcpyAsync(digis_clusters_h.error.get(), error_d.get(), vsize, cudaMemcpyDefault, stream.id())); - cudaCheck(cudaMemcpyAsync(digis_clusters_h.data.get(), data_d.get(), MAX_ERROR_SIZE, cudaMemcpyDefault, stream.id())); - // If we want to transfer only the minimal amount of data, we - // need a synchronization point. A single ExternalWork (of - // SiPixelRawToClusterHeterogeneous) does not help because it is - // already used to synchronize the data movement. So we'd need - // two ExternalWorks (or explicit use of TBB tasks). The - // prototype of #100 would allow this easily (as there would be - // two ExternalWorks). - // - //cudaCheck(cudaStreamSynchronize(stream.id())); - //int size = digis_clusters_h.error->size(); - //cudaCheck(cudaMemcpyAsync(digis_clusters_h.data.get(), data_d.get(), size*esize, cudaMemcpyDefault, stream.id())); - } + if(includeErrors) { + digiErrors_d.copyErrorToHostAsync(stream); } } // End of Raw2Digi and passing data for cluserisation @@ -614,12 +574,6 @@ namespace pixelgpudetails { wordCounter); cudaCheck(cudaGetLastError()); - // calibrated adc - if(transferToCPU) { - digis_clusters_h.adc = cs->make_host_unique(MAX_FED_WORDS, stream); - cudaCheck(cudaMemcpyAsync(digis_clusters_h.adc.get(), digis_d.adc(), wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id())); - } - #ifdef GPU_DEBUG std::cout << "CUDA countModules kernel launch with " << blocks @@ -628,11 +582,11 @@ namespace pixelgpudetails { cudaCheck(cudaMemsetAsync(clusters_d.moduleStart(), 0x00, sizeof(uint32_t), stream.id())); - countModules<<>>(digis_d.c_moduleInd(), clusters_d.moduleStart(), clusters_d.clus(), wordCounter); + countModules<<>>(digis_d.c_moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter); cudaCheck(cudaGetLastError()); // read the number of modules into a data member, used by getProduct()) - cudaCheck(cudaMemcpyAsync(&(digis_clusters_h.nModules_Clusters[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream.id())); + cudaCheck(cudaMemcpyAsync(&(nModules_Clusters_h[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream.id())); threadsPerBlock = 256; blocks = MaxNumModules; @@ -646,7 +600,7 @@ namespace pixelgpudetails { digis_d.c_xx(), digis_d.c_yy(), clusters_d.c_moduleStart(), clusters_d.clusInModule(), clusters_d.moduleId(), - clusters_d.clus(), + digis_d.clus(), wordCounter); cudaCheck(cudaGetLastError()); @@ -656,12 +610,11 @@ namespace pixelgpudetails { digis_d.c_adc(), clusters_d.c_moduleStart(), clusters_d.clusInModule(), clusters_d.c_moduleId(), - clusters_d.clus(), + digis_d.clus(), wordCounter); cudaCheck(cudaGetLastError()); - // count the module start indices already here (instead of // rechits) so that the number of clusters/hits can be made // available in the rechit producer without additional points of @@ -681,15 +634,7 @@ namespace pixelgpudetails { clusters_d.c_clusInModule(), &clusters_d.clusModuleStart()[1], gpuClustering::MaxNumModules, stream.id())); // last element holds the number of all clusters - cudaCheck(cudaMemcpyAsync(&(digis_clusters_h.nModules_Clusters[1]), clusters_d.clusModuleStart()+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id())); - - - // clusters - if(transferToCPU) { - digis_clusters_h.clus = cs->make_host_unique(MAX_FED_WORDS, stream); - cudaCheck(cudaMemcpyAsync(digis_clusters_h.clus.get(), clusters_d.clus(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); - } + cudaCheck(cudaMemcpyAsync(&(nModules_Clusters_h[1]), clusters_d.clusModuleStart()+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id())); } // end clusterizer scope } - } diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h index 44bed9abc1e68..1ab8bc3fa5998 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h @@ -5,10 +5,13 @@ #include #include "cuda/api_wrappers.h" -#include "CUDADataFormats/Common/interface/host_unique_ptr.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h" +#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "FWCore/Utilities/interface/typedefs.h" #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" -#include "siPixelRawToClusterHeterogeneousProduct.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "DataFormats/SiPixelDigi/interface/PixelErrors.h" struct SiPixelFedCablingMapGPU; class SiPixelGainForHLTonGPU; @@ -152,34 +155,8 @@ namespace pixelgpudetails { } - using error_obj = siPixelRawToClusterHeterogeneousProduct::error_obj; - - class SiPixelRawToClusterGPUKernel { public: - - using GPUProduct = siPixelRawToClusterHeterogeneousProduct::GPUProduct; - - struct CPUData { - CPUData() = default; - ~CPUData() = default; - - CPUData(const CPUData&) = delete; - CPUData& operator=(const CPUData&) = delete; - CPUData(CPUData&&) = default; - CPUData& operator=(CPUData&&) = default; - - edm::cuda::host::unique_ptr nModules_Clusters; // These should really be part of the GPU product - - edm::cuda::host::unique_ptr data; - edm::cuda::host::unique_ptr> error; - - edm::cuda::host::unique_ptr pdigi; - edm::cuda::host::unique_ptr rawIdArr; - edm::cuda::host::unique_ptr adc; - edm::cuda::host::unique_ptr clus; - }; - class WordFedAppender { public: WordFedAppender(cuda::stream_t<>& cudaStream); @@ -191,8 +168,8 @@ namespace pixelgpudetails { const unsigned char *fedId() const { return fedId_.get(); } private: - edm::cuda::host::unique_ptr word_; - edm::cuda::host::unique_ptr fedId_; + cudautils::host::unique_ptr word_; + cudautils::host::unique_ptr fedId_; }; SiPixelRawToClusterGPUKernel() = default; @@ -207,62 +184,38 @@ namespace pixelgpudetails { void makeClustersAsync(const SiPixelFedCablingMapGPU *cablingMap, const unsigned char *modToUnp, const SiPixelGainForHLTonGPU *gains, const WordFedAppender& wordFed, + PixelFormatterErrors&& errors, const uint32_t wordCounter, const uint32_t fedCounter, bool convertADCtoElectrons, - bool useQualityInfo, bool includeErrors, bool transferToCPU_, bool debug, + bool useQualityInfo, bool includeErrors, bool debug, cuda::stream_t<>& stream); - siPixelRawToClusterHeterogeneousProduct::GPUProduct getProduct() { - return siPixelRawToClusterHeterogeneousProduct::GPUProduct( - std::move(digis_d), std::move(clusters_d), - nDigis, - digis_clusters_h.nModules_Clusters[0], - digis_clusters_h.nModules_Clusters[1] - ); + std::pair getResults() { + digis_d.setNModulesDigis(nModules_Clusters_h[0], nDigis); + clusters_d.setNClusters(nModules_Clusters_h[1]); + // need to explicitly deallocate while the associated CUDA + // stream is still alive + // + // technically the statement above is not true anymore now that + // the CUDA streams are cached within the CUDAService, but it is + // still better to release as early as possible + nModules_Clusters_h.reset(); + return std::make_pair(std::move(digis_d), std::move(clusters_d)); } - CPUData&& getCPUData() { - // Set the vector data pointer to point to CPU - digis_clusters_h.error->set_data(digis_clusters_h.data.get()); - return std::move(digis_clusters_h); + SiPixelDigiErrorsCUDA&& getErrors() { + return std::move(digiErrors_d); } private: uint32_t nDigis = 0; - // CPU data - CPUData digis_clusters_h; - // Data to be put in the event + cudautils::host::unique_ptr nModules_Clusters_h; SiPixelDigisCUDA digis_d; SiPixelClustersCUDA clusters_d; + SiPixelDigiErrorsCUDA digiErrors_d; }; - // configuration and memory buffers alocated on the GPU - struct context { - uint32_t * word_d; - uint8_t * fedId_d; - uint32_t * pdigi_d; - uint16_t * xx_d; - uint16_t * yy_d; - uint16_t * adc_d; - uint16_t * moduleInd_d; - uint32_t * rawIdArr_d; - - GPU::SimpleVector * error_d; - error_obj * data_d; - - // these are for the clusterizer (to be moved) - uint32_t * moduleStart_d; - int32_t * clus_d; - uint32_t * clusInModule_d; - uint32_t * moduleId_d; - uint32_t * debug_d; - }; - - // void initCablingMap(); - context initDeviceMemory(); - void freeMemory(context &); - // see RecoLocalTracker/SiPixelClusterizer // all are runtime const, should be specified in python _cfg.py struct ADCThreshold { diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc deleted file mode 100644 index 905bc297b394d..0000000000000 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc +++ /dev/null @@ -1,739 +0,0 @@ -// C++ includes -#include -#include -#include -#include - -// CUDA kincludes -#include -#include - -// CMSSW includes -#include "CalibTracker/Records/interface/SiPixelGainCalibrationForHLTGPURcd.h" -#include "CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h" -#include "CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTService.h" -#include "CondFormats/DataRecord/interface/SiPixelFedCablingMapRcd.h" -#include "CondFormats/DataRecord/interface/SiPixelQualityRcd.h" -#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingMap.h" -#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingTree.h" -#include "CondFormats/SiPixelObjects/interface/SiPixelQuality.h" -#include "DataFormats/Common/interface/DetSetVector.h" -#include "DataFormats/Common/interface/Handle.h" -#include "DataFormats/DetId/interface/DetIdCollection.h" -#include "DataFormats/FEDRawData/interface/FEDNumbering.h" -#include "DataFormats/FEDRawData/interface/FEDRawData.h" -#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h" -#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h" -#include "DataFormats/SiPixelDetId/interface/PixelFEDChannel.h" -#include "DataFormats/SiPixelDigi/interface/PixelDigi.h" -#include "DataFormats/SiPixelRawData/interface/SiPixelRawDataError.h" -#include "DataFormats/TrackerCommon/interface/TrackerTopology.h" -#include "EventFilter/SiPixelRawToDigi/interface/PixelDataFormatter.h" -#include "EventFilter/SiPixelRawToDigi/interface/PixelUnpackingRegions.h" -#include "FWCore/Framework/interface/ConsumesCollector.h" -#include "FWCore/Framework/interface/ESHandle.h" -#include "FWCore/Framework/interface/ESTransientHandle.h" -#include "FWCore/Framework/interface/ESWatcher.h" -#include "FWCore/Framework/interface/EventSetup.h" -#include "FWCore/Framework/interface/Event.h" -#include "FWCore/Framework/interface/MakerMacros.h" -#include "FWCore/MessageLogger/interface/MessageLogger.h" -#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" -#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" -#include "FWCore/ParameterSet/interface/ParameterSet.h" -#include "FWCore/PluginManager/interface/ModuleDef.h" -#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h" -#include "Geometry/TrackerGeometryBuilder/interface/PixelGeomDetUnit.h" -#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" -#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h" -#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h" - -#include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h" -#include "RecoTracker/Record/interface/CkfComponentsRecord.h" - -#include "SiPixelRawToClusterGPUKernel.h" -#include "siPixelRawToClusterHeterogeneousProduct.h" -#include "PixelThresholdClusterizer.h" - -namespace { - struct AccretionCluster { - typedef unsigned short UShort; - static constexpr UShort MAXSIZE = 256; - UShort adc[MAXSIZE]; - UShort x[MAXSIZE]; - UShort y[MAXSIZE]; - UShort xmin=16000; - UShort ymin=16000; - unsigned int isize=0; - int charge=0; - - void clear() { - isize=0; - charge=0; - xmin=16000; - ymin=16000; - } - - bool add(SiPixelCluster::PixelPos const & p, UShort const iadc) { - if (isize==MAXSIZE) return false; - xmin=std::min(xmin,(unsigned short)(p.row())); - ymin=std::min(ymin,(unsigned short)(p.col())); - adc[isize]=iadc; - x[isize]=p.row(); - y[isize++]=p.col(); - charge+=iadc; - return true; - } - }; - - constexpr uint32_t dummydetid = 0xffffffff; -} - -class SiPixelRawToClusterHeterogeneous: public HeterogeneousEDProducer > { -public: - using CPUProduct = siPixelRawToClusterHeterogeneousProduct::CPUProduct; - using GPUProduct = siPixelRawToClusterHeterogeneousProduct::GPUProduct; - using Output = siPixelRawToClusterHeterogeneousProduct::HeterogeneousDigiCluster; - - explicit SiPixelRawToClusterHeterogeneous(const edm::ParameterSet& iConfig); - ~SiPixelRawToClusterHeterogeneous() override = default; - - static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); - -private: - // CPU implementation - void produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) override; - - // GPU implementation - void acquireGPUCuda(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) override; - void produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) override; - void convertGPUtoCPU(edm::Event& ev, unsigned int nDigis, pixelgpudetails::SiPixelRawToClusterGPUKernel::CPUData) const; - - // Commonalities - const FEDRawDataCollection *initialize(const edm::Event& ev, const edm::EventSetup& es); - - std::unique_ptr cabling_; - const SiPixelQuality *badPixelInfo_ = nullptr; - const SiPixelFedCablingMap *cablingMap_ = nullptr; -std::unique_ptr regions_; - edm::EDGetTokenT tFEDRawDataCollection; - - bool includeErrors; - bool useQuality; - bool debug; - std::vector tkerrorlist; - std::vector usererrorlist; - std::vector fedIds; - - edm::ESWatcher recordWatcher; - edm::ESWatcher qualityWatcher; - - bool usePilotBlade; - bool usePhase1; - bool convertADCtoElectrons; - std::string cablingMapLabel; - - // clusterizer - PixelThresholdClusterizer clusterizer_; - const TrackerGeometry *geom_ = nullptr; - const TrackerTopology *ttopo_ = nullptr; - - // gain calib - SiPixelGainCalibrationForHLTService theSiPixelGainCalibration_; - - // GPU algo - pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_; - PixelDataFormatter::Errors errors_; - - bool enableTransfer_; - bool enableConversion_; -}; - -SiPixelRawToClusterHeterogeneous::SiPixelRawToClusterHeterogeneous(const edm::ParameterSet& iConfig): - HeterogeneousEDProducer(iConfig), - clusterizer_(iConfig), - theSiPixelGainCalibration_(iConfig) { - includeErrors = iConfig.getParameter("IncludeErrors"); - useQuality = iConfig.getParameter("UseQualityInfo"); - tkerrorlist = iConfig.getParameter > ("ErrorList"); - usererrorlist = iConfig.getParameter > ("UserErrorList"); - tFEDRawDataCollection = consumes (iConfig.getParameter("InputLabel")); - - enableConversion_ = iConfig.getParameter("gpuEnableConversion"); - enableTransfer_ = enableConversion_ || iConfig.getParameter("gpuEnableTransfer"); - - clusterizer_.setSiPixelGainCalibrationService(&theSiPixelGainCalibration_); - - // Products in GPU - produces(); - // Products in CPU - if(enableConversion_) { - produces>(); - if(includeErrors) { - produces>(); - produces(); - produces("UserErrorModules"); - produces(); - produces>(); - } - } - - // regions - if(!iConfig.getParameter("Regions").getParameterNames().empty()) { - regions_ = std::make_unique(iConfig, consumesCollector()); - } - - // Control the usage of pilot-blade data, FED=40 - usePilotBlade = iConfig.getParameter ("UsePilotBlade"); - if(usePilotBlade) edm::LogInfo("SiPixelRawToCluster") << " Use pilot blade data (FED 40)"; - - // Control the usage of phase1 - usePhase1 = iConfig.getParameter ("UsePhase1"); - if(usePhase1) edm::LogInfo("SiPixelRawToCluster") << " Using phase1"; - - //CablingMap could have a label //Tav - cablingMapLabel = iConfig.getParameter ("CablingMapLabel"); - - convertADCtoElectrons = iConfig.getParameter("ConvertADCtoElectrons"); -} - -void SiPixelRawToClusterHeterogeneous::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { - edm::ParameterSetDescription desc; - desc.add("IncludeErrors",true); - desc.add("UseQualityInfo",false); - { - std::vector temp1; - temp1.reserve(1); - temp1.push_back(29); - desc.add >("ErrorList",temp1)->setComment("## ErrorList: list of error codes used by tracking to invalidate modules"); - } - { - std::vector temp1; - temp1.reserve(1); - temp1.push_back(40); - desc.add >("UserErrorList",temp1)->setComment("## UserErrorList: list of error codes used by Pixel experts for investigation"); - } - desc.add("InputLabel",edm::InputTag("rawDataCollector")); - { - edm::ParameterSetDescription psd0; - psd0.addOptional>("inputs"); - psd0.addOptional>("deltaPhi"); - psd0.addOptional>("maxZ"); - psd0.addOptional("beamSpot"); - desc.add("Regions",psd0)->setComment("## Empty Regions PSet means complete unpacking"); - } - desc.add("UsePilotBlade",false)->setComment("## Use pilot blades"); - desc.add("UsePhase1",false)->setComment("## Use phase1"); - desc.add("CablingMapLabel","")->setComment("CablingMap label"); //Tav - desc.addOptional("CheckPixelOrder"); // never used, kept for back-compatibility - - desc.add("ConvertADCtoElectrons", false)->setComment("## do the calibration ADC-> Electron and apply the threshold, requried for clustering"); - - // clusterizer - desc.add("ChannelThreshold", 1000); - desc.add("SeedThreshold", 1000); - desc.add("ClusterThreshold", 4000); - desc.add("ClusterThreshold_L1", 4000); - desc.add("VCaltoElectronGain", 65); - desc.add("VCaltoElectronGain_L1", 65); - desc.add("VCaltoElectronOffset", -414); - desc.add("VCaltoElectronOffset_L1", -414); - desc.add("MissCalibrate", true); - desc.add("SplitClusters", false); - desc.add("ElectronPerADCGain", 135.); - // Phase 2 clusterizer - desc.add("Phase2Calibration", false); - desc.add("Phase2ReadoutMode", -1); - desc.add("Phase2DigiBaseline", 1200.); - desc.add("Phase2KinkADC", 8); - - desc.add("gpuEnableTransfer", true); - desc.add("gpuEnableConversion", true); - - HeterogeneousEDProducer::fillPSetDescription(desc); - - descriptions.add("siPixelClustersHeterogeneousDefault",desc); -} - -const FEDRawDataCollection *SiPixelRawToClusterHeterogeneous::initialize(const edm::Event& ev, const edm::EventSetup& es) { - debug = edm::MessageDrop::instance()->debugEnabled; - - // setup gain calibration service - theSiPixelGainCalibration_.setESObjects( es ); - - // initialize cabling map or update if necessary - if (recordWatcher.check( es )) { - // cabling map, which maps online address (fed->link->ROC->local pixel) to offline (DetId->global pixel) - edm::ESTransientHandle cablingMap; - es.get().get( cablingMapLabel, cablingMap ); //Tav - cablingMap_ = cablingMap.product(); - fedIds = cablingMap->fedIds(); - cabling_ = cablingMap->cablingTree(); - LogDebug("map version:")<< cabling_->version(); - } - // initialize quality record or update if necessary - if (qualityWatcher.check( es )&&useQuality) { - // quality info for dead pixel modules or ROCs - edm::ESHandle qualityInfo; - es.get().get( qualityInfo ); - badPixelInfo_ = qualityInfo.product(); - if (!badPixelInfo_) { - edm::LogError("SiPixelQualityNotPresent")<<" Configured to use SiPixelQuality, but SiPixelQuality not present"; - } - } - - // tracker geometry: to make sure numbering of DetId is consistent... - edm::ESHandle geom; - es.get().get(geom); - geom_ = geom.product(); - - edm::ESHandle trackerTopologyHandle; - es.get().get(trackerTopologyHandle); - ttopo_ = trackerTopologyHandle.product(); - - if (regions_) { - regions_->run(ev, es); - LogDebug("SiPixelRawToCluster") << "region2unpack #feds: "<nFEDs(); - LogDebug("SiPixelRawToCluster") << "region2unpack #modules (BPIX,EPIX,total): "<nBarrelModules()<<" "<nForwardModules()<<" "<nModules(); - } - - edm::Handle buffers; - ev.getByToken(tFEDRawDataCollection, buffers); - return buffers.product(); -} - - -// ----------------------------------------------------------------------------- -void SiPixelRawToClusterHeterogeneous::produceCPU(edm::HeterogeneousEvent& ev, const edm::EventSetup& es) -{ - const auto buffers = initialize(ev.event(), es); - - // create product (digis & errors) - auto collection = std::make_unique>(); - auto errorcollection = std::make_unique>(); - auto tkerror_detidcollection = std::make_unique(); - auto usererror_detidcollection = std::make_unique(); - auto disabled_channelcollection = std::make_unique< edmNew::DetSetVector>(); - auto outputClusters = std::make_unique(); - // output->collection.reserve(8*1024); - - - PixelDataFormatter formatter(cabling_.get(), usePhase1); // for phase 1 & 0 - formatter.setErrorStatus(includeErrors); - if (useQuality) formatter.setQualityStatus(useQuality, badPixelInfo_); - - bool errorsInEvent = false; - PixelDataFormatter::DetErrors nodeterrors; - - if (regions_) { - formatter.setModulesToUnpack(regions_->modulesToUnpack()); - } - - for (auto aFed = fedIds.begin(); aFed != fedIds.end(); ++aFed) { - int fedId = *aFed; - - if(!usePilotBlade && (fedId==40) ) continue; // skip pilot blade data - - if (regions_ && !regions_->mayUnpackFED(fedId)) continue; - - if(debug) LogDebug("SiPixelRawToCluster")<< " PRODUCE DIGI FOR FED: " << fedId; - - PixelDataFormatter::Errors errors; - - //get event data for this fed - const FEDRawData& fedRawData = buffers->FEDData( fedId ); - - //convert data to digi and strip off errors - formatter.interpretRawData( errorsInEvent, fedId, fedRawData, *collection, errors); - - //pack errors into collection - if(includeErrors) { - typedef PixelDataFormatter::Errors::iterator IE; - for (IE is = errors.begin(); is != errors.end(); is++) { - uint32_t errordetid = is->first; - if (errordetid==dummydetid) { // errors given dummy detId must be sorted by Fed - nodeterrors.insert( nodeterrors.end(), errors[errordetid].begin(), errors[errordetid].end() ); - } else { - edm::DetSet& errorDetSet = errorcollection->find_or_insert(errordetid); - errorDetSet.data.insert(errorDetSet.data.end(), is->second.begin(), is->second.end()); - // Fill detid of the detectors where there is error AND the error number is listed - // in the configurable error list in the job option cfi. - // Code needs to be here, because there can be a set of errors for each - // entry in the for loop over PixelDataFormatter::Errors - - std::vector disabledChannelsDetSet; - - for (auto const& aPixelError : errorDetSet) { - // For the time being, we extend the error handling functionality with ErrorType 25 - // In the future, we should sort out how the usage of tkerrorlist can be generalized - if (aPixelError.getType()==25) { - assert(aPixelError.getFedId()==fedId); - const sipixelobjects::PixelFEDCabling* fed = cabling_->fed(fedId); - if (fed) { - cms_uint32_t linkId = formatter.linkId(aPixelError.getWord32()); - const sipixelobjects::PixelFEDLink* link = fed->link(linkId); - if (link) { - // The "offline" 0..15 numbering is fixed by definition, also, the FrameConversion depends on it - // in contrast, the ROC-in-channel numbering is determined by hardware --> better to use the "offline" scheme - PixelFEDChannel ch = {fed->id(), linkId, 25, 0}; - for (unsigned int iRoc=1; iRoc<=link->numberOfROCs(); iRoc++) { - const sipixelobjects::PixelROC * roc = link->roc(iRoc); - if (roc->idInDetUnit()idInDetUnit(); - if (roc->idInDetUnit()>ch.roc_last) ch.roc_last=roc->idInDetUnit(); - } - disabledChannelsDetSet.push_back(ch); - } - } - } else { - // fill list of detIds to be turned off by tracking - if(!tkerrorlist.empty()) { - auto it_find = std::find(tkerrorlist.begin(), tkerrorlist.end(), aPixelError.getType()); - if(it_find != tkerrorlist.end()){ - tkerror_detidcollection->push_back(errordetid); - } - } - } - - // fill list of detIds with errors to be studied - if(!usererrorlist.empty()) { - auto it_find = std::find(usererrorlist.begin(), usererrorlist.end(), aPixelError.getType()); - if(it_find != usererrorlist.end()){ - usererror_detidcollection->push_back(errordetid); - } - } - - } // loop on DetSet of errors - - if (!disabledChannelsDetSet.empty()) { - disabled_channelcollection->insert(errordetid, disabledChannelsDetSet.data(), disabledChannelsDetSet.size()); - } - - } // if error assigned to a real DetId - } // loop on errors in event for this FED - } // if errors to be included in the event - } // loop on FED data to be unpacked - - if(includeErrors) { - edm::DetSet& errorDetSet = errorcollection->find_or_insert(dummydetid); - errorDetSet.data = nodeterrors; - } - if (errorsInEvent) LogDebug("SiPixelRawToCluster") << "Error words were stored in this event"; - - // clusterize, originally from SiPixelClusterProducer - for(const auto detset: *collection) { - const auto detId = DetId(detset.detId()); - - std::vector badChannels; // why do we need this? - - // Comment: At the moment the clusterizer depends on geometry - // to access information as the pixel topology (number of columns - // and rows in a detector module). - // In the future the geometry service will be replaced with - // a ES service. - const GeomDetUnit * geoUnit = geom_->idToDetUnit( detId ); - const PixelGeomDetUnit * pixDet = dynamic_cast(geoUnit); - edmNew::DetSetVector::FastFiller spc(*outputClusters, detset.detId()); - clusterizer_.clusterizeDetUnit(detset, pixDet, ttopo_, badChannels, spc); - if ( spc.empty() ) { - spc.abort(); - } - } - outputClusters->shrink_to_fit(); - - //send digis and errors back to framework - ev.put(std::move(collection)); - if(includeErrors){ - ev.put(std::move(errorcollection)); - ev.put(std::move(tkerror_detidcollection)); - ev.put(std::move(usererror_detidcollection), "UserErrorModules"); - ev.put(std::move(disabled_channelcollection)); - } - ev.put(std::move(outputClusters)); -} - -// ----------------------------------------------------------------------------- -void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& ev, const edm::EventSetup& es, cuda::stream_t<>& cudaStream) { - const auto buffers = initialize(ev.event(), es); - - edm::ESHandle hgpuMap; - es.get().get(hgpuMap); - if(hgpuMap->hasQuality() != useQuality) { - throw cms::Exception("LogicError") << "UseQuality of the module (" << useQuality<< ") differs the one from SiPixelFedCablingMapGPUWrapper. Please fix your configuration."; - } - // get the GPU product already here so that the async transfer can begin - const auto *gpuMap = hgpuMap->getGPUProductAsync(cudaStream); - - edm::cuda::device::unique_ptr modulesToUnpackRegional; - const unsigned char *gpuModulesToUnpack; - if (regions_) { - modulesToUnpackRegional = hgpuMap->getModToUnpRegionalAsync(*(regions_->modulesToUnpack()), cudaStream); - gpuModulesToUnpack = modulesToUnpackRegional.get(); - } - else { - gpuModulesToUnpack = hgpuMap->getModToUnpAllAsync(cudaStream); - } - - - edm::ESHandle hgains; - es.get().get(hgains); - - errors_.clear(); - - // GPU specific: Data extraction for RawToDigi GPU - unsigned int wordCounterGPU = 0; - unsigned int fedCounter = 0; - bool errorsInEvent = false; - - // In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData() - ErrorChecker errorcheck; - auto wordFedAppender = pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender(cudaStream); - for (auto aFed = fedIds.begin(); aFed != fedIds.end(); ++aFed) { - int fedId = *aFed; - - if (!usePilotBlade && (fedId==40) ) continue; // skip pilot blade data - if (regions_ && !regions_->mayUnpackFED(fedId)) continue; - - // for GPU - // first 150 index stores the fedId and next 150 will store the - // start index of word in that fed - assert(fedId>=1200); - fedCounter++; - - // get event data for this fed - const FEDRawData& rawData = buffers->FEDData( fedId ); - - // GPU specific - int nWords = rawData.size()/sizeof(cms_uint64_t); - if (nWords == 0) { - continue; - } - - // check CRC bit - const cms_uint64_t* trailer = reinterpret_cast(rawData.data())+(nWords-1); - if (not errorcheck.checkCRC(errorsInEvent, fedId, trailer, errors_)) { - continue; - } - - // check headers - const cms_uint64_t* header = reinterpret_cast(rawData.data()); header--; - bool moreHeaders = true; - while (moreHeaders) { - header++; - bool headerStatus = errorcheck.checkHeader(errorsInEvent, fedId, header, errors_); - moreHeaders = headerStatus; - } - - // check trailers - bool moreTrailers = true; - trailer++; - while (moreTrailers) { - trailer--; - bool trailerStatus = errorcheck.checkTrailer(errorsInEvent, fedId, nWords, trailer, errors_); - moreTrailers = trailerStatus; - } - - const cms_uint32_t * bw = (const cms_uint32_t *)(header+1); - const cms_uint32_t * ew = (const cms_uint32_t *)(trailer); - - assert(0 == (ew-bw)%2); - wordFedAppender.initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw)); - wordCounterGPU+=(ew-bw); - - } // end of for loop - - gpuAlgo_.makeClustersAsync(gpuMap, gpuModulesToUnpack, hgains->getGPUProductAsync(cudaStream), - wordFedAppender, - wordCounterGPU, fedCounter, convertADCtoElectrons, - useQuality, includeErrors, enableTransfer_, debug, cudaStream); -} - -void SiPixelRawToClusterHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& ev, const edm::EventSetup& es, cuda::stream_t<>& cudaStream) { - auto output = std::make_unique(gpuAlgo_.getProduct()); - - if(enableConversion_) { - convertGPUtoCPU(ev.event(), output->nDigis, gpuAlgo_.getCPUData()); - } - - ev.put(std::move(output), heterogeneous::DisableTransfer{}); -} - -void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev, - unsigned int nDigis, - pixelgpudetails::SiPixelRawToClusterGPUKernel::CPUData digis_clusters_h) const { - // TODO: add the transfers here as well? - - auto collection = std::make_unique>(); - auto errorcollection = std::make_unique>(); - auto tkerror_detidcollection = std::make_unique(); - auto usererror_detidcollection = std::make_unique(); - auto disabled_channelcollection = std::make_unique< edmNew::DetSetVector>(); - auto outputClusters = std::make_unique(); - - edm::DetSet * detDigis=nullptr; - for (uint32_t i = 0; i < nDigis; i++) { - if (digis_clusters_h.pdigi[i]==0) continue; - detDigis = &collection->find_or_insert(digis_clusters_h.rawIdArr[i]); - if ( (*detDigis).empty() ) (*detDigis).data.reserve(32); // avoid the first relocations - break; - } - - int32_t nclus=-1; - std::vector aclusters(1024); - auto totCluseFilled=0; - - auto fillClusters = [&](uint32_t detId){ - if (nclus<0) return; // this in reality should never happen - edmNew::DetSetVector::FastFiller spc(*outputClusters, detId); - auto layer = (DetId(detId).subdetId()==1) ? ttopo_->pxbLayer(detId) : 0; - auto clusterThreshold = (layer==1) ? 2000 : 4000; - for (int32_t ic=0; ic9000) continue; // not in cluster - assert(digis_clusters_h.rawIdArr[i] > 109999); - if ( (*detDigis).detId() != digis_clusters_h.rawIdArr[i]) - { - fillClusters((*detDigis).detId()); - assert(nclus==-1); - detDigis = &collection->find_or_insert(digis_clusters_h.rawIdArr[i]); - if ( (*detDigis).empty() ) - (*detDigis).data.reserve(32); // avoid the first relocations - else { std::cout << "Problem det present twice in input! " << (*detDigis).detId() << std::endl; } - } - (*detDigis).data.emplace_back(digis_clusters_h.pdigi[i]); - auto const & dig = (*detDigis).data.back(); - // fill clusters - assert(digis_clusters_h.clus[i]>=0); - assert(digis_clusters_h.clus[i]<1024); - nclus = std::max(digis_clusters_h.clus[i],nclus); - auto row = dig.row(); - auto col = dig.column(); - SiPixelCluster::PixelPos pix(row,col); - aclusters[digis_clusters_h.clus[i]].add(pix, digis_clusters_h.adc[i]); - } - - // fill final clusters - fillClusters((*detDigis).detId()); - //std::cout << "filled " << totCluseFilled << " clusters" << std::endl; - - PixelDataFormatter formatter(cabling_.get(), usePhase1); // for phase 1 & 0 - auto errors = errors_; // make a copy - PixelDataFormatter::DetErrors nodeterrors; - - auto size = digis_clusters_h.error->size(); - for (auto i = 0; i < size; i++) { - pixelgpudetails::error_obj err = (*digis_clusters_h.error)[i]; - if (err.errorType != 0) { - SiPixelRawDataError error(err.word, err.errorType, err.fedId + 1200); - errors[err.rawId].push_back(error); - } - } - - // pack errors into collection - if (includeErrors) { - - typedef PixelDataFormatter::Errors::iterator IE; - for (IE is = errors.begin(); is != errors.end(); is++) { - - uint32_t errordetid = is->first; - if (errordetid == dummydetid) {// errors given dummy detId must be sorted by Fed - nodeterrors.insert( nodeterrors.end(), errors[errordetid].begin(), errors[errordetid].end() ); - } - else { - edm::DetSet& errorDetSet = errorcollection->find_or_insert(errordetid); - errorDetSet.data.insert(errorDetSet.data.end(), is->second.begin(), is->second.end()); - // Fill detid of the detectors where there is error AND the error number is listed - // in the configurable error list in the job option cfi. - // Code needs to be here, because there can be a set of errors for each - // entry in the for loop over PixelDataFormatter::Errors - - std::vector disabledChannelsDetSet; - - for (auto const& aPixelError : errorDetSet) { - // For the time being, we extend the error handling functionality with ErrorType 25 - // In the future, we should sort out how the usage of tkerrorlist can be generalized - if (aPixelError.getType() == 25) { - int fedId = aPixelError.getFedId(); - const sipixelobjects::PixelFEDCabling* fed = cabling_->fed(fedId); - if (fed) { - cms_uint32_t linkId = formatter.linkId(aPixelError.getWord32()); - const sipixelobjects::PixelFEDLink* link = fed->link(linkId); - if (link) { - // The "offline" 0..15 numbering is fixed by definition, also, the FrameConversion depends on it - // in contrast, the ROC-in-channel numbering is determined by hardware --> better to use the "offline" scheme - PixelFEDChannel ch = {fed->id(), linkId, 25, 0}; - for (unsigned int iRoc = 1; iRoc <= link->numberOfROCs(); iRoc++) { - const sipixelobjects::PixelROC * roc = link->roc(iRoc); - if (roc->idInDetUnit() < ch.roc_first) ch.roc_first = roc->idInDetUnit(); - if (roc->idInDetUnit() > ch.roc_last) ch.roc_last = roc->idInDetUnit(); - } - if (ch.roc_firstpush_back(errordetid); - } - } - } - - // fill list of detIds with errors to be studied - if (!usererrorlist.empty()) { - auto it_find = std::find(usererrorlist.begin(), usererrorlist.end(), aPixelError.getType()); - if (it_find != usererrorlist.end()) { - usererror_detidcollection->push_back(errordetid); - } - } - - } // loop on DetSet of errors - - if (!disabledChannelsDetSet.empty()) { - disabled_channelcollection->insert(errordetid, disabledChannelsDetSet.data(), disabledChannelsDetSet.size()); - } - - } // if error assigned to a real DetId - } // loop on errors in event for this FED - } // if errors to be included in the event - - if (includeErrors) { - edm::DetSet& errorDetSet = errorcollection->find_or_insert(dummydetid); - errorDetSet.data = nodeterrors; - } - - ev.put(std::move(collection)); - if(includeErrors){ - ev.put(std::move(errorcollection)); - ev.put(std::move(tkerror_detidcollection)); - ev.put(std::move(usererror_detidcollection), "UserErrorModules"); - ev.put(std::move(disabled_channelcollection)); - } - ev.put(std::move(outputClusters)); -} - -// define as framework plugin -DEFINE_FWK_MODULE(SiPixelRawToClusterHeterogeneous); diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h deleted file mode 100644 index 3b81e4a16f017..0000000000000 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h +++ /dev/null @@ -1,47 +0,0 @@ -#ifndef EventFilter_SiPixelRawToDigi_siPixelRawToClusterHeterogeneousProduct_h -#define EventFilter_SiPixelRawToDigi_siPixelRawToClusterHeterogeneousProduct_h - -#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" -#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" -#include "FWCore/Utilities/interface/typedefs.h" -#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" -#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h" - -namespace siPixelRawToClusterHeterogeneousProduct { - using CPUProduct = int; // dummy... - - struct error_obj { - uint32_t rawId; - uint32_t word; - unsigned char errorType; - unsigned char fedId; - }; - - // FIXME split in two - struct GPUProduct { - GPUProduct() = default; - GPUProduct(const GPUProduct&) = delete; - GPUProduct& operator=(const GPUProduct&) = delete; - GPUProduct(GPUProduct&&) = default; - GPUProduct& operator=(GPUProduct&&) = default; - - GPUProduct(SiPixelDigisCUDA&& digis, - SiPixelClustersCUDA&& clusters, - uint32_t ndig, uint32_t nmod, uint32_t nclus): - digis_d(std::move(digis)), clusters_d(std::move(clusters)), - nDigis(ndig), nModules(nmod), nClusters(nclus) - {} - - SiPixelDigisCUDA digis_d; - SiPixelClustersCUDA clusters_d; - - uint32_t nDigis; - uint32_t nModules; - uint32_t nClusters; - }; - - using HeterogeneousDigiCluster = HeterogeneousProductImpl, - heterogeneous::GPUCudaProduct >; -} - -#endif diff --git a/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py b/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py index bb0bb85697a99..b9c6862b015bf 100644 --- a/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py +++ b/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py @@ -2,10 +2,16 @@ from CondTools.SiPixel.SiPixelGainCalibrationService_cfi import * from RecoLocalTracker.SiPixelClusterizer.SiPixelClusterizer_cfi import siPixelClusters as _siPixelClusters -siPixelClustersPreSplitting = _siPixelClusters.clone() +from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA +siPixelClustersPreSplitting = SwitchProducerCUDA( + cpu = _siPixelClusters.clone() +) from Configuration.ProcessModifiers.gpu_cff import gpu -from RecoLocalTracker.SiPixelClusterizer.siPixelClustersHeterogeneous_cfi import siPixelClustersHeterogeneous as _siPixelClustersHeterogeneous -from RecoLocalTracker.SiPixelClusterizer.siPixelFedCablingMapGPUWrapper_cfi import * -from CalibTracker.SiPixelESProducers.siPixelGainCalibrationForHLTGPU_cfi import * -gpu.toReplaceWith(siPixelClustersPreSplitting, _siPixelClustersHeterogeneous.clone()) +gpu.toModify(siPixelClustersPreSplitting, + cuda = cms.EDAlias( + siPixelDigisClustersPreSplitting = cms.VPSet( + cms.PSet(type = cms.string("SiPixelClusteredmNewDetSetVector")) + ) + ) +) diff --git a/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py b/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py new file mode 100644 index 0000000000000..c80f3b16b3a43 --- /dev/null +++ b/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py @@ -0,0 +1,21 @@ +import FWCore.ParameterSet.Config as cms + +from RecoLocalTracker.SiPixelClusterizer.SiPixelClusterizerPreSplitting_cfi import siPixelClustersPreSplitting +from RecoLocalTracker.SiPixelClusterizer.siPixelRawToClusterCUDA_cfi import siPixelRawToClusterCUDA as _siPixelRawToClusterCUDA +from RecoLocalTracker.SiPixelClusterizer.siPixelDigisClustersFromSoA_cfi import siPixelDigisClustersFromSoA as _siPixelDigisClustersFromSoA +from RecoLocalTracker.SiPixelClusterizer.siPixelFedCablingMapGPUWrapper_cfi import * +from CalibTracker.SiPixelESProducers.siPixelGainCalibrationForHLTGPU_cfi import * + +siPixelClustersPreSplittingTask = cms.Task(siPixelClustersPreSplitting) + +siPixelClustersCUDAPreSplitting = _siPixelRawToClusterCUDA.clone() +siPixelDigisClustersPreSplitting = _siPixelDigisClustersFromSoA.clone() +siPixelClustersPreSplittingTaskCUDA = cms.Task( + siPixelClustersCUDAPreSplitting, + siPixelDigisClustersPreSplitting, +) + +from Configuration.ProcessModifiers.gpu_cff import gpu +_siPixelClustersPreSplittingTask_gpu = siPixelClustersPreSplittingTask.copy() +_siPixelClustersPreSplittingTask_gpu.add(siPixelClustersPreSplittingTaskCUDA) +gpu.toReplaceWith(siPixelClustersPreSplittingTask, _siPixelClustersPreSplittingTask_gpu) diff --git a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc index b652100f69e9f..e8726100abe0e 100644 --- a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc +++ b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc @@ -135,7 +135,7 @@ const unsigned char *SiPixelFedCablingMapGPUWrapper::getModToUnpAllAsync(cuda::s return data.modToUnpDefault; } -edm::cuda::device::unique_ptr SiPixelFedCablingMapGPUWrapper::getModToUnpRegionalAsync(std::set const& modules, cuda::stream_t<>& cudaStream) const { +cudautils::device::unique_ptr SiPixelFedCablingMapGPUWrapper::getModToUnpRegionalAsync(std::set const& modules, cuda::stream_t<>& cudaStream) const { edm::Service cs; auto modToUnpDevice = cs->make_device_unique(pixelgpudetails::MAX_SIZE, cudaStream); auto modToUnpHost = cs->make_host_unique(pixelgpudetails::MAX_SIZE, cudaStream); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu index 947cd20d97919..80be13dedd26b 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu @@ -129,18 +129,19 @@ namespace pixelgpudetails { #endif } - void PixelRecHitGPUKernel::makeHitsAsync(const siPixelRawToClusterHeterogeneousProduct::GPUProduct& input, + void PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d, + SiPixelClustersCUDA const& clusters_d, float const * bs, pixelCPEforGPU::ParamsOnGPU const * cpeParams, bool transferToCPU, cuda::stream_t<>& stream) { cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id())); - gpu_.hitsModuleStart_d = input.clusters_d.clusModuleStart(); + gpu_.hitsModuleStart_d = clusters_d.clusModuleStart(); gpu_.cpeParams = cpeParams; // copy it for use in clients cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id())); int threadsPerBlock = 256; - int blocks = input.nModules; // active modules (with digis) + int blocks = digis_d.nModules(); // active modules (with digis) #ifdef GPU_DEBUG std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl; @@ -148,12 +149,12 @@ namespace pixelgpudetails { gpuPixelRecHits::getHits<<>>( cpeParams, gpu_.bs_d, - input.digis_d.moduleInd(), - input.digis_d.xx(), input.digis_d.yy(), input.digis_d.adc(), - input.clusters_d.moduleStart(), - input.clusters_d.clusInModule(), input.clusters_d.moduleId(), - input.clusters_d.clus(), - input.nDigis, + digis_d.moduleInd(), + digis_d.xx(), digis_d.yy(), digis_d.adc(), + clusters_d.moduleStart(), + clusters_d.clusInModule(), clusters_d.moduleId(), + digis_d.clus(), + digis_d.nDigis(), gpu_.hitsModuleStart_d, gpu_.charge_d, gpu_.detInd_d, @@ -170,7 +171,7 @@ namespace pixelgpudetails { cudaCheck(cudaGetLastError()); // needed only if hits on CPU are required... - nhits_ = input.nClusters; + nhits_ = clusters_d.nClusters(); if(transferToCPU) { cudaCheck(cudaMemcpyAsync(h_hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id())); #ifdef GPU_DEBUG diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h index dcc80308c4463..49164d24ab335 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h @@ -1,7 +1,8 @@ #ifndef RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h #define RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h -#include "RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h" +#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h" #include @@ -31,7 +32,8 @@ namespace pixelgpudetails { PixelRecHitGPUKernel& operator=(const PixelRecHitGPUKernel&) = delete; PixelRecHitGPUKernel& operator=(PixelRecHitGPUKernel&&) = delete; - void makeHitsAsync(const siPixelRawToClusterHeterogeneousProduct::GPUProduct& input, + void makeHitsAsync(SiPixelDigisCUDA const& digis_d, + SiPixelClustersCUDA const& clusters_d, float const * bs, pixelCPEforGPU::ParamsOnGPU const * cpeParams, bool transferToCPU, diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc index 68f53a47157d4..d8e07667f976b 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc @@ -1,3 +1,6 @@ +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" #include "DataFormats/Common/interface/DetSetVectorNew.h" #include "DataFormats/Common/interface/Handle.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" @@ -13,25 +16,25 @@ #include "FWCore/Utilities/interface/InputTag.h" #include "Geometry/Records/interface/TrackerDigiGeometryRecord.h" #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" #include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h" #include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h" #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h" #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h" #include "RecoLocalTracker/Records/interface/TkPixelCPERecord.h" -#include "RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h" // TODO: we need a proper place for this header... - #include "PixelRecHits.h" // TODO : spit product from kernel +#include + class SiPixelRecHitHeterogeneous: public HeterogeneousEDProducer > { public: - using Input = siPixelRawToClusterHeterogeneousProduct::HeterogeneousDigiCluster; - using CPUProduct = siPixelRecHitsHeterogeneousProduct::CPUProduct; using GPUProduct = siPixelRecHitsHeterogeneousProduct::GPUProduct; using Output = siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit; @@ -62,7 +65,9 @@ class SiPixelRecHitHeterogeneous: public HeterogeneousEDProducer tBeamSpot; - edm::EDGetTokenT token_; + // The mess with inputs will be cleaned up when migrating to the new framework + edm::EDGetTokenT> token_; + edm::EDGetTokenT> tokenDigi_; edm::EDGetTokenT clusterToken_; std::string cpeName_; @@ -78,8 +83,8 @@ class SiPixelRecHitHeterogeneous: public HeterogeneousEDProducer(iConfig.getParameter("beamSpot"))), - token_(consumesHeterogeneous(iConfig.getParameter("heterogeneousSrc"))), - clusterToken_(consumes(iConfig.getParameter("src"))), + token_(consumes>(iConfig.getParameter("heterogeneousSrc"))), + tokenDigi_(consumes>(iConfig.getParameter("heterogeneousSrc"))), cpeName_(iConfig.getParameter("CPE")) { enableConversion_ = iConfig.getParameter("gpuEnableConversion"); @@ -87,6 +92,7 @@ SiPixelRecHitHeterogeneous::SiPixelRecHitHeterogeneous(const edm::ParameterSet& produces(); if(enableConversion_) { + clusterToken_ = consumes(iConfig.getParameter("src")); produces(); } } @@ -95,7 +101,7 @@ void SiPixelRecHitHeterogeneous::fillDescriptions(edm::ConfigurationDescriptions edm::ParameterSetDescription desc; desc.add("beamSpot", edm::InputTag("offlineBeamSpot")); - desc.add("heterogeneousSrc", edm::InputTag("siPixelClustersPreSplitting")); + desc.add("heterogeneousSrc", edm::InputTag("siPixelClustersCUDAPreSplitting")); desc.add("src", edm::InputTag("siPixelClustersPreSplitting")); desc.add("CPE", "PixelCPEFast"); @@ -118,16 +124,7 @@ void SiPixelRecHitHeterogeneous::initialize(const edm::EventSetup& es) { } void SiPixelRecHitHeterogeneous::produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) { - initialize(iSetup); - - edm::Handle hclusters; - iEvent.getByToken(clusterToken_, hclusters); - - auto output = std::make_unique(); - run(hclusters, *output); - - output->shrink_to_fit(); - iEvent.put(std::move(output)); + throw cms::Exception("NotImplemented") << "CPU version is no longer implemented"; } void SiPixelRecHitHeterogeneous::run(const edm::Handle& inputhandle, SiPixelRecHitCollectionNew &output) const { @@ -174,8 +171,28 @@ void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& i throw cms::Exception("Configuration") << "too bad, not a fast cpe gpu processing not possible...."; } - edm::Handle hinput; - iEvent.getByToken(token_, hinput); + edm::Handle> hclusters; + iEvent.getByToken(token_, hclusters); + // temporary check (until the migration) + edm::Service cs; + assert(hclusters->device() == cs->getCurrentDevice()); + CUDAScopedContext ctx{*hclusters}; + auto const& clusters = ctx.get(*hclusters); + + edm::Handle> hdigis; + iEvent.getByToken(tokenDigi_, hdigis); + auto const& digis = ctx.get(*hdigis); + + // We're processing in a stream given by base class, so need to + // synchronize explicitly (implementation is from + // CUDAScopedContext). In practice these should not be needed + // (because of synchronizations upstream), but let's play generic. + if(not hclusters->event().has_occurred()) { + cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hclusters->event().id(), 0)); + } + if(not hdigis->event().has_occurred()) { + cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hclusters->event().id(), 0)); + } edm::Handle bsHandle; iEvent.getByToken( tBeamSpot, bsHandle); @@ -185,8 +202,7 @@ void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& i bs[0]=bsh.x0(); bs[1]=bsh.y0(); bs[2]=bsh.z0(); } - - gpuAlgo_->makeHitsAsync(*hinput, bs, fcpe->getGPUProductAsync(cudaStream), enableTransfer_, cudaStream); + gpuAlgo_->makeHitsAsync(digis, clusters, bs, fcpe->getGPUProductAsync(cudaStream), enableTransfer_, cudaStream); } diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py index 15224adb78cc3..58935e9a6991c 100644 --- a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py +++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py @@ -21,7 +21,6 @@ def customizePixelTracksForProfilingDisableConversion(process): process = customizePixelTracksForProfiling(process) # Disable conversions to legacy - process.siPixelClustersPreSplitting.gpuEnableConversion = False process.siPixelRecHitsPreSplitting.gpuEnableConversion = False process.pixelTracksHitQuadruplets.gpuEnableConversion = False process.pixelTracks.gpuEnableConversion = False @@ -33,7 +32,6 @@ def customizePixelTracksForProfilingDisableTransfer(process): process = customizePixelTracksForProfilingDisableConversion(process) # Disable "unnecessary" transfers to CPU - process.siPixelClustersPreSplitting.gpuEnableTransfer = False process.siPixelRecHitsPreSplitting.gpuEnableTransfer = False process.pixelTracksHitQuadruplets.gpuEnableTransfer = False process.pixelVertices.gpuEnableTransfer = False diff --git a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml index 9f5d10ad020e9..767d140a5d5ed 100644 --- a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml +++ b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml @@ -18,6 +18,7 @@ + diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu index b402daef07a05..dfa08c1fa2043 100644 --- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu +++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu @@ -3,8 +3,6 @@ #include #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" -#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" -#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" @@ -14,7 +12,7 @@ using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU; __global__ -void simLink(const SiPixelDigisCUDA::DeviceConstView *dd, uint32_t ndigis, const SiPixelClustersCUDA::DeviceConstView *cc, clusterSLOnGPU::HitsOnGPU const * hhp, ClusterSLGPU const * slp, uint32_t n) +void simLink(const SiPixelDigisCUDA::DeviceConstView *dd, uint32_t ndigis, clusterSLOnGPU::HitsOnGPU const * hhp, ClusterSLGPU const * slp, uint32_t n) { assert(slp == slp->me_d); @@ -35,7 +33,7 @@ void simLink(const SiPixelDigisCUDA::DeviceConstView *dd, uint32_t ndigis, const auto ch = pixelgpudetails::pixelToChannel(dd->xx(i), dd->yy(i)); auto first = hh.hitsModuleStart_d[id]; - auto cl = first + cc->clus(i); + auto cl = first + dd->clus(i); assert(cl < 2000 * blockDim.x); const std::array me{{id, ch, 0, 0}}; @@ -162,7 +160,7 @@ namespace clusterSLOnGPU { cudaCheck(cudaMemsetAsync(slgpu.n2_d, 0, (ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream)); } - void Kernel::algo(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream) { + void Kernel::algo(SiPixelDigisCUDA const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream) { zero(stream.id()); ClusterSLGPU const & sl = slgpu; @@ -177,7 +175,7 @@ namespace clusterSLOnGPU { blocks = (ndigis + threadsPerBlock - 1) / threadsPerBlock; assert(sl.me_d); - simLink<<>>(dd.digis_d.view(), ndigis, dd.clusters_d.view(), hh.gpu_d, sl.me_d, n); + simLink<<>>(dd.view(), ndigis, hh.gpu_d, sl.me_d, n); cudaCheck(cudaGetLastError()); if (doDump) { diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h index 00b0e34b301c8..23976cb418e16 100644 --- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h +++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h @@ -4,8 +4,8 @@ #include #include +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h" #include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" #include "trackerHitAssociationHeterogeneousProduct.h" @@ -15,7 +15,6 @@ namespace clusterSLOnGPU { using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU; using GPUProduct = trackerHitAssociationHeterogeneousProduct::GPUProduct; - using DigisOnGPU = siPixelRawToClusterHeterogeneousProduct::GPUProduct; using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU; using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU; @@ -23,7 +22,7 @@ namespace clusterSLOnGPU { public: Kernel(cuda::stream_t<>& stream, bool dump); ~Kernel() {deAlloc();} - void algo(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream); + void algo(SiPixelDigisCUDA const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream); GPUProduct getProduct() { return GPUProduct{slgpu.me_d};} private: diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc index a5e0f403adcad..e9e271e1e58cc 100644 --- a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc +++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc @@ -4,6 +4,8 @@ #include +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" #include "DataFormats/Common/interface/DetSetVector.h" #include "DataFormats/Common/interface/DetSetVectorNew.h" #include "DataFormats/Common/interface/Handle.h" @@ -22,15 +24,16 @@ #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ServiceRegistry/interface/Service.h" #include "FWCore/Utilities/interface/EDGetToken.h" #include "FWCore/Utilities/interface/InputTag.h" #include "Geometry/Records/interface/TrackerDigiGeometryRecord.h" #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" #include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h" -#include "RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h" #include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" #include "SimDataFormats/Track/interface/SimTrackContainer.h" #include "SimDataFormats/TrackerDigiSimLink/interface/PixelDigiSimLink.h" @@ -52,7 +55,6 @@ class ClusterTPAssociationHeterogeneous : public HeterogeneousEDProducer> phase2OTClustersToken_; edm::EDGetTokenT trackingParticleToken_; - edm::EDGetTokenT tGpuDigis; + edm::EDGetTokenT> tGpuDigis; edm::EDGetTokenT tGpuHits; std::unique_ptr gpuAlgo; @@ -111,7 +113,7 @@ ClusterTPAssociationHeterogeneous::ClusterTPAssociationHeterogeneous(const edm:: stripClustersToken_(consumes>(cfg.getParameter("stripClusterSrc"))), phase2OTClustersToken_(consumes>(cfg.getParameter("phase2OTClusterSrc"))), trackingParticleToken_(consumes(cfg.getParameter("trackingParticleSrc"))), - tGpuDigis(consumesHeterogeneous(cfg.getParameter("heterogeneousPixelDigiClusterSrc"))), + tGpuDigis(consumes>(cfg.getParameter("heterogeneousPixelDigiClusterSrc"))), tGpuHits(consumesHeterogeneous(cfg.getParameter("heterogeneousPixelRecHitSrc"))), doDump(cfg.getParameter("dumpCSV")) { @@ -128,7 +130,7 @@ void ClusterTPAssociationHeterogeneous::fillDescriptions(edm::ConfigurationDescr desc.add("stripClusterSrc", edm::InputTag("siStripClusters")); desc.add("phase2OTClusterSrc", edm::InputTag("siPhase2Clusters")); desc.add("trackingParticleSrc", edm::InputTag("mix", "MergedTrackTruth")); - desc.add("heterogeneousPixelDigiClusterSrc", edm::InputTag("siPixelClustersPreSplitting")); + desc.add("heterogeneousPixelDigiClusterSrc", edm::InputTag("siPixelClustersCUDAPreSplitting")); desc.add("heterogeneousPixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplitting")); desc.add("dumpCSV", false); @@ -184,13 +186,27 @@ void ClusterTPAssociationHeterogeneous::acquireGPUCuda(const edm::HeterogeneousE // gpu stuff ------------------------ - edm::Handle gd; + edm::Handle> gd; + iEvent.getByToken(tGpuDigis, gd); + // temporary check (until the migration) + edm::Service cs; + assert(gd->device() == cs->getCurrentDevice()); + + CUDAScopedContext ctx{*gd}; + auto const &gDigis = ctx.get(*gd); + + // We're processing in a stream given by base class, so need to + // synchronize explicitly (implementation is from + // CUDAScopedContext). In practice these should not be needed + // (because of synchronizations upstream), but let's play generic. + if(not gd->event().has_occurred()) { + cudaCheck(cudaStreamWaitEvent(cudaStream.id(), gd->event().id(), 0)); + } + edm::Handle gh; - iEvent.getByToken(tGpuDigis, gd); iEvent.getByToken(tGpuHits, gh); - auto const & gDigis = *gd; auto const & gHits = *gh; - auto ndigis = gDigis.nDigis; + auto ndigis = gDigis.nDigis(); auto nhits = gHits.nHits; digi2tp.clear();