Rename CUDAContextToken to CUDAContextState, and change semantics

Now CUDAScopedContextAcquire takes it as a parameter to constructor, and stores the state in its destructor (yielding RAII semantics).
cms-patatrack · Jun 14, 2019 · 48819a2 · 48819a2
1 parent 43d3f60
commit 48819a2
Show file tree

Hide file tree

Showing 9 changed files with 110 additions and 84 deletions.
diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md
@@ -146,31 +146,28 @@ private:
   ProducerOutputGPUAlgo gpuAlgo_;
   edm::EDGetTokenT<InputData> inputToken_;
   edm::EDPutTokenT<CUDAProduct<OutputData>> outputToken_;
-  CUDAContextToken ctxTmp_;
+  CUDAContextState ctxState_;
 };
 ...
 void ProducerOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   // Sets the current device and creates a CUDA stream
-  CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
+  CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_};
 
   auto const& inputData = iEvent.get(inputToken_);
 
   // Queues asynchronous data transfers and kernels to the CUDA stream
   // returned by CUDAScopedContextAcquire::stream()
   gpuAlgo.makeAsync(inputData, ctx.stream());
 
-  // Passes the current device and CUDA stream to produce()
-  // Feels a bit silly, and will hopefully get improved in the future
-  ctxTmp_ = ctx.toToken();
-
   // Destructor of ctx queues a callback to the CUDA stream notifying
-  // waitingTaskHolder when the queued asynchronous work has finished
+  // waitingTaskHolder when the queued asynchronous work has finished,
+  // and saves the device and CUDA stream to ctxState_
 }
 
 // Called after the asynchronous work has finished
 void ProducerOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
   // Sets again the current device, uses the CUDA stream created in the acquire()
-  CUDAScopedContextProduce ctx{std::move(ctxTmp_)};
+  CUDAScopedContextProduce ctx{ctxState_};
 
   // Now getResult() returns data in GPU memory that is passed to the
   // constructor of OutputData. CUDAScopedContextProduce::emplace() wraps the
@@ -254,15 +251,14 @@ private:
   ProducerInputGPUAlgo gpuAlgo_;
   edm::EDGetTokenT<CUDAProduct<InputData>> inputToken_;
   edm::EDPutTokenT<CUDAProduct<OutputData>> outputToken_;
-  CUDAContextToken ctxTmp_;
 };
 ...
 void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   CUDAProduct<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
 
   // Set the current device to the same that was used to produce
   // InputData, and also use the same CUDA stream
-  CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+  CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder), ctxState_};
 
   // Grab the real input data. Checks that the input data is on the
   // current device. If the input data was produced in a different CUDA
@@ -274,18 +270,15 @@ void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup&
   // returned by CUDAScopedContextAcquire::stream()
   gpuAlgo.makeAsync(inputData, ctx.stream());
 
-  // Passes the current device and CUDA stream to produce()
-  // Feels a bit silly, and will hopefully get improved in the future
-  ctxTmp_ = ctx.toToken();
-
-// Destructor of ctx queues a callback to the CUDA stream notifying
-  // waitingTaskHolder when the queued asynchronous work has finished
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished,
+  // and saves the device and CUDA stream to ctxState_
 }
 
 // Called after the asynchronous work has finished
 void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
   // Sets again the current device, uses the CUDA stream created in the acquire()
-  CUDAScopedContextProduce ctx{std::move(ctxTmp_)};
+  CUDAScopedContextProduce ctx{ctxState_};
 
   // Now getResult() returns data in GPU memory that is passed to the
   // constructor of OutputData. CUDAScopedContextProduce::emplace() wraps the
@@ -654,27 +647,30 @@ callback function to the CUDA stream in its destructor to call
 `waitingTaskHolder.doneWaiting()`.
 
 A GPU->GPU producer needs a `CUDAScopedContext` also in its
-`produce()`. Currently the best way is to store the state of
-`CUDAScopedContext` to `CUDAContextToken` member variable:
+`produce()`. The device and CUDA stream are transferred via
+`CUDAContextState` member variable:
 
 ```cpp
 class FooProducerCUDA ... {
   ...
-  CUDAContextToken ctxTmp_;
+  CUDAContextState ctxState_;
 };
 
 void acquire(...) {
   ...
-  ctxTmp_ = ctx.toToken();
+  CUDAScopedContextAcquire ctx{..., std::move(waitingTaskHolder), ctxState_};
+  ...
 }
 
 void produce(...( {
   ...
-  CUDAScopedContextProduce ctx{std::move(ctxTmp_)};
+  CUDAScopedContextProduce ctx{ctxState_};
 }
 ```
 
-Ideas for improvements are welcome.
+The `CUDAScopedContextAcquire` saves its state to the `ctxState_` in
+the destructor, and `CUDAScopedContextProduce` then restores the
+context.
 
 
 #### Transferring GPU data to CPU

diff --git a/HeterogeneousCore/CUDACore/interface/CUDAContextState.h b/HeterogeneousCore/CUDACore/interface/CUDAContextState.h
@@ -0,0 +1,46 @@
+#ifndef HeterogeneousCore_CUDACore_CUDAContextState_h
+#define HeterogeneousCore_CUDACore_CUDAContextState_h
+
+#include <cuda/api_wrappers.h>
+
+#include <memory>
+
+/**
+ * The purpose of this class is to deliver the device and CUDA stream
+ * information from ExternalWork's acquire() to producer() via a
+ * member/StreamCache variable.
+ */
+class CUDAContextState {
+public:
+  CUDAContextState() = default;
+  ~CUDAContextState() = default;
+
+  CUDAContextState(const CUDAContextState&) = delete;
+  CUDAContextState& operator=(const CUDAContextState&) = delete;
+  CUDAContextState(CUDAContextState&&) = delete;
+  CUDAContextState& operator=(CUDAContextState&& other) = delete;
+
+private:
+  friend class CUDAScopedContextAcquire;
+  friend class CUDAScopedContextProduce;
+
+  void set(int device, std::shared_ptr<cuda::stream_t<>> stream) {
+    throwIfStream();
+    device_ = device;
+    stream_ = std::move(stream);
+  }
+
+  int device() { return device_; }
+  std::shared_ptr<cuda::stream_t<>>&& streamPtr() {
+    throwIfNoStream();
+    return std::move(stream_);
+  }
+
+  void throwIfStream() const;
+  void throwIfNoStream() const;
+
+  std::shared_ptr<cuda::stream_t<>> stream_;
+  int device_;
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h b/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
@@ -7,7 +7,7 @@
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/EDPutToken.h"
 #include "CUDADataFormats/Common/interface/CUDAProduct.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAContextToken.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
 
 #include <cuda/api_wrappers.h>
 
@@ -46,6 +46,8 @@ class CUDAScopedContextBase {
 
   void synchronizeStreams(int dataDevice, const cuda::stream_t<>& dataStream, bool available, const cuda::event_t *dataEvent);
 
+  std::shared_ptr<cuda::stream_t<>>& streamPtr() { return stream_; }
+
 private:
   int currentDevice_;
   cuda::device::current::scoped_override_t<> setDeviceForThisScope_;
@@ -66,19 +68,28 @@ class CUDAScopedContextAcquire: public CUDAScopedContextBase {
     waitingTaskHolder_{std::move(waitingTaskHolder)}
   {}
 
+  explicit CUDAScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder, CUDAContextState& state):
+    CUDAScopedContextBase(streamID),
+    waitingTaskHolder_{std::move(waitingTaskHolder)},
+    contextState_{&state}
+  {}
+
   explicit CUDAScopedContextAcquire(const CUDAProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
     CUDAScopedContextBase(data),
     waitingTaskHolder_{std::move(waitingTaskHolder)}
   {}
 
-  ~CUDAScopedContextAcquire();
+  explicit CUDAScopedContextAcquire(const CUDAProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder, CUDAContextState& state):
+    CUDAScopedContextBase(data),
+    waitingTaskHolder_{std::move(waitingTaskHolder)},
+    contextState_{&state}
+  {}
 
-  CUDAContextToken toToken() {
-    return CUDAContextToken(device(), streamPtr());
-  }
+  ~CUDAScopedContextAcquire();
 
 private:
   edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
+  CUDAContextState *contextState_ = nullptr;
 };
 
 /**
@@ -97,7 +108,7 @@ class CUDAScopedContextProduce: public CUDAScopedContextBase {
     CUDAScopedContextBase(data)
   {}
 
-  explicit CUDAScopedContextProduce(CUDAContextToken&& token):
+  explicit CUDAScopedContextProduce(CUDAContextState& token):
     CUDAScopedContextBase(token.device(), std::move(token.streamPtr()))
   {}
 

diff --git a/HeterogeneousCore/CUDACore/src/CUDAContextState.cc b/HeterogeneousCore/CUDACore/src/CUDAContextState.cc
@@ -0,0 +1,14 @@
+#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
+#include "FWCore/Utilities/interface/Exception.h"
+
+void CUDAContextState::throwIfStream() const {
+  if(stream_) {
+    throw cms::Exception("LogicError") << "Trying to set CUDAContextState, but it already had a valid state";
+  }
+}
+
+void CUDAContextState::throwIfNoStream() const {
+  if(not stream_) {
+    throw cms::Exception("LogicError") << "Trying to get CUDAContextState, but it did not have a valid state";
+  }
+}
diff --git a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
@@ -77,6 +77,9 @@ CUDAScopedContextAcquire::~CUDAScopedContextAcquire() {
                                 }
                               }
                             });
+  if(contextState_) {
+    contextState_->set(device(), std::move(streamPtr()));
+  }
 }
 
 ////////////////////

diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
@@ -69,18 +69,17 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
       REQUIRE(ctx3.stream().id() != data.stream().id());
     }
 
-    SECTION("Storing state as CUDAContextToken") {
-      CUDAContextToken ctxtok;
+    SECTION("Storing state in CUDAContextState") {
+      CUDAContextState ctxstate;
       { // acquire
         std::unique_ptr<CUDAProduct<int>> dataPtr = ctx.wrap(10);
         const auto& data = *dataPtr;
         edm::WaitingTaskWithArenaHolder dummy{edm::make_waiting_task(tbb::task::allocate_root(), [](std::exception_ptr const* iPtr){})};
-        CUDAScopedContextAcquire ctx2{data, std::move(dummy)};
-        ctxtok = ctx2.toToken();
+        CUDAScopedContextAcquire ctx2{data, std::move(dummy), ctxstate};
       }
 
       { // produce
-        CUDAScopedContextProduce ctx2{std::move(ctxtok)};
+        CUDAScopedContextProduce ctx2{ctxstate};
         REQUIRE(cuda::device::current::get().id() == ctx.device());
         REQUIRE(ctx2.stream().id() == ctx.stream().id());
       }

diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
@@ -7,7 +7,7 @@
 
 #include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAContextToken.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
 #include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
 
 #include "TestCUDAProducerGPUKernel.h"
@@ -26,7 +26,7 @@ class TestCUDAProducerGPUEW: public edm::stream::EDProducer<edm::ExternalWork> {
   edm::EDGetTokenT<CUDAProduct<CUDAThing>> srcToken_;
   edm::EDPutTokenT<CUDAProduct<CUDAThing>> dstToken_;
   TestCUDAProducerGPUKernel gpuAlgo_;
-  CUDAContextToken ctxTmp_;
+  CUDAContextState ctxState_;
   cudautils::device::unique_ptr<float[]> devicePtr_;
   float hostData_ = 0.f;
 };
@@ -47,7 +47,7 @@ void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent, const edm::EventSe
   edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
 
   const auto& in = iEvent.get(srcToken_);
-  CUDAScopedContextAcquire ctx{in, std::move(waitingTaskHolder)};
+  CUDAScopedContextAcquire ctx{in, std::move(waitingTaskHolder), ctxState_};
   const CUDAThing& input = ctx.get(in);
 
   devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx.stream());
@@ -57,14 +57,12 @@ void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent, const edm::EventSe
   cuda::memory::async::copy(&hostData_, devicePtr_.get()+10, sizeof(float), ctx.stream().id());
 
   edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID();
-
-  ctxTmp_ = ctx.toToken();
 }
 
 void TestCUDAProducerGPUEW::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
   edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID() << " 10th element " << hostData_; 
 
-  CUDAScopedContextProduce ctx{std::move(ctxTmp_)};
+  CUDAScopedContextProduce ctx{ctxState_};
 
   ctx.emplace(iEvent, dstToken_, std::move(devicePtr_));
 

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
@@ -53,7 +53,7 @@ class SiPixelRawToClusterCUDA: public edm::stream::EDProducer<edm::ExternalWork>
   edm::EDPutTokenT<CUDAProduct<SiPixelDigiErrorsCUDA>> digiErrorPutToken_;
   edm::EDPutTokenT<CUDAProduct<SiPixelClustersCUDA>> clusterPutToken_;
 
-  CUDAContextToken ctxTmp_;
+  CUDAContextState ctxState_;
 
   edm::ESWatcher<SiPixelFedCablingMapRcd> recordWatcher;
 
@@ -118,7 +118,7 @@ void SiPixelRawToClusterCUDA::fillDescriptions(edm::ConfigurationDescriptions& d
 
 
 void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
+  CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_};
 
   edm::ESHandle<SiPixelFedCablingMapGPUWrapper> hgpuMap;
   iSetup.get<CkfComponentsRecord>().get(hgpuMap);
@@ -228,12 +228,10 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event
                              useQuality_, includeErrors_,
                              edm::MessageDrop::instance()->debugEnabled,
                              ctx.stream());
-
-  ctxTmp_ = ctx.toToken();
 }
 
 void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
-  CUDAScopedContextProduce ctx{std::move(ctxTmp_)};
+  CUDAScopedContextProduce ctx{ctxState_};
 
   auto tmp = gpuAlgo_.getResults();
   ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first));