Skip to content

Commit

Permalink
Rename CUDAContextToken to CUDAContextState, and change semantics
Browse files Browse the repository at this point in the history
Now CUDAScopedContextAcquire takes it as a parameter to constructor,
and stores the state in its destructor (yielding RAII semantics).
  • Loading branch information
makortel committed Jun 14, 2019
1 parent 43d3f60 commit 48819a2
Show file tree
Hide file tree
Showing 9 changed files with 110 additions and 84 deletions.
42 changes: 19 additions & 23 deletions HeterogeneousCore/CUDACore/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,31 +146,28 @@ private:
ProducerOutputGPUAlgo gpuAlgo_;
edm::EDGetTokenT<InputData> inputToken_;
edm::EDPutTokenT<CUDAProduct<OutputData>> outputToken_;
CUDAContextToken ctxTmp_;
CUDAContextState ctxState_;
};
...
void ProducerOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
// Sets the current device and creates a CUDA stream
CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_};
auto const& inputData = iEvent.get(inputToken_);
// Queues asynchronous data transfers and kernels to the CUDA stream
// returned by CUDAScopedContextAcquire::stream()
gpuAlgo.makeAsync(inputData, ctx.stream());
// Passes the current device and CUDA stream to produce()
// Feels a bit silly, and will hopefully get improved in the future
ctxTmp_ = ctx.toToken();
// Destructor of ctx queues a callback to the CUDA stream notifying
// waitingTaskHolder when the queued asynchronous work has finished
// waitingTaskHolder when the queued asynchronous work has finished,
// and saves the device and CUDA stream to ctxState_
}
// Called after the asynchronous work has finished
void ProducerOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
// Sets again the current device, uses the CUDA stream created in the acquire()
CUDAScopedContextProduce ctx{std::move(ctxTmp_)};
CUDAScopedContextProduce ctx{ctxState_};
// Now getResult() returns data in GPU memory that is passed to the
// constructor of OutputData. CUDAScopedContextProduce::emplace() wraps the
Expand Down Expand Up @@ -254,15 +251,14 @@ private:
ProducerInputGPUAlgo gpuAlgo_;
edm::EDGetTokenT<CUDAProduct<InputData>> inputToken_;
edm::EDPutTokenT<CUDAProduct<OutputData>> outputToken_;
CUDAContextToken ctxTmp_;
};
...
void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
CUDAProduct<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
// Set the current device to the same that was used to produce
// InputData, and also use the same CUDA stream
CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder), ctxState_};
// Grab the real input data. Checks that the input data is on the
// current device. If the input data was produced in a different CUDA
Expand All @@ -274,18 +270,15 @@ void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup&
// returned by CUDAScopedContextAcquire::stream()
gpuAlgo.makeAsync(inputData, ctx.stream());
// Passes the current device and CUDA stream to produce()
// Feels a bit silly, and will hopefully get improved in the future
ctxTmp_ = ctx.toToken();
// Destructor of ctx queues a callback to the CUDA stream notifying
// waitingTaskHolder when the queued asynchronous work has finished
// Destructor of ctx queues a callback to the CUDA stream notifying
// waitingTaskHolder when the queued asynchronous work has finished,
// and saves the device and CUDA stream to ctxState_
}
// Called after the asynchronous work has finished
void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
// Sets again the current device, uses the CUDA stream created in the acquire()
CUDAScopedContextProduce ctx{std::move(ctxTmp_)};
CUDAScopedContextProduce ctx{ctxState_};
// Now getResult() returns data in GPU memory that is passed to the
// constructor of OutputData. CUDAScopedContextProduce::emplace() wraps the
Expand Down Expand Up @@ -654,27 +647,30 @@ callback function to the CUDA stream in its destructor to call
`waitingTaskHolder.doneWaiting()`.
A GPU->GPU producer needs a `CUDAScopedContext` also in its
`produce()`. Currently the best way is to store the state of
`CUDAScopedContext` to `CUDAContextToken` member variable:
`produce()`. The device and CUDA stream are transferred via
`CUDAContextState` member variable:
```cpp
class FooProducerCUDA ... {
...
CUDAContextToken ctxTmp_;
CUDAContextState ctxState_;
};
void acquire(...) {
...
ctxTmp_ = ctx.toToken();
CUDAScopedContextAcquire ctx{..., std::move(waitingTaskHolder), ctxState_};
...
}
void produce(...( {
...
CUDAScopedContextProduce ctx{std::move(ctxTmp_)};
CUDAScopedContextProduce ctx{ctxState_};
}
```

Ideas for improvements are welcome.
The `CUDAScopedContextAcquire` saves its state to the `ctxState_` in
the destructor, and `CUDAScopedContextProduce` then restores the
context.


#### Transferring GPU data to CPU
Expand Down
46 changes: 46 additions & 0 deletions HeterogeneousCore/CUDACore/interface/CUDAContextState.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#ifndef HeterogeneousCore_CUDACore_CUDAContextState_h
#define HeterogeneousCore_CUDACore_CUDAContextState_h

#include <cuda/api_wrappers.h>

#include <memory>

/**
* The purpose of this class is to deliver the device and CUDA stream
* information from ExternalWork's acquire() to producer() via a
* member/StreamCache variable.
*/
class CUDAContextState {
public:
CUDAContextState() = default;
~CUDAContextState() = default;

CUDAContextState(const CUDAContextState&) = delete;
CUDAContextState& operator=(const CUDAContextState&) = delete;
CUDAContextState(CUDAContextState&&) = delete;
CUDAContextState& operator=(CUDAContextState&& other) = delete;

private:
friend class CUDAScopedContextAcquire;
friend class CUDAScopedContextProduce;

void set(int device, std::shared_ptr<cuda::stream_t<>> stream) {
throwIfStream();
device_ = device;
stream_ = std::move(stream);
}

int device() { return device_; }
std::shared_ptr<cuda::stream_t<>>&& streamPtr() {
throwIfNoStream();
return std::move(stream_);
}

void throwIfStream() const;
void throwIfNoStream() const;

std::shared_ptr<cuda::stream_t<>> stream_;
int device_;
};

#endif
39 changes: 0 additions & 39 deletions HeterogeneousCore/CUDACore/interface/CUDAContextToken.h

This file was deleted.

23 changes: 17 additions & 6 deletions HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "FWCore/Utilities/interface/EDGetToken.h"
#include "FWCore/Utilities/interface/EDPutToken.h"
#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAContextToken.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"

#include <cuda/api_wrappers.h>

Expand Down Expand Up @@ -46,6 +46,8 @@ class CUDAScopedContextBase {

void synchronizeStreams(int dataDevice, const cuda::stream_t<>& dataStream, bool available, const cuda::event_t *dataEvent);

std::shared_ptr<cuda::stream_t<>>& streamPtr() { return stream_; }

private:
int currentDevice_;
cuda::device::current::scoped_override_t<> setDeviceForThisScope_;
Expand All @@ -66,19 +68,28 @@ class CUDAScopedContextAcquire: public CUDAScopedContextBase {
waitingTaskHolder_{std::move(waitingTaskHolder)}
{}

explicit CUDAScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder, CUDAContextState& state):
CUDAScopedContextBase(streamID),
waitingTaskHolder_{std::move(waitingTaskHolder)},
contextState_{&state}
{}

explicit CUDAScopedContextAcquire(const CUDAProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
CUDAScopedContextBase(data),
waitingTaskHolder_{std::move(waitingTaskHolder)}
{}

~CUDAScopedContextAcquire();
explicit CUDAScopedContextAcquire(const CUDAProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder, CUDAContextState& state):
CUDAScopedContextBase(data),
waitingTaskHolder_{std::move(waitingTaskHolder)},
contextState_{&state}
{}

CUDAContextToken toToken() {
return CUDAContextToken(device(), streamPtr());
}
~CUDAScopedContextAcquire();

private:
edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
CUDAContextState *contextState_ = nullptr;
};

/**
Expand All @@ -97,7 +108,7 @@ class CUDAScopedContextProduce: public CUDAScopedContextBase {
CUDAScopedContextBase(data)
{}

explicit CUDAScopedContextProduce(CUDAContextToken&& token):
explicit CUDAScopedContextProduce(CUDAContextState& token):
CUDAScopedContextBase(token.device(), std::move(token.streamPtr()))
{}

Expand Down
14 changes: 14 additions & 0 deletions HeterogeneousCore/CUDACore/src/CUDAContextState.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
#include "FWCore/Utilities/interface/Exception.h"

void CUDAContextState::throwIfStream() const {
if(stream_) {
throw cms::Exception("LogicError") << "Trying to set CUDAContextState, but it already had a valid state";
}
}

void CUDAContextState::throwIfNoStream() const {
if(not stream_) {
throw cms::Exception("LogicError") << "Trying to get CUDAContextState, but it did not have a valid state";
}
}
3 changes: 3 additions & 0 deletions HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ CUDAScopedContextAcquire::~CUDAScopedContextAcquire() {
}
}
});
if(contextState_) {
contextState_->set(device(), std::move(streamPtr()));
}
}

////////////////////
Expand Down
9 changes: 4 additions & 5 deletions HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,18 +69,17 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
REQUIRE(ctx3.stream().id() != data.stream().id());
}

SECTION("Storing state as CUDAContextToken") {
CUDAContextToken ctxtok;
SECTION("Storing state in CUDAContextState") {
CUDAContextState ctxstate;
{ // acquire
std::unique_ptr<CUDAProduct<int>> dataPtr = ctx.wrap(10);
const auto& data = *dataPtr;
edm::WaitingTaskWithArenaHolder dummy{edm::make_waiting_task(tbb::task::allocate_root(), [](std::exception_ptr const* iPtr){})};
CUDAScopedContextAcquire ctx2{data, std::move(dummy)};
ctxtok = ctx2.toToken();
CUDAScopedContextAcquire ctx2{data, std::move(dummy), ctxstate};
}

{ // produce
CUDAScopedContextProduce ctx2{std::move(ctxtok)};
CUDAScopedContextProduce ctx2{ctxstate};
REQUIRE(cuda::device::current::get().id() == ctx.device());
REQUIRE(ctx2.stream().id() == ctx.stream().id());
}
Expand Down
10 changes: 4 additions & 6 deletions HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAContextToken.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"

#include "TestCUDAProducerGPUKernel.h"
Expand All @@ -26,7 +26,7 @@ class TestCUDAProducerGPUEW: public edm::stream::EDProducer<edm::ExternalWork> {
edm::EDGetTokenT<CUDAProduct<CUDAThing>> srcToken_;
edm::EDPutTokenT<CUDAProduct<CUDAThing>> dstToken_;
TestCUDAProducerGPUKernel gpuAlgo_;
CUDAContextToken ctxTmp_;
CUDAContextState ctxState_;
cudautils::device::unique_ptr<float[]> devicePtr_;
float hostData_ = 0.f;
};
Expand All @@ -47,7 +47,7 @@ void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent, const edm::EventSe
edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire begin event " << iEvent.id().event() << " stream " << iEvent.streamID();

const auto& in = iEvent.get(srcToken_);
CUDAScopedContextAcquire ctx{in, std::move(waitingTaskHolder)};
CUDAScopedContextAcquire ctx{in, std::move(waitingTaskHolder), ctxState_};
const CUDAThing& input = ctx.get(in);

devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx.stream());
Expand All @@ -57,14 +57,12 @@ void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent, const edm::EventSe
cuda::memory::async::copy(&hostData_, devicePtr_.get()+10, sizeof(float), ctx.stream().id());

edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID();

ctxTmp_ = ctx.toToken();
}

void TestCUDAProducerGPUEW::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID() << " 10th element " << hostData_;

CUDAScopedContextProduce ctx{std::move(ctxTmp_)};
CUDAScopedContextProduce ctx{ctxState_};

ctx.emplace(iEvent, dstToken_, std::move(devicePtr_));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class SiPixelRawToClusterCUDA: public edm::stream::EDProducer<edm::ExternalWork>
edm::EDPutTokenT<CUDAProduct<SiPixelDigiErrorsCUDA>> digiErrorPutToken_;
edm::EDPutTokenT<CUDAProduct<SiPixelClustersCUDA>> clusterPutToken_;

CUDAContextToken ctxTmp_;
CUDAContextState ctxState_;

edm::ESWatcher<SiPixelFedCablingMapRcd> recordWatcher;

Expand Down Expand Up @@ -118,7 +118,7 @@ void SiPixelRawToClusterCUDA::fillDescriptions(edm::ConfigurationDescriptions& d


void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_};

edm::ESHandle<SiPixelFedCablingMapGPUWrapper> hgpuMap;
iSetup.get<CkfComponentsRecord>().get(hgpuMap);
Expand Down Expand Up @@ -228,12 +228,10 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event
useQuality_, includeErrors_,
edm::MessageDrop::instance()->debugEnabled,
ctx.stream());

ctxTmp_ = ctx.toToken();
}

void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
CUDAScopedContextProduce ctx{std::move(ctxTmp_)};
CUDAScopedContextProduce ctx{ctxState_};

auto tmp = gpuAlgo_.getResults();
ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first));
Expand Down

0 comments on commit 48819a2

Please sign in to comment.