Skip to content

Commit

Permalink
Reorganize CUDAScopedContext (#355)
Browse files Browse the repository at this point in the history
* Split CUDAScopedContext to *Acquire and *Produce

The motivation is that acquire() and produce() need a different
functionality, and are constructed differently (e.g. acquire version
always needs the edm::WaitingTaskWithArenaHolder). This split should
make it more difficult to make mistakes. It should also make future
evolution, e.g. towards chains of TBB tasks alternating in CPU and GPU
work, easier.

* Rename CUDAContextToken to CUDAContextState, and change semantics

Now CUDAScopedContextAcquire takes it as a parameter to constructor,
and stores the state in its destructor (yielding RAII semantics).

* Document the constructors.
  • Loading branch information
makortel authored and fwyzard committed Jun 20, 2019
1 parent 55b2510 commit 957e184
Show file tree
Hide file tree
Showing 24 changed files with 321 additions and 236 deletions.
3 changes: 2 additions & 1 deletion CUDADataFormats/Common/interface/CUDAProduct.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ class CUDAProduct: public CUDAProductBase {
CUDAProduct& operator=(CUDAProduct&&) = default;

private:
friend class CUDAScopedContext;
friend class CUDAScopedContextBase;
friend class CUDAScopedContextProduce;
friend class edm::Wrapper<CUDAProduct<T>>;

explicit CUDAProduct(int device, std::shared_ptr<cuda::stream_t<>> stream, T data):
Expand Down
3 changes: 2 additions & 1 deletion CUDADataFormats/Common/interface/CUDAProductBase.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ class CUDAProductBase {
{}

private:
friend class CUDAScopedContext;
friend class CUDAScopedContextBase;
friend class CUDAScopedContextProduce;

// The following functions are intended to be used only from CUDAScopedContext
void setEvent(std::shared_ptr<cuda::event_t> event) {
Expand Down
8 changes: 4 additions & 4 deletions CUDADataFormats/Common/test/test_CUDAProduct.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@ namespace cudatest {
class TestCUDAScopedContext {
public:
static
CUDAScopedContext make(int dev, bool createEvent) {
CUDAScopedContextProduce make(int dev, bool createEvent) {
auto device = cuda::device::get(dev);
std::unique_ptr<cuda::event_t> event;
if(createEvent) {
event = std::make_unique<cuda::event_t>(device.create_event());
}
return CUDAScopedContext(dev,
std::make_unique<cuda::stream_t<>>(device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream)),
std::move(event));
return CUDAScopedContextProduce(dev,
std::make_unique<cuda::stream_t<>>(device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream)),
std::move(event));
}
};
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ void SiPixelDigiErrorsSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptio

void SiPixelDigiErrorsSoAFromCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
// Do the transfer in a CUDA stream parallel to the computation CUDA stream
CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};

const auto& gpuDigiErrors = ctx.get(iEvent, digiErrorGetToken_);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ void SiPixelDigisSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& d

void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
// Do the transfer in a CUDA stream parallel to the computation CUDA stream
CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};

const auto& gpuDigis = ctx.get(iEvent, digiGetToken_);

Expand Down
151 changes: 80 additions & 71 deletions HeterogeneousCore/CUDACore/README.md

Large diffs are not rendered by default.

46 changes: 46 additions & 0 deletions HeterogeneousCore/CUDACore/interface/CUDAContextState.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#ifndef HeterogeneousCore_CUDACore_CUDAContextState_h
#define HeterogeneousCore_CUDACore_CUDAContextState_h

#include <cuda/api_wrappers.h>

#include <memory>

/**
* The purpose of this class is to deliver the device and CUDA stream
* information from ExternalWork's acquire() to producer() via a
* member/StreamCache variable.
*/
class CUDAContextState {
public:
CUDAContextState() = default;
~CUDAContextState() = default;

CUDAContextState(const CUDAContextState&) = delete;
CUDAContextState& operator=(const CUDAContextState&) = delete;
CUDAContextState(CUDAContextState&&) = delete;
CUDAContextState& operator=(CUDAContextState&& other) = delete;

private:
friend class CUDAScopedContextAcquire;
friend class CUDAScopedContextProduce;

void set(int device, std::shared_ptr<cuda::stream_t<>> stream) {
throwIfStream();
device_ = device;
stream_ = std::move(stream);
}

int device() { return device_; }
std::shared_ptr<cuda::stream_t<>>&& streamPtr() {
throwIfNoStream();
return std::move(stream_);
}

void throwIfStream() const;
void throwIfNoStream() const;

std::shared_ptr<cuda::stream_t<>> stream_;
int device_;
};

#endif
38 changes: 0 additions & 38 deletions HeterogeneousCore/CUDACore/interface/CUDAContextToken.h

This file was deleted.

134 changes: 91 additions & 43 deletions HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "FWCore/Utilities/interface/EDGetToken.h"
#include "FWCore/Utilities/interface/EDPutToken.h"
#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAContextToken.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"

#include <cuda/api_wrappers.h>

Expand All @@ -17,49 +17,15 @@ namespace cudatest {
class TestCUDAScopedContext;
}

/**
* The aim of this class is to do necessary per-event "initialization":
* - setting the current device
* - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
* - synchronizing between CUDA streams if necessary
* and enforce that those get done in a proper way in RAII fashion.
*/
class CUDAScopedContext {
// This class is intended to be derived by other CUDAScopedContext*, not for general use
class CUDAScopedContextBase {
public:
explicit CUDAScopedContext(edm::StreamID streamID);

explicit CUDAScopedContext(CUDAContextToken&& token):
currentDevice_(token.device()),
setDeviceForThisScope_(currentDevice_),
stream_(std::move(token.streamPtr()))
{}

explicit CUDAScopedContext(const CUDAProductBase& data);

explicit CUDAScopedContext(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
CUDAScopedContext(streamID)
{
waitingTaskHolder_ = std::move(waitingTaskHolder);
}

explicit CUDAScopedContext(const CUDAProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
CUDAScopedContext(data)
{
waitingTaskHolder_ = std::move(waitingTaskHolder);
}

~CUDAScopedContext();

int device() const { return currentDevice_; }

cuda::stream_t<>& stream() { return *stream_; }
const cuda::stream_t<>& stream() const { return *stream_; }
const std::shared_ptr<cuda::stream_t<>>& streamPtr() const { return stream_; }

CUDAContextToken toToken() {
return CUDAContextToken(currentDevice_, stream_);
}

template <typename T>
const T& get(const CUDAProduct<T>& data) {
synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event());
Expand All @@ -71,6 +37,90 @@ class CUDAScopedContext {
return get(iEvent.get(token));
}

protected:
explicit CUDAScopedContextBase(edm::StreamID streamID);

explicit CUDAScopedContextBase(const CUDAProductBase& data);

explicit CUDAScopedContextBase(int device, std::shared_ptr<cuda::stream_t<>> stream);

void synchronizeStreams(int dataDevice, const cuda::stream_t<>& dataStream, bool available, const cuda::event_t *dataEvent);

std::shared_ptr<cuda::stream_t<>>& streamPtr() { return stream_; }

private:
int currentDevice_;
cuda::device::current::scoped_override_t<> setDeviceForThisScope_;
std::shared_ptr<cuda::stream_t<>> stream_;
};

/**
* The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire():
* - setting the current device
* - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
* - synchronizing between CUDA streams if necessary
* and enforce that those get done in a proper way in RAII fashion.
*/
class CUDAScopedContextAcquire: public CUDAScopedContextBase {
public:
/// Constructor to create a new CUDA stream (no need for context beyond acquire())
explicit CUDAScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
CUDAScopedContextBase(streamID),
waitingTaskHolder_{std::move(waitingTaskHolder)}
{}

/// Constructor to create a new CUDA stream, and the context is needed after acquire()
explicit CUDAScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder, CUDAContextState& state):
CUDAScopedContextBase(streamID),
waitingTaskHolder_{std::move(waitingTaskHolder)},
contextState_{&state}
{}

/// Constructor to (possibly) re-use a CUDA stream (no need for context beyond acquire())
explicit CUDAScopedContextAcquire(const CUDAProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
CUDAScopedContextBase(data),
waitingTaskHolder_{std::move(waitingTaskHolder)}
{}

/// Constructor to (possibly) re-use a CUDA stream, and the context is needed after acquire()
explicit CUDAScopedContextAcquire(const CUDAProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder, CUDAContextState& state):
CUDAScopedContextBase(data),
waitingTaskHolder_{std::move(waitingTaskHolder)},
contextState_{&state}
{}

~CUDAScopedContextAcquire();

private:
edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
CUDAContextState *contextState_ = nullptr;
};

/**
* The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce():
* - setting the current device
* - synchronizing between CUDA streams if necessary
* and enforce that those get done in a proper way in RAII fashion.
*/
class CUDAScopedContextProduce: public CUDAScopedContextBase {
public:
/// Constructor to create a new CUDA stream (non-ExternalWork module)
explicit CUDAScopedContextProduce(edm::StreamID streamID):
CUDAScopedContextBase(streamID)
{}

/// Constructor to (possibly) re-use a CUDA stream (non-ExternalWork module)
explicit CUDAScopedContextProduce(const CUDAProductBase& data):
CUDAScopedContextBase(data)
{}

/// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
explicit CUDAScopedContextProduce(CUDAContextState& token):
CUDAScopedContextBase(token.device(), std::move(token.streamPtr()))
{}

~CUDAScopedContextProduce();

template <typename T>
std::unique_ptr<CUDAProduct<T> > wrap(T data) {
// make_unique doesn't work because of private constructor
Expand All @@ -96,15 +146,13 @@ class CUDAScopedContext {
friend class cudatest::TestCUDAScopedContext;

// This construcor is only meant for testing
explicit CUDAScopedContext(int device, std::unique_ptr<cuda::stream_t<>> stream, std::unique_ptr<cuda::event_t> event);
explicit CUDAScopedContextProduce(int device, std::unique_ptr<cuda::stream_t<>> stream, std::unique_ptr<cuda::event_t> event):
CUDAScopedContextBase(device, std::move(stream)),
event_{std::move(event)}
{}

void createEventIfStreamBusy();
void synchronizeStreams(int dataDevice, const cuda::stream_t<>& dataStream, bool available, const cuda::event_t *dataEvent);

int currentDevice_;
std::optional<edm::WaitingTaskWithArenaHolder> waitingTaskHolder_;
cuda::device::current::scoped_override_t<> setDeviceForThisScope_;
std::shared_ptr<cuda::stream_t<>> stream_;
std::shared_ptr<cuda::event_t> event_;
};

Expand Down
14 changes: 14 additions & 0 deletions HeterogeneousCore/CUDACore/src/CUDAContextState.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
#include "FWCore/Utilities/interface/Exception.h"

void CUDAContextState::throwIfStream() const {
if(stream_) {
throw cms::Exception("LogicError") << "Trying to set CUDAContextState, but it already had a valid state";
}
}

void CUDAContextState::throwIfNoStream() const {
if(not stream_) {
throw cms::Exception("LogicError") << "Trying to get CUDAContextState, but it did not have a valid state";
}
}
Loading

0 comments on commit 957e184

Please sign in to comment.