Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reorganize CUDAScopedContext #355

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CUDADataFormats/Common/interface/CUDAProduct.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ class CUDAProduct: public CUDAProductBase {
CUDAProduct& operator=(CUDAProduct&&) = default;

private:
friend class CUDAScopedContext;
friend class CUDAScopedContextBase;
friend class CUDAScopedContextProduce;
friend class edm::Wrapper<CUDAProduct<T>>;

explicit CUDAProduct(int device, std::shared_ptr<cuda::stream_t<>> stream, T data):
Expand Down
3 changes: 2 additions & 1 deletion CUDADataFormats/Common/interface/CUDAProductBase.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ class CUDAProductBase {
{}

private:
friend class CUDAScopedContext;
friend class CUDAScopedContextBase;
friend class CUDAScopedContextProduce;

// The following functions are intended to be used only from CUDAScopedContext
void setEvent(std::shared_ptr<cuda::event_t> event) {
Expand Down
8 changes: 4 additions & 4 deletions CUDADataFormats/Common/test/test_CUDAProduct.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@ namespace cudatest {
class TestCUDAScopedContext {
public:
static
CUDAScopedContext make(int dev, bool createEvent) {
CUDAScopedContextProduce make(int dev, bool createEvent) {
auto device = cuda::device::get(dev);
std::unique_ptr<cuda::event_t> event;
if(createEvent) {
event = std::make_unique<cuda::event_t>(device.create_event());
}
return CUDAScopedContext(dev,
std::make_unique<cuda::stream_t<>>(device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream)),
std::move(event));
return CUDAScopedContextProduce(dev,
std::make_unique<cuda::stream_t<>>(device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream)),
std::move(event));
}
};
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ void SiPixelDigiErrorsSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptio

void SiPixelDigiErrorsSoAFromCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
// Do the transfer in a CUDA stream parallel to the computation CUDA stream
CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};

const auto& gpuDigiErrors = ctx.get(iEvent, digiErrorGetToken_);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ void SiPixelDigisSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& d

void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
// Do the transfer in a CUDA stream parallel to the computation CUDA stream
CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};

const auto& gpuDigis = ctx.get(iEvent, digiGetToken_);

Expand Down
151 changes: 80 additions & 71 deletions HeterogeneousCore/CUDACore/README.md

Large diffs are not rendered by default.

46 changes: 46 additions & 0 deletions HeterogeneousCore/CUDACore/interface/CUDAContextState.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#ifndef HeterogeneousCore_CUDACore_CUDAContextState_h
#define HeterogeneousCore_CUDACore_CUDAContextState_h

#include <cuda/api_wrappers.h>

#include <memory>

/**
* The purpose of this class is to deliver the device and CUDA stream
* information from ExternalWork's acquire() to producer() via a
* member/StreamCache variable.
*/
class CUDAContextState {
public:
CUDAContextState() = default;
~CUDAContextState() = default;

CUDAContextState(const CUDAContextState&) = delete;
CUDAContextState& operator=(const CUDAContextState&) = delete;
CUDAContextState(CUDAContextState&&) = delete;
CUDAContextState& operator=(CUDAContextState&& other) = delete;

private:
friend class CUDAScopedContextAcquire;
friend class CUDAScopedContextProduce;

void set(int device, std::shared_ptr<cuda::stream_t<>> stream) {
throwIfStream();
device_ = device;
stream_ = std::move(stream);
}

int device() { return device_; }
std::shared_ptr<cuda::stream_t<>>&& streamPtr() {
throwIfNoStream();
return std::move(stream_);
}

void throwIfStream() const;
void throwIfNoStream() const;

std::shared_ptr<cuda::stream_t<>> stream_;
int device_;
};

#endif
38 changes: 0 additions & 38 deletions HeterogeneousCore/CUDACore/interface/CUDAContextToken.h

This file was deleted.

134 changes: 91 additions & 43 deletions HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "FWCore/Utilities/interface/EDGetToken.h"
#include "FWCore/Utilities/interface/EDPutToken.h"
#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAContextToken.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"

#include <cuda/api_wrappers.h>

Expand All @@ -17,49 +17,15 @@ namespace cudatest {
class TestCUDAScopedContext;
}

/**
* The aim of this class is to do necessary per-event "initialization":
* - setting the current device
* - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
* - synchronizing between CUDA streams if necessary
* and enforce that those get done in a proper way in RAII fashion.
*/
class CUDAScopedContext {
// This class is intended to be derived by other CUDAScopedContext*, not for general use
class CUDAScopedContextBase {
public:
explicit CUDAScopedContext(edm::StreamID streamID);

explicit CUDAScopedContext(CUDAContextToken&& token):
currentDevice_(token.device()),
setDeviceForThisScope_(currentDevice_),
stream_(std::move(token.streamPtr()))
{}

explicit CUDAScopedContext(const CUDAProductBase& data);

explicit CUDAScopedContext(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
CUDAScopedContext(streamID)
{
waitingTaskHolder_ = std::move(waitingTaskHolder);
}

explicit CUDAScopedContext(const CUDAProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
CUDAScopedContext(data)
{
waitingTaskHolder_ = std::move(waitingTaskHolder);
}

~CUDAScopedContext();

int device() const { return currentDevice_; }

cuda::stream_t<>& stream() { return *stream_; }
const cuda::stream_t<>& stream() const { return *stream_; }
const std::shared_ptr<cuda::stream_t<>>& streamPtr() const { return stream_; }

CUDAContextToken toToken() {
return CUDAContextToken(currentDevice_, stream_);
}

template <typename T>
const T& get(const CUDAProduct<T>& data) {
synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event());
Expand All @@ -71,6 +37,90 @@ class CUDAScopedContext {
return get(iEvent.get(token));
}

protected:
explicit CUDAScopedContextBase(edm::StreamID streamID);

explicit CUDAScopedContextBase(const CUDAProductBase& data);

explicit CUDAScopedContextBase(int device, std::shared_ptr<cuda::stream_t<>> stream);

void synchronizeStreams(int dataDevice, const cuda::stream_t<>& dataStream, bool available, const cuda::event_t *dataEvent);

std::shared_ptr<cuda::stream_t<>>& streamPtr() { return stream_; }

private:
int currentDevice_;
cuda::device::current::scoped_override_t<> setDeviceForThisScope_;
std::shared_ptr<cuda::stream_t<>> stream_;
};

/**
* The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire():
* - setting the current device
* - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
* - synchronizing between CUDA streams if necessary
* and enforce that those get done in a proper way in RAII fashion.
*/
class CUDAScopedContextAcquire: public CUDAScopedContextBase {
public:
/// Constructor to create a new CUDA stream (no need for context beyond acquire())
explicit CUDAScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
CUDAScopedContextBase(streamID),
waitingTaskHolder_{std::move(waitingTaskHolder)}
{}

/// Constructor to create a new CUDA stream, and the context is needed after acquire()
explicit CUDAScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder, CUDAContextState& state):
CUDAScopedContextBase(streamID),
waitingTaskHolder_{std::move(waitingTaskHolder)},
contextState_{&state}
{}

/// Constructor to (possibly) re-use a CUDA stream (no need for context beyond acquire())
explicit CUDAScopedContextAcquire(const CUDAProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
CUDAScopedContextBase(data),
waitingTaskHolder_{std::move(waitingTaskHolder)}
{}

/// Constructor to (possibly) re-use a CUDA stream, and the context is needed after acquire()
explicit CUDAScopedContextAcquire(const CUDAProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder, CUDAContextState& state):
CUDAScopedContextBase(data),
waitingTaskHolder_{std::move(waitingTaskHolder)},
contextState_{&state}
{}

~CUDAScopedContextAcquire();

private:
edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
CUDAContextState *contextState_ = nullptr;
};

/**
* The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce():
* - setting the current device
* - synchronizing between CUDA streams if necessary
* and enforce that those get done in a proper way in RAII fashion.
*/
class CUDAScopedContextProduce: public CUDAScopedContextBase {
public:
/// Constructor to create a new CUDA stream (non-ExternalWork module)
explicit CUDAScopedContextProduce(edm::StreamID streamID):
CUDAScopedContextBase(streamID)
{}

/// Constructor to (possibly) re-use a CUDA stream (non-ExternalWork module)
explicit CUDAScopedContextProduce(const CUDAProductBase& data):
CUDAScopedContextBase(data)
{}

/// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
explicit CUDAScopedContextProduce(CUDAContextState& token):
CUDAScopedContextBase(token.device(), std::move(token.streamPtr()))
{}

~CUDAScopedContextProduce();

template <typename T>
std::unique_ptr<CUDAProduct<T> > wrap(T data) {
// make_unique doesn't work because of private constructor
Expand All @@ -96,15 +146,13 @@ class CUDAScopedContext {
friend class cudatest::TestCUDAScopedContext;

// This construcor is only meant for testing
explicit CUDAScopedContext(int device, std::unique_ptr<cuda::stream_t<>> stream, std::unique_ptr<cuda::event_t> event);
explicit CUDAScopedContextProduce(int device, std::unique_ptr<cuda::stream_t<>> stream, std::unique_ptr<cuda::event_t> event):
CUDAScopedContextBase(device, std::move(stream)),
event_{std::move(event)}
{}

void createEventIfStreamBusy();
void synchronizeStreams(int dataDevice, const cuda::stream_t<>& dataStream, bool available, const cuda::event_t *dataEvent);

int currentDevice_;
std::optional<edm::WaitingTaskWithArenaHolder> waitingTaskHolder_;
cuda::device::current::scoped_override_t<> setDeviceForThisScope_;
std::shared_ptr<cuda::stream_t<>> stream_;
std::shared_ptr<cuda::event_t> event_;
};

Expand Down
14 changes: 14 additions & 0 deletions HeterogeneousCore/CUDACore/src/CUDAContextState.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
#include "FWCore/Utilities/interface/Exception.h"

void CUDAContextState::throwIfStream() const {
if(stream_) {
throw cms::Exception("LogicError") << "Trying to set CUDAContextState, but it already had a valid state";
}
}

void CUDAContextState::throwIfNoStream() const {
if(not stream_) {
throw cms::Exception("LogicError") << "Trying to get CUDAContextState, but it did not have a valid state";
}
}
Loading