Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace CUDA API wrapper memory operations with native CUDA calls #395

Merged
merged 15 commits into from
Oct 29, 2019
Merged
3 changes: 2 additions & 1 deletion CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"

BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cudaStream_t stream) {
data_d_ = cudautils::make_device_unique<Data>(stream);
cuda::memory::async::copy(data_d_.get(), data_h, sizeof(Data), stream);
cudaCheck(cudaMemcpyAsync(data_d_.get(), data_h, sizeof(Data), cudaMemcpyHostToDevice, stream));
}
2 changes: 1 addition & 1 deletion CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ cudautils::host::unique_ptr<float[]> TrackingRecHit2DCUDA::localCoordToHostAsync
template <>
cudautils::host::unique_ptr<uint32_t[]> TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(cudaStream_t stream) const {
auto ret = cudautils::make_host_unique<uint32_t[]>(2001, stream);
cudaMemcpyAsync(ret.get(), m_hitsModuleStart, 4 * 2001, cudaMemcpyDefault, stream);
cudaCheck(cudaMemcpyAsync(ret.get(), m_hitsModuleStart, 4 * 2001, cudaMemcpyDefault, stream));
return ret;
}
18 changes: 9 additions & 9 deletions DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <cuda/api_wrappers.h>

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "DataFormats/GeometrySurface/interface/GloballyPositioned.h"
#include "DataFormats/GeometrySurface/interface/SOARotation.h"
#include "DataFormats/GeometrySurface/interface/TkRotation.h"
Expand Down Expand Up @@ -73,7 +74,7 @@ int main(void) {

// auto d_sf = cuda::memory::device::make_unique<SFrame[]>(current_device, 1);
auto d_sf = cuda::memory::device::make_unique<char[]>(current_device, sizeof(SFrame));
cuda::memory::copy(d_sf.get(), &sf1, sizeof(SFrame));
cudaCheck(cudaMemcpy(d_sf.get(), &sf1, sizeof(SFrame), cudaMemcpyHostToDevice));

for (auto i = 0U; i < size; ++i) {
xl[i] = yl[i] = 0.1f * float(i) - float(size / 2);
Expand All @@ -84,9 +85,9 @@ int main(void) {
std::random_shuffle(xl, xl + size);
std::random_shuffle(yl, yl + size);

cuda::memory::copy(d_xl.get(), xl, size32);
cuda::memory::copy(d_yl.get(), yl, size32);
cuda::memory::copy(d_le.get(), le, 3 * size32);
cudaCheck(cudaMemcpy(d_xl.get(), xl, size32, cudaMemcpyHostToDevice));
cudaCheck(cudaMemcpy(d_yl.get(), yl, size32, cudaMemcpyHostToDevice));
cudaCheck(cudaMemcpy(d_le.get(), le, 3 * size32, cudaMemcpyHostToDevice));

toGlobalWrapper((SFrame const *)(d_sf.get()),
d_xl.get(),
Expand All @@ -97,11 +98,10 @@ int main(void) {
d_le.get(),
d_ge.get(),
size);

cuda::memory::copy(x, d_x.get(), size32);
cuda::memory::copy(y, d_y.get(), size32);
cuda::memory::copy(z, d_z.get(), size32);
cuda::memory::copy(ge, d_ge.get(), 6 * size32);
cudaCheck(cudaMemcpy(x, d_x.get(), size32, cudaMemcpyDeviceToHost));
cudaCheck(cudaMemcpy(y, d_y.get(), size32, cudaMemcpyDeviceToHost));
cudaCheck(cudaMemcpy(z, d_z.get(), size32, cudaMemcpyDeviceToHost));
cudaCheck(cudaMemcpy(ge, d_ge.get(), 6 * size32, cudaMemcpyDeviceToHost));

float eps = 0.;
for (auto i = 0U; i < size; ++i) {
Expand Down
9 changes: 5 additions & 4 deletions DataFormats/Math/test/CholeskyInvert_t.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <cuda/api_wrappers.h>

#include "DataFormats/Math/interface/choleskyInversion.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"

Expand Down Expand Up @@ -132,7 +133,7 @@ void go(bool soa) {
std::cout << mm[SIZE / 2](1, 1) << std::endl;

auto m_d = cuda::memory::device::make_unique<double[]>(current_device, DIM * DIM * stride());
cuda::memory::copy(m_d.get(), (double const *)(mm), stride() * sizeof(MX));
cudaCheck(cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice));

constexpr int NKK =
#ifdef DOPROF
Expand All @@ -151,7 +152,8 @@ void go(bool soa) {
else
cudautils::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);

cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX));
cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost));

delta += (std::chrono::high_resolution_clock::now() - start);

if (0 == kk)
Expand All @@ -162,8 +164,7 @@ void go(bool soa) {

#ifndef DOPROF
cudautils::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);

cuda::memory::copy(&mm, m_d.get(), stride() * sizeof(MX));
cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost));
#endif
delta1 += (std::chrono::high_resolution_clock::now() - start);

Expand Down
5 changes: 3 additions & 2 deletions DataFormats/Math/test/cudaAtan2Test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ end
#include "cuda/api_wrappers.h"

#include "DataFormats/Math/interface/approx_atan2.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"

Expand Down Expand Up @@ -72,7 +73,7 @@ void go() {
auto diff_d = cuda::memory::device::make_unique<int[]>(current_device, 3);

int diffs[3];
cuda::memory::device::zero(diff_d.get(), 3 * 4);
cudaCheck(cudaMemset(diff_d.get(), 0, 3 * 4));

// Launch the diff CUDA Kernel
dim3 threadsPerBlock(32, 32, 1);
Expand All @@ -83,7 +84,7 @@ void go() {

cudautils::launch(diffAtan<DEGREE>, {blocksPerGrid, threadsPerBlock}, diff_d.get());

cuda::memory::copy(diffs, diff_d.get(), 3 * 4);
cudaCheck(cudaMemcpy(diffs, diff_d.get(), 3 * 4, cudaMemcpyDeviceToHost));
delta += (std::chrono::high_resolution_clock::now() - start);

float mdiff = diffs[0] * 1.e-7;
Expand Down
13 changes: 8 additions & 5 deletions DataFormats/Math/test/cudaMathTest.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ end
#include "DataFormats/Math/interface/approx_log.h"
#include "DataFormats/Math/interface/approx_exp.h"
#include "DataFormats/Math/interface/approx_atan2.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"

Expand Down Expand Up @@ -106,8 +107,8 @@ void go() {
auto d_B = cuda::memory::device::make_unique<float[]>(current_device, numElements);
auto d_C = cuda::memory::device::make_unique<float[]>(current_device, numElements);

cuda::memory::copy(d_A.get(), h_A.get(), size);
cuda::memory::copy(d_B.get(), h_B.get(), size);
cudaCheck(cudaMemcpy(d_A.get(), h_A.get(), size, cudaMemcpyHostToDevice));
cudaCheck(cudaMemcpy(d_B.get(), h_B.get(), size, cudaMemcpyHostToDevice));
delta += (std::chrono::high_resolution_clock::now() - start);
std::cout << "cuda alloc+copy took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
<< std::endl;
Expand All @@ -118,19 +119,21 @@ void go() {
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads\n";

delta -= (std::chrono::high_resolution_clock::now() - start);
cudautils::launch(vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
cudautils::launch(
vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
delta += (std::chrono::high_resolution_clock::now() - start);
std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
<< std::endl;

delta -= (std::chrono::high_resolution_clock::now() - start);
cudautils::launch(vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
cudautils::launch(
vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
delta += (std::chrono::high_resolution_clock::now() - start);
std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
<< std::endl;

delta -= (std::chrono::high_resolution_clock::now() - start);
cuda::memory::copy(h_C.get(), d_C.get(), size);
cudaCheck(cudaMemcpy(h_C.get(), d_C.get(), size, cudaMemcpyDeviceToHost));
delta += (std::chrono::high_resolution_clock::now() - start);
std::cout << "cuda copy back took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
<< std::endl;
Expand Down
7 changes: 4 additions & 3 deletions HeterogeneousCore/CUDACore/test/testStreamEvent.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include <cuda_runtime.h>

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"

namespace {
Expand Down Expand Up @@ -39,8 +40,8 @@ int main() {
cudaStream_t stream1, stream2;
cudaEvent_t event1, event2;

cudaMalloc(&dev_points1, ARRAY_SIZE * sizeof(float));
cudaMallocHost(&host_points1, ARRAY_SIZE * sizeof(float));
cudaCheck(cudaMalloc(&dev_points1, ARRAY_SIZE * sizeof(float)));
cudaCheck(cudaMallocHost(&host_points1, ARRAY_SIZE * sizeof(float)));
cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
cudaEventCreate(&event1);
Expand All @@ -50,7 +51,7 @@ int main() {
host_points1[j] = static_cast<float>(j);
}

cudaMemcpyAsync(dev_points1, host_points1, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream1);
cudaCheck(cudaMemcpyAsync(dev_points1, host_points1, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream1));
kernel_looping<<<1, 16, 0, stream1>>>(dev_points1, ARRAY_SIZE);
if (debug)
std::cout << "Kernel launched on stream1" << std::endl;
Expand Down
10 changes: 5 additions & 5 deletions HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@ namespace cudatest {
namespace {
std::unique_ptr<CUDAProduct<int*>> produce(int device, int* d, int* h) {
auto ctx = cudatest::TestCUDAScopedContext::make(device, true);

cuda::memory::async::copy(d, h, sizeof(int), ctx.stream());
cudaCheck(cudaMemcpyAsync(d, h, sizeof(int), cudaMemcpyHostToDevice, ctx.stream()));
testCUDAScopedContextKernels_single(d, ctx.stream());
return ctx.wrap(d);
}
Expand Down Expand Up @@ -116,9 +115,10 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
h_a1 = 0;
h_a2 = 0;
int h_a3 = 0;
cuda::memory::async::copy(&h_a1, d_a1.get(), sizeof(int), ctx.stream());
cuda::memory::async::copy(&h_a2, d_a2.get(), sizeof(int), ctx.stream());
cuda::memory::async::copy(&h_a3, d_a3.get(), sizeof(int), ctx.stream());

cudaCheck(cudaMemcpyAsync(&h_a1, d_a1.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
cudaCheck(cudaMemcpyAsync(&h_a2, d_a2.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
cudaCheck(cudaMemcpyAsync(&h_a3, d_a3.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));

REQUIRE(h_a1 == 2);
REQUIRE(h_a2 == 4);
Expand Down
5 changes: 3 additions & 2 deletions HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "FWCore/ServiceRegistry/interface/Service.h"

#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
Expand Down Expand Up @@ -67,8 +68,8 @@ void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent,
// Mimick the need to transfer some of the GPU data back to CPU to
// be used for something within this module, or to be put in the
// event.
cuda::memory::async::copy(hostData_.get(), devicePtr_.get() + 10, sizeof(float), ctx.stream());

cudaCheck(
cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event "
<< iEvent.id().event() << " stream " << iEvent.streamID();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "FWCore/ServiceRegistry/interface/Service.h"

#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
Expand Down Expand Up @@ -75,8 +76,8 @@ void TestCUDAProducerGPUEWTask::acquire(const edm::Event& iEvent,
// Mimick the need to transfer some of the GPU data back to CPU to
// be used for something within this module, or to be put in the
// event.
cuda::memory::async::copy(hostData_.get(), devicePtr_.get() + 10, sizeof(float), ctx.stream());

cudaCheck(
cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
// Push a task to run addSimpleWork() after the asynchronous work
// (and acquire()) has finished instead of produce()
ctx.pushNextTask([iev = iEvent.id().event(), istr = iEvent.streamID(), this](CUDAScopedContextTask ctx) {
Expand All @@ -94,7 +95,8 @@ void TestCUDAProducerGPUEWTask::addSimpleWork(edm::EventNumber_t eventID,
edm::LogVerbatim("TestCUDAProducerGPUEWTask")
<< label_ << " TestCUDAProducerGPUEWTask::addSimpleWork begin event " << eventID << " stream " << streamID
<< " 10th element " << *hostData_ << " not satisfied, queueing more work";
cuda::memory::async::copy(hostData_.get(), devicePtr_.get() + 10, sizeof(float), ctx.stream());
cudaCheck(
cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));

ctx.pushNextTask([eventID, streamID, this](CUDAScopedContextTask ctx) { addSimpleWork(eventID, streamID, ctx); });
gpuAlgo_.runSimpleAlgo(devicePtr_.get(), ctx.stream());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ cudautils::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const
// First make the sanity check
if (d_input != nullptr) {
auto h_check = std::make_unique<float[]>(NUM_VALUES);
cuda::memory::copy(h_check.get(), d_input, NUM_VALUES * sizeof(float));
cudaCheck(cudaMemcpy(h_check.get(), d_input, NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost));
for (int i = 0; i < NUM_VALUES; ++i) {
if (h_check[i] != i) {
throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got "
Expand All @@ -89,8 +89,8 @@ cudautils::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const
auto d_a = cudautils::make_device_unique<float[]>(NUM_VALUES, stream);
auto d_b = cudautils::make_device_unique<float[]>(NUM_VALUES, stream);

cuda::memory::async::copy(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), stream);
cuda::memory::async::copy(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), stream);
cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream));
cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream));

int threadsPerBlock{32};
int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"

#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
Expand Down Expand Up @@ -58,8 +59,11 @@ void TestCUDAProducerGPUtoCPU::acquire(const edm::Event& iEvent,

buffer_ = cudautils::make_host_unique<float[]>(TestCUDAProducerGPUKernel::NUM_VALUES, ctx.stream());
// Enqueue async copy, continue in produce once finished
cuda::memory::async::copy(
buffer_.get(), device.get(), TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float), ctx.stream());
cudaCheck(cudaMemcpyAsync(buffer_.get(),
device.get(),
TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float),
cudaMemcpyDeviceToHost,
ctx.stream()));

edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire end event "
<< iEvent.id().event() << " stream " << iEvent.streamID();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ process.moduleToTest(process.toTest)
REQUIRE(data != nullptr);

float firstElements[10];
cuda::memory::async::copy(firstElements, data, sizeof(float) * 10, prod->stream());
cudaCheck(cudaMemcpyAsync(firstElements, data, sizeof(float) * 10, cudaMemcpyDeviceToHost, prod->stream()));

std::cout << "Synchronizing with CUDA stream" << std::endl;
auto stream = prod->stream();
Expand Down
2 changes: 1 addition & 1 deletion HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ namespace cudautils {
) {
uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off));
#ifdef __CUDACC__
cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream);
cudaCheck(cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream));
#else
::memset(off, 0, 4 * Histo::totbins());
#endif
Expand Down
9 changes: 5 additions & 4 deletions HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef HeterogeneousCore_CUDAUtilities_copyAsync_h
#define HeterogeneousCore_CUDAUtilities_copyAsync_h

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"

Expand All @@ -17,15 +18,15 @@ namespace cudautils {
// Shouldn't compile for array types because of sizeof(T), but
// let's add an assert with a more helpful message
static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
cuda::memory::async::copy(dst.get(), src.get(), sizeof(T), stream);
cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream));
}

template <typename T>
inline void copyAsync(cudautils::host::unique_ptr<T>& dst,
const cudautils::device::unique_ptr<T>& src,
cudaStream_t stream) {
static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
cuda::memory::async::copy(dst.get(), src.get(), sizeof(T), stream);
cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream));
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is device2host

Copy link

@makortel makortel Oct 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And "Calling cudaMemcpyAsync() with dst and src pointers that do not match the direction of the copy results in an undefined behavior." (*), so specifying the direction explicitly is actually harmful?

(*) https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed. I think we agreed to remove all explicit directions.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Calling cudaMemcpyAsync() with dst and src pointers that do not match the direction of the copy results in an undefined behavior.

I thought it was supposed to crash...

}

// Multiple elements
Expand All @@ -34,15 +35,15 @@ namespace cudautils {
const cudautils::host::unique_ptr<T[]>& src,
size_t nelements,
cudaStream_t stream) {
cuda::memory::async::copy(dst.get(), src.get(), nelements * sizeof(T), stream);
cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream));
}

template <typename T>
inline void copyAsync(cudautils::host::unique_ptr<T[]>& dst,
const cudautils::device::unique_ptr<T[]>& src,
size_t nelements,
cudaStream_t stream) {
cuda::memory::async::copy(dst.get(), src.get(), nelements * sizeof(T), stream);
cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream));
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto

}
} // namespace cudautils

Expand Down
Loading