Skip to content

Commit

Permalink
Remove dependency on NVIDIA cub
Browse files Browse the repository at this point in the history
Replace the use of the prefix scan from CUB with a home-brewed implementation,
using dynamic instead of static shared memory.

Annotate all CMS-specific changes to CachingDeviceAllocator.

No changes to physics or timing performance.
  • Loading branch information
VinInn authored and fwyzard committed May 19, 2020
1 parent 2464e28 commit e53e7bb
Show file tree
Hide file tree
Showing 9 changed files with 241 additions and 250 deletions.
1 change: 0 additions & 1 deletion HeterogeneousCore/CUDAUtilities/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
<iftool name="cuda-gcc-support">
<use name="cub"/>
<use name="cuda"/>
<use name="eigen"/>
<use name="FWCore/Utilities"/>
Expand Down
75 changes: 29 additions & 46 deletions HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@
#include <cstdint>
#include <type_traits>

#ifdef __CUDACC__
#include <cub/cub.cuh>
#endif

#include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
Expand Down Expand Up @@ -55,62 +51,60 @@ namespace cms {
}

template <typename Histo>
inline void launchZero(Histo *__restrict__ h,
cudaStream_t stream
inline __attribute__((always_inline)) void launchZero(Histo *__restrict__ h,
cudaStream_t stream
#ifndef __CUDACC__
= cudaStreamDefault
= cudaStreamDefault
#endif
) {
uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off));
uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
int32_t size = offsetof(Histo, bins) - offsetof(Histo, off);
assert(size >= int(sizeof(uint32_t) * Histo::totbins()));
#ifdef __CUDACC__
cudaCheck(cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream));
cudaCheck(cudaMemsetAsync(poff, 0, size, stream));
#else
::memset(off, 0, 4 * Histo::totbins());
::memset(poff, 0, size);
#endif
}

template <typename Histo>
inline void launchFinalize(Histo *__restrict__ h,
uint8_t *__restrict__ ws
inline __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h,
cudaStream_t stream
#ifndef __CUDACC__
= cudaStreamDefault
#endif
,
cudaStream_t stream
#ifndef __CUDACC__
= cudaStreamDefault
= cudaStreamDefault
#endif
) {
#ifdef __CUDACC__
assert(ws);
uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off));
size_t wss = Histo::wsSize();
assert(wss > 0);
CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Histo, psws));
auto nthreads = 1024;
auto nblocks = (Histo::totbins() + nthreads - 1) / nthreads;
multiBlockPrefixScan<<<nblocks, nthreads, sizeof(int32_t) * nblocks, stream>>>(
poff, poff, Histo::totbins(), ppsws);
cudaCheck(cudaGetLastError());
#else
h->finalize();
#endif
}

template <typename Histo, typename T>
inline void fillManyFromVector(Histo *__restrict__ h,
uint8_t *__restrict__ ws,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets,
uint32_t totSize,
int nthreads,
cudaStream_t stream
inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets,
uint32_t totSize,
int nthreads,
cudaStream_t stream
#ifndef __CUDACC__
= cudaStreamDefault
= cudaStreamDefault
#endif
) {
launchZero(h, stream);
#ifdef __CUDACC__
auto nblocks = (totSize + nthreads - 1) / nthreads;
countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
cudaCheck(cudaGetLastError());
launchFinalize(h, ws, stream);
launchFinalize(h, stream);
fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
cudaCheck(cudaGetLastError());
#else
Expand Down Expand Up @@ -186,18 +180,6 @@ namespace cms {

static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }

__host__ static size_t wsSize() {
#ifdef __CUDACC__
uint32_t *v = nullptr;
void *d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins());
return temp_storage_bytes;
#else
return 0;
#endif
}

static constexpr UT bin(T t) {
constexpr uint32_t shift = sizeT() - nbits();
constexpr uint32_t mask = (1 << nbits()) - 1;
Expand All @@ -209,7 +191,7 @@ namespace cms {
i = 0;
}

__host__ __device__ void add(CountersOnly const &co) {
__host__ __device__ __forceinline__ void add(CountersOnly const &co) {
for (uint32_t i = 0; i < totbins(); ++i) {
#ifdef __CUDA_ARCH__
atomicAdd(off + i, co.off[i]);
Expand Down Expand Up @@ -325,6 +307,7 @@ namespace cms {
constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; }

Counter off[totbins()];
int32_t psws; // prefix-scan working space
index_type bins[capacity()];
};

Expand Down
37 changes: 24 additions & 13 deletions HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,15 +127,27 @@ namespace cms {
#endif
}

// limited to 1024*1024 elements....
#ifdef __CUDA_ARCH__
// see https://stackoverflow.com/questions/40021086/can-i-obtain-the-amount-of-allocated-dynamic-shared-memory-from-within-a-kernel/40021087#40021087
__device__ __forceinline__ unsigned dynamic_smem_size() {
unsigned ret;
asm volatile("mov.u32 %0, %dynamic_smem_size;" : "=r"(ret));
return ret;
}
#endif

// in principle not limited....
template <typename T>
__global__ void multiBlockPrefixScan(T const* __restrict__ ci, T* __restrict__ co, int32_t size, int32_t* pc) {
__global__ void multiBlockPrefixScan(T const* ci, T* co, int32_t size, int32_t* pc) {
__shared__ T ws[32];
// first each block does a scan of size 1024; (better be enough blocks....)
assert(1024 * gridDim.x >= size);
int off = 1024 * blockIdx.x;
#ifdef __CUDA_ARCH__
assert(sizeof(T) * gridDim.x <= dynamic_smem_size()); // size of psum below
#endif
assert(blockDim.x * gridDim.x >= size);
// first each block does a scan
int off = blockDim.x * blockIdx.x;
if (size - off > 0)
blockPrefixScan(ci + off, co + off, std::min(1024, size - off), ws);
blockPrefixScan(ci + off, co + off, std::min(int(blockDim.x), size - off), ws);

// count blocks that finished
__shared__ bool isLastBlockDone;
Expand All @@ -149,25 +161,24 @@ namespace cms {
if (!isLastBlockDone)
return;

assert(int(gridDim.x) == *pc);

// good each block has done its work and now we are left in last block

// let's get the partial sums from each block
__shared__ T psum[1024];
extern __shared__ T psum[];
for (int i = threadIdx.x, ni = gridDim.x; i < ni; i += blockDim.x) {
auto j = 1024 * i + 1023;
auto j = blockDim.x * i + blockDim.x - 1;
psum[i] = (j < size) ? co[j] : T(0);
}
__syncthreads();
blockPrefixScan(psum, psum, gridDim.x, ws);

// now it would have been handy to have the other blocks around...
int first = threadIdx.x; // + blockDim.x * blockIdx.x
for (int i = first + 1024; i < size; i += blockDim.x) { // *gridDim.x) {
auto k = i / 1024; // block
co[i] += psum[k - 1];
for (int i = threadIdx.x + blockDim.x, k = 0; i < size; i += blockDim.x, ++k) {
co[i] += psum[k];
}
}

} // namespace cuda
} // namespace cms

Expand Down
Loading

0 comments on commit e53e7bb

Please sign in to comment.