Remove dependency on NVIDIA cub

Replace the use of the prefix scan from CUB with a home-brewed implementation, using dynamic instead of static shared memory. Annotate all CMS-specific changes to CachingDeviceAllocator. No changes to physics or timing performance.
cms-sw · May 19, 2020 · e53e7bb · e53e7bb
1 parent 2464e28
commit e53e7bb
Show file tree

Hide file tree

Showing 9 changed files with 241 additions and 250 deletions.
diff --git a/HeterogeneousCore/CUDAUtilities/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
@@ -1,5 +1,4 @@
 <iftool name="cuda-gcc-support">
-  <use name="cub"/>
   <use name="cuda"/>
   <use name="eigen"/>
   <use name="FWCore/Utilities"/>

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -9,10 +9,6 @@
 #include <cstdint>
 #include <type_traits>
 
-#ifdef __CUDACC__
-#include <cub/cub.cuh>
-#endif
-
 #include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
@@ -55,62 +51,60 @@ namespace cms {
     }
 
     template <typename Histo>
-    inline void launchZero(Histo *__restrict__ h,
-                           cudaStream_t stream
+    inline __attribute__((always_inline)) void launchZero(Histo *__restrict__ h,
+                                                          cudaStream_t stream
 #ifndef __CUDACC__
-                           = cudaStreamDefault
+                                                          = cudaStreamDefault
 #endif
     ) {
-      uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off));
+      uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
+      int32_t size = offsetof(Histo, bins) - offsetof(Histo, off);
+      assert(size >= int(sizeof(uint32_t) * Histo::totbins()));
 #ifdef __CUDACC__
-      cudaCheck(cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream));
+      cudaCheck(cudaMemsetAsync(poff, 0, size, stream));
 #else
-      ::memset(off, 0, 4 * Histo::totbins());
+      ::memset(poff, 0, size);
 #endif
     }
 
     template <typename Histo>
-    inline void launchFinalize(Histo *__restrict__ h,
-                               uint8_t *__restrict__ ws
+    inline __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h,
+                                                              cudaStream_t stream
 #ifndef __CUDACC__
-                               = cudaStreamDefault
-#endif
-                               ,
-                               cudaStream_t stream
-#ifndef __CUDACC__
-                               = cudaStreamDefault
+                                                              = cudaStreamDefault
 #endif
     ) {
 #ifdef __CUDACC__
-      assert(ws);
-      uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off));
-      size_t wss = Histo::wsSize();
-      assert(wss > 0);
-      CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
+      uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
+      int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Histo, psws));
+      auto nthreads = 1024;
+      auto nblocks = (Histo::totbins() + nthreads - 1) / nthreads;
+      multiBlockPrefixScan<<<nblocks, nthreads, sizeof(int32_t) * nblocks, stream>>>(
+          poff, poff, Histo::totbins(), ppsws);
+      cudaCheck(cudaGetLastError());
 #else
       h->finalize();
 #endif
     }
 
     template <typename Histo, typename T>
-    inline void fillManyFromVector(Histo *__restrict__ h,
-                                   uint8_t *__restrict__ ws,
-                                   uint32_t nh,
-                                   T const *__restrict__ v,
-                                   uint32_t const *__restrict__ offsets,
-                                   uint32_t totSize,
-                                   int nthreads,
-                                   cudaStream_t stream
+    inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
+                                                                  uint32_t nh,
+                                                                  T const *__restrict__ v,
+                                                                  uint32_t const *__restrict__ offsets,
+                                                                  uint32_t totSize,
+                                                                  int nthreads,
+                                                                  cudaStream_t stream
 #ifndef __CUDACC__
-                                   = cudaStreamDefault
+                                                                  = cudaStreamDefault
 #endif
     ) {
       launchZero(h, stream);
 #ifdef __CUDACC__
       auto nblocks = (totSize + nthreads - 1) / nthreads;
       countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
       cudaCheck(cudaGetLastError());
-      launchFinalize(h, ws, stream);
+      launchFinalize(h, stream);
       fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
       cudaCheck(cudaGetLastError());
 #else
@@ -186,18 +180,6 @@ namespace cms {
 
       static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }
 
-      __host__ static size_t wsSize() {
-#ifdef __CUDACC__
-        uint32_t *v = nullptr;
-        void *d_temp_storage = nullptr;
-        size_t temp_storage_bytes = 0;
-        cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins());
-        return temp_storage_bytes;
-#else
-        return 0;
-#endif
-      }
-
       static constexpr UT bin(T t) {
         constexpr uint32_t shift = sizeT() - nbits();
         constexpr uint32_t mask = (1 << nbits()) - 1;
@@ -209,7 +191,7 @@ namespace cms {
           i = 0;
       }
 
-      __host__ __device__ void add(CountersOnly const &co) {
+      __host__ __device__ __forceinline__ void add(CountersOnly const &co) {
         for (uint32_t i = 0; i < totbins(); ++i) {
 #ifdef __CUDA_ARCH__
           atomicAdd(off + i, co.off[i]);
@@ -325,6 +307,7 @@ namespace cms {
       constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; }
 
       Counter off[totbins()];
+      int32_t psws;  // prefix-scan working space
       index_type bins[capacity()];
     };
 

diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
@@ -127,15 +127,27 @@ namespace cms {
 #endif
     }
 
-    // limited to 1024*1024 elements....
+#ifdef __CUDA_ARCH__
+    // see https://stackoverflow.com/questions/40021086/can-i-obtain-the-amount-of-allocated-dynamic-shared-memory-from-within-a-kernel/40021087#40021087
+    __device__ __forceinline__ unsigned dynamic_smem_size() {
+      unsigned ret;
+      asm volatile("mov.u32 %0, %dynamic_smem_size;" : "=r"(ret));
+      return ret;
+    }
+#endif
+
+    // in principle not limited....
     template <typename T>
-    __global__ void multiBlockPrefixScan(T const* __restrict__ ci, T* __restrict__ co, int32_t size, int32_t* pc) {
+    __global__ void multiBlockPrefixScan(T const* ci, T* co, int32_t size, int32_t* pc) {
       __shared__ T ws[32];
-      // first each block does a scan of size 1024; (better be enough blocks....)
-      assert(1024 * gridDim.x >= size);
-      int off = 1024 * blockIdx.x;
+#ifdef __CUDA_ARCH__
+      assert(sizeof(T) * gridDim.x <= dynamic_smem_size());  // size of psum below
+#endif
+      assert(blockDim.x * gridDim.x >= size);
+      // first each block does a scan
+      int off = blockDim.x * blockIdx.x;
       if (size - off > 0)
-        blockPrefixScan(ci + off, co + off, std::min(1024, size - off), ws);
+        blockPrefixScan(ci + off, co + off, std::min(int(blockDim.x), size - off), ws);
 
       // count blocks that finished
       __shared__ bool isLastBlockDone;
@@ -149,25 +161,24 @@ namespace cms {
       if (!isLastBlockDone)
         return;
 
+      assert(int(gridDim.x) == *pc);
+
       // good each block has done its work and now we are left in last block
 
       // let's get the partial sums from each block
-      __shared__ T psum[1024];
+      extern __shared__ T psum[];
       for (int i = threadIdx.x, ni = gridDim.x; i < ni; i += blockDim.x) {
-        auto j = 1024 * i + 1023;
+        auto j = blockDim.x * i + blockDim.x - 1;
         psum[i] = (j < size) ? co[j] : T(0);
       }
       __syncthreads();
       blockPrefixScan(psum, psum, gridDim.x, ws);
 
       // now it would have been handy to have the other blocks around...
-      int first = threadIdx.x;                                 // + blockDim.x * blockIdx.x
-      for (int i = first + 1024; i < size; i += blockDim.x) {  //  *gridDim.x) {
-        auto k = i / 1024;                                     // block
-        co[i] += psum[k - 1];
+      for (int i = threadIdx.x + blockDim.x, k = 0; i < size; i += blockDim.x, ++k) {
+        co[i] += psum[k];
       }
     }
-
   }  // namespace cuda
 }  // namespace cms