cms-patatrack · rovere · Aug 13, 2018 · Aug 13, 2018 · Aug 15, 2018 · Aug 15, 2018
diff --git a/DataFormats/GeometrySurface/interface/SOARotation.h b/DataFormats/GeometrySurface/interface/SOARotation.h
@@ -139,6 +139,22 @@ class SOAFrame {
        ux+=px; uy+=py; uz+=pz;
   }
 
+  constexpr inline
+  void toGlobal(
+    T cxx,
+    T cxy, 
+    T cyy,
+    T * gl) const {
+
+    auto const & r = rot;
+    gl[0] =  r.xx()*(r.xx()*cxx+r.yx()*cxy) + r.yx()*(r.xx()*cxy+r.yx()*cyy);
+    gl[1] =  r.xx()*(r.xy()*cxx+r.yy()*cxy) + r.yx()*(r.xy()*cxy+r.yy()*cyy);
+    gl[2] =  r.xy()*(r.xy()*cxx+r.yy()*cxy) + r.yy()*(r.xy()*cxy+r.yy()*cyy);
+    gl[3] =  r.xx()*(r.xz()*cxx+r.yz()*cxy) + r.yx()*(r.xz()*cxy+r.yz()*cyy);
+    gl[4] =  r.xy()*(r.xz()*cxx+r.yz()*cxy) + r.yy()*(r.xz()*cxy+r.yz()*cyy);
+    gl[5] =  r.xz()*(r.xz()*cxx+r.yz()*cxy) + r.yz()*(r.xz()*cxy+r.yz()*cyy);
+  }
+
 
   constexpr inline
   T x() const { return px; }

diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu b/DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu
@@ -6,13 +6,14 @@ __global__
 void toGlobal(SOAFrame<float> const * frame, 
 	      float const * xl, float const * yl,
 	      float * x, float * y, float * z,
+              float const * le, float * ge,
 	      uint32_t n)
 {
   int i = blockDim.x * blockIdx.x + threadIdx.x;
   if (i >= n) return;
 
   frame[0].toGlobal(xl[i],yl[i],x[i],y[i],z[i]);
-
+  frame[0].toGlobal(le[3*i],le[3*i+1],le[3*i+2],ge+6*i);
 
 }
 
@@ -24,6 +25,7 @@ void toGlobal(SOAFrame<float> const * frame,
 void toGlobalWrapper(SOAFrame<float> const * frame, 
 	      float const * xl, float const * yl,
 	      float * x, float * y, float * z,
+              float const * le, float * ge,
 		     uint32_t n) {
 
   int threadsPerBlock = 256;
@@ -35,7 +37,7 @@ void toGlobalWrapper(SOAFrame<float> const * frame,
   cuda::launch(
 	       toGlobal,
 	       { blocksPerGrid, threadsPerBlock },
-	       frame, xl, yl, x, y, z,
+	       frame, xl, yl, x, y, z, le, ge,
 	       n
 	       );
 

diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
@@ -6,6 +6,7 @@
 void toGlobalWrapper(SOAFrame<float> const * frame, 
 	      float const * xl, float const * yl,
 	      float * x, float * y, float * z,
+              float const * le, float * ge,
 		     uint32_t n);
 
 
@@ -58,9 +59,10 @@ int main(void)
 
   float xl[size],yl[size];
   float x[size],y[size],z[size];
-  // float xh[size],yh[size],zh[size];
-
 
+  // errors
+  float le[3*size];
+  float ge[6*size];
 
 
   auto current_device = cuda::device::current::get();
@@ -71,6 +73,10 @@ int main(void)
   auto d_y = cuda::memory::device::make_unique<float[]>(current_device, size);
   auto d_z = cuda::memory::device::make_unique<float[]>(current_device, size);
 
+  auto d_le = cuda::memory::device::make_unique<float[]>(current_device, 3*size);
+  auto d_ge = cuda::memory::device::make_unique<float[]>(current_device, 6*size);
+
+
   double a = 0.01;
   double ca = std::cos(a);
   double sa = std::sin(a);
@@ -95,21 +101,29 @@ int main(void)
 
 
 
-  for (auto i=0U; i<size; ++i) xl[i]=yl[i] =  0.1f*float(i)-float(size/2);
+  for (auto i=0U; i<size; ++i) {
+    xl[i]=yl[i] =  0.1f*float(i)-float(size/2);
+    le[3*i] = 0.01f; le[3*i+2] =  (i>size/2) ? 1.f :  0.04f;
+    le[2*i+1]=0.;
+  }
   std::random_shuffle(xl,xl+size);
   std::random_shuffle(yl,yl+size);
 
   cuda::memory::copy(d_xl.get(), xl, size32);
   cuda::memory::copy(d_yl.get(), yl, size32);
+  cuda::memory::copy(d_le.get(), le, 3*size32);
 
+
   toGlobalWrapper((SFrame const *)(d_sf.get()), d_xl.get(), d_yl.get(), d_x.get(), d_y.get(), d_z.get(),
-    size
+    d_le.get(), d_ge.get(), size
     );
 
   cuda::memory::copy(x,d_x.get(), size32);
   cuda::memory::copy(y,d_y.get(), size32);
   cuda::memory::copy(z,d_z.get(), size32);
+  cuda::memory::copy(ge,d_ge.get(), 6*size32);
 
+
   float eps=0.;
   for (auto i=0U; i<size; ++i) {
     auto gp = f1.toGlobal(LocalPoint(xl[i],yl[i]));

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -4,13 +4,15 @@
 #include <cassert>
 #include <cstdint>
 #include <algorithm>
+#include <type_traits>
 #ifndef __CUDA_ARCH__
 #include <atomic>
 #endif // __CUDA_ARCH__
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
+
 #ifdef __CUDACC__
 namespace cudautils {
 
@@ -37,14 +39,14 @@ namespace cudautils {
      int32_t ih = off - offsets - 1;
      assert(ih >= 0);
      assert(ih < nh); 
-     h[ih].fill(v, i);
+     h[ih].fill(v[i], i);
   }
 
   template<typename Histo, typename T>
   __global__
   void fillFromVector(Histo * h, T const * v, uint32_t size) {
      auto i = blockIdx.x * blockDim.x + threadIdx.x;
-     if(i < size) h->fill(v, i);
+     if(i < size) h->fill(v[i], i);
   }
 
   template<typename Histo>
@@ -73,7 +75,32 @@ namespace cudautils {
 } // namespace cudautils
 #endif
 
-template<typename T, uint32_t N, uint32_t M>
+
+// iteratate over N bins left and right of the one containing "v"
+// including spillBin
+template<typename Hist, typename V, typename Func>
+__host__ __device__
+void forEachInBins(Hist const & hist, V value, int n, Func func) {
+   int bs = hist.bin(value);
+   int be = std::min(int(hist.nbins()),bs+n+1);
+   bs = std::max(0,bs-n);
+   assert(be>bs);
+   for (auto b=bs; b<be; ++b){
+   for (auto pj=hist.begin(b);pj<hist.end(b);++pj) {
+      func(*pj);
+   }}
+   for (auto pj=hist.beginSpill();pj<hist.endSpill();++pj)
+     func(*pj);
+}
+
+
+template<
+  typename T, // the type of the discretized input values
+  uint32_t N, // number of bins (in bits)
+  uint32_t M, // max number of element a bin can contain
+  uint32_t S=sizeof(T) * 8, // number of significant bits in T
+  typename I=uint32_t  // type stored in the container (usually an index in a vector of the input values)
+>
 class HistoContainer {
 public:
 #ifdef __CUDACC__
@@ -82,14 +109,16 @@ class HistoContainer {
   using Counter = std::atomic<uint32_t>;
 #endif
 
-  static constexpr uint32_t sizeT()     { return sizeof(T) * 8; }
+  using index_type = I;
+  using UT = typename std::make_unsigned<T>::type;
+  static constexpr uint32_t sizeT()     { return S; }
   static constexpr uint32_t nbins()     { return 1 << N; }
   static constexpr uint32_t shift()     { return sizeT() - N; }
   static constexpr uint32_t mask()      { return nbins() - 1; }
   static constexpr uint32_t binSize()   { return 1 << M; }
-  static constexpr uint32_t spillSize() { return 4 * binSize(); }
+  static constexpr uint32_t spillSize() { return 16 * binSize(); }
 
-  static constexpr uint32_t bin(T t) {
+  static constexpr UT bin(T t) {
     return (t >> shift()) & mask();
   }
 
@@ -109,16 +138,17 @@ class HistoContainer {
   }
 
   __host__ __device__
-  void fill(T const * t, uint32_t j) {
-    auto b = bin(t[j]);
+  void fill(T t, index_type j) {
+    UT b = bin(t);
+    assert(b<nbins());
     auto w = atomicIncrement(n[b]);
     if (w < binSize()) {
       bins[b * binSize() + w] = j;
     } else {
       auto w = atomicIncrement(nspills);
       if (w < spillSize())
         spillBin[w] = j;
-    }     
+    }
   }
 
   constexpr bool fullSpill() const {
@@ -141,10 +171,18 @@ class HistoContainer {
      return n[b];
   }
 
-  uint32_t bins[nbins()*binSize()];
+  constexpr auto const * beginSpill() const {
+     return spillBin;
+  }
+
+  constexpr auto const * endSpill() const {
+     return beginSpill() + std::min(spillSize(), uint32_t(nspills));
+  }
+
   Counter  n[nbins()];
-  uint32_t spillBin[spillSize()];
   Counter  nspills;
+  index_type bins[nbins()*binSize()];
+  index_type spillBin[spillSize()];
 };
 
 #endif // HeterogeneousCore_CUDAUtilities_HistoContainer_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
@@ -4,17 +4,56 @@
 #include<cstdint>
 #include<cassert>
 
+
+template<typename T, 
+         typename std::enable_if<std::is_unsigned<T>::value,T>::type* = nullptr
+        >
+__device__
+void reorderSigned(T const * a, uint16_t * ind, uint16_t * ind2, uint32_t size) {
+}
+
+template<typename T, 
+         typename std::enable_if<std::is_signed<T>::value,T>::type* = nullptr
+        >
+__device__
+void reorderSigned(T const * a, uint16_t * ind, uint16_t * ind2, uint32_t size) {
+
+  //move negative first...
+
+  int32_t first = threadIdx.x;
+  __shared__ uint32_t firstNeg;
+  firstNeg=0;
+  __syncthreads();
+
+  // find first negative  (for float ^ will not work...)
+  for (auto i=first; i<size-1; i+=blockDim.x) {
+    // if ( (int(a[ind[i]])*int(a[ind[i+1]])) <0 ) firstNeg=i+1;
+   if ( (a[ind[i]]^a[ind[i+1]]) < 0 ) firstNeg=i+1;
+  }
+
+  __syncthreads();
+
+  auto ii=first;
+  for (auto i=firstNeg+threadIdx.x; i<size; i+=blockDim.x)  { ind2[ii] = ind[i]; ii+=blockDim.x; }
+  __syncthreads();
+  ii= size-firstNeg +threadIdx.x;
+  assert(ii>=0);
+  for (auto i=first;i<firstNeg;i+=blockDim.x)  { ind2[ii] = ind[i]; ii+=blockDim.x; }
+  __syncthreads();
+  for (auto i=first; i<size; i+=blockDim.x) ind[i]=ind2[i];
+
+}
+
 template<typename T>
 __device__  
-void radixSort(T * a, uint16_t * ind, uint32_t size) {
+void radixSort(T const * a, uint16_t * ind, uint32_t size) {
 
   constexpr int d = 8, w = 8*sizeof(T);
   constexpr int sb = 1<<d;
 
   constexpr int MaxSize = 256*32;
   __shared__ uint16_t ind2[MaxSize];
   __shared__ int32_t c[sb], ct[sb], cu[sb];
-  __shared__ uint32_t firstNeg;    
 
   __shared__ int ibs;
   __shared__ int p;
@@ -25,8 +64,6 @@ void radixSort(T * a, uint16_t * ind, uint32_t size) {
 
   // bool debug = false; // threadIdx.x==0 && blockIdx.x==5;
 
-  firstNeg=0;
-
   p = 0;  
 
   auto j = ind;
@@ -118,32 +155,14 @@ void radixSort(T * a, uint16_t * ind, uint32_t size) {
 
   }
 
-  // w/d is even so ind is correct
-  assert(j==ind);
-  __syncthreads();
-
-
+  if (8==w) // int8 
+     for (auto i=first; i<size; i+=blockDim.x) ind[i]=ind2[i];
+  else
+    assert(j==ind);   // w/d is even so ind is correct
 
-  // now move negative first...
-  // find first negative  (for float ^ will not work...)
-  for (auto i=first; i<size-1; i+=blockDim.x) {
-    // if ( (int(a[ind[i]])*int(a[ind[i+1]])) <0 ) firstNeg=i+1;
-   if ( (a[ind[i]]^a[ind[i+1]]) < 0 ) firstNeg=i+1; 
-  }
-
-  __syncthreads();
-  // assert(firstNeg>0); not necessary true if all positive !
-
-  auto ii=first;
-  for (auto i=firstNeg+threadIdx.x; i<size; i+=blockDim.x)  { ind2[ii] = ind[i]; ii+=blockDim.x; }
-  __syncthreads();
-  ii= size-firstNeg +threadIdx.x;
-  assert(ii>=0);
-  for (auto i=first;i<firstNeg;i+=blockDim.x)  { ind2[ii] = ind[i]; ii+=blockDim.x; }
-  __syncthreads();
-  for (auto i=first; i<size; i+=blockDim.x) ind[i]=ind2[i];
+  // now move negative first... (if signed)
+  reorderSigned(a,ind,ind2,size);
 
-
 }
 
 

diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
@@ -22,7 +22,7 @@ void go() {
   for (int it=0; it<5; ++it) {
     for (long long j = 0; j < N; j++) v[j]=rgen(eng);
     h.zero();
-    for (long long j = 0; j < N; j++) h.fill(v,j);
+    for (long long j = 0; j < N; j++) h.fill(v[j],j);
 
     std::cout << "nspills " << h.nspills << std::endl;    
 
@@ -44,6 +44,34 @@ void go() {
     }
   }
 
+  for (long long j = 0; j < N; j++) {
+    auto b0 = h.bin(v[j]);
+    auto stot = h.endSpill()-h.beginSpill();
+    int w=0;
+    int tot=0;
+    auto ftest = [&](int k) {
+       assert(k>=0 && k<N);
+       tot++;
+    };
+    forEachInBins(h,v[j],w,ftest);
+    int rtot = h.end(b0)-h.begin(b0) + stot;
+    assert(tot==rtot);
+    w=1; tot=0;
+    forEachInBins(h,v[j],w,ftest);
+    int bp = b0+1;
+    int bm = b0-1;
+    if (bp<int(h.nbins())) rtot += h.end(bp)-h.begin(bp);
+    if (bm>=0) rtot += h.end(bm)-h.begin(bm);
+    assert(tot==rtot);
+    w=2; tot=0;
+    forEachInBins(h,v[j],w,ftest);
+    bp++;
+    bm--;
+    if (bp<int(h.nbins())) rtot += h.end(bp)-h.begin(bp);
+    if (bm>=0) rtot += h.end(bm)-h.begin(bm);
+    assert(tot==rtot);
+  }
+
 }
 
 int main() {