Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Patatrack hackathon4 #162

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
4d3a0a5
use hit to sort (not faster)
VinInn Aug 13, 2018
b148a8e
reduce memory, optimize bin iteration
VinInn Aug 13, 2018
bc7615d
reduce memory occpancy, use jmax
VinInn Aug 15, 2018
5e272ec
remove redundant checks
VinInn Aug 15, 2018
13b741a
new pt cut
VinInn Aug 18, 2018
9512324
new interface
VinInn Aug 18, 2018
6214e0f
prototype of vertex finder on gpu
VinInn Aug 19, 2018
a8bdab1
initial commit
VinInn Aug 23, 2018
5047e6d
more code, does not compile yet
VinInn Aug 28, 2018
3dc134e
compiles
VinInn Aug 28, 2018
6e96759
works, disabled to run old divisive...
VinInn Aug 29, 2018
f0963c4
now gpu algo running
VinInn Aug 29, 2018
57b8db3
tranfer reverse pointers
VinInn Aug 29, 2018
68fad3d
added tracks to vertex, printouts removed
VinInn Aug 30, 2018
1ce01f2
expose all params
VinInn Aug 30, 2018
fc770d6
added chi2, error rescaled (small effect)
VinInn Aug 31, 2018
10ad7e9
sort in pt2
VinInn Aug 31, 2018
3d67018
more doc, some cleanup
VinInn Sep 1, 2018
bcb7c44
as usual the test is more complex than the algo
VinInn Sep 1, 2018
cdb91a3
use new utility function
VinInn Sep 1, 2018
794a796
properly rescale error with chi2
VinInn Sep 1, 2018
aa8913e
support 8 and 64 bit int
VinInn Sep 2, 2018
80308a5
support unsigned
VinInn Sep 2, 2018
77dcfa6
SFINAE
VinInn Sep 2, 2018
f156524
const
VinInn Sep 2, 2018
fd5886b
derivation of fast dphi cut
VinInn Sep 2, 2018
4c16b56
python notebook to tune cuts for doublets and triplets
VinInn Sep 2, 2018
4973ac7
Add back standalone GPU fit test
rovere Sep 3, 2018
4f0b74f
transfrom errors to global
VinInn Sep 3, 2018
08467e6
fix copy to gpu
VinInn Sep 3, 2018
8000d07
Merge branch 'DoubletTuning' into PatatrackHackathon4
rovere Sep 3, 2018
03c4b00
Add Helix Fit to Quadruplet GPU Producer
rovere Sep 5, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions DataFormats/GeometrySurface/interface/SOARotation.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,22 @@ class SOAFrame {
ux+=px; uy+=py; uz+=pz;
}

constexpr inline
void toGlobal(
T cxx,
T cxy,
T cyy,
T * gl) const {

auto const & r = rot;
gl[0] = r.xx()*(r.xx()*cxx+r.yx()*cxy) + r.yx()*(r.xx()*cxy+r.yx()*cyy);
gl[1] = r.xx()*(r.xy()*cxx+r.yy()*cxy) + r.yx()*(r.xy()*cxy+r.yy()*cyy);
gl[2] = r.xy()*(r.xy()*cxx+r.yy()*cxy) + r.yy()*(r.xy()*cxy+r.yy()*cyy);
gl[3] = r.xx()*(r.xz()*cxx+r.yz()*cxy) + r.yx()*(r.xz()*cxy+r.yz()*cyy);
gl[4] = r.xy()*(r.xz()*cxx+r.yz()*cxy) + r.yy()*(r.xz()*cxy+r.yz()*cyy);
gl[5] = r.xz()*(r.xz()*cxx+r.yz()*cxy) + r.yz()*(r.xz()*cxy+r.yz()*cyy);
}


constexpr inline
T x() const { return px; }
Expand Down
6 changes: 4 additions & 2 deletions DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,14 @@ __global__
void toGlobal(SOAFrame<float> const * frame,
float const * xl, float const * yl,
float * x, float * y, float * z,
float const * le, float * ge,
uint32_t n)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= n) return;

frame[0].toGlobal(xl[i],yl[i],x[i],y[i],z[i]);

frame[0].toGlobal(le[3*i],le[3*i+1],le[3*i+2],ge+6*i);

}

Expand All @@ -24,6 +25,7 @@ void toGlobal(SOAFrame<float> const * frame,
void toGlobalWrapper(SOAFrame<float> const * frame,
float const * xl, float const * yl,
float * x, float * y, float * z,
float const * le, float * ge,
uint32_t n) {

int threadsPerBlock = 256;
Expand All @@ -35,7 +37,7 @@ void toGlobalWrapper(SOAFrame<float> const * frame,
cuda::launch(
toGlobal,
{ blocksPerGrid, threadsPerBlock },
frame, xl, yl, x, y, z,
frame, xl, yl, x, y, z, le, ge,
n
);

Expand Down
22 changes: 18 additions & 4 deletions DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
void toGlobalWrapper(SOAFrame<float> const * frame,
float const * xl, float const * yl,
float * x, float * y, float * z,
float const * le, float * ge,
uint32_t n);


Expand Down Expand Up @@ -58,9 +59,10 @@ int main(void)

float xl[size],yl[size];
float x[size],y[size],z[size];
// float xh[size],yh[size],zh[size];


// errors
float le[3*size];
float ge[6*size];


auto current_device = cuda::device::current::get();
Expand All @@ -71,6 +73,10 @@ int main(void)
auto d_y = cuda::memory::device::make_unique<float[]>(current_device, size);
auto d_z = cuda::memory::device::make_unique<float[]>(current_device, size);

auto d_le = cuda::memory::device::make_unique<float[]>(current_device, 3*size);
auto d_ge = cuda::memory::device::make_unique<float[]>(current_device, 6*size);


double a = 0.01;
double ca = std::cos(a);
double sa = std::sin(a);
Expand All @@ -95,21 +101,29 @@ int main(void)



for (auto i=0U; i<size; ++i) xl[i]=yl[i] = 0.1f*float(i)-float(size/2);
for (auto i=0U; i<size; ++i) {
xl[i]=yl[i] = 0.1f*float(i)-float(size/2);
le[3*i] = 0.01f; le[3*i+2] = (i>size/2) ? 1.f : 0.04f;
le[2*i+1]=0.;
}
std::random_shuffle(xl,xl+size);
std::random_shuffle(yl,yl+size);

cuda::memory::copy(d_xl.get(), xl, size32);
cuda::memory::copy(d_yl.get(), yl, size32);
cuda::memory::copy(d_le.get(), le, 3*size32);


toGlobalWrapper((SFrame const *)(d_sf.get()), d_xl.get(), d_yl.get(), d_x.get(), d_y.get(), d_z.get(),
size
d_le.get(), d_ge.get(), size
);

cuda::memory::copy(x,d_x.get(), size32);
cuda::memory::copy(y,d_y.get(), size32);
cuda::memory::copy(z,d_z.get(), size32);
cuda::memory::copy(ge,d_ge.get(), 6*size32);


float eps=0.;
for (auto i=0U; i<size; ++i) {
auto gp = f1.toGlobal(LocalPoint(xl[i],yl[i]));
Expand Down
60 changes: 49 additions & 11 deletions HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
#include <cassert>
#include <cstdint>
#include <algorithm>
#include <type_traits>
#ifndef __CUDA_ARCH__
#include <atomic>
#endif // __CUDA_ARCH__

#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"


#ifdef __CUDACC__
namespace cudautils {

Expand All @@ -37,14 +39,14 @@ namespace cudautils {
int32_t ih = off - offsets - 1;
assert(ih >= 0);
assert(ih < nh);
h[ih].fill(v, i);
h[ih].fill(v[i], i);
}

template<typename Histo, typename T>
__global__
void fillFromVector(Histo * h, T const * v, uint32_t size) {
auto i = blockIdx.x * blockDim.x + threadIdx.x;
if(i < size) h->fill(v, i);
if(i < size) h->fill(v[i], i);
}

template<typename Histo>
Expand Down Expand Up @@ -73,7 +75,32 @@ namespace cudautils {
} // namespace cudautils
#endif

template<typename T, uint32_t N, uint32_t M>

// iteratate over N bins left and right of the one containing "v"
// including spillBin
template<typename Hist, typename V, typename Func>
__host__ __device__
void forEachInBins(Hist const & hist, V value, int n, Func func) {
int bs = hist.bin(value);
int be = std::min(int(hist.nbins()),bs+n+1);
bs = std::max(0,bs-n);
assert(be>bs);
for (auto b=bs; b<be; ++b){
for (auto pj=hist.begin(b);pj<hist.end(b);++pj) {
func(*pj);
}}
for (auto pj=hist.beginSpill();pj<hist.endSpill();++pj)
func(*pj);
}


template<
typename T, // the type of the discretized input values
uint32_t N, // number of bins (in bits)
uint32_t M, // max number of element a bin can contain
uint32_t S=sizeof(T) * 8, // number of significant bits in T
typename I=uint32_t // type stored in the container (usually an index in a vector of the input values)
>
class HistoContainer {
public:
#ifdef __CUDACC__
Expand All @@ -82,14 +109,16 @@ class HistoContainer {
using Counter = std::atomic<uint32_t>;
#endif

static constexpr uint32_t sizeT() { return sizeof(T) * 8; }
using index_type = I;
using UT = typename std::make_unsigned<T>::type;
static constexpr uint32_t sizeT() { return S; }
static constexpr uint32_t nbins() { return 1 << N; }
static constexpr uint32_t shift() { return sizeT() - N; }
static constexpr uint32_t mask() { return nbins() - 1; }
static constexpr uint32_t binSize() { return 1 << M; }
static constexpr uint32_t spillSize() { return 4 * binSize(); }
static constexpr uint32_t spillSize() { return 16 * binSize(); }

static constexpr uint32_t bin(T t) {
static constexpr UT bin(T t) {
return (t >> shift()) & mask();
}

Expand All @@ -109,16 +138,17 @@ class HistoContainer {
}

__host__ __device__
void fill(T const * t, uint32_t j) {
auto b = bin(t[j]);
void fill(T t, index_type j) {
UT b = bin(t);
assert(b<nbins());
auto w = atomicIncrement(n[b]);
if (w < binSize()) {
bins[b * binSize() + w] = j;
} else {
auto w = atomicIncrement(nspills);
if (w < spillSize())
spillBin[w] = j;
}
}
}

constexpr bool fullSpill() const {
Expand All @@ -141,10 +171,18 @@ class HistoContainer {
return n[b];
}

uint32_t bins[nbins()*binSize()];
constexpr auto const * beginSpill() const {
return spillBin;
}

constexpr auto const * endSpill() const {
return beginSpill() + std::min(spillSize(), uint32_t(nspills));
}

Counter n[nbins()];
uint32_t spillBin[spillSize()];
Counter nspills;
index_type bins[nbins()*binSize()];
index_type spillBin[spillSize()];
};

#endif // HeterogeneousCore_CUDAUtilities_HistoContainer_h
75 changes: 47 additions & 28 deletions HeterogeneousCore/CUDAUtilities/interface/radixSort.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,56 @@
#include<cstdint>
#include<cassert>


template<typename T,
typename std::enable_if<std::is_unsigned<T>::value,T>::type* = nullptr
>
__device__
void reorderSigned(T const * a, uint16_t * ind, uint16_t * ind2, uint32_t size) {
}

template<typename T,
typename std::enable_if<std::is_signed<T>::value,T>::type* = nullptr
>
__device__
void reorderSigned(T const * a, uint16_t * ind, uint16_t * ind2, uint32_t size) {

//move negative first...

int32_t first = threadIdx.x;
__shared__ uint32_t firstNeg;
firstNeg=0;
__syncthreads();

// find first negative (for float ^ will not work...)
for (auto i=first; i<size-1; i+=blockDim.x) {
// if ( (int(a[ind[i]])*int(a[ind[i+1]])) <0 ) firstNeg=i+1;
if ( (a[ind[i]]^a[ind[i+1]]) < 0 ) firstNeg=i+1;
}

__syncthreads();

auto ii=first;
for (auto i=firstNeg+threadIdx.x; i<size; i+=blockDim.x) { ind2[ii] = ind[i]; ii+=blockDim.x; }
__syncthreads();
ii= size-firstNeg +threadIdx.x;
assert(ii>=0);
for (auto i=first;i<firstNeg;i+=blockDim.x) { ind2[ii] = ind[i]; ii+=blockDim.x; }
__syncthreads();
for (auto i=first; i<size; i+=blockDim.x) ind[i]=ind2[i];

}

template<typename T>
__device__
void radixSort(T * a, uint16_t * ind, uint32_t size) {
void radixSort(T const * a, uint16_t * ind, uint32_t size) {

constexpr int d = 8, w = 8*sizeof(T);
constexpr int sb = 1<<d;

constexpr int MaxSize = 256*32;
__shared__ uint16_t ind2[MaxSize];
__shared__ int32_t c[sb], ct[sb], cu[sb];
__shared__ uint32_t firstNeg;

__shared__ int ibs;
__shared__ int p;
Expand All @@ -25,8 +64,6 @@ void radixSort(T * a, uint16_t * ind, uint32_t size) {

// bool debug = false; // threadIdx.x==0 && blockIdx.x==5;

firstNeg=0;

p = 0;

auto j = ind;
Expand Down Expand Up @@ -118,32 +155,14 @@ void radixSort(T * a, uint16_t * ind, uint32_t size) {

}

// w/d is even so ind is correct
assert(j==ind);
__syncthreads();


if (8==w) // int8
for (auto i=first; i<size; i+=blockDim.x) ind[i]=ind2[i];
else
assert(j==ind); // w/d is even so ind is correct

// now move negative first...
// find first negative (for float ^ will not work...)
for (auto i=first; i<size-1; i+=blockDim.x) {
// if ( (int(a[ind[i]])*int(a[ind[i+1]])) <0 ) firstNeg=i+1;
if ( (a[ind[i]]^a[ind[i+1]]) < 0 ) firstNeg=i+1;
}

__syncthreads();
// assert(firstNeg>0); not necessary true if all positive !

auto ii=first;
for (auto i=firstNeg+threadIdx.x; i<size; i+=blockDim.x) { ind2[ii] = ind[i]; ii+=blockDim.x; }
__syncthreads();
ii= size-firstNeg +threadIdx.x;
assert(ii>=0);
for (auto i=first;i<firstNeg;i+=blockDim.x) { ind2[ii] = ind[i]; ii+=blockDim.x; }
__syncthreads();
for (auto i=first; i<size; i+=blockDim.x) ind[i]=ind2[i];
// now move negative first... (if signed)
reorderSigned(a,ind,ind2,size);


}


Expand Down
30 changes: 29 additions & 1 deletion HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ void go() {
for (int it=0; it<5; ++it) {
for (long long j = 0; j < N; j++) v[j]=rgen(eng);
h.zero();
for (long long j = 0; j < N; j++) h.fill(v,j);
for (long long j = 0; j < N; j++) h.fill(v[j],j);

std::cout << "nspills " << h.nspills << std::endl;

Expand All @@ -44,6 +44,34 @@ void go() {
}
}

for (long long j = 0; j < N; j++) {
auto b0 = h.bin(v[j]);
auto stot = h.endSpill()-h.beginSpill();
int w=0;
int tot=0;
auto ftest = [&](int k) {
assert(k>=0 && k<N);
tot++;
};
forEachInBins(h,v[j],w,ftest);
int rtot = h.end(b0)-h.begin(b0) + stot;
assert(tot==rtot);
w=1; tot=0;
forEachInBins(h,v[j],w,ftest);
int bp = b0+1;
int bm = b0-1;
if (bp<int(h.nbins())) rtot += h.end(bp)-h.begin(bp);
if (bm>=0) rtot += h.end(bm)-h.begin(bm);
assert(tot==rtot);
w=2; tot=0;
forEachInBins(h,v[j],w,ftest);
bp++;
bm--;
if (bp<int(h.nbins())) rtot += h.end(bp)-h.begin(bp);
if (bm>=0) rtot += h.end(bm)-h.begin(bm);
assert(tot==rtot);
}

}

int main() {
Expand Down
Loading