Skip to content

Commit

Permalink
Merge pull request cms-sw#165 from osschar/avx2
Browse files Browse the repository at this point in the history
Add support for AVX2
  • Loading branch information
kmcdermo authored Oct 10, 2018
2 parents 9e2609b + fff2432 commit 825e264
Show file tree
Hide file tree
Showing 9 changed files with 94 additions and 29 deletions.
2 changes: 2 additions & 0 deletions Config.h
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,8 @@ namespace Config
#define MPT_SIZE 16
#elif defined USE_CUDA
#define MPT_SIZE 8
#elif defined(__AVX__) || defined(__AVX2__)
#define MPT_SIZE 8
#else
#define MPT_SIZE 8
#endif
Expand Down
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,6 @@ endif

echo:
-echo CXX = ${CXX}

echo_cc_defs:
${CXX} -dM -E -mavx2 - < /dev/null
9 changes: 7 additions & 2 deletions Makefile.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
#KNC_BUILD := 1
# Define to build for AVX_512, the new mic (KNL) and latest generation Xeons.
#AVX_512 := 1
# Define to build for AVX2
#AVX2 := 1

# 0. Use gcc-5 from MacPorts on OSX
# OSXGCC5 := 1
Expand Down Expand Up @@ -71,9 +73,12 @@ OPT := -g -O3
# 4. Vectorization settings
ifdef AVX_512
VEC_GCC := -mavx512f -mavx512cd # -march=native -fopt-info-vec -mavx
VEC_ICC := -xHost -qopt-zmm-usage=high #-march=native -mtune=native #-xcore-avx512
VEC_ICC := -xHost -qopt-zmm-usage=high #-march=native -mtune=native #-xcore-avx512
else ifdef AVX2
VEC_GCC := -mavx2
VEC_ICC := -mavx2
else
VEC_GCC := -msse3 # -mavx # -fopt-info-vec-all
VEC_GCC := -mavx # -fopt-info-vec-all
VEC_ICC := -mavx
endif
VEC_MIC := -mmic
Expand Down
20 changes: 20 additions & 0 deletions Matriplex/Matriplex.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,26 @@ class Matriplex
}
*/

#elif defined(AVX2_INTRINSICS)

void SlurpIn(const char *arr, __m256i& vi, const int N_proc = N)
{
const __m256 src = { 0 };

__m256i k = _mm256_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7 );
__m256i k_sel = _mm256_set1_epi32(N_proc);
__m256i k_master = _mm256_cmpgt_epi32(k_sel, k);

k = k_master;
for (int i = 0; i < kSize; ++i, arr += sizeof(T))
{
__m256 reg = _mm256_mask_i32gather_ps(src, (float*) arr, vi, (__m256) k, 1);
// Restore mask (docs say gather clears it but it doesn't seem to).
k = k_master;
_mm256_maskstore_ps((float*) &fArray[i*N], k, reg);
}
}

#else

void SlurpIn(const char *arr, int vi[N], const int N_proc = N)
Expand Down
21 changes: 19 additions & 2 deletions Matriplex/MatriplexCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,30 @@
#define MPLEX_INTRINSICS_WIDTH_BYTES 64
#define MPLEX_INTRINSICS_WIDTH_BITS 512
#define MIC_INTRINSICS
#define GATHER_INTRINSICS
#define GATHER_IDX_LOAD(name, arr) __m512i name = _mm512_load_epi32(arr);

#define LD(a, i) _mm512_load_ps(&a[i*N+n])
#define ST(a, i, r) _mm512_store_ps(&a[i*N+n], r)
#define ADD(a, b) _mm512_add_ps(a, b)
#define ADD(a, b) _mm512_add_ps(a, b)
#define MUL(a, b) _mm512_mul_ps(a, b)
#define FMA(a, b, v) _mm512_fmadd_ps(a, b, v)

#elif defined(__AVX2__)

typedef __m256 IntrVec_t;
#define MPLEX_INTRINSICS_WIDTH_BYTES 32
#define MPLEX_INTRINSICS_WIDTH_BITS 256
#define AVX2_INTRINSICS
#define GATHER_INTRINSICS
#define GATHER_IDX_LOAD(name, arr) __m256i name = _mm256_load_epi32(arr);

#define LD(a, i) _mm256_load_ps(&a[i*N+n])
#define ST(a, i, r) _mm256_store_ps(&a[i*N+n], r)
#define ADD(a, b) _mm256_add_ps(a, b)
#define MUL(a, b) _mm256_mul_ps(a, b)
#define FMA(a, b, v) _mm256_fmadd_ps(a, b, v)

#elif defined(__AVX__)

typedef __m256 IntrVec_t;
Expand All @@ -44,7 +61,7 @@

#define LD(a, i) _mm256_load_ps(&a[i*N+n])
#define ST(a, i, r) _mm256_store_ps(&a[i*N+n], r)
#define ADD(a, b) _mm256_add_ps(a, b)
#define ADD(a, b) _mm256_add_ps(a, b)
#define MUL(a, b) _mm256_mul_ps(a, b)
// #define FMA(a, b, v) { __m256 temp = _mm256_mul_ps(a, b); v = _mm256_add_ps(temp, v); }
inline __m256 FMA(const __m256 &a, const __m256 &b, const __m256 &v)
Expand Down
20 changes: 20 additions & 0 deletions Matriplex/MatriplexSym.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,26 @@ class MatriplexSym
}
*/

#elif defined(AVX2_INTRINSICS)

void SlurpIn(const char *arr, __m256i& vi, const int N_proc = N)
{
const __m256 src = { 0 };

__m256i k = _mm256_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7 );
__m256i k_sel = _mm256_set1_epi32(N_proc);
__m256i k_master = _mm256_cmpgt_epi32(k_sel, k);

k = k_master;
for (int i = 0; i < kSize; ++i, arr += sizeof(T))
{
__m256 reg = _mm256_mask_i32gather_ps(src, (float *) arr, vi, (__m256) k, 1);
// Restore mask (docs say gather clears it but it doesn't seem to).
k = k_master;
_mm256_maskstore_ps((float*) &fArray[i*N], k, reg);
}
}

#else

void SlurpIn(const char *arr, int vi[N], const int N_proc = N)
Expand Down
26 changes: 13 additions & 13 deletions mkFit/MkFinder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -468,8 +468,8 @@ void MkFinder::AddBestHit(const LayerOfHits &layer_of_hits, const int N_proc,
idx[itrack] = XHitArr.At(itrack, hit_cnt, 0) * sizeof(Hit);
}
}
#if defined(MIC_INTRINSICS)
__m512i vi = _mm512_load_epi32(idx);
#if defined(GATHER_INTRINSICS)
GATHER_IDX_LOAD(vi, idx);
#endif

#ifndef NO_PREFETCH
Expand Down Expand Up @@ -499,7 +499,7 @@ void MkFinder::AddBestHit(const LayerOfHits &layer_of_hits, const int N_proc,

#else //NO_GATHER

#if defined(MIC_INTRINSICS)
#if defined(GATHER_INTRINSICS)
msErr.SlurpIn(varr + off_error, vi);
msPar.SlurpIn(varr + off_param, vi);
#else
Expand Down Expand Up @@ -672,8 +672,8 @@ void MkFinder::FindCandidates(const LayerOfHits &layer_of_hits,
idx[itrack] = XHitArr.At(itrack, hit_cnt, 0) * sizeof(Hit);
}
}
#if defined(MIC_INTRINSICS)
__m512i vi = _mm512_load_epi32(idx);
#if defined(GATHER_INTRINSICS)
GATHER_IDX_LOAD(vi, idx);
#endif

// Prefetch to L2 the hits we'll (probably) process after two loops iterations.
Expand All @@ -686,7 +686,7 @@ void MkFinder::FindCandidates(const LayerOfHits &layer_of_hits,
}
}

#if defined(MIC_INTRINSICS)
#if defined(GATHER_INTRINSICS)
msErr.SlurpIn(varr + off_error, vi);
msPar.SlurpIn(varr + off_param, vi);
#else
Expand Down Expand Up @@ -851,8 +851,8 @@ void MkFinder::FindCandidatesCloneEngine(const LayerOfHits &layer_of_hits, CandC
idx[itrack] = XHitArr.At(itrack, hit_cnt, 0) * sizeof(Hit);
}
}
#if defined(MIC_INTRINSICS)
__m512i vi = _mm512_load_epi32(idx);
#if defined(GATHER_INTRINSICS)
GATHER_IDX_LOAD(vi, idx);
#endif

// Prefetch to L2 the hits we'll (probably) process after two loops iterations.
Expand All @@ -865,7 +865,7 @@ void MkFinder::FindCandidatesCloneEngine(const LayerOfHits &layer_of_hits, CandC
}
}

#if defined(MIC_INTRINSICS)
#if defined(GATHER_INTRINSICS)
msErr.SlurpIn(varr + off_error, vi);
msPar.SlurpIn(varr + off_param, vi);
#else
Expand Down Expand Up @@ -1054,8 +1054,8 @@ void MkFinder::BkFitInputTracks(TrackVec& cands, int beg, int end)

Chi2.SetVal(0);

#ifdef MIC_INTRINSICS
__m512i vi = _mm512_load_epi32(idx);
#ifdef GATHER_INTRINSICS
GATHER_IDX_LOAD(vi, idx);
Err[iC].SlurpIn(varr + off_error, vi, N_proc);
Par[iC].SlurpIn(varr + off_param, vi, N_proc);
#else
Expand Down Expand Up @@ -1096,8 +1096,8 @@ void MkFinder::BkFitInputTracks(EventOfCombCandidates& eocss, int beg, int end)

Chi2.SetVal(0);

#ifdef MIC_INTRINSICS
__m512i vi = _mm512_load_epi32(idx);
#ifdef GATHER_INTRINSICS
GATHER_IDX_LOAD(vi, idx);
Err[iC].SlurpIn(varr + off_error, vi, N_proc);
Par[iC].SlurpIn(varr + off_param, vi, N_proc);
#else
Expand Down
6 changes: 2 additions & 4 deletions mkFit/MkFinderFV.cc
Original file line number Diff line number Diff line change
Expand Up @@ -314,9 +314,6 @@ void MkFinderFV<nseeds, ncands>::FindCandidates(const LayerOfHits &layer_of_hits
idx[itrack] = XHitArr.At(itrack, hit_cnt, 0) * sizeof(Hit);
}
}
#if defined(MIC_INTRINSICS)
__m512i vi = _mm512_load_epi32(idx);
#endif

// Prefetch to L2 the hits we'll (probably) process after two loops iterations.
// Ideally this would be initiated before coming here, for whole bunch_of_hits.m_hits vector.
Expand All @@ -328,7 +325,8 @@ void MkFinderFV<nseeds, ncands>::FindCandidates(const LayerOfHits &layer_of_hits
}
}

#if defined(MIC_INTRINSICS)
#if defined(GATHER_INTRINSICS)
GATHER_IDX_LOAD(vi, idx);
msErr.SlurpIn(varr + off_error, vi);
msPar.SlurpIn(varr + off_param, vi);
#else
Expand Down
16 changes: 8 additions & 8 deletions mkFit/MkFitter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,8 @@ void MkFitter::SlurpInTracksAndHits(const std::vector<Track>& tracks,
Chi2(itrack, 0, 0) = trk.chi2();
}

#ifdef MIC_INTRINSICS
__m512i vi = _mm512_load_epi32(idx);
#ifdef GATHER_INTRINSICS
GATHER_IDX_LOAD(vi, idx);
Err[iC].SlurpIn(varr + off_error, vi);
Par[iC].SlurpIn(varr + off_param, vi);
#else
Expand Down Expand Up @@ -228,8 +228,8 @@ void MkFitter::SlurpInTracksAndHits(const std::vector<Track>& tracks,
HoTArr[hi](itrack, 0, 0) = tracks[i].getHitOnTrack(hi);
}

#ifdef MIC_INTRINSICS
__m512i vi = _mm512_load_epi32(idx);
#ifdef GATHER_INTRINSICS
GATHER_IDX_LOAD(vi, idx);
msErr[hi].SlurpIn(varr + off_error, vi);
msPar[hi].SlurpIn(varr + off_param, vi);
#else
Expand Down Expand Up @@ -388,8 +388,8 @@ void MkFitter::InputTracksForFit(const std::vector<Track>& tracks,

// for ( ; itrack < NN; ++itrack) { idx[itrack] = idx[0]; }

#ifdef MIC_INTRINSICS
__m512i vi = _mm512_load_epi32(idx);
#ifdef GATHER_INTRINSICS
GATHER_IDX_LOAD(vi, idx);
Err[iC].SlurpIn(varr + off_error, vi, N_proc);
Par[iC].SlurpIn(varr + off_param, vi, N_proc);
for (int ll = 0; ll < Config::nLayers; ++ll)
Expand Down Expand Up @@ -440,8 +440,8 @@ void MkFitter::FitTracksWithInterSlurp(const std::vector<HitVec>& layersohits,
}
for (int i = N_proc; i < NN; ++i) { idx[i] = idx[0]; }

#ifdef MIC_INTRINSICS
__m512i vi = _mm512_load_epi32(idx);
#ifdef GATHER_INTRINSICS
GATHER_IDX_LOAD(vi, idx);
msPar[0].SlurpIn(varr + off_param, vi, N_proc);
msErr[0].SlurpIn(varr + off_error, vi, N_proc);
#else
Expand Down

0 comments on commit 825e264

Please sign in to comment.