Merge pull request cms-sw#165 from osschar/avx2

Add support for AVX2
slava77 · Oct 10, 2018 · 825e264 · 825e264
2 parents 9e2609b + fff2432
commit 825e264
Show file tree

Hide file tree

Showing 9 changed files with 94 additions and 29 deletions.
diff --git a/Config.h b/Config.h
@@ -406,6 +406,8 @@ namespace Config
       #define MPT_SIZE 16
     #elif defined USE_CUDA
       #define MPT_SIZE 8
+    #elif defined(__AVX__) || defined(__AVX2__)
+      #define MPT_SIZE 8
     #else
       #define MPT_SIZE 8
     #endif

diff --git a/Makefile b/Makefile
@@ -94,3 +94,6 @@ endif
 
 echo:
 	-echo CXX = ${CXX}
+
+echo_cc_defs:
+	${CXX} -dM -E -mavx2 - < /dev/null
diff --git a/Makefile.config b/Makefile.config
@@ -21,6 +21,8 @@
 #KNC_BUILD := 1
 # Define to build for AVX_512, the new mic (KNL) and latest generation Xeons.
 #AVX_512 := 1
+# Define to build for AVX2
+#AVX2    := 1
 
 # 0. Use gcc-5 from MacPorts on OSX
 # OSXGCC5    := 1
@@ -71,9 +73,12 @@ OPT := -g -O3
 # 4. Vectorization settings
 ifdef AVX_512
 VEC_GCC  := -mavx512f -mavx512cd # -march=native -fopt-info-vec -mavx
-VEC_ICC  :=  -xHost -qopt-zmm-usage=high #-march=native -mtune=native #-xcore-avx512
+VEC_ICC  := -xHost -qopt-zmm-usage=high #-march=native -mtune=native #-xcore-avx512
+else ifdef AVX2
+VEC_GCC  := -mavx2
+VEC_ICC  := -mavx2
 else
-VEC_GCC  := -msse3 # -mavx # -fopt-info-vec-all
+VEC_GCC  := -mavx # -fopt-info-vec-all
 VEC_ICC  := -mavx
 endif
 VEC_MIC  := -mmic

diff --git a/Matriplex/Matriplex.h b/Matriplex/Matriplex.h
@@ -157,6 +157,26 @@ class Matriplex
    }
    */
 
+#elif defined(AVX2_INTRINSICS)
+
+   void SlurpIn(const char *arr, __m256i& vi, const int N_proc = N)
+   {
+      const __m256 src = { 0 };
+
+      __m256i k        = _mm256_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7 );
+      __m256i k_sel    = _mm256_set1_epi32(N_proc);
+      __m256i k_master = _mm256_cmpgt_epi32(k_sel, k);
+
+      k = k_master;
+      for (int i = 0; i < kSize; ++i, arr += sizeof(T))
+      {
+         __m256 reg = _mm256_mask_i32gather_ps(src, (float*) arr, vi, (__m256) k, 1);
+         // Restore mask (docs say gather clears it but it doesn't seem to).
+         k = k_master;
+         _mm256_maskstore_ps((float*) &fArray[i*N], k, reg);
+      }
+   }
+
 #else
 
    void SlurpIn(const char *arr, int vi[N], const int N_proc = N)

diff --git a/Matriplex/MatriplexCommon.h b/Matriplex/MatriplexCommon.h
@@ -28,13 +28,30 @@
     #define MPLEX_INTRINSICS_WIDTH_BYTES  64
     #define MPLEX_INTRINSICS_WIDTH_BITS  512
     #define MIC_INTRINSICS
+    #define GATHER_INTRINSICS
+    #define GATHER_IDX_LOAD(name, arr)  __m512i name = _mm512_load_epi32(arr);
 
     #define LD(a, i)      _mm512_load_ps(&a[i*N+n])
     #define ST(a, i, r)   _mm512_store_ps(&a[i*N+n], r)
-    #define ADD(a, b)     _mm512_add_ps(a, b) 
+    #define ADD(a, b)     _mm512_add_ps(a, b)
     #define MUL(a, b)     _mm512_mul_ps(a, b)
     #define FMA(a, b, v)  _mm512_fmadd_ps(a, b, v)
 
+  #elif defined(__AVX2__)
+
+    typedef __m256 IntrVec_t;
+    #define MPLEX_INTRINSICS_WIDTH_BYTES  32
+    #define MPLEX_INTRINSICS_WIDTH_BITS  256
+    #define AVX2_INTRINSICS
+    #define GATHER_INTRINSICS
+    #define GATHER_IDX_LOAD(name, arr)  __m256i name = _mm256_load_epi32(arr);
+
+    #define LD(a, i)      _mm256_load_ps(&a[i*N+n])
+    #define ST(a, i, r)   _mm256_store_ps(&a[i*N+n], r)
+    #define ADD(a, b)     _mm256_add_ps(a, b)
+    #define MUL(a, b)     _mm256_mul_ps(a, b)
+    #define FMA(a, b, v)  _mm256_fmadd_ps(a, b, v)
+
   #elif defined(__AVX__)
 
     typedef __m256 IntrVec_t;
@@ -44,7 +61,7 @@
 
     #define LD(a, i)      _mm256_load_ps(&a[i*N+n])
     #define ST(a, i, r)   _mm256_store_ps(&a[i*N+n], r)
-    #define ADD(a, b)     _mm256_add_ps(a, b) 
+    #define ADD(a, b)     _mm256_add_ps(a, b)
     #define MUL(a, b)     _mm256_mul_ps(a, b)
     // #define FMA(a, b, v)  { __m256 temp = _mm256_mul_ps(a, b); v = _mm256_add_ps(temp, v); }
     inline __m256 FMA(const __m256 &a, const __m256 &b, const __m256 &v)

diff --git a/Matriplex/MatriplexSym.h b/Matriplex/MatriplexSym.h
@@ -177,6 +177,26 @@ class MatriplexSym
    }
    */
 
+#elif defined(AVX2_INTRINSICS)
+
+   void SlurpIn(const char *arr, __m256i& vi, const int N_proc = N)
+   {
+      const __m256 src = { 0 };
+
+      __m256i k        = _mm256_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7 );
+      __m256i k_sel    = _mm256_set1_epi32(N_proc);
+      __m256i k_master = _mm256_cmpgt_epi32(k_sel, k);
+
+      k = k_master;
+      for (int i = 0; i < kSize; ++i, arr += sizeof(T))
+      {
+         __m256 reg = _mm256_mask_i32gather_ps(src, (float *) arr, vi, (__m256) k, 1);
+         // Restore mask (docs say gather clears it but it doesn't seem to).
+         k = k_master;
+         _mm256_maskstore_ps((float*) &fArray[i*N], k, reg);
+      }
+   }
+
 #else
 
    void SlurpIn(const char *arr, int vi[N], const int N_proc = N)

diff --git a/mkFit/MkFinder.cc b/mkFit/MkFinder.cc
@@ -468,8 +468,8 @@ void MkFinder::AddBestHit(const LayerOfHits &layer_of_hits, const int N_proc,
         idx[itrack] = XHitArr.At(itrack, hit_cnt, 0) * sizeof(Hit);
       }
     }
-#if defined(MIC_INTRINSICS)
-    __m512i vi = _mm512_load_epi32(idx);
+#if defined(GATHER_INTRINSICS)
+    GATHER_IDX_LOAD(vi, idx);
 #endif
 
 #ifndef NO_PREFETCH
@@ -499,7 +499,7 @@ void MkFinder::AddBestHit(const LayerOfHits &layer_of_hits, const int N_proc,
 
 #else //NO_GATHER
 
-#if defined(MIC_INTRINSICS)
+#if defined(GATHER_INTRINSICS)
     msErr.SlurpIn(varr + off_error, vi);
     msPar.SlurpIn(varr + off_param, vi);
 #else
@@ -672,8 +672,8 @@ void MkFinder::FindCandidates(const LayerOfHits &layer_of_hits,
 	idx[itrack] = XHitArr.At(itrack, hit_cnt, 0) * sizeof(Hit);
       }
     }
-#if defined(MIC_INTRINSICS)
-    __m512i vi = _mm512_load_epi32(idx);
+#if defined(GATHER_INTRINSICS)
+    GATHER_IDX_LOAD(vi, idx);
 #endif
 
     // Prefetch to L2 the hits we'll (probably) process after two loops iterations.
@@ -686,7 +686,7 @@ void MkFinder::FindCandidates(const LayerOfHits &layer_of_hits,
       }
     }
 
-#if defined(MIC_INTRINSICS)
+#if defined(GATHER_INTRINSICS)
     msErr.SlurpIn(varr + off_error, vi);
     msPar.SlurpIn(varr + off_param, vi);
 #else
@@ -851,8 +851,8 @@ void MkFinder::FindCandidatesCloneEngine(const LayerOfHits &layer_of_hits, CandC
         idx[itrack] = XHitArr.At(itrack, hit_cnt, 0) * sizeof(Hit);
       }
     }
-#if defined(MIC_INTRINSICS)
-    __m512i vi = _mm512_load_epi32(idx);
+#if defined(GATHER_INTRINSICS)
+    GATHER_IDX_LOAD(vi, idx);
 #endif
 
     // Prefetch to L2 the hits we'll (probably) process after two loops iterations.
@@ -865,7 +865,7 @@ void MkFinder::FindCandidatesCloneEngine(const LayerOfHits &layer_of_hits, CandC
       }
     }
 
-#if defined(MIC_INTRINSICS)
+#if defined(GATHER_INTRINSICS)
     msErr.SlurpIn(varr + off_error, vi);
     msPar.SlurpIn(varr + off_param, vi);
 #else
@@ -1054,8 +1054,8 @@ void MkFinder::BkFitInputTracks(TrackVec& cands, int beg, int end)
 
   Chi2.SetVal(0);
 
-#ifdef MIC_INTRINSICS
-  __m512i vi      = _mm512_load_epi32(idx);
+#ifdef GATHER_INTRINSICS
+  GATHER_IDX_LOAD(vi, idx);
   Err[iC].SlurpIn(varr + off_error, vi, N_proc);
   Par[iC].SlurpIn(varr + off_param, vi, N_proc);
 #else
@@ -1096,8 +1096,8 @@ void MkFinder::BkFitInputTracks(EventOfCombCandidates& eocss, int beg, int end)
 
   Chi2.SetVal(0);
 
-#ifdef MIC_INTRINSICS
-  __m512i vi      = _mm512_load_epi32(idx);
+#ifdef GATHER_INTRINSICS
+  GATHER_IDX_LOAD(vi, idx);
   Err[iC].SlurpIn(varr + off_error, vi, N_proc);
   Par[iC].SlurpIn(varr + off_param, vi, N_proc);
 #else

diff --git a/mkFit/MkFinderFV.cc b/mkFit/MkFinderFV.cc
@@ -314,9 +314,6 @@ void MkFinderFV<nseeds, ncands>::FindCandidates(const LayerOfHits &layer_of_hits
         idx[itrack] = XHitArr.At(itrack, hit_cnt, 0) * sizeof(Hit);
       }
     }
-#if defined(MIC_INTRINSICS)
-    __m512i vi = _mm512_load_epi32(idx);
-#endif
 
     // Prefetch to L2 the hits we'll (probably) process after two loops iterations.
     // Ideally this would be initiated before coming here, for whole bunch_of_hits.m_hits vector.
@@ -328,7 +325,8 @@ void MkFinderFV<nseeds, ncands>::FindCandidates(const LayerOfHits &layer_of_hits
       }
     }
 
-#if defined(MIC_INTRINSICS)
+#if defined(GATHER_INTRINSICS)
+    GATHER_IDX_LOAD(vi, idx);
     msErr.SlurpIn(varr + off_error, vi);
     msPar.SlurpIn(varr + off_param, vi);
 #else

diff --git a/mkFit/MkFitter.cc b/mkFit/MkFitter.cc
@@ -197,8 +197,8 @@ void MkFitter::SlurpInTracksAndHits(const std::vector<Track>&  tracks,
     Chi2(itrack, 0, 0) = trk.chi2();
   }
 
-#ifdef MIC_INTRINSICS
-  __m512i vi      = _mm512_load_epi32(idx);
+#ifdef GATHER_INTRINSICS
+  GATHER_IDX_LOAD(vi, idx);
   Err[iC].SlurpIn(varr + off_error, vi);
   Par[iC].SlurpIn(varr + off_param, vi);
 #else
@@ -228,8 +228,8 @@ void MkFitter::SlurpInTracksAndHits(const std::vector<Track>&  tracks,
       HoTArr[hi](itrack, 0, 0) = tracks[i].getHitOnTrack(hi);
     }
 
-#ifdef MIC_INTRINSICS
-    __m512i vi      = _mm512_load_epi32(idx);
+#ifdef GATHER_INTRINSICS
+    GATHER_IDX_LOAD(vi, idx);
     msErr[hi].SlurpIn(varr + off_error, vi);
     msPar[hi].SlurpIn(varr + off_param, vi);
 #else
@@ -388,8 +388,8 @@ void MkFitter::InputTracksForFit(const std::vector<Track>& tracks,
 
   // for ( ; itrack < NN; ++itrack)  {  idx[itrack] = idx[0];  }
 
-#ifdef MIC_INTRINSICS
-  __m512i vi      = _mm512_load_epi32(idx);
+#ifdef GATHER_INTRINSICS
+  GATHER_IDX_LOAD(vi, idx);
   Err[iC].SlurpIn(varr + off_error, vi, N_proc);
   Par[iC].SlurpIn(varr + off_param, vi, N_proc);
   for (int ll = 0; ll < Config::nLayers; ++ll)
@@ -440,8 +440,8 @@ void MkFitter::FitTracksWithInterSlurp(const std::vector<HitVec>& layersohits,
     }
     for (int i = N_proc; i < NN; ++i)  {  idx[i] = idx[0];  }
 
-#ifdef MIC_INTRINSICS
-    __m512i vi      = _mm512_load_epi32(idx);
+#ifdef GATHER_INTRINSICS
+    GATHER_IDX_LOAD(vi, idx);
     msPar[0].SlurpIn(varr + off_param, vi, N_proc);
     msErr[0].SlurpIn(varr + off_error, vi, N_proc);
 #else