Added kernelclip AVX, SSE

awxkee · Aug 4, 2024 · 823a308 · 823a308
1 parent 7bd4541
commit 823a308
Show file tree

Hide file tree

Showing 12 changed files with 530 additions and 26 deletions.
diff --git a/benches/gauss_bench/main.rs b/benches/gauss_bench/main.rs
@@ -1,9 +1,9 @@
-use criterion::{Criterion, criterion_group, criterion_main};
-use image::GenericImageView;
+use criterion::{criterion_group, criterion_main, Criterion};
 use image::io::Reader as ImageReader;
+use image::GenericImageView;
 use libblur::{EdgeMode, FastBlurChannels, ThreadingPolicy};
 use opencv::core::{
-    BORDER_DEFAULT, find_file, Mat, MatTraitConst, MatTraitConstManual, mean, Size, split, Vector,
+    find_file, mean, split, Mat, MatTraitConst, MatTraitConstManual, Size, Vector, BORDER_DEFAULT,
 };
 use opencv::imgcodecs::{imread, IMREAD_COLOR};
 

diff --git a/src/lib/gaussian/avx/filter_vertical_f32.rs b/src/lib/gaussian/avx/filter_vertical_f32.rs
@@ -0,0 +1,247 @@
+// Copyright (c) Radzivon Bartoshyk. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// 1.  Redistributions of source code must retain the above copyright notice, this
+// list of conditions and the following disclaimer.
+//
+// 2.  Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3.  Neither the name of the copyright holder nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+use crate::gaussian::gaussian_filter::GaussianFilter;
+use crate::unsafe_slice::UnsafeSlice;
+use erydanos::{_mm256_prefer_fma_ps, _mm_prefer_fma_ps};
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+pub fn gaussian_blur_vertical_pass_filter_f32_avx<T, const CHANNEL_CONFIGURATION: usize>(
+    undef_src: &[T],
+    src_stride: u32,
+    undef_unsafe_dst: &UnsafeSlice<T>,
+    dst_stride: u32,
+    width: u32,
+    _: u32,
+    filter: &Vec<GaussianFilter>,
+    start_y: u32,
+    end_y: u32,
+) {
+    let src: &[f32] = unsafe { std::mem::transmute(undef_src) };
+    let unsafe_dst: &UnsafeSlice<'_, f32> = unsafe { std::mem::transmute(undef_unsafe_dst) };
+
+    let zeros = unsafe { _mm256_setzero_ps() };
+
+    let total_length = width as usize * CHANNEL_CONFIGURATION;
+    for y in start_y..end_y {
+        let y_dst_shift = y as usize * dst_stride as usize;
+
+        let mut cx = 0usize;
+
+        let current_filter = unsafe { filter.get_unchecked(y as usize) };
+        let filter_start = current_filter.start;
+        let filter_weights = &current_filter.filter;
+
+        unsafe {
+            while cx + 64 < total_length {
+                let mut store0 = zeros;
+                let mut store1 = zeros;
+                let mut store2 = zeros;
+                let mut store3 = zeros;
+                let mut store4 = zeros;
+                let mut store5 = zeros;
+                let mut store6 = zeros;
+                let mut store7 = zeros;
+
+                let mut j = 0usize;
+                while j < current_filter.size {
+                    let weight = *filter_weights.get_unchecked(j);
+                    let f_weight = _mm256_set1_ps(weight);
+
+                    let py = filter_start + j;
+                    let y_src_shift = py * src_stride as usize;
+                    let s_ptr = src.as_ptr().add(y_src_shift + cx);
+                    let px_0 = _mm256_loadu_ps(s_ptr);
+                    let px_1 = _mm256_loadu_ps(s_ptr.add(8));
+                    let px_2 = _mm256_loadu_ps(s_ptr.add(16));
+                    let px_3 = _mm256_loadu_ps(s_ptr.add(24));
+                    let px_4 = _mm256_loadu_ps(s_ptr.add(32));
+                    let px_5 = _mm256_loadu_ps(s_ptr.add(40));
+                    let px_6 = _mm256_loadu_ps(s_ptr.add(48));
+                    let px_7 = _mm256_loadu_ps(s_ptr.add(56));
+                    store0 = _mm256_prefer_fma_ps(store0, px_0, f_weight);
+                    store1 = _mm256_prefer_fma_ps(store1, px_1, f_weight);
+                    store2 = _mm256_prefer_fma_ps(store2, px_2, f_weight);
+                    store3 = _mm256_prefer_fma_ps(store3, px_3, f_weight);
+                    store4 = _mm256_prefer_fma_ps(store4, px_4, f_weight);
+                    store5 = _mm256_prefer_fma_ps(store5, px_5, f_weight);
+                    store6 = _mm256_prefer_fma_ps(store6, px_6, f_weight);
+                    store7 = _mm256_prefer_fma_ps(store7, px_7, f_weight);
+
+                    j += 1;
+                }
+
+                let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);
+                _mm256_storeu_ps(dst_ptr, store0);
+                _mm256_storeu_ps(dst_ptr.add(8), store1);
+                _mm256_storeu_ps(dst_ptr.add(16), store2);
+                _mm256_storeu_ps(dst_ptr.add(24), store3);
+                _mm256_storeu_ps(dst_ptr.add(32), store4);
+                _mm256_storeu_ps(dst_ptr.add(40), store5);
+                _mm256_storeu_ps(dst_ptr.add(48), store6);
+                _mm256_storeu_ps(dst_ptr.add(56), store7);
+
+                cx += 64;
+            }
+
+            while cx + 32 < total_length {
+                let mut store0 = zeros;
+                let mut store1 = zeros;
+                let mut store2 = zeros;
+                let mut store3 = zeros;
+
+                let mut j = 0usize;
+                while j < current_filter.size {
+                    let weight = *filter_weights.get_unchecked(j);
+                    let f_weight = _mm256_set1_ps(weight);
+
+                    let py = filter_start + j;
+                    let y_src_shift = py * src_stride as usize;
+                    let s_ptr = src.as_ptr().add(y_src_shift + cx);
+                    let px_0 = _mm256_loadu_ps(s_ptr);
+                    let px_1 = _mm256_loadu_ps(s_ptr.add(8));
+                    let px_2 = _mm256_loadu_ps(s_ptr.add(16));
+                    let px_3 = _mm256_loadu_ps(s_ptr.add(24));
+                    store0 = _mm256_prefer_fma_ps(store0, px_0, f_weight);
+                    store1 = _mm256_prefer_fma_ps(store1, px_1, f_weight);
+                    store2 = _mm256_prefer_fma_ps(store2, px_2, f_weight);
+                    store3 = _mm256_prefer_fma_ps(store3, px_3, f_weight);
+
+                    j += 1;
+                }
+
+                let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);
+                _mm256_storeu_ps(dst_ptr, store0);
+                _mm256_storeu_ps(dst_ptr.add(8), store1);
+                _mm256_storeu_ps(dst_ptr.add(16), store2);
+                _mm256_storeu_ps(dst_ptr.add(24), store3);
+
+                cx += 32;
+            }
+
+            while cx + 16 < total_length {
+                let mut store0 = zeros;
+                let mut store1 = zeros;
+
+                let mut j = 0usize;
+                while j < current_filter.size {
+                    let weight = *filter_weights.get_unchecked(j);
+                    let f_weight = _mm256_set1_ps(weight);
+
+                    let py = filter_start + j;
+                    let y_src_shift = py * src_stride as usize;
+                    let s_ptr = src.as_ptr().add(y_src_shift + cx);
+                    let px_0 = _mm256_loadu_ps(s_ptr);
+                    let px_1 = _mm256_loadu_ps(s_ptr.add(8));
+                    store0 = _mm256_prefer_fma_ps(store0, px_0, f_weight);
+                    store1 = _mm256_prefer_fma_ps(store1, px_1, f_weight);
+
+                    j += 1;
+                }
+
+                let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);
+                _mm256_storeu_ps(dst_ptr, store0);
+                _mm256_storeu_ps(dst_ptr.add(8), store1);
+
+                cx += 16;
+            }
+
+            while cx + 8 < total_length {
+                let mut store0 = zeros;
+
+                let mut j = 0usize;
+                while j < current_filter.size {
+                    let weight = *filter_weights.get_unchecked(j);
+                    let f_weight = _mm256_set1_ps(weight);
+
+                    let py = filter_start + j;
+                    let y_src_shift = py * src_stride as usize;
+                    let s_ptr = src.as_ptr().add(y_src_shift + cx);
+                    let px = _mm256_loadu_ps(s_ptr);
+                    store0 = _mm256_prefer_fma_ps(store0, px, f_weight);
+
+                    j += 1;
+                }
+
+                let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);
+                _mm256_storeu_ps(dst_ptr, store0);
+
+                cx += 8;
+            }
+
+            while cx + 4 < total_length {
+                let mut store0 = _mm_setzero_ps();
+
+                let mut j = 0usize;
+                while j < current_filter.size {
+                    let weight = *filter_weights.get_unchecked(j);
+                    let f_weight = _mm_set1_ps(weight);
+
+                    let py = filter_start + j;
+                    let y_src_shift = py * src_stride as usize;
+                    let s_ptr = src.as_ptr().add(y_src_shift + cx);
+                    let lo_lo = _mm_loadu_ps(s_ptr);
+                    store0 = _mm_prefer_fma_ps(store0, lo_lo, f_weight);
+
+                    j += 1;
+                }
+
+                let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);
+                _mm_storeu_ps(dst_ptr, store0);
+
+                cx += 4;
+            }
+
+            while cx < total_length {
+                let mut store0 = _mm_setzero_ps();
+
+                let mut j = 0usize;
+                while j < current_filter.size {
+                    let weight = *filter_weights.get_unchecked(j);
+                    let f_weight = _mm_set1_ps(weight);
+
+                    let py = filter_start + j;
+                    let y_src_shift = py * src_stride as usize;
+                    let s_ptr = src.as_ptr().add(y_src_shift + cx);
+                    let f_pixel = _mm_setr_ps(s_ptr.read_unaligned(), 0., 0., 0.);
+                    store0 = _mm_prefer_fma_ps(store0, f_pixel, f_weight);
+
+                    j += 1;
+                }
+
+                let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);
+                let pixel = _mm_extract_ps::<0>(store0);
+                (dst_ptr as *mut i32).write_unaligned(pixel);
+
+                cx += 1;
+            }
+        }
+    }
+}
diff --git a/src/lib/gaussian/avx/mod.rs b/src/lib/gaussian/avx/mod.rs
@@ -1,6 +1,8 @@
-mod vertical;
 mod utils;
+mod vertical;
 mod vertical_f32;
+mod filter_vertical_f32;
 
 pub use vertical::gaussian_blur_vertical_pass_impl_avx;
-pub use vertical_f32::gaussian_blur_vertical_pass_impl_f32_avx;
+pub use vertical_f32::gaussian_blur_vertical_pass_impl_f32_avx;
+pub use filter_vertical_f32::gaussian_blur_vertical_pass_filter_f32_avx;
diff --git a/src/lib/gaussian/avx/utils.rs b/src/lib/gaussian/avx/utils.rs
@@ -64,6 +64,15 @@ pub(crate) unsafe fn load_u8_f32_fast<const CHANNELS_COUNT: usize>(ptr: *const u
         4 => u32::from_le_bytes([ptr.add(3).read_unaligned(), 0, 0, 0]),
         _ => 0,
     };
-    let v_int = _mm256_setr_epi32(u_first as i32, u_second as i32, u_third as i32, u_fourth as i32, 0,0,0,0);
+    let v_int = _mm256_setr_epi32(
+        u_first as i32,
+        u_second as i32,
+        u_third as i32,
+        u_fourth as i32,
+        0,
+        0,
+        0,
+        0,
+    );
     return _mm256_cvtepi32_ps(v_int);
-}
+}
diff --git a/src/lib/gaussian/avx/vertical.rs b/src/lib/gaussian/avx/vertical.rs
@@ -32,7 +32,9 @@ use std::arch::x86_64::*;
 
 use erydanos::_mm256_prefer_fma_ps;
 
-use crate::gaussian::avx::utils::{avx2_pack_u16, avx2_pack_u32, load_u8_f32_fast, load_u8_u32_one};
+use crate::gaussian::avx::utils::{
+    avx2_pack_u16, avx2_pack_u32, load_u8_f32_fast, load_u8_u32_one,
+};
 use crate::unsafe_slice::UnsafeSlice;
 
 pub fn gaussian_blur_vertical_pass_impl_avx<T, const CHANNEL_CONFIGURATION: usize>(

diff --git a/src/lib/gaussian/avx/vertical_f32.rs b/src/lib/gaussian/avx/vertical_f32.rs
@@ -25,12 +25,12 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+use crate::unsafe_slice::UnsafeSlice;
+use erydanos::{_mm256_prefer_fma_ps, _mm_prefer_fma_ps};
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
-use erydanos::{_mm256_prefer_fma_ps, _mm_prefer_fma_ps};
-use crate::unsafe_slice::UnsafeSlice;
 
 pub fn gaussian_blur_vertical_pass_impl_f32_avx<T, const CHANNEL_CONFIGURATION: usize>(
     undef_src: &[T],