Refactor SSE f32

awxkee · Jul 27, 2024 · e9779a6 · e9779a6
1 parent b2b4113
commit e9779a6
Show file tree

Hide file tree

Showing 3 changed files with 252 additions and 215 deletions.
diff --git a/src/sse/mod.rs b/src/sse/mod.rs
@@ -48,6 +48,7 @@ mod utils;
 mod vertical_f16;
 mod vertical_u16;
 mod vertical_u8;
+mod vertical_f32;
 
 #[cfg(all(feature = "half", target_feature = "f16c"))]
 pub use alpha_f16::{sse_premultiply_alpha_rgba_f16, sse_unpremultiply_alpha_rgba_f16};
@@ -74,6 +75,7 @@ pub use utils::*;
 pub use vertical_f16::convolve_vertical_rgb_sse_row_f16;
 pub use vertical_u16::convolve_vertical_rgb_sse_row_u16;
 pub use vertical_u8::convolve_vertical_sse_row;
+pub use vertical_f32::convolve_vertical_rgb_sse_row_f32;
 
 pub const fn shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 {
     ((z << 6) | (y << 4) | (x << 2) | w) as i32

diff --git a/src/sse/rgb_f32.rs b/src/sse/rgb_f32.rs
@@ -27,14 +27,15 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-use crate::filter_weights::{FilterBounds, FilterWeights};
-use crate::load_4_weights;
-use crate::sse::{_mm_prefer_fma_ps, shuffle};
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
+use crate::filter_weights::FilterWeights;
+use crate::load_4_weights;
+use crate::sse::{_mm_prefer_fma_ps, shuffle};
+
 #[inline(always)]
 pub(crate) unsafe fn convolve_horizontal_parts_4_rgb_f32(
     start_x: usize,
@@ -363,215 +364,3 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32(
         }
     }
 }
-
-#[inline(always)]
-pub(crate) unsafe fn convolve_vertical_part_sse_16_f32(
-    start_y: usize,
-    start_x: usize,
-    src: *const f32,
-    src_stride: usize,
-    dst: *mut f32,
-    filter: *const f32,
-    bounds: &FilterBounds,
-) {
-    let mut store_0 = _mm_setzero_ps();
-    let mut store_1 = _mm_setzero_ps();
-    let mut store_2 = _mm_setzero_ps();
-    let mut store_3 = _mm_setzero_ps();
-
-    let px = start_x;
-
-    for j in 0..bounds.size {
-        let py = start_y + j;
-        let weight = unsafe { filter.add(j).read_unaligned() };
-        let v_weight = _mm_set1_ps(weight);
-        let src_ptr = src.add(src_stride * py);
-
-        let s_ptr = src_ptr.add(px);
-        let item_row_0 = _mm_loadu_ps(s_ptr);
-        let item_row_1 = _mm_loadu_ps(s_ptr.add(4));
-        let item_row_2 = _mm_loadu_ps(s_ptr.add(8));
-        let item_row_3 = _mm_loadu_ps(s_ptr.add(12));
-
-        store_0 = _mm_prefer_fma_ps(store_0, item_row_0, v_weight);
-        store_1 = _mm_prefer_fma_ps(store_1, item_row_1, v_weight);
-        store_2 = _mm_prefer_fma_ps(store_2, item_row_2, v_weight);
-        store_3 = _mm_prefer_fma_ps(store_3, item_row_3, v_weight);
-    }
-
-    let dst_ptr = dst.add(px);
-    _mm_storeu_ps(dst_ptr, store_0);
-    _mm_storeu_ps(dst_ptr.add(4), store_1);
-    _mm_storeu_ps(dst_ptr.add(8), store_2);
-    _mm_storeu_ps(dst_ptr.add(12), store_3);
-}
-
-#[inline(always)]
-pub(crate) unsafe fn convolve_vertical_part_sse_8_f32(
-    start_y: usize,
-    start_x: usize,
-    src: *const f32,
-    src_stride: usize,
-    dst: *mut f32,
-    filter: *const f32,
-    bounds: &FilterBounds,
-) {
-    let mut store_0 = _mm_setzero_ps();
-    let mut store_1 = _mm_setzero_ps();
-
-    let px = start_x;
-
-    for j in 0..bounds.size {
-        let py = start_y + j;
-        let weight = unsafe { filter.add(j).read_unaligned() };
-        let v_weight = _mm_set1_ps(weight);
-        let src_ptr = src.add(src_stride * py);
-
-        let s_ptr = src_ptr.add(px);
-        let item_row_0 = _mm_loadu_ps(s_ptr);
-        let item_row_1 = _mm_loadu_ps(s_ptr.add(4));
-
-        store_0 = _mm_prefer_fma_ps(store_0, item_row_0, v_weight);
-        store_1 = _mm_prefer_fma_ps(store_1, item_row_1, v_weight);
-    }
-
-    let dst_ptr = dst.add(px);
-    _mm_storeu_ps(dst_ptr, store_0);
-    _mm_storeu_ps(dst_ptr.add(4), store_1);
-}
-
-#[inline(always)]
-pub(crate) unsafe fn convolve_vertical_part_sse_4_f32(
-    start_y: usize,
-    start_x: usize,
-    src: *const f32,
-    src_stride: usize,
-    dst: *mut f32,
-    filter: *const f32,
-    bounds: &FilterBounds,
-) {
-    let mut store_0 = _mm_setzero_ps();
-
-    let px = start_x;
-
-    for j in 0..bounds.size {
-        let py = start_y + j;
-        let weight = unsafe { filter.add(j).read_unaligned() };
-        let v_weight = _mm_set1_ps(weight);
-        let src_ptr = src.add(src_stride * py);
-
-        let s_ptr = src_ptr.add(px);
-        let item_row_0 = _mm_loadu_ps(s_ptr);
-
-        store_0 = _mm_prefer_fma_ps(store_0, item_row_0, v_weight);
-    }
-
-    let dst_ptr = dst.add(px);
-    _mm_storeu_ps(dst_ptr, store_0);
-}
-
-#[inline(always)]
-pub(crate) unsafe fn convolve_vertical_part_sse_f32(
-    start_y: usize,
-    start_x: usize,
-    src: *const f32,
-    src_stride: usize,
-    dst: *mut f32,
-    filter: *const f32,
-    bounds: &FilterBounds,
-) {
-    let mut store_0 = _mm_setzero_ps();
-
-    let px = start_x;
-
-    for j in 0..bounds.size {
-        let py = start_y + j;
-        let weight = unsafe { filter.add(j).read_unaligned() };
-        let v_weight = _mm_set1_ps(weight);
-        let src_ptr = src.add(src_stride * py);
-
-        let s_ptr = src_ptr.add(px);
-        let item_row_0 = _mm_set1_ps(s_ptr.read_unaligned());
-
-        store_0 = _mm_prefer_fma_ps(store_0, item_row_0, v_weight);
-    }
-
-    let dst_ptr = dst.add(px);
-    dst_ptr.write_unaligned(f32::from_bits(_mm_extract_ps::<0>(store_0) as u32));
-}
-
-#[inline(always)]
-pub(crate) fn convolve_vertical_rgb_sse_row_f32<const CHANNELS: usize>(
-    width: usize,
-    bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
-    src_stride: usize,
-    weight_ptr: *const f32,
-) {
-    let mut cx = 0usize;
-    let dst_width = CHANNELS * width;
-
-    while cx + 16 < dst_width {
-        unsafe {
-            convolve_vertical_part_sse_16_f32(
-                bounds.start,
-                cx,
-                unsafe_source_ptr_0,
-                src_stride,
-                unsafe_destination_ptr_0,
-                weight_ptr,
-                bounds,
-            );
-        }
-
-        cx += 16;
-    }
-
-    while cx + 8 < dst_width {
-        unsafe {
-            convolve_vertical_part_sse_8_f32(
-                bounds.start,
-                cx,
-                unsafe_source_ptr_0,
-                src_stride,
-                unsafe_destination_ptr_0,
-                weight_ptr,
-                bounds,
-            );
-        }
-
-        cx += 8;
-    }
-
-    while cx + 4 < dst_width {
-        unsafe {
-            convolve_vertical_part_sse_4_f32(
-                bounds.start,
-                cx,
-                unsafe_source_ptr_0,
-                src_stride,
-                unsafe_destination_ptr_0,
-                weight_ptr,
-                bounds,
-            );
-        }
-
-        cx += 4;
-    }
-
-    while cx < dst_width {
-        unsafe {
-            convolve_vertical_part_sse_f32(
-                bounds.start,
-                cx,
-                unsafe_source_ptr_0,
-                src_stride,
-                unsafe_destination_ptr_0,
-                weight_ptr,
-                bounds,
-            );
-        }
-        cx += 1;
-    }
-}