Skip to content

Commit

Permalink
Refactor SSE f32
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Jul 27, 2024
1 parent b2b4113 commit e9779a6
Show file tree
Hide file tree
Showing 3 changed files with 252 additions and 215 deletions.
2 changes: 2 additions & 0 deletions src/sse/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ mod utils;
mod vertical_f16;
mod vertical_u16;
mod vertical_u8;
mod vertical_f32;

#[cfg(all(feature = "half", target_feature = "f16c"))]
pub use alpha_f16::{sse_premultiply_alpha_rgba_f16, sse_unpremultiply_alpha_rgba_f16};
Expand All @@ -74,6 +75,7 @@ pub use utils::*;
pub use vertical_f16::convolve_vertical_rgb_sse_row_f16;
pub use vertical_u16::convolve_vertical_rgb_sse_row_u16;
pub use vertical_u8::convolve_vertical_sse_row;
pub use vertical_f32::convolve_vertical_rgb_sse_row_f32;

pub const fn shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 {
((z << 6) | (y << 4) | (x << 2) | w) as i32
Expand Down
219 changes: 4 additions & 215 deletions src/sse/rgb_f32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,15 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

use crate::filter_weights::{FilterBounds, FilterWeights};
use crate::load_4_weights;
use crate::sse::{_mm_prefer_fma_ps, shuffle};
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

use crate::filter_weights::FilterWeights;
use crate::load_4_weights;
use crate::sse::{_mm_prefer_fma_ps, shuffle};

#[inline(always)]
pub(crate) unsafe fn convolve_horizontal_parts_4_rgb_f32(
start_x: usize,
Expand Down Expand Up @@ -363,215 +364,3 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32(
}
}
}

#[inline(always)]
pub(crate) unsafe fn convolve_vertical_part_sse_16_f32(
start_y: usize,
start_x: usize,
src: *const f32,
src_stride: usize,
dst: *mut f32,
filter: *const f32,
bounds: &FilterBounds,
) {
let mut store_0 = _mm_setzero_ps();
let mut store_1 = _mm_setzero_ps();
let mut store_2 = _mm_setzero_ps();
let mut store_3 = _mm_setzero_ps();

let px = start_x;

for j in 0..bounds.size {
let py = start_y + j;
let weight = unsafe { filter.add(j).read_unaligned() };
let v_weight = _mm_set1_ps(weight);
let src_ptr = src.add(src_stride * py);

let s_ptr = src_ptr.add(px);
let item_row_0 = _mm_loadu_ps(s_ptr);
let item_row_1 = _mm_loadu_ps(s_ptr.add(4));
let item_row_2 = _mm_loadu_ps(s_ptr.add(8));
let item_row_3 = _mm_loadu_ps(s_ptr.add(12));

store_0 = _mm_prefer_fma_ps(store_0, item_row_0, v_weight);
store_1 = _mm_prefer_fma_ps(store_1, item_row_1, v_weight);
store_2 = _mm_prefer_fma_ps(store_2, item_row_2, v_weight);
store_3 = _mm_prefer_fma_ps(store_3, item_row_3, v_weight);
}

let dst_ptr = dst.add(px);
_mm_storeu_ps(dst_ptr, store_0);
_mm_storeu_ps(dst_ptr.add(4), store_1);
_mm_storeu_ps(dst_ptr.add(8), store_2);
_mm_storeu_ps(dst_ptr.add(12), store_3);
}

#[inline(always)]
pub(crate) unsafe fn convolve_vertical_part_sse_8_f32(
start_y: usize,
start_x: usize,
src: *const f32,
src_stride: usize,
dst: *mut f32,
filter: *const f32,
bounds: &FilterBounds,
) {
let mut store_0 = _mm_setzero_ps();
let mut store_1 = _mm_setzero_ps();

let px = start_x;

for j in 0..bounds.size {
let py = start_y + j;
let weight = unsafe { filter.add(j).read_unaligned() };
let v_weight = _mm_set1_ps(weight);
let src_ptr = src.add(src_stride * py);

let s_ptr = src_ptr.add(px);
let item_row_0 = _mm_loadu_ps(s_ptr);
let item_row_1 = _mm_loadu_ps(s_ptr.add(4));

store_0 = _mm_prefer_fma_ps(store_0, item_row_0, v_weight);
store_1 = _mm_prefer_fma_ps(store_1, item_row_1, v_weight);
}

let dst_ptr = dst.add(px);
_mm_storeu_ps(dst_ptr, store_0);
_mm_storeu_ps(dst_ptr.add(4), store_1);
}

#[inline(always)]
pub(crate) unsafe fn convolve_vertical_part_sse_4_f32(
start_y: usize,
start_x: usize,
src: *const f32,
src_stride: usize,
dst: *mut f32,
filter: *const f32,
bounds: &FilterBounds,
) {
let mut store_0 = _mm_setzero_ps();

let px = start_x;

for j in 0..bounds.size {
let py = start_y + j;
let weight = unsafe { filter.add(j).read_unaligned() };
let v_weight = _mm_set1_ps(weight);
let src_ptr = src.add(src_stride * py);

let s_ptr = src_ptr.add(px);
let item_row_0 = _mm_loadu_ps(s_ptr);

store_0 = _mm_prefer_fma_ps(store_0, item_row_0, v_weight);
}

let dst_ptr = dst.add(px);
_mm_storeu_ps(dst_ptr, store_0);
}

#[inline(always)]
pub(crate) unsafe fn convolve_vertical_part_sse_f32(
start_y: usize,
start_x: usize,
src: *const f32,
src_stride: usize,
dst: *mut f32,
filter: *const f32,
bounds: &FilterBounds,
) {
let mut store_0 = _mm_setzero_ps();

let px = start_x;

for j in 0..bounds.size {
let py = start_y + j;
let weight = unsafe { filter.add(j).read_unaligned() };
let v_weight = _mm_set1_ps(weight);
let src_ptr = src.add(src_stride * py);

let s_ptr = src_ptr.add(px);
let item_row_0 = _mm_set1_ps(s_ptr.read_unaligned());

store_0 = _mm_prefer_fma_ps(store_0, item_row_0, v_weight);
}

let dst_ptr = dst.add(px);
dst_ptr.write_unaligned(f32::from_bits(_mm_extract_ps::<0>(store_0) as u32));
}

#[inline(always)]
pub(crate) fn convolve_vertical_rgb_sse_row_f32<const CHANNELS: usize>(
width: usize,
bounds: &FilterBounds,
unsafe_source_ptr_0: *const f32,
unsafe_destination_ptr_0: *mut f32,
src_stride: usize,
weight_ptr: *const f32,
) {
let mut cx = 0usize;
let dst_width = CHANNELS * width;

while cx + 16 < dst_width {
unsafe {
convolve_vertical_part_sse_16_f32(
bounds.start,
cx,
unsafe_source_ptr_0,
src_stride,
unsafe_destination_ptr_0,
weight_ptr,
bounds,
);
}

cx += 16;
}

while cx + 8 < dst_width {
unsafe {
convolve_vertical_part_sse_8_f32(
bounds.start,
cx,
unsafe_source_ptr_0,
src_stride,
unsafe_destination_ptr_0,
weight_ptr,
bounds,
);
}

cx += 8;
}

while cx + 4 < dst_width {
unsafe {
convolve_vertical_part_sse_4_f32(
bounds.start,
cx,
unsafe_source_ptr_0,
src_stride,
unsafe_destination_ptr_0,
weight_ptr,
bounds,
);
}

cx += 4;
}

while cx < dst_width {
unsafe {
convolve_vertical_part_sse_f32(
bounds.start,
cx,
unsafe_source_ptr_0,
src_stride,
unsafe_destination_ptr_0,
weight_ptr,
bounds,
);
}
cx += 1;
}
}
Loading

0 comments on commit e9779a6

Please sign in to comment.