Skip to content

Commit

Permalink
Added kernelclip AVX, SSE
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Aug 4, 2024
1 parent 7bd4541 commit 823a308
Show file tree
Hide file tree
Showing 12 changed files with 530 additions and 26 deletions.
6 changes: 3 additions & 3 deletions benches/gauss_bench/main.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
use criterion::{Criterion, criterion_group, criterion_main};
use image::GenericImageView;
use criterion::{criterion_group, criterion_main, Criterion};
use image::io::Reader as ImageReader;
use image::GenericImageView;
use libblur::{EdgeMode, FastBlurChannels, ThreadingPolicy};
use opencv::core::{
BORDER_DEFAULT, find_file, Mat, MatTraitConst, MatTraitConstManual, mean, Size, split, Vector,
find_file, mean, split, Mat, MatTraitConst, MatTraitConstManual, Size, Vector, BORDER_DEFAULT,
};
use opencv::imgcodecs::{imread, IMREAD_COLOR};

Expand Down
247 changes: 247 additions & 0 deletions src/lib/gaussian/avx/filter_vertical_f32.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
// Copyright (c) Radzivon Bartoshyk. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice, this
// list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// 3. Neither the name of the copyright holder nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

use crate::gaussian::gaussian_filter::GaussianFilter;
use crate::unsafe_slice::UnsafeSlice;
use erydanos::{_mm256_prefer_fma_ps, _mm_prefer_fma_ps};
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

pub fn gaussian_blur_vertical_pass_filter_f32_avx<T, const CHANNEL_CONFIGURATION: usize>(
undef_src: &[T],
src_stride: u32,
undef_unsafe_dst: &UnsafeSlice<T>,
dst_stride: u32,
width: u32,
_: u32,
filter: &Vec<GaussianFilter>,
start_y: u32,
end_y: u32,
) {
let src: &[f32] = unsafe { std::mem::transmute(undef_src) };
let unsafe_dst: &UnsafeSlice<'_, f32> = unsafe { std::mem::transmute(undef_unsafe_dst) };

let zeros = unsafe { _mm256_setzero_ps() };

let total_length = width as usize * CHANNEL_CONFIGURATION;
for y in start_y..end_y {
let y_dst_shift = y as usize * dst_stride as usize;

let mut cx = 0usize;

let current_filter = unsafe { filter.get_unchecked(y as usize) };
let filter_start = current_filter.start;
let filter_weights = &current_filter.filter;

unsafe {
while cx + 64 < total_length {
let mut store0 = zeros;
let mut store1 = zeros;
let mut store2 = zeros;
let mut store3 = zeros;
let mut store4 = zeros;
let mut store5 = zeros;
let mut store6 = zeros;
let mut store7 = zeros;

let mut j = 0usize;
while j < current_filter.size {
let weight = *filter_weights.get_unchecked(j);
let f_weight = _mm256_set1_ps(weight);

let py = filter_start + j;
let y_src_shift = py * src_stride as usize;
let s_ptr = src.as_ptr().add(y_src_shift + cx);
let px_0 = _mm256_loadu_ps(s_ptr);
let px_1 = _mm256_loadu_ps(s_ptr.add(8));
let px_2 = _mm256_loadu_ps(s_ptr.add(16));
let px_3 = _mm256_loadu_ps(s_ptr.add(24));
let px_4 = _mm256_loadu_ps(s_ptr.add(32));
let px_5 = _mm256_loadu_ps(s_ptr.add(40));
let px_6 = _mm256_loadu_ps(s_ptr.add(48));
let px_7 = _mm256_loadu_ps(s_ptr.add(56));
store0 = _mm256_prefer_fma_ps(store0, px_0, f_weight);
store1 = _mm256_prefer_fma_ps(store1, px_1, f_weight);
store2 = _mm256_prefer_fma_ps(store2, px_2, f_weight);
store3 = _mm256_prefer_fma_ps(store3, px_3, f_weight);
store4 = _mm256_prefer_fma_ps(store4, px_4, f_weight);
store5 = _mm256_prefer_fma_ps(store5, px_5, f_weight);
store6 = _mm256_prefer_fma_ps(store6, px_6, f_weight);
store7 = _mm256_prefer_fma_ps(store7, px_7, f_weight);

j += 1;
}

let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);
_mm256_storeu_ps(dst_ptr, store0);
_mm256_storeu_ps(dst_ptr.add(8), store1);
_mm256_storeu_ps(dst_ptr.add(16), store2);
_mm256_storeu_ps(dst_ptr.add(24), store3);
_mm256_storeu_ps(dst_ptr.add(32), store4);
_mm256_storeu_ps(dst_ptr.add(40), store5);
_mm256_storeu_ps(dst_ptr.add(48), store6);
_mm256_storeu_ps(dst_ptr.add(56), store7);

cx += 64;
}

while cx + 32 < total_length {
let mut store0 = zeros;
let mut store1 = zeros;
let mut store2 = zeros;
let mut store3 = zeros;

let mut j = 0usize;
while j < current_filter.size {
let weight = *filter_weights.get_unchecked(j);
let f_weight = _mm256_set1_ps(weight);

let py = filter_start + j;
let y_src_shift = py * src_stride as usize;
let s_ptr = src.as_ptr().add(y_src_shift + cx);
let px_0 = _mm256_loadu_ps(s_ptr);
let px_1 = _mm256_loadu_ps(s_ptr.add(8));
let px_2 = _mm256_loadu_ps(s_ptr.add(16));
let px_3 = _mm256_loadu_ps(s_ptr.add(24));
store0 = _mm256_prefer_fma_ps(store0, px_0, f_weight);
store1 = _mm256_prefer_fma_ps(store1, px_1, f_weight);
store2 = _mm256_prefer_fma_ps(store2, px_2, f_weight);
store3 = _mm256_prefer_fma_ps(store3, px_3, f_weight);

j += 1;
}

let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);
_mm256_storeu_ps(dst_ptr, store0);
_mm256_storeu_ps(dst_ptr.add(8), store1);
_mm256_storeu_ps(dst_ptr.add(16), store2);
_mm256_storeu_ps(dst_ptr.add(24), store3);

cx += 32;
}

while cx + 16 < total_length {
let mut store0 = zeros;
let mut store1 = zeros;

let mut j = 0usize;
while j < current_filter.size {
let weight = *filter_weights.get_unchecked(j);
let f_weight = _mm256_set1_ps(weight);

let py = filter_start + j;
let y_src_shift = py * src_stride as usize;
let s_ptr = src.as_ptr().add(y_src_shift + cx);
let px_0 = _mm256_loadu_ps(s_ptr);
let px_1 = _mm256_loadu_ps(s_ptr.add(8));
store0 = _mm256_prefer_fma_ps(store0, px_0, f_weight);
store1 = _mm256_prefer_fma_ps(store1, px_1, f_weight);

j += 1;
}

let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);
_mm256_storeu_ps(dst_ptr, store0);
_mm256_storeu_ps(dst_ptr.add(8), store1);

cx += 16;
}

while cx + 8 < total_length {
let mut store0 = zeros;

let mut j = 0usize;
while j < current_filter.size {
let weight = *filter_weights.get_unchecked(j);
let f_weight = _mm256_set1_ps(weight);

let py = filter_start + j;
let y_src_shift = py * src_stride as usize;
let s_ptr = src.as_ptr().add(y_src_shift + cx);
let px = _mm256_loadu_ps(s_ptr);
store0 = _mm256_prefer_fma_ps(store0, px, f_weight);

j += 1;
}

let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);
_mm256_storeu_ps(dst_ptr, store0);

cx += 8;
}

while cx + 4 < total_length {
let mut store0 = _mm_setzero_ps();

let mut j = 0usize;
while j < current_filter.size {
let weight = *filter_weights.get_unchecked(j);
let f_weight = _mm_set1_ps(weight);

let py = filter_start + j;
let y_src_shift = py * src_stride as usize;
let s_ptr = src.as_ptr().add(y_src_shift + cx);
let lo_lo = _mm_loadu_ps(s_ptr);
store0 = _mm_prefer_fma_ps(store0, lo_lo, f_weight);

j += 1;
}

let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);
_mm_storeu_ps(dst_ptr, store0);

cx += 4;
}

while cx < total_length {
let mut store0 = _mm_setzero_ps();

let mut j = 0usize;
while j < current_filter.size {
let weight = *filter_weights.get_unchecked(j);
let f_weight = _mm_set1_ps(weight);

let py = filter_start + j;
let y_src_shift = py * src_stride as usize;
let s_ptr = src.as_ptr().add(y_src_shift + cx);
let f_pixel = _mm_setr_ps(s_ptr.read_unaligned(), 0., 0., 0.);
store0 = _mm_prefer_fma_ps(store0, f_pixel, f_weight);

j += 1;
}

let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);
let pixel = _mm_extract_ps::<0>(store0);
(dst_ptr as *mut i32).write_unaligned(pixel);

cx += 1;
}
}
}
}
6 changes: 4 additions & 2 deletions src/lib/gaussian/avx/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
mod vertical;
mod utils;
mod vertical;
mod vertical_f32;
mod filter_vertical_f32;

pub use vertical::gaussian_blur_vertical_pass_impl_avx;
pub use vertical_f32::gaussian_blur_vertical_pass_impl_f32_avx;
pub use vertical_f32::gaussian_blur_vertical_pass_impl_f32_avx;
pub use filter_vertical_f32::gaussian_blur_vertical_pass_filter_f32_avx;
13 changes: 11 additions & 2 deletions src/lib/gaussian/avx/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,15 @@ pub(crate) unsafe fn load_u8_f32_fast<const CHANNELS_COUNT: usize>(ptr: *const u
4 => u32::from_le_bytes([ptr.add(3).read_unaligned(), 0, 0, 0]),
_ => 0,
};
let v_int = _mm256_setr_epi32(u_first as i32, u_second as i32, u_third as i32, u_fourth as i32, 0,0,0,0);
let v_int = _mm256_setr_epi32(
u_first as i32,
u_second as i32,
u_third as i32,
u_fourth as i32,
0,
0,
0,
0,
);
return _mm256_cvtepi32_ps(v_int);
}
}
4 changes: 3 additions & 1 deletion src/lib/gaussian/avx/vertical.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ use std::arch::x86_64::*;

use erydanos::_mm256_prefer_fma_ps;

use crate::gaussian::avx::utils::{avx2_pack_u16, avx2_pack_u32, load_u8_f32_fast, load_u8_u32_one};
use crate::gaussian::avx::utils::{
avx2_pack_u16, avx2_pack_u32, load_u8_f32_fast, load_u8_u32_one,
};
use crate::unsafe_slice::UnsafeSlice;

pub fn gaussian_blur_vertical_pass_impl_avx<T, const CHANNEL_CONFIGURATION: usize>(
Expand Down
4 changes: 2 additions & 2 deletions src/lib/gaussian/avx/vertical_f32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

use crate::unsafe_slice::UnsafeSlice;
use erydanos::{_mm256_prefer_fma_ps, _mm_prefer_fma_ps};
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
use erydanos::{_mm256_prefer_fma_ps, _mm_prefer_fma_ps};
use crate::unsafe_slice::UnsafeSlice;

pub fn gaussian_blur_vertical_pass_impl_f32_avx<T, const CHANNEL_CONFIGURATION: usize>(
undef_src: &[T],
Expand Down
Loading

0 comments on commit 823a308

Please sign in to comment.