Skip to content

Commit

Permalink
Added SSE vertical f32
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Aug 4, 2024
1 parent 8e48c78 commit 1bffb65
Show file tree
Hide file tree
Showing 6 changed files with 245 additions and 8 deletions.
9 changes: 6 additions & 3 deletions .github/workflows/build_push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,12 @@ jobs:
- uses: actions/checkout@v4
- uses: actions-rust-lang/setup-rust-toolchain@v1
- run: rustup target add aarch64-unknown-linux-gnu x86_64-unknown-linux-gnu i686-unknown-linux-gnu powerpc-unknown-linux-gnu
- run: cargo build --target aarch64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
- run: cargo build --target x86_64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
- run: cargo build --target i686-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
- run: RUSTFLAGS="-C target-feature=+neon" cargo build --target aarch64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
- run: RUSTFLAGS="-C target-feature=+sse4.1" cargo build --target i686-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
- run: cargo build --target powerpc-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
- run: RUSTFLAGS="-C target-feature=+sse4.1" cargo build --target x86_64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
- run: RUSTFLAGS="-C target-feature=+sse4.1,+f16c" cargo build --features half --target x86_64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
- run: RUSTFLAGS="-C target-feature=+avx2,+f16c" cargo build --features half --target x86_64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
- run: RUSTFLAGS="-C target-feature=+avx2" cargo build --target x86_64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
- name: Test release pipeline
run: cargo publish --dry-run --manifest-path src/lib/Cargo.toml
9 changes: 6 additions & 3 deletions .github/workflows/publish_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,13 @@ jobs:
- uses: actions/checkout@v4
- uses: actions-rust-lang/setup-rust-toolchain@v1
- run: rustup target add aarch64-unknown-linux-gnu x86_64-unknown-linux-gnu i686-unknown-linux-gnu powerpc-unknown-linux-gnu
- run: cargo build --target aarch64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
- run: cargo build --target x86_64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
- run: cargo build --target i686-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
- run: RUSTFLAGS="-C target-feature=+neon" cargo build --target aarch64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
- run: RUSTFLAGS="-C target-feature=+sse4.1" cargo build --target i686-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
- run: cargo build --target powerpc-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
- run: RUSTFLAGS="-C target-feature=+sse4.1" cargo build --target x86_64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
- run: RUSTFLAGS="-C target-feature=+sse4.1,+f16c" cargo build --features half --target x86_64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
- run: RUSTFLAGS="-C target-feature=+avx2,+f16c" cargo build --features half --target x86_64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
- run: RUSTFLAGS="-C target-feature=+avx2" cargo build --target x86_64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
- name: Make a release
env:
CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_TOKEN }}
Expand Down
2 changes: 2 additions & 0 deletions src/lib/gaussian/gauss_sse/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ mod filter_u8;
mod horiz_four_channel_f32;
mod horiz_one_channel_f32;
mod horiz_one_channel_u8;
mod vertical_f32;

pub use base::gaussian_blur_horizontal_pass_impl_sse;
pub use base::gaussian_blur_vertical_pass_impl_sse;
Expand All @@ -39,3 +40,4 @@ pub use horiz_four_channel_f32::gaussian_horiz_sse_t_f_chan_f32;
pub use horiz_one_channel_f32::gaussian_horiz_one_chan_f32;
pub use horiz_one_channel_f32::gaussian_horiz_one_chan_filter_f32;
pub use horiz_one_channel_u8::*;
pub use vertical_f32::gaussian_blur_vertical_pass_impl_f32_sse;
218 changes: 218 additions & 0 deletions src/lib/gaussian/gauss_sse/vertical_f32.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
// Copyright (c) Radzivon Bartoshyk. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice, this
// list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// 3. Neither the name of the copyright holder nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
use erydanos::_mm_prefer_fma_ps;
use crate::unsafe_slice::UnsafeSlice;

pub fn gaussian_blur_vertical_pass_impl_f32_sse<T, const CHANNEL_CONFIGURATION: usize>(
undef_src: &[T],
src_stride: u32,
undef_unsafe_dst: &UnsafeSlice<T>,
dst_stride: u32,
width: u32,
height: u32,
kernel_size: usize,
kernel: &[f32],
start_y: u32,
end_y: u32,
) {
let src: &[f32] = unsafe { std::mem::transmute(undef_src) };
let unsafe_dst: &UnsafeSlice<'_, u8> = unsafe { std::mem::transmute(undef_unsafe_dst) };
let half_kernel = (kernel_size / 2) as i32;

let zeros = unsafe { _mm_setzero_ps() };
let total_length = width as usize * CHANNEL_CONFIGURATION;
for y in start_y..end_y {
let y_dst_shift = y as usize * dst_stride as usize;

let mut cx = 0usize;

unsafe {

while cx + 24 < total_length {
let mut store0 = zeros;
let mut store1 = zeros;
let mut store2 = zeros;
let mut store3 = zeros;
let mut store4 = zeros;
let mut store5 = zeros;

let mut r = -half_kernel;
while r <= half_kernel {
let weight = *kernel.get_unchecked((r + half_kernel) as usize);
let f_weight = _mm_set1_ps(weight);

let py =
std::cmp::min(std::cmp::max(y as i64 + r as i64, 0), (height - 1) as i64);
let y_src_shift = py as usize * src_stride as usize;
let s_ptr = src.as_ptr().add(y_src_shift + cx);
let px_0 = _mm_loadu_ps(s_ptr);
let px_1 = _mm_loadu_ps(s_ptr.add(4));
let px_2 = _mm_loadu_ps(s_ptr.add(8));
let px_3 = _mm_loadu_ps(s_ptr.add(12));
let px_4 = _mm_loadu_ps(s_ptr.add(16));
let px_5 = _mm_loadu_ps(s_ptr.add(20));
store0 = _mm_prefer_fma_ps(store0, px_0, f_weight);
store1 = _mm_prefer_fma_ps(store1, px_1, f_weight);
store2 = _mm_prefer_fma_ps(store2, px_2, f_weight);
store3 = _mm_prefer_fma_ps(store2, px_3, f_weight);
store4 = _mm_prefer_fma_ps(store4, px_4, f_weight);
store5 = _mm_prefer_fma_ps(store5, px_5, f_weight);

r += 1;
}

let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);
_mm_storeu_ps(dst_ptr, store0);
_mm_storeu_ps(dst_ptr.add(4), store1);
_mm_storeu_ps(dst_ptr.add(8), store2);
_mm_storeu_ps(dst_ptr.add(12), store3);
_mm_storeu_ps(dst_ptr.add(16), store4);
_mm_storeu_ps(dst_ptr.add(20), store5);

cx += 24;
}

while cx + 16 < total_length {
let mut store0 = zeros;
let mut store1 = zeros;
let mut store2 = zeros;
let mut store3 = zeros;

let mut r = -half_kernel;
while r <= half_kernel {
let weight = *kernel.get_unchecked((r + half_kernel) as usize);
let f_weight = _mm_set1_ps(weight);

let py =
std::cmp::min(std::cmp::max(y as i64 + r as i64, 0), (height - 1) as i64);
let y_src_shift = py as usize * src_stride as usize;
let s_ptr = src.as_ptr().add(y_src_shift + cx);
let px_0 = _mm_loadu_ps(s_ptr);
let px_1 = _mm_loadu_ps(s_ptr.add(4));
let px_2 = _mm_loadu_ps(s_ptr.add(8));
let px_3 = _mm_loadu_ps(s_ptr.add(12));
store0 = _mm_prefer_fma_ps(store0, px_0, f_weight);
store1 = _mm_prefer_fma_ps(store1, px_1, f_weight);
store2 = _mm_prefer_fma_ps(store2, px_2, f_weight);
store3 = _mm_prefer_fma_ps(store2, px_3, f_weight);

r += 1;
}

let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);
_mm_storeu_ps(dst_ptr, store0);
_mm_storeu_ps(dst_ptr.add(4), store1);
_mm_storeu_ps(dst_ptr.add(8), store2);
_mm_storeu_ps(dst_ptr.add(12), store3);

cx += 16;
}

while cx + 8 < total_length {
let mut store0 = zeros;
let mut store1 = zeros;

let mut r = -half_kernel;
while r <= half_kernel {
let weight = *kernel.get_unchecked((r + half_kernel) as usize);
let f_weight = _mm_set1_ps(weight);

let py =
std::cmp::min(std::cmp::max(y as i64 + r as i64, 0), (height - 1) as i64);
let y_src_shift = py as usize * src_stride as usize;
let s_ptr = src.as_ptr().add(y_src_shift + cx);
let px_0 = _mm_loadu_ps(s_ptr);
let px_1 = _mm_loadu_ps(s_ptr.add(4));
store0 = _mm_prefer_fma_ps(store0, px_0, f_weight);
store1 = _mm_prefer_fma_ps(store1, px_1, f_weight);
r += 1;
}

let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);
_mm_storeu_ps(dst_ptr, store0);
_mm_storeu_ps(dst_ptr.add(4), store1);

cx += 8;
}

while cx + 4 < total_length {
let mut store0 = zeros;

let mut r = -half_kernel;
while r <= half_kernel {
let weight = *kernel.get_unchecked((r + half_kernel) as usize);
let f_weight = _mm_set1_ps(weight);

let py =
std::cmp::min(std::cmp::max(y as i64 + r as i64, 0), (height - 1) as i64);
let y_src_shift = py as usize * src_stride as usize;
let s_ptr = src.as_ptr().add(y_src_shift + cx);
let lo_lo = _mm_loadu_ps(s_ptr);
store0 = _mm_prefer_fma_ps(store0, lo_lo, f_weight);

r += 1;
}

let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);
_mm_storeu_ps(dst_ptr, store0);

cx += 4;
}

while cx < total_length {
let mut store0 = zeros;

let mut r = -half_kernel;
while r <= half_kernel {
let weight = *kernel.get_unchecked((r + half_kernel) as usize);
let f_weight = _mm_set1_ps(weight);

let py =
std::cmp::min(std::cmp::max(y as i64 + r as i64, 0), (height - 1) as i64);
let y_src_shift = py as usize * src_stride as usize;
let s_ptr = src.as_ptr().add(y_src_shift + cx);
let f_pixel = _mm_setr_ps(s_ptr.read_unaligned(), 0., 0., 0.);
store0 = _mm_prefer_fma_ps(store0, f_pixel, f_weight);

r += 1;
}

let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);

let pixel = _mm_extract_ps::<0>(store0);
(dst_ptr as * mut i32).write_unaligned(pixel);

cx += 1;
}
}
}
}
12 changes: 11 additions & 1 deletion src/lib/gaussian/gaussian.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ use crate::gaussian::gauss_sse::gaussian_sse_horiz_one_chan_u8;
target_feature = "sse4.1"
))]
use crate::gaussian::gauss_sse::{
gaussian_blur_horizontal_pass_impl_sse, gaussian_blur_vertical_pass_impl_sse,
gaussian_blur_horizontal_pass_impl_sse, gaussian_blur_vertical_pass_impl_sse, gaussian_blur_vertical_pass_impl_f32_sse,
};
use crate::gaussian::gaussian_filter::create_filter;
use crate::gaussian::gaussian_horizontal::gaussian_blur_horizontal_pass_impl;
Expand Down Expand Up @@ -272,6 +272,16 @@ fn gaussian_blur_vertical_pass<
_dispatcher = gaussian_blur_vertical_pass_neon::<T, CHANNEL_CONFIGURATION>;
}
}
if std::any::type_name::<T>() == "f32" && edge_mode == EdgeMode::Clamp {
#[cfg(all(
any(target_arch = "x86_64", target_arch = "x86"),
target_feature = "sse4.1"
))]
{
// Generally vertical pass do not depends on any specific channel configuration so it is allowed to make a vectorized calls for any channel
_dispatcher = gaussian_blur_vertical_pass_impl_f32_sse::<T, CHANNEL_CONFIGURATION>;
}
}
let unsafe_dst = UnsafeSlice::new(dst);
thread_pool.scope(|scope| {
let segment_size = height / thread_count;
Expand Down
3 changes: 2 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ fn main() {
// EdgeMode::Clamp,
// );

libblur::gaussian_blur(
libblur::gaussian_blur_in_linear(
&bytes,
stride as u32,
&mut dst_bytes,
Expand All @@ -212,6 +212,7 @@ fn main() {
FastBlurChannels::Channels4,
EdgeMode::Clamp,
ThreadingPolicy::Single,
TransferFunction::Srgb,
);

// dst_bytes = perform_planar_pass_3(&bytes, dimensions.0 as usize, dimensions.1 as usize);
Expand Down

0 comments on commit 1bffb65

Please sign in to comment.