Added SSE vertical f32

awxkee · Aug 4, 2024 · 1bffb65 · 1bffb65
1 parent 8e48c78
commit 1bffb65
Show file tree

Hide file tree

Showing 6 changed files with 245 additions and 8 deletions.
diff --git a/.github/workflows/build_push.yml b/.github/workflows/build_push.yml
@@ -24,9 +24,12 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions-rust-lang/setup-rust-toolchain@v1
       - run: rustup target add aarch64-unknown-linux-gnu x86_64-unknown-linux-gnu i686-unknown-linux-gnu powerpc-unknown-linux-gnu
-      - run: cargo build --target aarch64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
-      - run: cargo build --target x86_64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
-      - run: cargo build --target i686-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
+      - run: RUSTFLAGS="-C target-feature=+neon" cargo build --target aarch64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
+      - run: RUSTFLAGS="-C target-feature=+sse4.1" cargo build --target i686-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
       - run: cargo build --target powerpc-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
+      - run: RUSTFLAGS="-C target-feature=+sse4.1" cargo build --target x86_64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
+      - run: RUSTFLAGS="-C target-feature=+sse4.1,+f16c" cargo build --features half --target x86_64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
+      - run: RUSTFLAGS="-C target-feature=+avx2,+f16c" cargo build --features half --target x86_64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
+      - run: RUSTFLAGS="-C target-feature=+avx2" cargo build --target x86_64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
       - name: Test release pipeline
         run: cargo publish --dry-run --manifest-path src/lib/Cargo.toml
diff --git a/.github/workflows/publish_release.yml b/.github/workflows/publish_release.yml
@@ -18,10 +18,13 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions-rust-lang/setup-rust-toolchain@v1
       - run: rustup target add aarch64-unknown-linux-gnu x86_64-unknown-linux-gnu i686-unknown-linux-gnu powerpc-unknown-linux-gnu
-      - run: cargo build --target aarch64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
-      - run: cargo build --target x86_64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
-      - run: cargo build --target i686-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
+      - run: RUSTFLAGS="-C target-feature=+neon" cargo build --target aarch64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
+      - run: RUSTFLAGS="-C target-feature=+sse4.1" cargo build --target i686-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
       - run: cargo build --target powerpc-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
+      - run: RUSTFLAGS="-C target-feature=+sse4.1" cargo build --target x86_64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
+      - run: RUSTFLAGS="-C target-feature=+sse4.1,+f16c" cargo build --features half --target x86_64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
+      - run: RUSTFLAGS="-C target-feature=+avx2,+f16c" cargo build --features half --target x86_64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
+      - run: RUSTFLAGS="-C target-feature=+avx2" cargo build --target x86_64-unknown-linux-gnu --manifest-path ./src/lib/Cargo.toml
       - name: Make a release
         env:
           CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_TOKEN }}

diff --git a/src/lib/gaussian/gauss_sse/mod.rs b/src/lib/gaussian/gauss_sse/mod.rs
@@ -30,6 +30,7 @@ mod filter_u8;
 mod horiz_four_channel_f32;
 mod horiz_one_channel_f32;
 mod horiz_one_channel_u8;
+mod vertical_f32;
 
 pub use base::gaussian_blur_horizontal_pass_impl_sse;
 pub use base::gaussian_blur_vertical_pass_impl_sse;
@@ -39,3 +40,4 @@ pub use horiz_four_channel_f32::gaussian_horiz_sse_t_f_chan_f32;
 pub use horiz_one_channel_f32::gaussian_horiz_one_chan_f32;
 pub use horiz_one_channel_f32::gaussian_horiz_one_chan_filter_f32;
 pub use horiz_one_channel_u8::*;
+pub use vertical_f32::gaussian_blur_vertical_pass_impl_f32_sse;
diff --git a/src/lib/gaussian/gauss_sse/vertical_f32.rs b/src/lib/gaussian/gauss_sse/vertical_f32.rs
@@ -0,0 +1,218 @@
+// Copyright (c) Radzivon Bartoshyk. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// 1.  Redistributions of source code must retain the above copyright notice, this
+// list of conditions and the following disclaimer.
+//
+// 2.  Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// 3.  Neither the name of the copyright holder nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+use erydanos::_mm_prefer_fma_ps;
+use crate::unsafe_slice::UnsafeSlice;
+
+pub fn gaussian_blur_vertical_pass_impl_f32_sse<T, const CHANNEL_CONFIGURATION: usize>(
+    undef_src: &[T],
+    src_stride: u32,
+    undef_unsafe_dst: &UnsafeSlice<T>,
+    dst_stride: u32,
+    width: u32,
+    height: u32,
+    kernel_size: usize,
+    kernel: &[f32],
+    start_y: u32,
+    end_y: u32,
+) {
+    let src: &[f32] = unsafe { std::mem::transmute(undef_src) };
+    let unsafe_dst: &UnsafeSlice<'_, u8> = unsafe { std::mem::transmute(undef_unsafe_dst) };
+    let half_kernel = (kernel_size / 2) as i32;
+
+    let zeros = unsafe { _mm_setzero_ps() };
+    let total_length = width as usize * CHANNEL_CONFIGURATION;
+    for y in start_y..end_y {
+        let y_dst_shift = y as usize * dst_stride as usize;
+
+        let mut cx = 0usize;
+
+        unsafe {
+
+            while cx + 24 < total_length {
+                let mut store0 = zeros;
+                let mut store1 = zeros;
+                let mut store2 = zeros;
+                let mut store3 = zeros;
+                let mut store4 = zeros;
+                let mut store5 = zeros;
+
+                let mut r = -half_kernel;
+                while r <= half_kernel {
+                    let weight = *kernel.get_unchecked((r + half_kernel) as usize);
+                    let f_weight = _mm_set1_ps(weight);
+
+                    let py =
+                        std::cmp::min(std::cmp::max(y as i64 + r as i64, 0), (height - 1) as i64);
+                    let y_src_shift = py as usize * src_stride as usize;
+                    let s_ptr = src.as_ptr().add(y_src_shift + cx);
+                    let px_0 = _mm_loadu_ps(s_ptr);
+                    let px_1 = _mm_loadu_ps(s_ptr.add(4));
+                    let px_2 = _mm_loadu_ps(s_ptr.add(8));
+                    let px_3 = _mm_loadu_ps(s_ptr.add(12));
+                    let px_4 = _mm_loadu_ps(s_ptr.add(16));
+                    let px_5 = _mm_loadu_ps(s_ptr.add(20));
+                    store0 = _mm_prefer_fma_ps(store0, px_0, f_weight);
+                    store1 = _mm_prefer_fma_ps(store1, px_1, f_weight);
+                    store2 = _mm_prefer_fma_ps(store2, px_2, f_weight);
+                    store3 = _mm_prefer_fma_ps(store2, px_3, f_weight);
+                    store4 = _mm_prefer_fma_ps(store4, px_4, f_weight);
+                    store5 = _mm_prefer_fma_ps(store5, px_5, f_weight);
+
+                    r += 1;
+                }
+
+                let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);
+                _mm_storeu_ps(dst_ptr, store0);
+                _mm_storeu_ps(dst_ptr.add(4), store1);
+                _mm_storeu_ps(dst_ptr.add(8), store2);
+                _mm_storeu_ps(dst_ptr.add(12), store3);
+                _mm_storeu_ps(dst_ptr.add(16), store4);
+                _mm_storeu_ps(dst_ptr.add(20), store5);
+
+                cx += 24;
+            }
+
+            while cx + 16 < total_length {
+                let mut store0 = zeros;
+                let mut store1 = zeros;
+                let mut store2 = zeros;
+                let mut store3 = zeros;
+
+                let mut r = -half_kernel;
+                while r <= half_kernel {
+                    let weight = *kernel.get_unchecked((r + half_kernel) as usize);
+                    let f_weight = _mm_set1_ps(weight);
+
+                    let py =
+                        std::cmp::min(std::cmp::max(y as i64 + r as i64, 0), (height - 1) as i64);
+                    let y_src_shift = py as usize * src_stride as usize;
+                    let s_ptr = src.as_ptr().add(y_src_shift + cx);
+                    let px_0 = _mm_loadu_ps(s_ptr);
+                    let px_1 = _mm_loadu_ps(s_ptr.add(4));
+                    let px_2 = _mm_loadu_ps(s_ptr.add(8));
+                    let px_3 = _mm_loadu_ps(s_ptr.add(12));
+                    store0 = _mm_prefer_fma_ps(store0, px_0, f_weight);
+                    store1 = _mm_prefer_fma_ps(store1, px_1, f_weight);
+                    store2 = _mm_prefer_fma_ps(store2, px_2, f_weight);
+                    store3 = _mm_prefer_fma_ps(store2, px_3, f_weight);
+
+                    r += 1;
+                }
+
+                let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);
+                _mm_storeu_ps(dst_ptr, store0);
+                _mm_storeu_ps(dst_ptr.add(4), store1);
+                _mm_storeu_ps(dst_ptr.add(8), store2);
+                _mm_storeu_ps(dst_ptr.add(12), store3);
+
+                cx += 16;
+            }
+
+            while cx + 8 < total_length {
+                let mut store0 = zeros;
+                let mut store1 = zeros;
+
+                let mut r = -half_kernel;
+                while r <= half_kernel {
+                    let weight = *kernel.get_unchecked((r + half_kernel) as usize);
+                    let f_weight = _mm_set1_ps(weight);
+
+                    let py =
+                        std::cmp::min(std::cmp::max(y as i64 + r as i64, 0), (height - 1) as i64);
+                    let y_src_shift = py as usize * src_stride as usize;
+                    let s_ptr = src.as_ptr().add(y_src_shift + cx);
+                    let px_0 = _mm_loadu_ps(s_ptr);
+                    let px_1 = _mm_loadu_ps(s_ptr.add(4));
+                    store0 = _mm_prefer_fma_ps(store0, px_0, f_weight);
+                    store1 = _mm_prefer_fma_ps(store1, px_1, f_weight);
+                    r += 1;
+                }
+
+                let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);
+                _mm_storeu_ps(dst_ptr, store0);
+                _mm_storeu_ps(dst_ptr.add(4), store1);
+
+                cx += 8;
+            }
+
+            while cx + 4 < total_length {
+                let mut store0 = zeros;
+
+                let mut r = -half_kernel;
+                while r <= half_kernel {
+                    let weight = *kernel.get_unchecked((r + half_kernel) as usize);
+                    let f_weight = _mm_set1_ps(weight);
+
+                    let py =
+                        std::cmp::min(std::cmp::max(y as i64 + r as i64, 0), (height - 1) as i64);
+                    let y_src_shift = py as usize * src_stride as usize;
+                    let s_ptr = src.as_ptr().add(y_src_shift + cx);
+                    let lo_lo = _mm_loadu_ps(s_ptr);
+                    store0 = _mm_prefer_fma_ps(store0, lo_lo, f_weight);
+
+                    r += 1;
+                }
+
+                let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);
+                _mm_storeu_ps(dst_ptr, store0);
+
+                cx += 4;
+            }
+
+            while cx < total_length {
+                let mut store0 = zeros;
+
+                let mut r = -half_kernel;
+                while r <= half_kernel {
+                    let weight = *kernel.get_unchecked((r + half_kernel) as usize);
+                    let f_weight = _mm_set1_ps(weight);
+
+                    let py =
+                        std::cmp::min(std::cmp::max(y as i64 + r as i64, 0), (height - 1) as i64);
+                    let y_src_shift = py as usize * src_stride as usize;
+                    let s_ptr = src.as_ptr().add(y_src_shift + cx);
+                    let f_pixel = _mm_setr_ps(s_ptr.read_unaligned(), 0., 0., 0.);
+                    store0 = _mm_prefer_fma_ps(store0, f_pixel, f_weight);
+
+                    r += 1;
+                }
+
+                let dst_ptr = (unsafe_dst.slice.as_ptr() as *mut f32).add(y_dst_shift + cx);
+
+                let pixel = _mm_extract_ps::<0>(store0);
+                (dst_ptr as * mut i32).write_unaligned(pixel);
+
+                cx += 1;
+            }
+        }
+    }
+}
diff --git a/src/lib/gaussian/gaussian.rs b/src/lib/gaussian/gaussian.rs
@@ -58,7 +58,7 @@ use crate::gaussian::gauss_sse::gaussian_sse_horiz_one_chan_u8;
     target_feature = "sse4.1"
 ))]
 use crate::gaussian::gauss_sse::{
-    gaussian_blur_horizontal_pass_impl_sse, gaussian_blur_vertical_pass_impl_sse,
+    gaussian_blur_horizontal_pass_impl_sse, gaussian_blur_vertical_pass_impl_sse, gaussian_blur_vertical_pass_impl_f32_sse,
 };
 use crate::gaussian::gaussian_filter::create_filter;
 use crate::gaussian::gaussian_horizontal::gaussian_blur_horizontal_pass_impl;
@@ -272,6 +272,16 @@ fn gaussian_blur_vertical_pass<
             _dispatcher = gaussian_blur_vertical_pass_neon::<T, CHANNEL_CONFIGURATION>;
         }
     }
+    if std::any::type_name::<T>() == "f32" && edge_mode == EdgeMode::Clamp {
+        #[cfg(all(
+            any(target_arch = "x86_64", target_arch = "x86"),
+            target_feature = "sse4.1"
+        ))]
+        {
+            // Generally vertical pass do not depends on any specific channel configuration so it is allowed to make a vectorized calls for any channel
+            _dispatcher = gaussian_blur_vertical_pass_impl_f32_sse::<T, CHANNEL_CONFIGURATION>;
+        }
+    }
     let unsafe_dst = UnsafeSlice::new(dst);
     thread_pool.scope(|scope| {
         let segment_size = height / thread_count;

diff --git a/src/main.rs b/src/main.rs
@@ -200,7 +200,7 @@ fn main() {
     //     EdgeMode::Clamp,
     // );
 
-    libblur::gaussian_blur(
+    libblur::gaussian_blur_in_linear(
         &bytes,
         stride as u32,
         &mut dst_bytes,
@@ -212,6 +212,7 @@ fn main() {
         FastBlurChannels::Channels4,
         EdgeMode::Clamp,
         ThreadingPolicy::Single,
+        TransferFunction::Srgb,
     );
 
     // dst_bytes = perform_planar_pass_3(&bytes, dimensions.0 as usize, dimensions.1 as usize);