Skip to content

Commit

Permalink
Fast gaussian next f16, some improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Aug 4, 2024
1 parent 2e35cf1 commit 6fd9b5a
Show file tree
Hide file tree
Showing 14 changed files with 497 additions and 338 deletions.
106 changes: 60 additions & 46 deletions src/lib/fast_gaussian_next.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@

#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
use crate::neon::{
fast_gaussian_next_horizontal_pass_neon_f32, fast_gaussian_next_horizontal_pass_neon_u8,
fast_gaussian_next_horizontal_pass_neon_f16, fast_gaussian_next_horizontal_pass_neon_f32,
fast_gaussian_next_horizontal_pass_neon_u8, fast_gaussian_next_vertical_pass_neon_f16,
fast_gaussian_next_vertical_pass_neon_f32, fast_gaussian_next_vertical_pass_neon_u8,
};
use crate::reflect_index;
Expand Down Expand Up @@ -597,59 +598,72 @@ fn fast_gaussian_next_impl<
EDGE_MODE,
>;
}
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
{
_dispatcher_vertical = fast_gaussian_next_vertical_pass_neon_f16::<
T,
CHANNEL_CONFIGURATION,
EDGE_MODE,
>;
_dispatcher_horizontal = fast_gaussian_next_horizontal_pass_neon_f16::<
T,
CHANNEL_CONFIGURATION,
EDGE_MODE,
>;
}
}
}

if CHANNEL_CONFIGURATION >= 3 {
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
{
if BASE_RADIUS_I64_CUTOFF > radius {
if std::any::type_name::<T>() == "u8" {
_dispatcher_vertical = fast_gaussian_next_vertical_pass_neon_u8::<
T,
CHANNEL_CONFIGURATION,
EDGE_MODE,
>;
_dispatcher_horizontal = fast_gaussian_next_horizontal_pass_neon_u8::<
T,
CHANNEL_CONFIGURATION,
EDGE_MODE,
>;
} else if std::any::type_name::<T>() == "f32" {
_dispatcher_horizontal = fast_gaussian_next_horizontal_pass_neon_f32::<
T,
CHANNEL_CONFIGURATION,
EDGE_MODE,
>;
_dispatcher_vertical = fast_gaussian_next_vertical_pass_neon_f32::<
T,
CHANNEL_CONFIGURATION,
EDGE_MODE,
>;
}
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
{
if BASE_RADIUS_I64_CUTOFF > radius {
if std::any::type_name::<T>() == "u8" {
_dispatcher_vertical = fast_gaussian_next_vertical_pass_neon_u8::<
T,
CHANNEL_CONFIGURATION,
EDGE_MODE,
>;
_dispatcher_horizontal = fast_gaussian_next_horizontal_pass_neon_u8::<
T,
CHANNEL_CONFIGURATION,
EDGE_MODE,
>;
} else if std::any::type_name::<T>() == "f32" {
_dispatcher_horizontal = fast_gaussian_next_horizontal_pass_neon_f32::<
T,
CHANNEL_CONFIGURATION,
EDGE_MODE,
>;
_dispatcher_vertical = fast_gaussian_next_vertical_pass_neon_f32::<
T,
CHANNEL_CONFIGURATION,
EDGE_MODE,
>;
}
}
#[cfg(all(
any(target_arch = "x86_64", target_arch = "x86"),
target_feature = "sse4.1"
))]
{
if BASE_RADIUS_I64_CUTOFF > radius {
if std::any::type_name::<T>() == "u8" {
_dispatcher_vertical = fast_gaussian_next_vertical_pass_sse_u8::<
T,
CHANNEL_CONFIGURATION,
EDGE_MODE,
>;
_dispatcher_horizontal = fast_gaussian_next_horizontal_pass_sse_u8::<
T,
CHANNEL_CONFIGURATION,
EDGE_MODE,
>;
}
}

#[cfg(all(
any(target_arch = "x86_64", target_arch = "x86"),
target_feature = "sse4.1"
))]
{
if BASE_RADIUS_I64_CUTOFF > radius {
if std::any::type_name::<T>() == "u8" {
_dispatcher_vertical = fast_gaussian_next_vertical_pass_sse_u8::<
T,
CHANNEL_CONFIGURATION,
EDGE_MODE,
>;
_dispatcher_horizontal = fast_gaussian_next_horizontal_pass_sse_u8::<
T,
CHANNEL_CONFIGURATION,
EDGE_MODE,
>;
}
}
}

let thread_count = threading_policy.get_threads_count(width, height) as u32;
let pool = rayon::ThreadPoolBuilder::new()
.num_threads(thread_count as usize)
Expand Down
42 changes: 8 additions & 34 deletions src/lib/neon/fast_gaussian.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

use crate::neon::{load_u8_s32_fast, vmulq_s32_f32};
use crate::neon::{load_u8_s32_fast, store_u8x8_m4, vmulq_s32_f32};
use crate::{clamp_edge, reflect_101, reflect_index, EdgeMode};
use std::arch::aarch64::*;

Expand Down Expand Up @@ -68,22 +68,10 @@ pub fn fast_gaussian_horizontal_pass_neon_u8<
let prepared_u16 = unsafe { vqmovn_u32(prepared_px_s32) };
let prepared_u8 = unsafe { vqmovn_u16(vcombine_u16(prepared_u16, prepared_u16)) };

let casted_u32 = unsafe { vreinterpret_u32_u8(prepared_u8) };
let pixel = unsafe { vget_lane_u32::<0>(casted_u32) };
let offset = current_y + current_px;
if CHANNELS_COUNT == 4 {
unsafe {
let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(offset) as *mut u32;
dst_ptr.write_unaligned(pixel);
}
} else {
let bits = pixel.to_le_bytes();

unsafe {
bytes.write(offset, bits[0]);
bytes.write(offset + 1, bits[1]);
bytes.write(offset + 2, bits[2]);
}
let bytes_offset = current_y + current_px;
unsafe {
let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset);
store_u8x8_m4::<CHANNELS_COUNT>(dst_ptr, prepared_u8)
}

let arr_index = ((x - radius_64) & 1023) as usize;
Expand Down Expand Up @@ -162,25 +150,11 @@ pub(crate) fn fast_gaussian_vertical_pass_neon_u8<
let prepared_u16 = unsafe { vqmovn_u32(prepared_px_s32) };
let prepared_u8 = unsafe { vqmovn_u16(vcombine_u16(prepared_u16, prepared_u16)) };

let casted_u32 = unsafe { vreinterpret_u32_u8(prepared_u8) };
let pixel = unsafe { vget_lane_u32::<0>(casted_u32) };

let bytes_offset = current_y + current_px;

if CHANNELS_COUNT == 4 {
unsafe {
let dst_ptr =
(bytes.slice.as_ptr() as *mut u8).add(bytes_offset) as *mut u32;
dst_ptr.write_unaligned(pixel);
}
} else {
let bits = pixel.to_le_bytes();

unsafe {
bytes.write(bytes_offset, bits[0]);
bytes.write(bytes_offset + 1, bits[1]);
bytes.write(bytes_offset + 2, bits[2]);
}
unsafe {
let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset);
store_u8x8_m4::<CHANNELS_COUNT>(dst_ptr, prepared_u8)
}

let arr_index = ((y - radius_64) & 1023) as usize;
Expand Down
42 changes: 7 additions & 35 deletions src/lib/neon/fast_gaussian_f32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

use std::arch::aarch64::*;

use crate::neon::load_f32_fast;
use crate::neon::{load_f32_fast, store_f32};
use crate::unsafe_slice::UnsafeSlice;
use crate::{clamp_edge, reflect_101, reflect_index, EdgeMode};

Expand Down Expand Up @@ -66,23 +66,9 @@ pub fn fast_gaussian_vertical_pass_neon_f32<

let prepared_px = unsafe { vmulq_f32(summs, f_weight) };

if CHANNELS_COUNT == 4 {
unsafe {
let dst_ptr = bytes.slice.as_ptr().add(current_y + current_px) as *mut f32;
vst1q_f32(dst_ptr, prepared_px)
}
} else {
let new_r = unsafe { vgetq_lane_f32::<0>(prepared_px) };
let new_g = unsafe { vgetq_lane_f32::<1>(prepared_px) };
let new_b = unsafe { vgetq_lane_f32::<2>(prepared_px) };

let offset = current_y + current_px;

unsafe {
bytes.write(offset, new_r);
bytes.write(offset + 1, new_g);
bytes.write(offset + 2, new_b);
}
unsafe {
let dst_ptr = bytes.slice.as_ptr().add(current_y + current_px) as *mut f32;
store_f32::<CHANNELS_COUNT>(dst_ptr, prepared_px);
}

let arr_index = ((y - radius_64) & 1023) as usize;
Expand Down Expand Up @@ -156,23 +142,9 @@ pub fn fast_gaussian_horizontal_pass_neon_f32<

let prepared_px = unsafe { vmulq_f32(summs, f_weight) };

if CHANNELS_COUNT == 4 {
unsafe {
let dst_ptr = bytes.slice.as_ptr().add(current_y + current_px) as *mut f32;
vst1q_f32(dst_ptr, prepared_px)
}
} else {
let new_r = unsafe { vgetq_lane_f32::<0>(prepared_px) };
let new_g = unsafe { vgetq_lane_f32::<1>(prepared_px) };
let new_b = unsafe { vgetq_lane_f32::<2>(prepared_px) };

let offset = current_y + current_px;

unsafe {
bytes.write(offset, new_r);
bytes.write(offset + 1, new_g);
bytes.write(offset + 2, new_b);
}
unsafe {
let dst_ptr = bytes.slice.as_ptr().add(current_y + current_px) as *mut f32;
store_f32::<CHANNELS_COUNT>(dst_ptr, prepared_px);
}

let arr_index = ((x - radius_64) & 1023) as usize;
Expand Down
41 changes: 7 additions & 34 deletions src/lib/neon/fast_gaussian_next.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

use crate::neon::load_u8_s32_fast;
use crate::neon::{load_u8_s32_fast, store_u8x8_m4};
use crate::reflect_index;
use crate::{clamp_edge, reflect_101, EdgeMode};
use std::arch::aarch64::*;
Expand Down Expand Up @@ -71,25 +71,11 @@ pub fn fast_gaussian_next_vertical_pass_neon_u8<
let prepared_u16 = unsafe { vqmovun_s32(prepared_px_s32) };
let prepared_u8 = unsafe { vqmovn_u16(vcombine_u16(prepared_u16, prepared_u16)) };

let casted_u32 = unsafe { vreinterpret_u32_u8(prepared_u8) };
let pixel = unsafe { vget_lane_u32::<0>(casted_u32) };

let bytes_offset = current_y + current_px;

if CHANNELS_COUNT == 4 {
unsafe {
let dst_ptr =
(bytes.slice.as_ptr() as *mut u8).add(bytes_offset) as *mut u32;
dst_ptr.write_unaligned(pixel);
}
} else {
let bits = pixel.to_le_bytes();

unsafe {
bytes.write(bytes_offset, bits[0]);
bytes.write(bytes_offset + 1, bits[1]);
bytes.write(bytes_offset + 2, bits[2]);
}
unsafe {
let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset);
store_u8x8_m4::<CHANNELS_COUNT>(dst_ptr, prepared_u8)
}

let d_arr_index_1 = ((y + radius_64) & 1023) as usize;
Expand Down Expand Up @@ -185,25 +171,12 @@ pub(crate) fn fast_gaussian_next_horizontal_pass_neon_u8<
unsafe { vcvtaq_s32_f32(vmulq_f32(vcvtq_f32_s32(summs), f_weight)) };
let prepared_u16 = unsafe { vqmovun_s32(prepared_px_s32) };
let prepared_u8 = unsafe { vqmovn_u16(vcombine_u16(prepared_u16, prepared_u16)) };
let casted_u32 = unsafe { vreinterpret_u32_u8(prepared_u8) };
let pixel = unsafe { vget_lane_u32::<0>(casted_u32) };

let bytes_offset = current_y + current_px;

if CHANNELS_COUNT == 4 {
unsafe {
let dst_ptr =
(bytes.slice.as_ptr() as *mut u8).add(bytes_offset) as *mut u32;
dst_ptr.write_unaligned(pixel);
}
} else {
let bits = pixel.to_le_bytes();

unsafe {
bytes.write(bytes_offset, bits[0]);
bytes.write(bytes_offset + 1, bits[1]);
bytes.write(bytes_offset + 2, bits[2]);
}
unsafe {
let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset);
store_u8x8_m4::<CHANNELS_COUNT>(dst_ptr, prepared_u8)
}

let d_arr_index_1 = ((x + radius_64) & 1023) as usize;
Expand Down
Loading

0 comments on commit 6fd9b5a

Please sign in to comment.