Added vertical u16

awxkee · Jul 28, 2024 · 83c4a97 · 83c4a97
1 parent 68f0f1c
commit 83c4a97
Show file tree

Hide file tree

Showing 10 changed files with 621 additions and 601 deletions.
diff --git a/README.md b/README.md
@@ -37,7 +37,7 @@ let resized = scaler.resize_rgba(
 let resized_image = resized.as_bytes();
 ```
 
-### Fastest path with SIMD
+### Fastest paths using SIMD
 
 Despite all implementation are fast, not all the paths are implemented using SIMD, so some paths are slower
 

diff --git a/app/src/main.rs b/app/src/main.rs
@@ -21,7 +21,7 @@ use pic_scale::{
 
 fn main() {
     // test_fast_image();
-    let img = ImageReader::open("./assets/asset_5.png")
+    let img = ImageReader::open("./assets/nasa-4928x3279-rgba.png")
         .unwrap()
         .decode()
         .unwrap();
@@ -31,18 +31,14 @@ fn main() {
     let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
     scaler.set_threading_policy(ThreadingPolicy::Single);
 
-    //
-
-    let mut f32_bytes: Vec<f32> = bytes.iter().map(|&x| x as f32 / 255f32).collect();
-
     let start_time = Instant::now();
-    let store = ImageStore::<f32, 4>::from_slice(
-        &mut f32_bytes,
+    let store = ImageStore::<u8, 4>::from_slice(
+        &mut bytes,
         dimensions.0 as usize,
         dimensions.1 as usize,
     )
     .unwrap();
-    let resized = scaler.resize_rgba_f32(
+    let resized = scaler.resize_rgba(
         ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2),
         store,
         false,
@@ -97,15 +93,15 @@ fn main() {
     // Print the elapsed time in milliseconds
     println!("Scaler: {:.2?}", elapsed_time);
 
-    let dst: Vec<u8> = resized
-        .as_bytes()
-        .iter()
-        .map(|&x| (x * 255f32) as u8)
-        .collect();
+    // let dst: Vec<u8> = resized
+    //     .as_bytes()
+    //     .iter()
+    //     .map(|&x| (x * 255f32) as u8)
+    //     .collect();
 
     // let dst: Vec<u8> = resized.as_bytes().iter().map(|&x| (x >> 2) as u8).collect();
     //
-    // let dst = resized.as_bytes();
+    let dst = resized.as_bytes();
 
     if resized.channels == 4 {
         image::save_buffer(

diff --git a/src/avx2/utils.rs b/src/avx2/utils.rs
@@ -27,6 +27,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+use crate::support::PRECISION;
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
@@ -332,8 +333,8 @@ pub unsafe fn _mm256_srai_epi64x<const IMM8: i32>(a: __m256i) -> __m256i {
 }
 
 #[inline]
-/// Pack 64bytes integers into 32 bytes
-pub unsafe fn _mm256_packus_epi64(a: __m256i, b: __m256i) -> __m256i {
+/// Pack 64bytes integers into 32 bytes using truncation
+pub unsafe fn _mm256_packts_epi64(a: __m256i, b: __m256i) -> __m256i {
     const SHUFFLE_1: i32 = shuffle(2, 0, 2, 0);
     let combined = _mm256_shuffle_ps::<SHUFFLE_1>(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b));
     const SHUFFLE_2: i32 = shuffle(3, 1, 2, 0);
@@ -344,11 +345,11 @@ pub unsafe fn _mm256_packus_epi64(a: __m256i, b: __m256i) -> __m256i {
 #[inline]
 #[allow(dead_code)]
 /// Pack 64bytes integers into 32 bytes
-pub unsafe fn _mm_cvtepi64_epi32x(v: __m256i) -> __m128i {
+pub unsafe fn _mm256_cvtepi64_epi32x(v: __m256i) -> __m128i {
     let vf = _mm256_castsi256_ps(v);
     let hi = _mm256_extractf128_ps::<1>(vf);
     let lo = _mm256_castps256_ps128(vf);
     const FLAGS: i32 = shuffle(2, 0, 2, 0);
     let packed = _mm_shuffle_ps::<FLAGS>(lo, hi);
     return _mm_castps_si128(packed);
-}
+}
diff --git a/src/avx2/vertical_u16.rs b/src/avx2/vertical_u16.rs
@@ -32,7 +32,7 @@ use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
-use crate::avx2::utils::{_mm256_packus_epi64, _mm256_srai_epi64x};
+use crate::avx2::utils::{_mm256_packts_epi64, _mm256_srai_epi64x};
 use crate::filter_weights::FilterBounds;
 use crate::support::{PRECISION, ROUNDING_APPROX};
 
@@ -148,16 +148,16 @@ unsafe fn consume_u16_32(
     let n_store_6 = _mm256_srai_epi64x::<PRECISION>(store_6);
     let n_store_7 = _mm256_srai_epi64x::<PRECISION>(store_7);
 
-    let mut new_store_0 = _mm256_packus_epi64(n_store_0, n_store_1);
+    let mut new_store_0 = _mm256_packts_epi64(n_store_0, n_store_1);
     new_store_0 = _mm256_min_epi32(_mm256_max_epi32(new_store_0, zeros), v_max_colors);
 
-    let mut new_store_1 = _mm256_packus_epi64(n_store_2, n_store_3);
+    let mut new_store_1 = _mm256_packts_epi64(n_store_2, n_store_3);
     new_store_1 = _mm256_min_epi32(_mm256_max_epi32(new_store_1, zeros), v_max_colors);
 
-    let mut new_store_2 = _mm256_packus_epi64(n_store_4, n_store_5);
+    let mut new_store_2 = _mm256_packts_epi64(n_store_4, n_store_5);
     new_store_2 = _mm256_min_epi32(_mm256_max_epi32(new_store_2, zeros), v_max_colors);
 
-    let mut new_store_3 = _mm256_packus_epi64(n_store_6, n_store_7);
+    let mut new_store_3 = _mm256_packts_epi64(n_store_6, n_store_7);
     new_store_3 = _mm256_min_epi32(_mm256_max_epi32(new_store_3, zeros), v_max_colors);
 
     let store_0 = _mm256_packus_epi32(new_store_0, new_store_1);
@@ -238,10 +238,10 @@ unsafe fn consume_u16_16(
     let n_store_2 = _mm256_srai_epi64x::<PRECISION>(store_2);
     let n_store_3 = _mm256_srai_epi64x::<PRECISION>(store_3);
 
-    let mut new_store_0 = _mm256_packus_epi64(n_store_0, n_store_1);
+    let mut new_store_0 = _mm256_packts_epi64(n_store_0, n_store_1);
     new_store_0 = _mm256_min_epi32(_mm256_max_epi32(new_store_0, zeros), v_max_colors);
 
-    let mut new_store_1 = _mm256_packus_epi64(n_store_2, n_store_3);
+    let mut new_store_1 = _mm256_packts_epi64(n_store_2, n_store_3);
     new_store_1 = _mm256_min_epi32(_mm256_max_epi32(new_store_1, zeros), v_max_colors);
 
     let store = _mm256_packus_epi32(new_store_0, new_store_1);
@@ -300,7 +300,7 @@ unsafe fn consume_u16_8(
     let n_store_0 = _mm256_srai_epi64x::<PRECISION>(store_0);
     let n_store_1 = _mm256_srai_epi64x::<PRECISION>(store_1);
 
-    let mut new_store_0 = _mm256_packus_epi64(n_store_0, n_store_1);
+    let mut new_store_0 = _mm256_packts_epi64(n_store_0, n_store_1);
     new_store_0 = _mm256_min_epi32(_mm256_max_epi32(new_store_0, zeros), v_max_colors);
 
     let store_u16 = _mm256_castsi256_si128(_mm256_packus_epi32(new_store_0, new_store_0));
@@ -346,7 +346,7 @@ unsafe fn consume_u16_4(
     let v_max_colors = _mm256_set1_epi32(max_colors);
     let n_store_0 = _mm256_srai_epi64x::<PRECISION>(store);
 
-    let mut new_store = _mm256_packus_epi64(n_store_0, n_store_0);
+    let mut new_store = _mm256_packts_epi64(n_store_0, n_store_0);
     new_store = _mm256_min_epi32(_mm256_max_epi32(new_store, zeros), v_max_colors);
 
     let store_u16 = _mm256_castsi256_si128(_mm256_packus_epi32(new_store, new_store));
@@ -390,7 +390,7 @@ unsafe fn consume_u16_1(
     let v_max_colors = _mm256_set1_epi32(max_colors);
 
     let shrinked_64 = _mm256_srai_epi64x::<PRECISION>(store);
-    let shrinked = _mm256_packus_epi64(shrinked_64, shrinked_64);
+    let shrinked = _mm256_packts_epi64(shrinked_64, shrinked_64);
     let shrinked_store = _mm256_min_epi32(_mm256_max_epi32(shrinked, zeros), v_max_colors);
     let dst_ptr = dst.add(px);
     let value = _mm256_extract_epi32::<0>(shrinked_store);

diff --git a/src/rgb_u8.rs b/src/rgb_u8.rs
@@ -48,7 +48,7 @@ use crate::sse::convolve_vertical_sse_row;
     any(target_arch = "x86_64", target_arch = "x86"),
     target_feature = "sse4.1"
 ))]
-use crate::sse::sse_rgb::{
+use crate::sse::{
     convolve_horizontal_rgb_sse_row_one, convolve_horizontal_rgb_sse_rows_4,
 };
 use num_traits::AsPrimitive;

diff --git a/src/rgba_u8.rs b/src/rgba_u8.rs
@@ -48,9 +48,7 @@ use crate::sse::convolve_vertical_sse_row;
     any(target_arch = "x86_64", target_arch = "x86"),
     target_feature = "sse4.1"
 ))]
-use crate::sse::sse_rgb::{
-    convolve_horizontal_rgba_sse_rows_4, convolve_horizontal_rgba_sse_rows_one,
-};
+use crate::sse::{convolve_horizontal_rgba_sse_rows_4, convolve_horizontal_rgba_sse_rows_one};
 use crate::ImageStore;
 use rayon::ThreadPool;
 
@@ -76,7 +74,6 @@ impl<'a> HorizontalConvolutionPass<u8, 4> for ImageStore<'a, u8, 4> {
             target_feature = "sse4.1"
         ))]
         {
-            _dispatcher_4_rows = Some(convolve_horizontal_rgba_native_4_row::<u8, i32, 4>);
             if is_x86_feature_detected!("sse4.1") {
                 _dispatcher_4_rows = Some(convolve_horizontal_rgba_sse_rows_4);
                 _dispatcher_1_row = convolve_horizontal_rgba_sse_rows_one;

diff --git a/src/sse/mod.rs b/src/sse/mod.rs
@@ -49,6 +49,7 @@ mod vertical_f16;
 mod vertical_f32;
 mod vertical_u16;
 mod vertical_u8;
+mod rgba_u8;
 
 #[cfg(all(feature = "half", target_feature = "f16c"))]
 pub use alpha_f16::{sse_premultiply_alpha_rgba_f16, sse_unpremultiply_alpha_rgba_f16};
@@ -76,6 +77,7 @@ pub use vertical_f16::convolve_vertical_rgb_sse_row_f16;
 pub use vertical_f32::convolve_vertical_rgb_sse_row_f32;
 pub use vertical_u16::convolve_vertical_rgb_sse_row_u16;
 pub use vertical_u8::convolve_vertical_sse_row;
+pub use rgba_u8::{convolve_horizontal_rgba_sse_rows_4, convolve_horizontal_rgba_sse_rows_one};
 
 pub const fn shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 {
     ((z << 6) | (y << 4) | (x << 2) | w) as i32