Skip to content

Commit

Permalink
Added vertical u16
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Jul 28, 2024
1 parent 68f0f1c commit 83c4a97
Show file tree
Hide file tree
Showing 10 changed files with 621 additions and 601 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ let resized = scaler.resize_rgba(
let resized_image = resized.as_bytes();
```

### Fastest path with SIMD
### Fastest paths using SIMD

Despite all implementation are fast, not all the paths are implemented using SIMD, so some paths are slower

Expand Down
24 changes: 10 additions & 14 deletions app/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use pic_scale::{

fn main() {
// test_fast_image();
let img = ImageReader::open("./assets/asset_5.png")
let img = ImageReader::open("./assets/nasa-4928x3279-rgba.png")
.unwrap()
.decode()
.unwrap();
Expand All @@ -31,18 +31,14 @@ fn main() {
let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
scaler.set_threading_policy(ThreadingPolicy::Single);

//

let mut f32_bytes: Vec<f32> = bytes.iter().map(|&x| x as f32 / 255f32).collect();

let start_time = Instant::now();
let store = ImageStore::<f32, 4>::from_slice(
&mut f32_bytes,
let store = ImageStore::<u8, 4>::from_slice(
&mut bytes,
dimensions.0 as usize,
dimensions.1 as usize,
)
.unwrap();
let resized = scaler.resize_rgba_f32(
let resized = scaler.resize_rgba(
ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2),
store,
false,
Expand Down Expand Up @@ -97,15 +93,15 @@ fn main() {
// Print the elapsed time in milliseconds
println!("Scaler: {:.2?}", elapsed_time);

let dst: Vec<u8> = resized
.as_bytes()
.iter()
.map(|&x| (x * 255f32) as u8)
.collect();
// let dst: Vec<u8> = resized
// .as_bytes()
// .iter()
// .map(|&x| (x * 255f32) as u8)
// .collect();

// let dst: Vec<u8> = resized.as_bytes().iter().map(|&x| (x >> 2) as u8).collect();
//
// let dst = resized.as_bytes();
let dst = resized.as_bytes();

if resized.channels == 4 {
image::save_buffer(
Expand Down
9 changes: 5 additions & 4 deletions src/avx2/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

use crate::support::PRECISION;

Check warning on line 30 in src/avx2/utils.rs

View workflow job for this annotation

GitHub Actions / Build

unused import: `crate::support::PRECISION`

Check warning on line 30 in src/avx2/utils.rs

View workflow job for this annotation

GitHub Actions / Build

unused import: `crate::support::PRECISION`
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
Expand Down Expand Up @@ -332,8 +333,8 @@ pub unsafe fn _mm256_srai_epi64x<const IMM8: i32>(a: __m256i) -> __m256i {
}

#[inline]
/// Pack 64bytes integers into 32 bytes
pub unsafe fn _mm256_packus_epi64(a: __m256i, b: __m256i) -> __m256i {
/// Pack 64bytes integers into 32 bytes using truncation
pub unsafe fn _mm256_packts_epi64(a: __m256i, b: __m256i) -> __m256i {
const SHUFFLE_1: i32 = shuffle(2, 0, 2, 0);
let combined = _mm256_shuffle_ps::<SHUFFLE_1>(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b));
const SHUFFLE_2: i32 = shuffle(3, 1, 2, 0);
Expand All @@ -344,11 +345,11 @@ pub unsafe fn _mm256_packus_epi64(a: __m256i, b: __m256i) -> __m256i {
#[inline]
#[allow(dead_code)]
/// Pack 64bytes integers into 32 bytes
pub unsafe fn _mm_cvtepi64_epi32x(v: __m256i) -> __m128i {
pub unsafe fn _mm256_cvtepi64_epi32x(v: __m256i) -> __m128i {
let vf = _mm256_castsi256_ps(v);
let hi = _mm256_extractf128_ps::<1>(vf);
let lo = _mm256_castps256_ps128(vf);
const FLAGS: i32 = shuffle(2, 0, 2, 0);
let packed = _mm_shuffle_ps::<FLAGS>(lo, hi);
return _mm_castps_si128(packed);
}
}
20 changes: 10 additions & 10 deletions src/avx2/vertical_u16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

use crate::avx2::utils::{_mm256_packus_epi64, _mm256_srai_epi64x};
use crate::avx2::utils::{_mm256_packts_epi64, _mm256_srai_epi64x};
use crate::filter_weights::FilterBounds;
use crate::support::{PRECISION, ROUNDING_APPROX};

Expand Down Expand Up @@ -148,16 +148,16 @@ unsafe fn consume_u16_32(
let n_store_6 = _mm256_srai_epi64x::<PRECISION>(store_6);
let n_store_7 = _mm256_srai_epi64x::<PRECISION>(store_7);

let mut new_store_0 = _mm256_packus_epi64(n_store_0, n_store_1);
let mut new_store_0 = _mm256_packts_epi64(n_store_0, n_store_1);
new_store_0 = _mm256_min_epi32(_mm256_max_epi32(new_store_0, zeros), v_max_colors);

let mut new_store_1 = _mm256_packus_epi64(n_store_2, n_store_3);
let mut new_store_1 = _mm256_packts_epi64(n_store_2, n_store_3);
new_store_1 = _mm256_min_epi32(_mm256_max_epi32(new_store_1, zeros), v_max_colors);

let mut new_store_2 = _mm256_packus_epi64(n_store_4, n_store_5);
let mut new_store_2 = _mm256_packts_epi64(n_store_4, n_store_5);
new_store_2 = _mm256_min_epi32(_mm256_max_epi32(new_store_2, zeros), v_max_colors);

let mut new_store_3 = _mm256_packus_epi64(n_store_6, n_store_7);
let mut new_store_3 = _mm256_packts_epi64(n_store_6, n_store_7);
new_store_3 = _mm256_min_epi32(_mm256_max_epi32(new_store_3, zeros), v_max_colors);

let store_0 = _mm256_packus_epi32(new_store_0, new_store_1);
Expand Down Expand Up @@ -238,10 +238,10 @@ unsafe fn consume_u16_16(
let n_store_2 = _mm256_srai_epi64x::<PRECISION>(store_2);
let n_store_3 = _mm256_srai_epi64x::<PRECISION>(store_3);

let mut new_store_0 = _mm256_packus_epi64(n_store_0, n_store_1);
let mut new_store_0 = _mm256_packts_epi64(n_store_0, n_store_1);
new_store_0 = _mm256_min_epi32(_mm256_max_epi32(new_store_0, zeros), v_max_colors);

let mut new_store_1 = _mm256_packus_epi64(n_store_2, n_store_3);
let mut new_store_1 = _mm256_packts_epi64(n_store_2, n_store_3);
new_store_1 = _mm256_min_epi32(_mm256_max_epi32(new_store_1, zeros), v_max_colors);

let store = _mm256_packus_epi32(new_store_0, new_store_1);
Expand Down Expand Up @@ -300,7 +300,7 @@ unsafe fn consume_u16_8(
let n_store_0 = _mm256_srai_epi64x::<PRECISION>(store_0);
let n_store_1 = _mm256_srai_epi64x::<PRECISION>(store_1);

let mut new_store_0 = _mm256_packus_epi64(n_store_0, n_store_1);
let mut new_store_0 = _mm256_packts_epi64(n_store_0, n_store_1);
new_store_0 = _mm256_min_epi32(_mm256_max_epi32(new_store_0, zeros), v_max_colors);

let store_u16 = _mm256_castsi256_si128(_mm256_packus_epi32(new_store_0, new_store_0));
Expand Down Expand Up @@ -346,7 +346,7 @@ unsafe fn consume_u16_4(
let v_max_colors = _mm256_set1_epi32(max_colors);
let n_store_0 = _mm256_srai_epi64x::<PRECISION>(store);

let mut new_store = _mm256_packus_epi64(n_store_0, n_store_0);
let mut new_store = _mm256_packts_epi64(n_store_0, n_store_0);
new_store = _mm256_min_epi32(_mm256_max_epi32(new_store, zeros), v_max_colors);

let store_u16 = _mm256_castsi256_si128(_mm256_packus_epi32(new_store, new_store));
Expand Down Expand Up @@ -390,7 +390,7 @@ unsafe fn consume_u16_1(
let v_max_colors = _mm256_set1_epi32(max_colors);

let shrinked_64 = _mm256_srai_epi64x::<PRECISION>(store);
let shrinked = _mm256_packus_epi64(shrinked_64, shrinked_64);
let shrinked = _mm256_packts_epi64(shrinked_64, shrinked_64);
let shrinked_store = _mm256_min_epi32(_mm256_max_epi32(shrinked, zeros), v_max_colors);
let dst_ptr = dst.add(px);
let value = _mm256_extract_epi32::<0>(shrinked_store);
Expand Down
2 changes: 1 addition & 1 deletion src/rgb_u8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ use crate::sse::convolve_vertical_sse_row;
any(target_arch = "x86_64", target_arch = "x86"),
target_feature = "sse4.1"
))]
use crate::sse::sse_rgb::{
use crate::sse::{
convolve_horizontal_rgb_sse_row_one, convolve_horizontal_rgb_sse_rows_4,
};
use num_traits::AsPrimitive;
Expand Down
5 changes: 1 addition & 4 deletions src/rgba_u8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,7 @@ use crate::sse::convolve_vertical_sse_row;
any(target_arch = "x86_64", target_arch = "x86"),
target_feature = "sse4.1"
))]
use crate::sse::sse_rgb::{
convolve_horizontal_rgba_sse_rows_4, convolve_horizontal_rgba_sse_rows_one,
};
use crate::sse::{convolve_horizontal_rgba_sse_rows_4, convolve_horizontal_rgba_sse_rows_one};
use crate::ImageStore;
use rayon::ThreadPool;

Expand All @@ -76,7 +74,6 @@ impl<'a> HorizontalConvolutionPass<u8, 4> for ImageStore<'a, u8, 4> {
target_feature = "sse4.1"
))]
{
_dispatcher_4_rows = Some(convolve_horizontal_rgba_native_4_row::<u8, i32, 4>);
if is_x86_feature_detected!("sse4.1") {
_dispatcher_4_rows = Some(convolve_horizontal_rgba_sse_rows_4);
_dispatcher_1_row = convolve_horizontal_rgba_sse_rows_one;
Expand Down
2 changes: 2 additions & 0 deletions src/sse/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ mod vertical_f16;
mod vertical_f32;
mod vertical_u16;
mod vertical_u8;
mod rgba_u8;

#[cfg(all(feature = "half", target_feature = "f16c"))]
pub use alpha_f16::{sse_premultiply_alpha_rgba_f16, sse_unpremultiply_alpha_rgba_f16};
Expand Down Expand Up @@ -76,6 +77,7 @@ pub use vertical_f16::convolve_vertical_rgb_sse_row_f16;
pub use vertical_f32::convolve_vertical_rgb_sse_row_f32;
pub use vertical_u16::convolve_vertical_rgb_sse_row_u16;
pub use vertical_u8::convolve_vertical_sse_row;
pub use rgba_u8::{convolve_horizontal_rgba_sse_rows_4, convolve_horizontal_rgba_sse_rows_one};

pub const fn shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 {
((z << 6) | (y << 4) | (x << 2) | w) as i32
Expand Down
Loading

0 comments on commit 83c4a97

Please sign in to comment.