From 849baf41d8cba5d9e023c32dfd4c4f98cceda095 Mon Sep 17 00:00:00 2001 From: David Palm Date: Thu, 17 May 2018 11:27:50 +0200 Subject: [PATCH] Don't unroll outer loop Not unrolling the outer loop seems to speed up hashing quite significally: Original (unrolled): ``` running 3 tests test bench_keccak_256_with_empty_input ... bench: 557 ns/iter (+/- 46) test bench_keccak_256_with_large_input ... bench: 17,288 ns/iter (+/- 1,871) = 236 MB/s test bench_keccak_256_with_typical_input ... bench: 577 ns/iter (+/- 28) = 88 MB/s ``` This branch (not unrolled): ``` running 3 tests test bench_keccak_256_with_empty_input ... bench: 487 ns/iter (+/- 25) test bench_keccak_256_with_large_input ... bench: 14,645 ns/iter (+/- 675) = 279 MB/s test bench_keccak_256_with_typical_input ... bench: 495 ns/iter (+/- 32) = 103 MB/s ``` "Inspired" by https://github.com/RustCrypto/sponges/blob/master/keccak/src/lib.rs#L138 Running benchmarks from the `keccak-hash` crate so we can compare to the numbers [here](https://github.com/paritytech/keccak-hash/pull/1). --- .gitignore | 1 + src/lib.rs | 98 ++++++++++++++++++++++++++---------------------------- 2 files changed, 49 insertions(+), 50 deletions(-) diff --git a/.gitignore b/.gitignore index d4f917d..9b38af7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ target Cargo.lock *.swp +.idea diff --git a/src/lib.rs b/src/lib.rs index 877a527..c33c18b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -67,72 +67,70 @@ const RC: [u64; 24] = [ /// keccak-f[1600] pub fn keccakf(a: &mut [u64; PLEN]) { let mut arrays: [[u64; 5]; 24] = [[0; 5]; 24]; - - unroll! { - for i in 0..24 { - // Theta - unroll! { - for x in 0..5 { - // This looks useless but it gets way slower without it. I tried using - // `mem::uninitialized` for the initialisation of `arrays` but that also makes - // it slower, although not by as much as removing this assignment. Optimisers - // are weird. Maybe a different version of LLVM will react differently, so if - // you see this comment in the future try deleting this assignment and using - // uninit above and see how it affects the benchmarks. - arrays[i][x] = 0; - - unroll! { - for y_count in 0..5 { - let y = y_count * 5; - arrays[i][x] ^= a[x + y]; - } + // Not unrolling this is faster, see https://github.com/RustCrypto/sponges/blob/master/keccak/src/lib.rs#L138 + for i in 0..24 { + // Theta + unroll! { + for x in 0..5 { + // This looks useless but it gets way slower without it. I tried using + // `mem::uninitialized` for the initialisation of `arrays` but that also makes + // it slower, although not by as much as removing this assignment. Optimisers + // are weird. Maybe a different version of LLVM will react differently, so if + // you see this comment in the future try deleting this assignment and using + // uninit above and see how it affects the benchmarks. + arrays[i][x] = 0; + + unroll! { + for y_count in 0..5 { + let y = y_count * 5; + arrays[i][x] ^= a[x + y]; } } } + } - unroll! { - for x in 0..5 { - unroll! { - for y_count in 0..5 { - let y = y_count * 5; - a[y + x] ^= arrays[i][(x + 4) % 5] ^ arrays[i][(x + 1) % 5].rotate_left(1); - } + unroll! { + for x in 0..5 { + unroll! { + for y_count in 0..5 { + let y = y_count * 5; + a[y + x] ^= arrays[i][(x + 4) % 5] ^ arrays[i][(x + 1) % 5].rotate_left(1); } } } + } - // Rho and pi - let mut last = a[1]; - unroll! { - for x in 0..24 { - arrays[i][0] = a[PI[x]]; - a[PI[x]] = last.rotate_left(RHO[x]); - last = arrays[i][0]; - } + // Rho and pi + let mut last = a[1]; + unroll! { + for x in 0..24 { + arrays[i][0] = a[PI[x]]; + a[PI[x]] = last.rotate_left(RHO[x]); + last = arrays[i][0]; } + } - // Chi - unroll! { - for y_step in 0..5 { - let y = y_step * 5; + // Chi + unroll! { + for y_step in 0..5 { + let y = y_step * 5; - unroll! { - for x in 0..5 { - arrays[i][x] = a[y + x]; - } + unroll! { + for x in 0..5 { + arrays[i][x] = a[y + x]; } + } - unroll! { - for x in 0..5 { - a[y + x] = arrays[i][x] ^ ((!arrays[i][(x + 1) % 5]) & (arrays[i][(x + 2) % 5])); - } + unroll! { + for x in 0..5 { + a[y + x] = arrays[i][x] ^ ((!arrays[i][(x + 1) % 5]) & (arrays[i][(x + 2) % 5])); } } - }; + } + }; - // Iota - a[0] ^= RC[i]; - } + // Iota + a[0] ^= RC[i]; } }