Don't unroll outer loop

Not unrolling the outer loop seems to speed up hashing quite significally: Original (unrolled): ``` running 3 tests test bench_keccak_256_with_empty_input ... bench: 557 ns/iter (+/- 46) test bench_keccak_256_with_large_input ... bench: 17,288 ns/iter (+/- 1,871) = 236 MB/s test bench_keccak_256_with_typical_input ... bench: 577 ns/iter (+/- 28) = 88 MB/s ``` This branch (not unrolled): ``` running 3 tests test bench_keccak_256_with_empty_input ... bench: 487 ns/iter (+/- 25) test bench_keccak_256_with_large_input ... bench: 14,645 ns/iter (+/- 675) = 279 MB/s test bench_keccak_256_with_typical_input ... bench: 495 ns/iter (+/- 32) = 103 MB/s ``` "Inspired" by https://github.com/RustCrypto/sponges/blob/master/keccak/src/lib.rs#L138 Running benchmarks from the `keccak-hash` crate so we can compare to the numbers [here](paritytech/keccak-hash#1).
dvdplm · May 17, 2018 · 849baf4 · 849baf4
1 parent eda020a
commit 849baf4
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 50 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 target
 Cargo.lock
 *.swp
+.idea
diff --git a/src/lib.rs b/src/lib.rs
@@ -67,72 +67,70 @@ const RC: [u64; 24] = [
 /// keccak-f[1600]
 pub fn keccakf(a: &mut [u64; PLEN]) {
     let mut arrays: [[u64; 5]; 24] = [[0; 5]; 24];
-
-    unroll! {
-        for i in 0..24 {
-            // Theta
-            unroll! {
-                for x in 0..5 {
-                    // This looks useless but it gets way slower without it. I tried using
-                    // `mem::uninitialized` for the initialisation of `arrays` but that also makes
-                    // it slower, although not by as much as removing this assignment. Optimisers
-                    // are weird. Maybe a different version of LLVM will react differently, so if
-                    // you see this comment in the future try deleting this assignment and using
-                    // uninit above and see how it affects the benchmarks.
-                    arrays[i][x] = 0;
-
-                    unroll! {
-                        for y_count in 0..5 {
-                            let y = y_count * 5;
-                            arrays[i][x] ^= a[x + y];
-                        }
+    // Not unrolling this is faster, see https://github.com/RustCrypto/sponges/blob/master/keccak/src/lib.rs#L138
+    for i in 0..24 {
+        // Theta
+        unroll! {
+            for x in 0..5 {
+                // This looks useless but it gets way slower without it. I tried using
+                // `mem::uninitialized` for the initialisation of `arrays` but that also makes
+                // it slower, although not by as much as removing this assignment. Optimisers
+                // are weird. Maybe a different version of LLVM will react differently, so if
+                // you see this comment in the future try deleting this assignment and using
+                // uninit above and see how it affects the benchmarks.
+                arrays[i][x] = 0;
+
+                unroll! {
+                    for y_count in 0..5 {
+                        let y = y_count * 5;
+                        arrays[i][x] ^= a[x + y];
                     }
                 }
             }
+        }
 
-            unroll! {
-                for x in 0..5 {
-                    unroll! {
-                        for y_count in 0..5 {
-                            let y = y_count * 5;
-                            a[y + x] ^= arrays[i][(x + 4) % 5] ^ arrays[i][(x + 1) % 5].rotate_left(1);
-                        }
+        unroll! {
+            for x in 0..5 {
+                unroll! {
+                    for y_count in 0..5 {
+                        let y = y_count * 5;
+                        a[y + x] ^= arrays[i][(x + 4) % 5] ^ arrays[i][(x + 1) % 5].rotate_left(1);
                     }
                 }
             }
+        }
 
-            // Rho and pi
-            let mut last = a[1];
-            unroll! {
-                for x in 0..24 {
-                    arrays[i][0] = a[PI[x]];
-                    a[PI[x]] = last.rotate_left(RHO[x]);
-                    last = arrays[i][0];
-                }
+        // Rho and pi
+        let mut last = a[1];
+        unroll! {
+            for x in 0..24 {
+                arrays[i][0] = a[PI[x]];
+                a[PI[x]] = last.rotate_left(RHO[x]);
+                last = arrays[i][0];
             }
+        }
 
-            // Chi
-            unroll! {
-                for y_step in 0..5 {
-                    let y = y_step * 5;
+        // Chi
+        unroll! {
+            for y_step in 0..5 {
+                let y = y_step * 5;
 
-                    unroll! {
-                        for x in 0..5 {
-                            arrays[i][x] = a[y + x];
-                        }
+                unroll! {
+                    for x in 0..5 {
+                        arrays[i][x] = a[y + x];
                     }
+                }
 
-                    unroll! {
-                        for x in 0..5 {
-                            a[y + x] = arrays[i][x] ^ ((!arrays[i][(x + 1) % 5]) & (arrays[i][(x + 2) % 5]));
-                        }
+                unroll! {
+                    for x in 0..5 {
+                        a[y + x] = arrays[i][x] ^ ((!arrays[i][(x + 1) % 5]) & (arrays[i][(x + 2) % 5]));
                     }
                 }
-            };
+            }
+        };
 
-            // Iota
-            a[0] ^= RC[i];
-        }
+        // Iota
+        a[0] ^= RC[i];
     }
 }