From 849baf41d8cba5d9e023c32dfd4c4f98cceda095 Mon Sep 17 00:00:00 2001
From: David Palm <dvdplm@gmail.com>
Date: Thu, 17 May 2018 11:27:50 +0200
Subject: [PATCH] Don't unroll outer loop

Not unrolling the outer loop seems to speed up hashing quite significally:

Original (unrolled):
```
running 3 tests
test bench_keccak_256_with_empty_input   ... bench:         557 ns/iter (+/- 46)
test bench_keccak_256_with_large_input   ... bench:      17,288 ns/iter (+/- 1,871) = 236 MB/s
test bench_keccak_256_with_typical_input ... bench:         577 ns/iter (+/- 28) = 88 MB/s
```

This branch (not unrolled):
```
running 3 tests
test bench_keccak_256_with_empty_input   ... bench:         487 ns/iter (+/- 25)
test bench_keccak_256_with_large_input   ... bench:      14,645 ns/iter (+/- 675) = 279 MB/s
test bench_keccak_256_with_typical_input ... bench:         495 ns/iter (+/- 32) = 103 MB/s
```

"Inspired" by https://github.com/RustCrypto/sponges/blob/master/keccak/src/lib.rs#L138

Running benchmarks from the `keccak-hash` crate so we can compare to the numbers [here](https://github.com/paritytech/keccak-hash/pull/1).
---
 .gitignore |  1 +
 src/lib.rs | 98 ++++++++++++++++++++++++++----------------------------
 2 files changed, 49 insertions(+), 50 deletions(-)

diff --git a/.gitignore b/.gitignore
index d4f917d..9b38af7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 target
 Cargo.lock
 *.swp
+.idea
diff --git a/src/lib.rs b/src/lib.rs
index 877a527..c33c18b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -67,72 +67,70 @@ const RC: [u64; 24] = [
 /// keccak-f[1600]
 pub fn keccakf(a: &mut [u64; PLEN]) {
     let mut arrays: [[u64; 5]; 24] = [[0; 5]; 24];
-
-    unroll! {
-        for i in 0..24 {
-            // Theta
-            unroll! {
-                for x in 0..5 {
-                    // This looks useless but it gets way slower without it. I tried using
-                    // `mem::uninitialized` for the initialisation of `arrays` but that also makes
-                    // it slower, although not by as much as removing this assignment. Optimisers
-                    // are weird. Maybe a different version of LLVM will react differently, so if
-                    // you see this comment in the future try deleting this assignment and using
-                    // uninit above and see how it affects the benchmarks.
-                    arrays[i][x] = 0;
-
-                    unroll! {
-                        for y_count in 0..5 {
-                            let y = y_count * 5;
-                            arrays[i][x] ^= a[x + y];
-                        }
+    // Not unrolling this is faster, see https://github.com/RustCrypto/sponges/blob/master/keccak/src/lib.rs#L138
+    for i in 0..24 {
+        // Theta
+        unroll! {
+            for x in 0..5 {
+                // This looks useless but it gets way slower without it. I tried using
+                // `mem::uninitialized` for the initialisation of `arrays` but that also makes
+                // it slower, although not by as much as removing this assignment. Optimisers
+                // are weird. Maybe a different version of LLVM will react differently, so if
+                // you see this comment in the future try deleting this assignment and using
+                // uninit above and see how it affects the benchmarks.
+                arrays[i][x] = 0;
+
+                unroll! {
+                    for y_count in 0..5 {
+                        let y = y_count * 5;
+                        arrays[i][x] ^= a[x + y];
                     }
                 }
             }
+        }
 
-            unroll! {
-                for x in 0..5 {
-                    unroll! {
-                        for y_count in 0..5 {
-                            let y = y_count * 5;
-                            a[y + x] ^= arrays[i][(x + 4) % 5] ^ arrays[i][(x + 1) % 5].rotate_left(1);
-                        }
+        unroll! {
+            for x in 0..5 {
+                unroll! {
+                    for y_count in 0..5 {
+                        let y = y_count * 5;
+                        a[y + x] ^= arrays[i][(x + 4) % 5] ^ arrays[i][(x + 1) % 5].rotate_left(1);
                     }
                 }
             }
+        }
 
-            // Rho and pi
-            let mut last = a[1];
-            unroll! {
-                for x in 0..24 {
-                    arrays[i][0] = a[PI[x]];
-                    a[PI[x]] = last.rotate_left(RHO[x]);
-                    last = arrays[i][0];
-                }
+        // Rho and pi
+        let mut last = a[1];
+        unroll! {
+            for x in 0..24 {
+                arrays[i][0] = a[PI[x]];
+                a[PI[x]] = last.rotate_left(RHO[x]);
+                last = arrays[i][0];
             }
+        }
 
-            // Chi
-            unroll! {
-                for y_step in 0..5 {
-                    let y = y_step * 5;
+        // Chi
+        unroll! {
+            for y_step in 0..5 {
+                let y = y_step * 5;
 
-                    unroll! {
-                        for x in 0..5 {
-                            arrays[i][x] = a[y + x];
-                        }
+                unroll! {
+                    for x in 0..5 {
+                        arrays[i][x] = a[y + x];
                     }
+                }
 
-                    unroll! {
-                        for x in 0..5 {
-                            a[y + x] = arrays[i][x] ^ ((!arrays[i][(x + 1) % 5]) & (arrays[i][(x + 2) % 5]));
-                        }
+                unroll! {
+                    for x in 0..5 {
+                        a[y + x] = arrays[i][x] ^ ((!arrays[i][(x + 1) % 5]) & (arrays[i][(x + 2) % 5]));
                     }
                 }
-            };
+            }
+        };
 
-            // Iota
-            a[0] ^= RC[i];
-        }
+        // Iota
+        a[0] ^= RC[i];
     }
 }