From 0fcc94c69224b204f9d821d0b8f5e8b8a1be794b Mon Sep 17 00:00:00 2001 From: Yu Wei Wu Date: Thu, 7 Mar 2019 16:18:29 +0800 Subject: [PATCH] fix: rearrange bitwise operations while transform While optimized transform function holds same results, operations in each assignments are not the same number. So we reduce and rearrange some operators back to original positions. Improved performance is still valid. Fix #120 --- src/pow_avx.c | 28 ++++++++++++---------------- src/pow_c.c | 12 ++++++------ src/pow_kernel.cl | 6 +++--- src/pow_sse.c | 12 ++++++------ 4 files changed, 27 insertions(+), 31 deletions(-) diff --git a/src/pow_avx.c b/src/pow_avx.c index 83aba86..faa5cec 100644 --- a/src/pow_avx.c +++ b/src/pow_avx.c @@ -31,10 +31,9 @@ static void transform256(__m256i *lmid, __m256i *hmid) alpha = lfrom[t1]; beta = hfrom[t1]; - delta = _mm256_xor_si256(lfrom[t2], beta); /* lfrom[t2] ^ beta */ - - lto[j] = _mm256_andnot_si256(_mm256_and_si256(delta, alpha), - one); /* ~(delta & alpha) */ + delta = _mm256_and_si256( alpha, _mm256_xor_si256(lfrom[t2], beta)); + /* alpha & (lfrom[t2] ^ beta) */ + lto[j] = _mm256_andnot_si256(delta, one); /* ~delta */ hto[j] = _mm256_or_si256(_mm256_xor_si256(alpha, hfrom[t2]), delta); /* (alpha ^ hfrom[t2]) | delta */ } @@ -49,10 +48,9 @@ static void transform256(__m256i *lmid, __m256i *hmid) t2 = indices[j + 1]; alpha = lfrom[t1]; beta = hfrom[t1]; - delta = _mm256_xor_si256(lfrom[t2], beta); /* lfrom[t2] ^ beta */ - - lto[j] = _mm256_andnot_si256(_mm256_and_si256(delta, alpha), - one); /* ~(delta & alpha) */ + delta = _mm256_and_si256( alpha, _mm256_xor_si256(lfrom[t2], beta)); + /* alpha & (lfrom[t2] ^ beta) */ + lto[j] = _mm256_andnot_si256(delta, one); /* ~delta */ hto[j] = _mm256_or_si256(_mm256_xor_si256(alpha, hfrom[t2]), delta); /* (alpha ^ hfrom[t2]) | delta */ } @@ -223,10 +221,9 @@ static void transform256(__m256d *lmid, __m256d *hmid) alpha = lfrom[t1]; beta = hfrom[t1]; - delta = _mm256_xor_pd(lfrom[t2], beta); /* lfrom[t2] ^ beta */ - - lto[j] = _mm256_andnot_pd(_mm256_and_pd(delta, alpha), - one); /* ~(delta & alpha) */ + delta = _mm256_and_pd(alpha, _mm256_xor_pd(lfrom[t2], beta)); + /* alpha & (lfrom[t2] ^ beta) */ + lto[j] = _mm256_andnot_pd(delta, one); /* ~delta */ hto[j] = _mm256_or_pd(_mm256_xor_pd(alpha, hfrom[t2]), delta); /* (alpha ^ hfrom[t2]) | delta */ } @@ -242,10 +239,9 @@ static void transform256(__m256d *lmid, __m256d *hmid) alpha = lfrom[t1]; beta = hfrom[t1]; - delta = _mm256_xor_pd(lfrom[t2], beta); /* lfrom[t2] ^ beta */ - - lto[j] = _mm256_andnot_pd(_mm256_and_pd(delta, alpha), - one); /* ~(delta & alpha) */ + delta = _mm256_and_pd(alpha, _mm256_xor_pd(lfrom[t2], beta)); + /* alpha & (lfrom[t2] ^ beta) */ + lto[j] = _mm256_andnot_pd(delta, one); /* ~delta */ hto[j] = _mm256_or_pd(_mm256_xor_pd(alpha, hfrom[t2]), delta); /* (alpha ^ hfrom[t2]) | delta */ } diff --git a/src/pow_c.c b/src/pow_c.c index cbf2d81..9f11fd2 100644 --- a/src/pow_c.c +++ b/src/pow_c.c @@ -27,9 +27,9 @@ static void transform64(uint64_t *lmid, uint64_t *hmid) int t2 = indices[j + 1]; alpha = lfrom[t1]; beta = hfrom[t1]; - delta = beta ^ lfrom[t2]; - lto[j] = ~(delta & alpha); - hto[j] = delta | (alpha ^ hfrom[t2]); + delta = alpha & (lfrom[t2] ^ beta); + lto[j] = ~delta; + hto[j] = (alpha ^ hfrom[t2]) | delta; } uint64_t *lswap = lfrom, *hswap = hfrom; lfrom = lto; @@ -43,9 +43,9 @@ static void transform64(uint64_t *lmid, uint64_t *hmid) int t2 = indices[j + 1]; alpha = lfrom[t1]; beta = hfrom[t1]; - delta = beta ^ lfrom[t2]; - lto[j] = ~(delta & alpha); - hto[j] = delta | (alpha ^ hfrom[t2]); + delta = alpha & (lfrom[t2] ^ beta); + lto[j] = ~delta; + hto[j] = (alpha ^ hfrom[t2]) | delta; } } diff --git a/src/pow_kernel.cl b/src/pow_kernel.cl index 10d5aa8..32b7946 100644 --- a/src/pow_kernel.cl +++ b/src/pow_kernel.cl @@ -134,9 +134,9 @@ void transform(__global bc_trit_t* state_low, __global bc_trit_t* state_high, k = j+1; alpha = state_low[INDEX[j]]; beta = state_high[INDEX[j]]; - delta = beta ^ state_low[INDEX[k]]; - sp_low[i] = ~(delta & alpha); - sp_high[i] = delta | (alpha ^ state_high[INDEX[k]]); + delta = alpha & (beta ^ state_low[INDEX[k]]); + sp_low[i] = ~delta; + sp_high[i] = (alpha ^ state_high[INDEX[k]]) | delta; } barrier(CLK_LOCAL_MEM_FENCE); for (i = 0; i < n_trits; i++) { diff --git a/src/pow_sse.c b/src/pow_sse.c index eaea2f1..da0dc1e 100644 --- a/src/pow_sse.c +++ b/src/pow_sse.c @@ -29,9 +29,9 @@ static void transform128(__m128i *lmid, __m128i *hmid) t2 = indices[j + 1]; alpha = lfrom[t1]; beta = hfrom[t1]; - delta = beta ^ lfrom[t2]; - lto[j] = ~(delta & alpha); - hto[j] = delta | (alpha ^ hfrom[t2]); + delta = alpha & (beta ^ lfrom[t2]); + lto[j] = ~delta; + hto[j] = (alpha ^ hfrom[t2]) | delta; } __m128i *lswap = lfrom, *hswap = hfrom; lfrom = lto; @@ -44,9 +44,9 @@ static void transform128(__m128i *lmid, __m128i *hmid) t2 = indices[j + 1]; alpha = lfrom[t1]; beta = hfrom[t1]; - delta = beta ^ lfrom[t2]; - lto[j] = ~(delta & alpha); - hto[j] = delta | (alpha ^ hfrom[t2]); + delta = alpha & (beta ^ lfrom[t2]); + lto[j] = ~delta; + hto[j] = (alpha ^ hfrom[t2]) | delta; } }