Skip to content

Commit

Permalink
Merge pull request #101 from wusyong/optimized
Browse files Browse the repository at this point in the history
Optimize curl transform formulas
  • Loading branch information
jserv committed Feb 13, 2019
2 parents b2c91da + 1e725f1 commit 3d6fa34
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 69 deletions.
70 changes: 25 additions & 45 deletions src/pow_avx.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ static void transform256(__m256i *lmid, __m256i *hmid)
{
__m256i one = _mm256_set_epi64x(HBITS, HBITS, HBITS, HBITS);
int t1, t2;
__m256i alpha, beta, gamma, delta, ngamma;
__m256i alpha, beta, delta;
__m256i *lto = lmid + STATE_TRITS_LENGTH, *hto = hmid + STATE_TRITS_LENGTH;
__m256i *lfrom = lmid, *hfrom = hmid;

Expand All @@ -31,18 +31,12 @@ static void transform256(__m256i *lmid, __m256i *hmid)

alpha = lfrom[t1];
beta = hfrom[t1];
gamma = hfrom[t2];
ngamma = _mm256_andnot_si256(gamma, one);
delta = _mm256_and_si256(
_mm256_or_si256(alpha, ngamma),
_mm256_xor_si256(
lfrom[t2],
beta)); /* (alpha | (~gamma)) & (lfrom[t2] ^ beta) */


lto[j] = _mm256_andnot_si256(delta, one); /* ~delta */
hto[j] = _mm256_or_si256(_mm256_xor_si256(alpha, gamma),
delta); /* (alpha ^ gamma) | delta */
delta = _mm256_xor_si256(lfrom[t2], beta); /* lfrom[t2] ^ beta */

lto[j] = _mm256_andnot_si256(
_mm256_and_si256(delta, alpha), one); /* ~(delta & alpha) */
hto[j] = _mm256_or_si256(_mm256_xor_si256(alpha, hfrom[t2]),
delta); /* (alpha ^ hfrom[t2]) | delta */
}
__m256i *lswap = lfrom, *hswap = hfrom;
lfrom = lto;
Expand All @@ -55,15 +49,12 @@ static void transform256(__m256i *lmid, __m256i *hmid)
t2 = indices[j + 1];
alpha = lfrom[t1];
beta = hfrom[t1];
gamma = hfrom[t2];
ngamma = _mm256_andnot_si256(gamma, one);
delta = _mm256_and_si256(
_mm256_or_si256(alpha, ngamma),
_mm256_xor_si256(
lfrom[t2], beta)); /* (alpha | (~gamma)) & (lfrom[t2] ^ beta) */
lto[j] = _mm256_andnot_si256(delta, one); /* ~delta */
hto[j] = _mm256_or_si256(_mm256_xor_si256(alpha, gamma),
delta); /* (alpha ^ gamma) | delta */
delta = _mm256_xor_si256(lfrom[t2], beta); /* lfrom[t2] ^ beta */

lto[j] = _mm256_andnot_si256(
_mm256_and_si256(delta, alpha), one); /* ~(delta & alpha) */
hto[j] = _mm256_or_si256(_mm256_xor_si256(alpha, hfrom[t2]),
delta); /* (alpha ^ hfrom[t2]) | delta */
}
}

Expand Down Expand Up @@ -215,7 +206,7 @@ void transform256(__m256d *lmid, __m256d *hmid)
{
__m256d one = _mm256_set_pd(HBITS, HBITS, HBITS, HBITS);
int j, r, t1, t2;
__m256d alpha, beta, gamma, delta, ngamma;
__m256d alpha, beta, delta;
__m256d *lto = lmid + STATE_TRITS_LENGTH, *hto = hmid + STATE_TRITS_LENGTH;
__m256d *lfrom = lmid, *hfrom = hmid;
for (r = 0; r < 80; r++) {
Expand All @@ -225,18 +216,12 @@ void transform256(__m256d *lmid, __m256d *hmid)

alpha = lfrom[t1];
beta = hfrom[t1];
gamma = hfrom[t2];
ngamma = _mm256_andnot_pd(gamma, one);
delta = _mm256_and_pd(
_mm256_or_pd(alpha, ngamma),
_mm256_xor_pd(
lfrom[t2],
beta)); //(alpha | (~gamma)) & (lfrom[t2] ^ beta);


lto[j] = _mm256_andnot_pd(delta, one); //~delta;
hto[j] = _mm256_or_pd(_mm256_xor_pd(alpha, gamma),
delta); //(alpha ^ gamma) | delta;
delta = _mm256_xor_pd(lfrom[t2], beta); /* lfrom[t2] ^ beta */

lto[j] = _mm256_andnot_pd(
_mm256_and_pd(delta, alpha), one); /* ~(delta & alpha) */
hto[j] = _mm256_or_pd(_mm256_xor_pd(alpha, hfrom[t2]),
delta); /* (alpha ^ hfrom[t2]) | delta */
}
__m256d *lswap = lfrom, *hswap = hfrom;
lfrom = lto;
Expand All @@ -250,17 +235,12 @@ void transform256(__m256d *lmid, __m256d *hmid)

alpha = lfrom[t1];
beta = hfrom[t1];
gamma = hfrom[t2];
ngamma = _mm256_andnot_pd(gamma, one);
delta = _mm256_and_pd(
_mm256_or_pd(alpha, ngamma),
_mm256_xor_pd(lfrom[t2],
beta)); //(alpha | (~gamma)) & (lfrom[t2] ^ beta);

delta = _mm256_xor_pd(lfrom[t2], beta); /* lfrom[t2] ^ beta */

lto[j] = _mm256_andnot_pd(delta, one); //~delta;
hto[j] = _mm256_or_pd(_mm256_xor_pd(alpha, gamma),
delta); //(alpha ^ gamma) | delta;
lto[j] = _mm256_andnot_pd(
_mm256_and_pd(delta, alpha), one); /* ~(delta & alpha) */
hto[j] = _mm256_or_pd(_mm256_xor_pd(alpha, hfrom[t2]),
delta); /* (alpha ^ hfrom[t2]) | delta */
}
}

Expand Down
16 changes: 7 additions & 9 deletions src/pow_c.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

void transform64(uint64_t *lmid, uint64_t *hmid)
{
uint64_t alpha, beta, gamma, delta;
uint64_t alpha, beta, delta;
uint64_t *lfrom = lmid, *hfrom = hmid;
uint64_t *lto = lmid + STATE_TRITS_LENGTH, *hto = hmid + STATE_TRITS_LENGTH;

Expand All @@ -27,10 +27,9 @@ void transform64(uint64_t *lmid, uint64_t *hmid)
int t2 = indices[j + 1];
alpha = lfrom[t1];
beta = hfrom[t1];
gamma = hfrom[t2];
delta = (alpha | (~gamma)) & (lfrom[t2] ^ beta);
lto[j] = ~delta;
hto[j] = (alpha ^ gamma) | delta;
delta = beta ^ lfrom[t2];
lto[j] = ~(delta & alpha);
hto[j] = delta | (alpha ^ hfrom[t2]);
}
uint64_t *lswap = lfrom, *hswap = hfrom;
lfrom = lto;
Expand All @@ -44,10 +43,9 @@ void transform64(uint64_t *lmid, uint64_t *hmid)
int t2 = indices[j + 1];
alpha = lfrom[t1];
beta = hfrom[t1];
gamma = hfrom[t2];
delta = (alpha | (~gamma)) & (lfrom[t2] ^ beta);
lto[j] = ~delta; // 6
hto[j] = (alpha ^ gamma) | delta;
delta = beta ^ lfrom[t2];
lto[j] = ~(delta & alpha);
hto[j] = delta | (alpha ^ hfrom[t2]);
}
}

Expand Down
10 changes: 4 additions & 6 deletions src/pow_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -127,18 +127,16 @@ void transform(__global bc_trit_t* state_low, __global bc_trit_t* state_high,
__private size_t id, __private size_t l_size,
__private size_t n_trits) {
__private size_t round, i, j, k;
__private bc_trit_t alpha, beta, gamma, delta, sp_low[3], sp_high[3];
__private bc_trit_t alpha, beta, delta, sp_low[3], sp_high[3];
for (round = 0; round < NUMBER_OF_ROUNDS; round++) {
for (i = 0; i < n_trits; i++) {
j = id + i * l_size;
k = j+1;
alpha = state_low[INDEX[j]];
beta = state_high[INDEX[j]];
gamma = state_high[INDEX[k]];
delta = (alpha | (~gamma)) & (state_low[INDEX[k]] ^ beta);

sp_low[i] = ~delta;
sp_high[i] = (alpha ^ gamma) | delta;
delta = beta ^ state_low[INDEX[k]];
sp_low[i] = ~(delta & alpha);
sp_high[i] = delta | (alpha ^ state_high[INDEX[k]]);
}
barrier(CLK_LOCAL_MEM_FENCE);
for (i = 0; i < n_trits; i++) {
Expand Down
16 changes: 7 additions & 9 deletions src/pow_sse.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
static void transform128(__m128i *lmid, __m128i *hmid)
{
int t1, t2;
__m128i alpha, beta, gamma, delta;
__m128i alpha, beta, delta;
__m128i *lto = lmid + STATE_TRITS_LENGTH, *hto = hmid + STATE_TRITS_LENGTH;
__m128i *lfrom = lmid, *hfrom = hmid;

Expand All @@ -29,10 +29,9 @@ static void transform128(__m128i *lmid, __m128i *hmid)
t2 = indices[j + 1];
alpha = lfrom[t1];
beta = hfrom[t1];
gamma = hfrom[t2];
delta = (alpha | (~gamma)) & (lfrom[t2] ^ beta);
lto[j] = ~delta;
hto[j] = (alpha ^ gamma) | delta;
delta = beta ^ lfrom[t2];
lto[j] = ~(delta & alpha);
hto[j] = delta | (alpha ^ hfrom[t2]);
}
__m128i *lswap = lfrom, *hswap = hfrom;
lfrom = lto;
Expand All @@ -45,10 +44,9 @@ static void transform128(__m128i *lmid, __m128i *hmid)
t2 = indices[j + 1];
alpha = lfrom[t1];
beta = hfrom[t1];
gamma = hfrom[t2];
delta = (alpha | (~gamma)) & (lfrom[t2] ^ beta);
lto[j] = ~delta;
hto[j] = (alpha ^ gamma) | delta;
delta = beta ^ lfrom[t2];
lto[j] = ~(delta & alpha);
hto[j] = delta | (alpha ^ hfrom[t2]);
}
}

Expand Down

0 comments on commit 3d6fa34

Please sign in to comment.