Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Different AVX versions supported #34

Merged
merged 1 commit into from
Mar 24, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
204 changes: 204 additions & 0 deletions src/pow_avx.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ static const int indices[] = {
12, 376, 11, 375, 10, 374, 9, 373, 8, 372, 7, 371, 6, 370, 5,
369, 4, 368, 3, 367, 2, 366, 1, 365, 0};

#ifdef __AVX2__
static void transform256(__m256i *lmid, __m256i *hmid)
{
__m256i one = _mm256_set_epi64x(HBITS, HBITS, HBITS, HBITS);
Expand Down Expand Up @@ -258,6 +259,209 @@ static long long int pwork256(char mid[], int mwm, char nonce[], int n, int id)
incrN256(n, lmid, hmid);
return loop256(lmid, hmid, mwm, nonce, id);
}
#else
void transform256(__m256d *lmid, __m256d *hmid)
{
__m256d one = _mm256_set_pd(HBITS, HBITS, HBITS, HBITS);
int j, r, t1, t2;
__m256d alpha, beta, gamma, delta, ngamma;
__m256d *lto = lmid + STATE_LENGTH, *hto = hmid + STATE_LENGTH;
__m256d *lfrom = lmid, *hfrom = hmid;
for (r = 0; r < 80; r++) {
for (j = 0; j < STATE_LENGTH; j++) {
t1 = indices___[j];
t2 = indices___[j + 1];

alpha = lfrom[t1];
beta = hfrom[t1];
gamma = hfrom[t2];
ngamma = _mm256_andnot_pd(gamma, one);
delta = _mm256_and_pd(
_mm256_or_pd(alpha, ngamma),
_mm256_xor_pd(
lfrom[t2],
beta)); //(alpha | (~gamma)) & (lfrom[t2] ^ beta);


lto[j] = _mm256_andnot_pd(delta, one); //~delta;
hto[j] = _mm256_or_pd(_mm256_xor_pd(alpha, gamma),
delta); //(alpha ^ gamma) | delta;
}
__m256d *lswap = lfrom, *hswap = hfrom;
lfrom = lto;
hfrom = hto;
lto = lswap;
hto = hswap;
}
for (j = 0; j < HASH_LENGTH; j++) {
t1 = indices___[j];
t2 = indices___[j + 1];

alpha = lfrom[t1];
beta = hfrom[t1];
gamma = hfrom[t2];
ngamma = _mm256_andnot_pd(gamma, one);
delta = _mm256_and_pd(
_mm256_or_pd(alpha, ngamma),
_mm256_xor_pd(lfrom[t2],
beta)); //(alpha | (~gamma)) & (lfrom[t2] ^ beta);


lto[j] = _mm256_andnot_pd(delta, one); //~delta;
hto[j] = _mm256_or_pd(_mm256_xor_pd(alpha, gamma),
delta); //(alpha ^ gamma) | delta;
}
}

int incr256(__m256d *mid_low, __m256d *mid_high)
{
int i;
__m256d carry;
carry = _mm256_set_pd(LBITS, LBITS, LBITS, LBITS);
for (i = INCR_START; i < HASH_LENGTH && (i == INCR_START || carry[0]);
i++) {
__m256d low = mid_low[i], high = mid_high[i];
mid_low[i] = _mm256_xor_pd(high, low);
mid_high[i] = low;
carry = _mm256_andnot_pd(low, high); // high & (~low);
}
return i == HASH_LENGTH;
}

void seri256(__m256d *low, __m256d *high, int n, char *r)
{
int i = 0, index = 0;
if (n > 63 && n < 128) {
n -= 64;
index = 1;
}
if (n >= 128 && n < 192) {
n -= 128;
index = 2;
}
if (n >= 192 && n < 256) {
n -= 192;
index = 3;
}
for (i = HASH_LENGTH - NONCE_LENGTH; i < HASH_LENGTH; i++) {
long long l = ((dl) low[i][index]).l;
long long h = ((dl) high[i][index]).l;
long ll = (l >> n) & 1;
long hh = (h >> n) & 1;
if (hh == 0 && ll == 1) {
r[i + NONCE_LENGTH - HASH_LENGTH] = -1;
}
if (hh == 1 && ll == 1) {
r[i + NONCE_LENGTH - HASH_LENGTH] = 0;
}
if (hh == 1 && ll == 0) {
r[i + NONCE_LENGTH - HASH_LENGTH] = 1;
}
}
}

int check256(__m256d *l, __m256d *h, int m)
{
int i, j; // omit init for speed

__m256d nonce_probe = _mm256_set_pd(HBITS, HBITS, HBITS, HBITS);
for (i = HASH_LENGTH - m; i < HASH_LENGTH; i++) {
nonce_probe = _mm256_andnot_pd(_mm256_xor_pd(l[i], h[i]),
nonce_probe); //&= ~(l[i] ^ h[i]);
if (nonce_probe[0] == LBITS && nonce_probe[1] == LBITS &&
nonce_probe[2] == LBITS && nonce_probe[3] == LBITS) {
return -1;
}
}
for (j = 0; j < 4; j++) {
for (i = 0; i < 64; i++) {
long long np = ((dl) nonce_probe[j]).l;
if ((np >> i) & 1) {
return i + j * 64;
}
}
}
return -2;
}

void para256(char in[], __m256d l[], __m256d h[])
{
int i = 0;
for (i = 0; i < STATE_LENGTH; i++) {
switch (in[i]) {
case 0:
l[i] = _mm256_set_pd(HBITS, HBITS, HBITS, HBITS);
h[i] = _mm256_set_pd(HBITS, HBITS, HBITS, HBITS);
break;
case 1:
l[i] = _mm256_set_pd(LBITS, LBITS, LBITS, LBITS);
h[i] = _mm256_set_pd(HBITS, HBITS, HBITS, HBITS);
break;
case -1:
l[i] = _mm256_set_pd(HBITS, HBITS, HBITS, HBITS);
h[i] = _mm256_set_pd(LBITS, LBITS, LBITS, LBITS);
break;
}
}
}

void incrN256(int n, __m256d *mid_low, __m256d *mid_high)
{
int i, j;
for (j = 0; j < n; j++) {
__m256d carry;
carry = _mm256_set_pd(HBITS, HBITS, HBITS, HBITS);
for (i = HASH_LENGTH * 2 / 3 + 4;
i < HASH_LENGTH * 2 / 3 + 4 + 27 && carry[0]; i++) {
__m256d low = mid_low[i], high = mid_high[i];
mid_low[i] = _mm256_xor_pd(high, low);
mid_high[i] = low;
carry = _mm256_andnot_pd(low, high); // high & (~low);
}
}
}

int loop256(__m256d *lmid, __m256d *hmid, int m, char *nonce, int id)
{
int i = 0, n = 0, j = 0;

__m256d lcpy[STATE_LENGTH * 2], hcpy[STATE_LENGTH * 2];
for (i = 0; !incr256(lmid, hmid) && !stopAVX[id]; i++) {
for (j = 0; j < STATE_LENGTH; j++) {
lcpy[j] = lmid[j];
hcpy[j] = hmid[j];
}
transform256(lcpy, hcpy);
if ((n = check256(lcpy + STATE_LENGTH, hcpy + STATE_LENGTH, m)) >= 0) {
seri256(lmid, hmid, n, nonce);
return i * 256;
}
}
return -i * 256 - 1;
}

long long int pwork256(char mid[], int mwm, char nonce[], int n, int id)
{
__m256d lmid[STATE_LENGTH], hmid[STATE_LENGTH];
int offset = HASH_LENGTH - NONCE_LENGTH;
para256(mid, lmid, hmid);
lmid[offset] = _mm256_set_pd(LOW00, LOW01, LOW02, LOW03);
hmid[offset] = _mm256_set_pd(HIGH00, HIGH01, HIGH02, HIGH03);
lmid[offset + 1] = _mm256_set_pd(LOW10, LOW11, LOW12, LOW13);
hmid[offset + 1] = _mm256_set_pd(HIGH10, HIGH11, HIGH12, HIGH13);
lmid[offset + 2] = _mm256_set_pd(LOW20, LOW21, LOW22, LOW23);
hmid[offset + 2] = _mm256_set_pd(HIGH20, HIGH21, HIGH22, HIGH23);
lmid[offset + 3] = _mm256_set_pd(LOW30, LOW31, LOW32, LOW33);
hmid[offset + 3] = _mm256_set_pd(HIGH30, HIGH31, HIGH32, HIGH33);
lmid[offset + 4] = _mm256_set_pd(LOW40, LOW41, LOW42, LOW43);
hmid[offset + 4] = _mm256_set_pd(HIGH40, HIGH41, HIGH42, HIGH43);
lmid[offset + 5] = _mm256_set_pd(LOW50, LOW51, LOW52, LOW53);
hmid[offset + 5] = _mm256_set_pd(HIGH50, HIGH51, HIGH52, HIGH53);

incrN256(n, lmid, hmid);
return loop256(lmid, hmid, mwm, nonce, id);
}
#endif

static void *pworkThread(void *pitem)
{
Expand Down
65 changes: 56 additions & 9 deletions src/pow_avx.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,15 @@ void pow_avx_destroy();
#include <x86intrin.h>
#endif

#define HBITS 0xFFFFFFFFFFFFFFFFuLL
#define LBITS 0x0000000000000000uLL
#define HASH_LENGTH 243 //trits
#define STATE_LENGTH 3 * HASH_LENGTH //trits
#define NONCE_LENGTH 81
#define TX_LENGTH 2673 //trytes
#define INCR_START HASH_LENGTH - NONCE_LENGTH + 4 + 27


#ifdef __AVX2__
#define HBITS 0xFFFFFFFFFFFFFFFFuLL
#define LBITS 0x0000000000000000uLL
#define LOW00 0xDB6DB6DB6DB6DB6DuLL //0b1101101101101101101101101101101101101101101101101101101101101101
#define HIGH00 0xB6DB6DB6DB6DB6DBuLL //0b1011011011011011011011011011011011011011011011011011011011011011
#define LOW10 0xF1F8FC7E3F1F8FC7uLL //0b1111000111111000111111000111111000111111000111111000111111000111
Expand All @@ -46,7 +46,6 @@ void pow_avx_destroy();
#define HIGH40 0xFFFFFFFFFFFFFFFFuLL //0b1111111111111111111111111111111111111111111111111111111111111111
#define LOW50 0xFFFFFFFFFFFFFFFFuLL //0b1111111111111111111111111111111111111111111111111111111111111111
#define HIGH50 0xFFFFFFFFFFFFFFFFuLL //0b1111111111111111111111111111111111111111111111111111111111111111

#define LOW01 0x6DB6DB6DB6DB6DB6uLL //0b0110110110110110110110110110110110110110110110110110110110110110
#define HIGH01 0xDB6DB6DB6DB6DB6DuLL //0b1101101101101101101101101101101101101101101101101101101101101101
#define LOW11 0xF8FC7E3F1F8FC7E3uLL //0b1111100011111100011111100011111100011111100011111100011111100011
Expand All @@ -59,7 +58,6 @@ void pow_avx_destroy();
#define HIGH41 0xFFFFFFFFFFFFFFFFuLL //0b1111111111111111111111111111111111111111111111111111111111111111
#define LOW51 0xFFFFFFFFFFFFFFFFuLL //0b1111111111111111111111111111111111111111111111111111111111111111
#define HIGH51 0xFFFFFFFFFFFFFFFFuLL //0b1111111111111111111111111111111111111111111111111111111111111111

#define LOW02 0xB6DB6DB6DB6DB6DBuLL //0b1011011011011011011011011011011011011011011011011011011011011011
#define HIGH02 0x6DB6DB6DB6DB6DB6uLL //0b0110110110110110110110110110110110110110110110110110110110110110
#define LOW12 0xFC7E3F1F8FC7E3F1uLL //0b1111110001111110001111110001111110001111110001111110001111110001
Expand All @@ -72,7 +70,6 @@ void pow_avx_destroy();
#define HIGH42 0x00000003FFFFFFFFuLL //0b0000000000000000000000000000001111111111111111111111111111111111
#define LOW52 0xFFFFFFFFFFFFFFFFuLL //0b1111111111111111111111111111111111111111111111111111111111111111
#define HIGH52 0xFFFFFFFFFFFFFFFFuLL //0b1111111111111111111111111111111111111111111111111111111111111111

#define LOW03 0xDB6DB6DB6DB6DB6DuLL //0b1101101101101101101101101101101101101101101101101101101101101101
#define HIGH03 0xB6DB6DB6DB6DB6DBuLL //0b1011011011011011011011011011011011011011011011011011011011011011
#define LOW13 0x7E3F1F8FC7E3F1F8uLL //0b0111111000111111000111111000111111000111111000111111000111111000
Expand All @@ -85,7 +82,57 @@ void pow_avx_destroy();
#define HIGH43 0xFFF8000000000000uLL //0b1111111111111000000000000000000000000000000000000000000000000000
#define LOW53 0x0007FFFFFFFFFFFFuLL //0b0000000000000111111111111111111111111111111111111111111111111111
#define HIGH53 0xFFFFFFFFFFFFFFFFuLL //0b1111111111111111111111111111111111111111111111111111111111111111



#else
typedef union {double d; unsigned long long l;} dl;
#define HBITS ( ( (dl) 0xFFFFFFFFFFFFFFFFuLL ).d )
#define LBITS ( ( (dl) 0x0000000000000000uLL ).d )
#define LOW00 ( ( (dl)0xDB6DB6DB6DB6DB6DuLL ).d ) //0b1101101101101101101101101101101101101101101101101101101101101101
#define HIGH00 ( ( (dl)0xB6DB6DB6DB6DB6DBuLL ).d ) //0b1011011011011011011011011011011011011011011011011011011011011011
#define LOW10 ( ( (dl)0xF1F8FC7E3F1F8FC7uLL ).d ) //0b1111000111111000111111000111111000111111000111111000111111000111
#define HIGH10 ( ( (dl)0x8FC7E3F1F8FC7E3FuLL ).d ) //0b1000111111000111111000111111000111111000111111000111111000111111
#define LOW20 ( ( (dl)0x7FFFE00FFFFC01FFuLL ).d ) //0b0111111111111111111000000000111111111111111111000000000111111111
#define HIGH20 ( ( (dl)0xFFC01FFFF803FFFFuLL ).d ) //0b1111111111000000000111111111111111111000000000111111111111111111
#define LOW30 ( ( (dl)0xFFC0000007FFFFFFuLL ).d ) //0b1111111111000000000000000000000000000111111111111111111111111111
#define HIGH30 ( ( (dl)0x003FFFFFFFFFFFFFuLL ).d ) //0b0000000000111111111111111111111111111111111111111111111111111111
#define LOW40 ( ( (dl)0xFFFFFFFFFFFFFFFFuLL ).d ) //0b1111111111111111111111111111111111111111111111111111111111111111
#define HIGH40 ( ( (dl)0xFFFFFFFFFFFFFFFFuLL ).d ) //0b1111111111111111111111111111111111111111111111111111111111111111
#define LOW50 ( ( (dl)0xFFFFFFFFFFFFFFFFuLL ).d ) //0b1111111111111111111111111111111111111111111111111111111111111111
#define HIGH50 ( ( (dl)0xFFFFFFFFFFFFFFFFuLL ).d ) //0b1111111111111111111111111111111111111111111111111111111111111111
#define LOW01 ( ( (dl)0x6DB6DB6DB6DB6DB6uLL ).d ) //0b0110110110110110110110110110110110110110110110110110110110110110
#define HIGH01 ( ( (dl)0xDB6DB6DB6DB6DB6DuLL ).d ) //0b1101101101101101101101101101101101101101101101101101101101101101
#define LOW11 ( ( (dl)0xF8FC7E3F1F8FC7E3uLL ).d ) //0b1111100011111100011111100011111100011111100011111100011111100011
#define HIGH11 ( ( (dl)0xC7E3F1F8FC7E3F1FuLL ).d ) //0b1100011111100011111100011111100011111100011111100011111100011111
#define LOW21 ( ( (dl)0xC01FFFF803FFFF00uLL ).d ) //0b1100000000011111111111111111100000000011111111111111111100000000
#define HIGH21 ( ( (dl)0x3FFFF007FFFE00FFuLL ).d ) //0b0011111111111111111100000000011111111111111111100000000011111111
#define LOW31 ( ( (dl)0x00000FFFFFFFFFFFuLL ).d ) //0b0000000000000000000011111111111111111111111111111111111111111111
#define HIGH31 ( ( (dl)0xFFFFFFFFFFFE0000uLL ).d ) //0b1111111111111111111111111111111111111111111111100000000000000000
#define LOW41 ( ( (dl)0x000000000001FFFFuLL ).d ) //0b0000000000000000000000000000000000000000000000011111111111111111
#define HIGH41 ( ( (dl)0xFFFFFFFFFFFFFFFFuLL ).d ) //0b1111111111111111111111111111111111111111111111111111111111111111
#define LOW51 ( ( (dl)0xFFFFFFFFFFFFFFFFuLL ).d ) //0b1111111111111111111111111111111111111111111111111111111111111111
#define HIGH51 ( ( (dl)0xFFFFFFFFFFFFFFFFuLL ).d ) //0b1111111111111111111111111111111111111111111111111111111111111111
#define LOW02 ( ( (dl)0xB6DB6DB6DB6DB6DBuLL ).d ) //0b1011011011011011011011011011011011011011011011011011011011011011
#define HIGH02 ( ( (dl)0x6DB6DB6DB6DB6DB6uLL ).d ) //0b0110110110110110110110110110110110110110110110110110110110110110
#define LOW12 ( ( (dl)0xFC7E3F1F8FC7E3F1uLL ).d ) //0b1111110001111110001111110001111110001111110001111110001111110001
#define HIGH12 ( ( (dl)0xE3F1F8FC7E3F1F8FuLL ).d ) //0b1110001111110001111110001111110001111110001111110001111110001111
#define LOW22 ( ( (dl)0xFFF007FFFE00FFFFuLL ).d ) //0b1111111111110000000001111111111111111110000000001111111111111111
#define HIGH22 ( ( (dl)0xE00FFFFC01FFFF80uLL ).d ) //0b1110000000001111111111111111110000000001111111111111111110000000
#define LOW32 ( ( (dl)0x1FFFFFFFFFFFFF80uLL ).d ) //0b0001111111111111111111111111111111111111111111111111111110000000
#define HIGH32 ( ( (dl)0xFFFFFFFC0000007FuLL ).d ) //0b1111111111111111111111111111110000000000000000000000000001111111
#define LOW42 ( ( (dl)0xFFFFFFFC00000000uLL ).d ) //0b1111111111111111111111111111110000000000000000000000000000000000
#define HIGH42 ( ( (dl)0x00000003FFFFFFFFuLL ).d ) //0b0000000000000000000000000000001111111111111111111111111111111111
#define LOW52 ( ( (dl)0xFFFFFFFFFFFFFFFFuLL ).d ) //0b1111111111111111111111111111111111111111111111111111111111111111
#define HIGH52 ( ( (dl)0xFFFFFFFFFFFFFFFFuLL ).d ) //0b1111111111111111111111111111111111111111111111111111111111111111
#define LOW03 ( ( (dl)0xDB6DB6DB6DB6DB6DuLL ).d ) //0b1101101101101101101101101101101101101101101101101101101101101101
#define HIGH03 ( ( (dl)0xB6DB6DB6DB6DB6DBuLL ).d ) //0b1011011011011011011011011011011011011011011011011011011011011011
#define LOW13 ( ( (dl)0x7E3F1F8FC7E3F1F8uLL ).d ) //0b0111111000111111000111111000111111000111111000111111000111111000
#define HIGH13 ( ( (dl)0xF1F8FC7E3F1F8FC7uLL ).d ) //0b1111000111111000111111000111111000111111000111111000111111000111
#define LOW23 ( ( (dl)0x0FFFFC01FFFF803FuLL ).d ) //0b0000111111111111111111000000000111111111111111111000000000111111
#define HIGH23 ( ( (dl)0xFFF803FFFF007FFFuLL ).d ) //0b1111111111111000000000111111111111111111000000000111111111111111
#define LOW33 ( ( (dl)0xFFFFFFFFFF000000uLL ).d ) //0b1111111111111111111111111111111111111111000000000000000000000000
#define HIGH33 ( ( (dl)0xFFF8000000FFFFFFuLL ).d ) //0b1111111111111000000000000000000000000000111111111111111111111111
#define LOW43 ( ( (dl)0xFFFFFFFFFFFFFFFFuLL ).d ) //0b1111111111111111111111111111111111111111111111111111111111111111
#define HIGH43 ( ( (dl)0xFFF8000000000000uLL ).d ) //0b1111111111111000000000000000000000000000000000000000000000000000
#define LOW53 ( ( (dl)0x0007FFFFFFFFFFFFuLL ).d ) //0b0000000000000111111111111111111111111111111111111111111111111111
#define HIGH53 ( ( (dl)0xFFFFFFFFFFFFFFFFuLL ).d ) //0b1111111111111111111111111111111111111111111111111111111111111111
#endif
#endif