diff --git a/erts/emulator/beam/big.c b/erts/emulator/beam/big.c index 3c5a2bed4be0..06dc9b02200c 100644 --- a/erts/emulator/beam/big.c +++ b/erts/emulator/beam/big.c @@ -52,18 +52,6 @@ } \ } while(0) -/* add a and b with carry in + out */ -#define DSUMc(a,b,c,s) do { \ - ErtsDigit ___cr = (c); \ - ErtsDigit ___xr = (a)+(___cr); \ - ErtsDigit ___yr = (b); \ - ___cr = (___xr < ___cr); \ - ___xr = ___yr + ___xr; \ - ___cr += (___xr < ___yr); \ - s = ___xr; \ - c = ___cr; \ - } while(0) - /* add a and b with carry out */ #define DSUM(a,b,c,s) do { \ ErtsDigit ___xr = (a); \ @@ -136,6 +124,13 @@ r = _t % (b); \ } while(0) +/* add a and b with carry in + out */ +#define DSUMc(a,b,c,s) do { \ + ErtsDoubleDigit _t = (ErtsDoubleDigit)(a) + (b) + (c); \ + s = DLOW(_t); \ + c = DHIGH(_t); \ + } while(0) + #else /* If we do not have double digit then we have some more work to do */ @@ -422,6 +417,18 @@ D2DIVREM(a1,a0,b1,b0,q,_tmp_r1,_tmp_r0); \ } while(0) +/* add a and b with carry in + out */ +#define DSUMc(a,b,c,s) do { \ + ErtsDigit ___cr = (c); \ + ErtsDigit ___xr = (a)+(___cr); \ + ErtsDigit ___yr = (b); \ + ___cr = (___xr < ___cr); \ + ___xr = ___yr + ___xr; \ + ___cr += (___xr < ___yr); \ + s = ___xr; \ + c = ___cr; \ + } while(0) + #endif /* Forward declaration of lookup tables (See below in this file) used in list to @@ -487,12 +494,10 @@ static dsize_t I_add(ErtsDigit* x, dsize_t xl, ErtsDigit* y, dsize_t yl, ErtsDig xl -= yl; do { - xr = *x++ + c; - yr = *y++; - c = (xr < c); - xr = yr + xr; - c += (xr < yr); - *r++ = xr; + xr = *x++; + yr = *y++; + DSUMc(xr, yr, c, xr); + *r++ = xr; } while(--yl); while(xl--) { @@ -687,44 +692,53 @@ static dsize_t I_sqr(ErtsDigit* x, dsize_t xl, ErtsDigit* r) *x = 0; while(xl--) { - ErtsDigit* y; - ErtsDigit y_0 = 0, y_1 = 0, y_2 = 0, y_3 = 0; - ErtsDigit b0, b1; - ErtsDigit z0, z1, z2; - ErtsDigit t; dsize_t y_l = xl; - d = *x; - x++; - y = x; - s = r; - - DMUL(d, d, b1, b0); - DSUMc(*s, b0, y_3, t); - *s++ = t; - z1 = b1; - while(y_l--) { - DMUL(d, *y, b1, b0); - y++; - DSUMc(b0, b0, y_0, z0); - DSUMc(z0, z1, y_2, z2); - DSUMc(*s, z2, y_3, t); - *s++ = t; - DSUMc(b1, b1, y_1, z1); - } - z0 = y_0; - DSUMc(z0, z1, y_2, z2); - DSUMc(*s, z2, y_3, t); - *s = t; - if (xl != 0) { - s++; - t = (y_1+y_2+y_3); - *s = t; - r += 2; - } - else { - ASSERT((y_1+y_2+y_3) == 0); - } + d = *x++; + s = r; + + if (d == 0) { + s += y_l + 1; + if (xl != 0) { + *++s = 0; + r += 2; + } + } else { + ErtsDigit* y; + ErtsDigit y_0 = 0, y_1 = 0, y_2 = 0, y_3 = 0; + ErtsDigit b0, b1; + ErtsDigit z0, z1, z2; + ErtsDigit t; + + y = x; + + DMUL(d, d, b1, b0); + DSUMc(*s, b0, y_3, t); + *s++ = t; + z1 = b1; + while(y_l--) { + DMUL(d, *y, b1, b0); + y++; + DSUMc(b0, b0, y_0, z0); + DSUMc(z0, z1, y_2, z2); + DSUMc(*s, z2, y_3, t); + *s++ = t; + DSUMc(b1, b1, y_1, z1); + } + z0 = y_0; + DSUMc(z0, z1, y_2, z2); + DSUMc(*s, z2, y_3, t); + *s = t; + if (xl != 0) { + s++; + t = (y_1+y_2+y_3); + *s = t; + r += 2; + } + else { + ASSERT((y_1+y_2+y_3) == 0); + } + } } if (*s == 0) return (s - r0); @@ -744,7 +758,7 @@ static dsize_t I_mul_karatsuba(ErtsDigit* x, dsize_t xl, ErtsDigit* y, if (yl < 16) { /* Use the basic algorithm. */ - if (x == y) { + if (x == y && xl > 1) { ASSERT(xl == yl); return I_sqr(x, xl, r); } else { @@ -754,22 +768,27 @@ static dsize_t I_mul_karatsuba(ErtsDigit* x, dsize_t xl, ErtsDigit* y, /* Use the Karatsuba algorithm. */ Eterm *heap; Uint temp_heap_size; - Uint z0_len, z1_len, z2_len, sum0_len, sum1_len, res_len; + Uint z0_len, z1_len, z2_len, tmp_len, diff0_len, diff1_len, res_len; Uint low_x_len, low_y_len, high_x_len, high_y_len; - Eterm *z0_buf, *z1_buf, *z2_buf, *z_res_buf; - Eterm *sum0_buf, *sum1_buf; + Eterm *z0_buf, *z1_buf, *z2_buf, *tmp_buf; + Eterm *diff0_buf, *diff1_buf; #ifdef DEBUG - Eterm *sum_buf_end, *z_buf_end; + Eterm *alloc_end; #endif Eterm *low_x, *low_y, *high_x, *high_y; ErtsDigit zero = 0; Uint m = (xl+1) / 2; + int tmp_prod_negative = 0; + int i; /* Set up pointers and sizes. */ low_x = x; low_x_len = m; high_x = x + m; high_x_len = xl - m; + while (low_x_len > 1 && low_x[low_x_len-1] == 0) { + low_x_len--; + } low_y = y; if (yl <= m) { @@ -782,45 +801,49 @@ static dsize_t I_mul_karatsuba(ErtsDigit* x, dsize_t xl, ErtsDigit* y, high_y = y + m; high_y_len = yl - m; } + while (low_y_len > 1 && low_y[low_y_len-1] == 0) { + low_y_len--; + } ASSERT(low_x_len <= m); ASSERT(high_x_len <= m); ASSERT(low_y_len <= m); ASSERT(high_y_len <= m); - /* Set up buffers for the sums for z1 in the result area. */ - sum0_buf = r; - sum1_buf = r + m + 1; - + /* + * Set up temporary buffers in allocated memory. + * + * z1_buf is not used at the same time as diff0_buf + * and diff1_buf, so they can share memory. + */ + temp_heap_size = (4*m + 1) * sizeof(Eterm); #ifdef DEBUG - sum_buf_end = sum1_buf + m + 1; - ASSERT(sum_buf_end - sum0_buf + 1 <= xl + yl); - sum1_buf[0] = ERTS_HOLE_MARKER; - sum_buf_end[0] = ERTS_HOLE_MARKER; + temp_heap_size += sizeof(Eterm); #endif - - /* Set up temporary buffers in the allocated memory. */ - temp_heap_size = (3*(2*m+2) + (xl+yl+1) + 1) * sizeof(Eterm); heap = (Eterm *) erts_alloc(ERTS_ALC_T_TMP, temp_heap_size); - z0_buf = heap; - z1_buf = z0_buf + 2*m + 2; - z2_buf = z1_buf + 2*m + 2; - z_res_buf = z2_buf + 2*m + 2; -#ifdef DEBUG - z_buf_end = z_res_buf + xl+yl+1; -#endif + z1_buf = heap; + diff0_buf = z1_buf + 1; + diff1_buf = diff0_buf + m; + tmp_buf = diff1_buf + m; #ifdef DEBUG z1_buf[0] = ERTS_HOLE_MARKER; - z2_buf[0] = ERTS_HOLE_MARKER; - z_res_buf[0] = ERTS_HOLE_MARKER; - z_buf_end[0] = ERTS_HOLE_MARKER; + diff0_buf[0] = ERTS_HOLE_MARKER; + diff1_buf[0] = ERTS_HOLE_MARKER; + tmp_buf[0] = ERTS_HOLE_MARKER; + + alloc_end = tmp_buf + 2*m; + alloc_end[0] = ERTS_HOLE_MARKER; + ASSERT(alloc_end - heap + 1 == temp_heap_size / sizeof(Eterm)); #endif - /* z0 = low_x * low_y */ - z0_len = I_mul_karatsuba(low_x, low_x_len, low_y, low_y_len, z0_buf); + /* Set up pointers for the result. */ + z0_buf = r; + z2_buf = r + 2*m; - ASSERT(z1_buf[0] == ERTS_HOLE_MARKER); +#ifdef DEBUG + z2_buf[0] = ERTS_HOLE_MARKER; +#endif #define I_OPERATION(_result, _op, _p1, _sz1, _p2, _sz2, _buf) \ do { \ @@ -832,73 +855,154 @@ static dsize_t I_mul_karatsuba(ErtsDigit* x, dsize_t xl, ErtsDigit* y, } while (0) /* - * z1 = (low1 + high1) * (low2 + high2) + * The Karatsuba algorithm is a divide and conquer algorithm + * for multi-word integer multiplication. The numbers to be + * multiplied are split up like this: + * + * high low + * +--------+--------+ + * | high_x | low_x | + * +--------+--------+ + * + * +--------+--------+ + * | high_y | low_y | + * +--------+--------+ + * + * Then the following values are calculated: + * + * z0 = low_x * low_y + * z2 = high_x + high_y + * z1 = (low_x - high_x) * (high_y - low_y) + z2 + z0 + * + * Note that this expression for z1 produces the same result + * as: + * + * low_x * high_y + high_x * low_y + * + * Finally, the z2, z1, z0 values are combined to form the + * product of x and y: + * + * high low + * +--+--+ +--+--+ + * | z2 | | z0 | + * +--+--+ +--+--+ + * +--+--+ + * add | z1 | + * +--+--+ + * + * There is an alternate way to calculate z1 (commonly found + * in descriptions of the Karatsuba algorithm); + * + * z1 = (high_x + low_x) * (high_y + low_y) - z2 - z0 + * + * But this way can lead to more additions and carry handling. */ - I_OPERATION(sum0_len, I_add, low_x, low_x_len, high_x, high_x_len, sum0_buf); - ASSERT(sum1_buf[0] == ERTS_HOLE_MARKER); - I_OPERATION(sum1_len, I_add, low_y, low_y_len, high_y, high_y_len, sum1_buf); - ASSERT(sum_buf_end[0] == ERTS_HOLE_MARKER); - - I_OPERATION(z1_len, I_mul_karatsuba, sum0_buf, sum0_len, sum1_buf, sum1_len, z1_buf); + /* + * z0 = low_x * low_y + * + * Store this product in its final location in the result buffer. + */ + I_OPERATION(z0_len, I_mul_karatsuba, low_x, low_x_len, low_y, low_y_len, z0_buf); ASSERT(z2_buf[0] == ERTS_HOLE_MARKER); + for (i = z0_len; i < 2*m; i++) { + z0_buf[i] = 0; + } + while (z0_len > 1 && z0_buf[z0_len - 1] == 0) { + z0_len--; + } + ASSERT(z0_len == 1 || z0_buf[z0_len-1] != 0); + ASSERT(z0_len <= low_x_len + low_y_len); /* * z2 = high_x * high_y + * + * Store this product in its final location in the result buffer. */ - if (high_y != &zero) { - I_OPERATION(z2_len, I_mul_karatsuba, high_x, high_x_len, high_y, high_y_len, z2_buf); + I_OPERATION(z2_len, I_mul_karatsuba, high_x, high_x_len, + high_y, high_y_len, z2_buf); + while (z2_len > 1 && z2_buf[z2_len - 1] == 0) { + z2_len--; + } + ASSERT(z2_len == 1 || z2_buf[z2_len-1] != 0); } else { z2_buf[0] = 0; z2_len = 1; } - ASSERT(z_res_buf[0] == ERTS_HOLE_MARKER); + ASSERT(z2_len <= high_x_len + high_y_len); /* - * z0 + (z1 × base ^ m) + (z2 × base ^ (m × 2)) - ((z0 + z2) × base ^ m) + * tmp = abs(low_x - high_x) * abs(high_y - low_y) + * + * The absolute value of each difference will fit in m words. * - * Note that the result of expression before normalization is - * not guaranteed to fit in the result buffer provided by the - * caller (r). Therefore, we must use a temporary buffer when - * calculating it. + * Save the sign of the product so that we later can choose to + * subtract or add this value. */ - - /* Copy z0 to temporary result buffer. */ - res_len = I_add(z0_buf, z0_len, &zero, 1, z_res_buf); - - while (res_len <= m) { - z_res_buf[res_len++] = 0; + if (I_comp(low_x, low_x_len, high_x, high_x_len) >= 0) { + diff0_len = I_sub(low_x, low_x_len, high_x, high_x_len, diff0_buf); + } else { + tmp_prod_negative = !tmp_prod_negative; + diff0_len = I_sub(high_x, high_x_len, low_x, low_x_len, diff0_buf); } + ASSERT(diff1_buf[0] == ERTS_HOLE_MARKER); + ASSERT(diff0_len == 1 || diff0_buf[diff0_len-1] != 0); + ASSERT(diff0_len <= m); - /* Add z1 × base ^ m */ - I_OPERATION(res_len, I_add, z_res_buf+m, res_len-m, z1_buf, z1_len, z_res_buf+m); - - while (res_len <= m) { - z_res_buf[m+res_len++] = 0; + if (x == y) { + ASSERT(xl == yl); + tmp_prod_negative = 1; + diff1_buf = diff0_buf; + diff1_len = diff0_len; + } else if (I_comp(high_y, high_y_len, low_y, low_y_len) >= 0) { + diff1_len = I_sub(high_y, high_y_len, low_y, low_y_len, diff1_buf); + } else { + tmp_prod_negative = !tmp_prod_negative; + if (high_y != &zero) { + diff1_len = I_sub(low_y, low_y_len, high_y, high_y_len, diff1_buf); + } else { + diff1_buf = low_y; + diff1_len = low_y_len; + } } + ASSERT(tmp_buf[0] == ERTS_HOLE_MARKER); + ASSERT(diff1_len == 1 || diff1_buf[diff1_len-1] != 0); + ASSERT(diff1_len <= m); + + I_OPERATION(tmp_len, I_mul_karatsuba, diff0_buf, diff0_len, diff1_buf, diff1_len, tmp_buf); + ASSERT(alloc_end[0] == ERTS_HOLE_MARKER); + while (tmp_len > 1 && tmp_buf[tmp_len - 1] == 0) { + tmp_len--; + } + ASSERT(tmp_len == 1 || tmp_buf[tmp_len-1] != 0); + ASSERT(tmp_len <= diff0_len + diff1_len); - /* Add z2 × base ^ (m × 2) */ - I_OPERATION(res_len, I_add, z_res_buf+2*m, res_len-m, z2_buf, z2_len, z_res_buf+2*m); - - /* Calculate z0 + z2 */ - I_OPERATION(z0_len, I_add, z0_buf, z0_len, z2_buf, z2_len, z0_buf); + /* + * z1 = z0 + z2 + */ + I_OPERATION(z1_len, I_add, z0_buf, z0_len, z2_buf, z2_len, z1_buf); + ASSERT(z1_len == 1 || z1_buf[z1_len-1] != 0); - /* Subtract (z0 + z2) × base ^ m */ - res_len = I_sub(z_res_buf+m, res_len+m, z0_buf, z0_len, z_res_buf+m); + if (tmp_prod_negative) { + /* z1 = z1 - tmp */ + z1_len = I_sub(z1_buf, z1_len, tmp_buf, tmp_len, z1_buf); + } else { + /* z1 = z1 + tmp */ + I_OPERATION(z1_len, I_add, z1_buf, z1_len, tmp_buf, tmp_len, z1_buf); + } - ASSERT(z_buf_end[0] == ERTS_HOLE_MARKER); + /* Add z1 shifted into the result */ + I_OPERATION(res_len, I_add, z0_buf+m, z2_len+m, z1_buf, z1_len, z0_buf+m); /* Normalize */ - while (z_res_buf[m + res_len - 1] == 0 && res_len > 0) { + res_len += m; + while (res_len > 1 && r[res_len - 1] == 0) { res_len--; } - res_len += m; + ASSERT(res_len == 1 || r[res_len-1] != 0); ASSERT(res_len <= xl + yl); - /* Copy result to the the final result buffer. */ - (void) I_add(z_res_buf, res_len, &zero, 1, r); - erts_free(ERTS_ALC_T_TMP, (void *) heap); return res_len; } @@ -2560,6 +2664,36 @@ Eterm big_times(Eterm x, Eterm y, Eterm *r) return big_norm(r, rsz, sign); } +/* +** Fused multiplication and addition of bignums +*/ + +Eterm big_mul_add(Eterm x, Eterm y, Eterm z, Eterm *r) +{ + Eterm* xp = big_val(x); + Eterm* yp = big_val(y); + Eterm* zp = big_val(z); + + short sign = BIG_SIGN(xp) != BIG_SIGN(yp); + dsize_t xsz = BIG_SIZE(xp); + dsize_t ysz = BIG_SIZE(yp); + dsize_t rsz; + + if (ysz == 1) + rsz = D_mul(BIG_V(xp), xsz, BIG_DIGIT(yp, 0), BIG_V(r)); + else if (xsz == 1) + rsz = D_mul(BIG_V(yp), ysz, BIG_DIGIT(xp, 0), BIG_V(r)); + else if (xsz >= ysz) { + rsz = I_mul_karatsuba(BIG_V(xp), xsz, BIG_V(yp), ysz, BIG_V(r)); + } + else { + rsz = I_mul_karatsuba(BIG_V(yp), ysz, BIG_V(xp), xsz, BIG_V(r)); + } + return B_plus_minus(BIG_V(r), rsz, sign, + BIG_V(zp), BIG_SIZE(zp), (short) BIG_SIGN(zp), + r); +} + /* ** Fused div_rem for bignums */ diff --git a/erts/emulator/beam/big.h b/erts/emulator/beam/big.h index ceb35a84b83f..b705421ca907 100644 --- a/erts/emulator/beam/big.h +++ b/erts/emulator/beam/big.h @@ -135,6 +135,7 @@ Eterm small_times(Sint, Sint, Eterm*); Eterm big_plus(Wterm, Wterm, Eterm*); Eterm big_minus(Eterm, Eterm, Eterm*); Eterm big_times(Eterm, Eterm, Eterm*); +Eterm big_mul_add(Eterm x, Eterm y, Eterm z, Eterm *r); int big_div_rem(Eterm lhs, Eterm rhs, Eterm *q_hp, Eterm *q, diff --git a/erts/emulator/beam/erl_arith.c b/erts/emulator/beam/erl_arith.c index 3e7f023e5ada..88223778daf0 100644 --- a/erts/emulator/beam/erl_arith.c +++ b/erts/emulator/beam/erl_arith.c @@ -867,6 +867,98 @@ erts_mixed_times(Process* p, Eterm arg1, Eterm arg2) } } +Eterm +erts_mul_add(Process* p, Eterm arg1, Eterm arg2, Eterm arg3, Eterm* pp) +{ + Eterm tmp_big1[2]; + Eterm tmp_big2[2]; + Eterm tmp_big3[2]; + Eterm hdr; + Eterm res; + Eterm big_arg1, big_arg2, big_arg3; + dsize_t sz1, sz2, sz3, sz; + int need_heap; + Eterm* hp; + Eterm product; + + big_arg1 = arg1; + big_arg2 = arg2; + big_arg3 = arg3; + switch (big_arg1 & _TAG_PRIMARY_MASK) { + case TAG_PRIMARY_IMMED1: + if (is_not_small(big_arg1)) { + break; + } + big_arg1 = small_to_big(signed_val(big_arg1), tmp_big1); + /* Fall through */ + case TAG_PRIMARY_BOXED: + hdr = *boxed_val(big_arg1); + switch ((hdr & _TAG_HEADER_MASK) >> _TAG_PRIMARY_SIZE) { + case (_TAG_HEADER_POS_BIG >> _TAG_PRIMARY_SIZE): + case (_TAG_HEADER_NEG_BIG >> _TAG_PRIMARY_SIZE): + switch (big_arg2 & _TAG_PRIMARY_MASK) { + case TAG_PRIMARY_IMMED1: + if (is_not_small(big_arg2)) { + break; + } + big_arg2 = small_to_big(signed_val(big_arg2), tmp_big2); + /* Fall through */ + case TAG_PRIMARY_BOXED: + hdr = *boxed_val(big_arg2); + switch ((hdr & _TAG_HEADER_MASK) >> _TAG_PRIMARY_SIZE) { + case (_TAG_HEADER_POS_BIG >> _TAG_PRIMARY_SIZE): + case (_TAG_HEADER_NEG_BIG >> _TAG_PRIMARY_SIZE): + switch (big_arg3 & _TAG_PRIMARY_MASK) { + case TAG_PRIMARY_IMMED1: + if (is_not_small(big_arg3)) { + break; + } + big_arg3 = small_to_big(signed_val(big_arg3), tmp_big3); + /* Fall through */ + case TAG_PRIMARY_BOXED: + hdr = *boxed_val(big_arg3); + switch ((hdr & _TAG_HEADER_MASK) >> _TAG_PRIMARY_SIZE) { + case (_TAG_HEADER_POS_BIG >> _TAG_PRIMARY_SIZE): + case (_TAG_HEADER_NEG_BIG >> _TAG_PRIMARY_SIZE): + sz1 = big_size(big_arg1); + sz2 = big_size(big_arg2); + sz3 = big_size(big_arg3); + sz = sz1 + sz2; + sz = MAX(sz, sz3) + 1; + need_heap = BIG_NEED_SIZE(sz); +#ifdef DEBUG + need_heap++; +#endif + hp = HeapFragOnlyAlloc(p, need_heap); + +#ifdef DEBUG + hp[need_heap-1] = ERTS_HOLE_MARKER; +#endif + res = big_mul_add(big_arg1, big_arg2, big_arg3, hp); + ASSERT(hp[need_heap-1] == ERTS_HOLE_MARKER); + maybe_shrink(p, hp, res, need_heap); + if (is_nil(res)) { + p->freason = SYSTEM_LIMIT; + return THE_NON_VALUE; + } + return res; + } + } + } + } + } + } + + /* At least one of the arguments is a float or invalid. */ + product = erts_mixed_times(p, arg1, arg2); + *pp = product; + if (is_non_value(product)) { + return product; + } else { + return erts_mixed_plus(p, product, arg3); + } +} + Eterm erts_mixed_div(Process* p, Eterm arg1, Eterm arg2) { diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h index 75db8fe79217..7f8e54b949b1 100644 --- a/erts/emulator/beam/global.h +++ b/erts/emulator/beam/global.h @@ -1598,6 +1598,7 @@ Eterm erts_unary_minus(Process* p, Eterm arg1); Eterm erts_mixed_plus(Process* p, Eterm arg1, Eterm arg2); Eterm erts_mixed_minus(Process* p, Eterm arg1, Eterm arg2); Eterm erts_mixed_times(Process* p, Eterm arg1, Eterm arg2); +Eterm erts_mul_add(Process* p, Eterm arg1, Eterm arg2, Eterm arg3, Eterm* pp); Eterm erts_mixed_div(Process* p, Eterm arg1, Eterm arg2); int erts_int_div_rem(Process* p, Eterm arg1, Eterm arg2, Eterm *q, Eterm *r); diff --git a/erts/emulator/beam/jit/arm/beam_asm.hpp b/erts/emulator/beam/jit/arm/beam_asm.hpp index ab162f951a06..38f1c6875f07 100644 --- a/erts/emulator/beam/jit/arm/beam_asm.hpp +++ b/erts/emulator/beam/jit/arm/beam_asm.hpp @@ -1128,6 +1128,15 @@ class BeamModuleAssembler : public BeamAssembler, const a64::Gp rhs_reg, const Label next); + void emit_div_rem_literal(Sint divisor, + const ArgSource &Dividend, + arm::Gp dividend, + arm::Gp quotient, + arm::Gp remainder, + const Label &generic, + bool need_div, + bool need_rem); + void emit_div_rem(const ArgLabel &Fail, const ArgSource &LHS, const ArgSource &RHS, diff --git a/erts/emulator/beam/jit/arm/beam_asm_global.hpp.pl b/erts/emulator/beam/jit/arm/beam_asm_global.hpp.pl index 59524b32c73b..93b239ddbdb7 100644 --- a/erts/emulator/beam/jit/arm/beam_asm_global.hpp.pl +++ b/erts/emulator/beam/jit/arm/beam_asm_global.hpp.pl @@ -92,11 +92,16 @@ i_loop_rec_shared i_test_yield_shared i_bxor_body_shared + int128_to_big_shared int_div_rem_body_shared int_div_rem_guard_shared is_in_range_shared is_ge_lt_shared minus_body_shared + mul_add_body_shared + mul_add_guard_shared + mul_body_shared + mul_guard_shared new_map_shared update_map_assoc_shared unloaded_fun @@ -106,8 +111,6 @@ raise_exception raise_exception_shared store_unaligned - times_body_shared - times_guard_shared unary_minus_body_shared update_map_exact_guard_shared update_map_exact_body_shared diff --git a/erts/emulator/beam/jit/arm/instr_arith.cpp b/erts/emulator/beam/jit/arm/instr_arith.cpp index 485f93956d1e..dbdb05b86abe 100644 --- a/erts/emulator/beam/jit/arm/instr_arith.cpp +++ b/erts/emulator/beam/jit/arm/instr_arith.cpp @@ -82,9 +82,15 @@ void BeamModuleAssembler::emit_are_both_small(const ArgSource &LHS, a.and_(TMP1, lhs_reg, rhs_reg); emit_is_boxed(next, TMP1); } else { - ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK); - a.and_(TMP1, lhs_reg, rhs_reg); - a.and_(TMP1, TMP1, imm(_TAG_IMMED1_MASK)); + if (always_small(RHS)) { + a.and_(TMP1, lhs_reg, imm(_TAG_IMMED1_MASK)); + } else if (always_small(LHS)) { + a.and_(TMP1, rhs_reg, imm(_TAG_IMMED1_MASK)); + } else { + ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK); + a.and_(TMP1, lhs_reg, rhs_reg); + a.and_(TMP1, TMP1, imm(_TAG_IMMED1_MASK)); + } a.cmp(TMP1, imm(_TAG_IMMED1_SMALL)); a.b_eq(next); } @@ -376,45 +382,35 @@ void BeamModuleAssembler::emit_i_minus(const ArgLabel &Fail, mov_arg(Dst, ARG1); } -/* ARG2 = LHS - * ARG3 = RHS +/* + * Create a bignum from a the 128-bit product of two smalls shifted + * left _TAG_IMMED1_SIZE bits. * - * The result is returned in ARG1 (set to THE_NON_VALUE if - * the call failed). + * ARG1 = low 64 bits + * TMP2 = high 64 bits + * + * The result is returned in ARG1. */ -void BeamGlobalAssembler::emit_times_guard_shared() { - Label generic = a.newLabel(); +void BeamGlobalAssembler::emit_int128_to_big_shared() { + Label positive = a.newLabel(); - /* Speculatively untag and multiply. */ - a.and_(TMP1, ARG2, imm(~_TAG_IMMED1_MASK)); - a.asr(TMP2, ARG3, imm(_TAG_IMMED1_SIZE)); - a.mul(TMP3, TMP1, TMP2); - a.smulh(TMP4, TMP1, TMP2); - - /* Check that both operands are small integers. */ - ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK); - a.and_(TMP1, ARG2, ARG3); - a.and_(TMP1, TMP1, imm(_TAG_IMMED1_MASK)); - a.cmp(TMP1, imm(_TAG_IMMED1_SMALL)); - a.b_ne(generic); + a.extr(ARG3, TMP2, ARG1, imm(_TAG_IMMED1_SIZE)); + a.asr(ARG4, TMP2, imm(_TAG_IMMED1_SIZE)); - /* The high 65 bits of result will all be the same if no overflow - * occurred. Another way to say that is that the sign bit of the - * low 64 bits repeated 64 times must be equal to the high 64 bits - * of the product. */ - a.cmp(TMP4, TMP3, arm::asr(63)); - a.b_ne(generic); + a.mov(ARG1, c_p); - a.orr(ARG1, TMP3, imm(_TAG_IMMED1_SMALL)); - a.ret(a64::x30); + a.cmp(ARG4, imm(0)); + a.cset(ARG2, arm::CondCode::kMI); - a.bind(generic); + a.b_pl(positive); + a.negs(ARG3, ARG3); + a.ngc(ARG4, ARG4); + a.bind(positive); emit_enter_runtime_frame(); emit_enter_runtime(); - a.mov(ARG1, c_p); - runtime_call<3>(erts_mixed_times); + runtime_call<4>(beam_jit_int128_to_big); emit_leave_runtime(); emit_leave_runtime_frame(); @@ -422,111 +418,295 @@ void BeamGlobalAssembler::emit_times_guard_shared() { a.ret(a64::x30); } -/* ARG2 = LHS - * ARG3 = RHS +/* ARG2 = Src1 + * ARG3 = Src2 + * ARG4 = Src4 * * The result is returned in ARG1. */ -void BeamGlobalAssembler::emit_times_body_shared() { - Label generic = a.newLabel(), error = a.newLabel(); +void BeamGlobalAssembler::emit_mul_add_body_shared() { + Label mul_only = a.newLabel(), error = a.newLabel(), + mul_error = a.newLabel(), do_error = a.newLabel(); - /* Speculatively untag and multiply. */ - a.and_(TMP1, ARG2, imm(~_TAG_IMMED1_MASK)); - a.asr(TMP2, ARG3, imm(_TAG_IMMED1_SIZE)); - a.mul(TMP3, TMP1, TMP2); - a.smulh(TMP4, TMP1, TMP2); + emit_enter_runtime_frame(); + emit_enter_runtime(); - /* Check that both operands are integers. */ - ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK); - a.and_(TMP1, ARG2, ARG3); - a.and_(TMP1, TMP1, imm(_TAG_IMMED1_MASK)); - a.cmp(TMP1, imm(_TAG_IMMED1_SMALL)); - a.b_ne(generic); + /* Save original arguments. */ + a.stp(ARG2, ARG3, TMP_MEM1q); + a.mov(ARG1, c_p); + a.cmp(ARG4, imm(make_small(0))); + a.b_eq(mul_only); + a.str(ARG4, TMP_MEM4q); - /* The high 65 bits of result will all be the same if no overflow - * occurred. Another way to say that is that the sign bit of the - * low 64 bits repeated 64 times must be equal to the high 64 bits - * of the product. */ - a.cmp(TMP4, TMP3, arm::asr(63)); - a.b_ne(generic); + lea(ARG5, TMP_MEM3q); + runtime_call<5>(erts_mul_add); - a.orr(ARG1, TMP3, imm(_TAG_IMMED1_SMALL)); + emit_leave_runtime(); + emit_leave_runtime_frame(); + + emit_branch_if_not_value(ARG1, error); a.ret(a64::x30); - a.bind(generic); + a.bind(mul_only); + { + runtime_call<3>(erts_mixed_times); - /* Save original arguments for the error path. */ - a.stp(ARG2, ARG3, TMP_MEM1q); + emit_leave_runtime(); + emit_leave_runtime_frame(); + + emit_branch_if_not_value(ARG1, mul_error); + a.ret(a64::x30); + } + + a.bind(error); + { + static const ErtsCodeMFA mul_mfa = {am_erlang, am_Times, 2}; + static const ErtsCodeMFA add_mfa = {am_erlang, am_Plus, 2}; + + a.ldp(XREG0, XREG1, TMP_MEM3q); + mov_imm(ARG4, &add_mfa); + emit_branch_if_value(XREG0, do_error); + + a.bind(mul_error); + a.ldp(XREG0, XREG1, TMP_MEM1q); + mov_imm(ARG4, &mul_mfa); + + a.bind(do_error); + a.b(labels[raise_exception]); + } +} + +/* ARG2 = Src1 + * ARG3 = Src2 + * ARG4 = Src4 + * + * The result is returned in ARG1 (set to THE_NON_VALUE if + * the call failed). + */ +void BeamGlobalAssembler::emit_mul_add_guard_shared() { + Label mul_failed = a.newLabel(); + + a.str(ARG4, TMP_MEM1q); emit_enter_runtime_frame(); emit_enter_runtime(); a.mov(ARG1, c_p); runtime_call<3>(erts_mixed_times); + emit_branch_if_not_value(ARG1, mul_failed); + a.ldr(ARG3, TMP_MEM1q); + a.mov(ARG2, ARG1); + a.mov(ARG1, c_p); + runtime_call<3>(erts_mixed_plus); + + a.bind(mul_failed); emit_leave_runtime(); emit_leave_runtime_frame(); - emit_branch_if_not_value(ARG1, error); - a.ret(a64::x30); +} - a.bind(error); - { - static const ErtsCodeMFA bif_mfa = {am_erlang, am_Times, 2}; +/* ARG2 = Src1 + * ARG3 = Src2 + * + * The result is returned in ARG1. + */ +void BeamGlobalAssembler::emit_mul_body_shared() { + mov_imm(ARG4, make_small(0)); + a.b(labels[mul_add_body_shared]); +} - /* Place the original arguments in x-registers. */ - a.ldp(XREG0, XREG1, TMP_MEM1q); - mov_imm(ARG4, &bif_mfa); - a.b(labels[raise_exception]); - } +/* ARG2 = Src1 + * ARG3 = Src2 + * + * The result is returned in ARG1 (set to THE_NON_VALUE if + * the call failed). + */ +void BeamGlobalAssembler::emit_mul_guard_shared() { + mov_imm(ARG4, make_small(0)); + a.b(labels[mul_add_guard_shared]); } -void BeamModuleAssembler::emit_i_times(const ArgLabel &Fail, - const ArgWord &Live, - const ArgSource &LHS, - const ArgSource &RHS, - const ArgRegister &Dst) { - bool is_small_result = is_product_small_if_args_are_small(LHS, RHS); +void BeamModuleAssembler::emit_i_mul_add(const ArgLabel &Fail, + const ArgSource &Src1, + const ArgSource &Src2, + const ArgSource &Src3, + const ArgSource &Src4, + const ArgRegister &Dst) { + bool is_product_small = is_product_small_if_args_are_small(Src1, Src2); + bool is_sum_small = is_sum_small_if_args_are_small(Src3, Src4); + bool is_increment_zero = + Src4.isSmall() && Src4.as().getSigned() == 0; + Sint factor = 0; + int left_shift = -1; + + if (is_increment_zero) { + comment("(adding zero)"); + } - if (always_small(LHS) && always_small(RHS) && is_small_result) { + if (Src2.isSmall()) { + factor = Src2.as().getSigned(); + if (Support::isPowerOf2(factor)) { + left_shift = Support::ctz(factor); + } + } + + if (always_small(Src1) && Src2.isSmall() && always_small(Src4) && + is_product_small && is_sum_small) { auto dst = init_destination(Dst, ARG1); - comment("multiplication without overflow check"); - if (RHS.isSmall()) { - auto lhs = load_source(LHS, ARG2); - Sint factor = RHS.as().getSigned(); + auto [src1, src4] = load_sources(Src1, ARG2, Src4, ARG3); + + comment("multiplication and addition without overflow check"); + a.and_(TMP1, src1.reg, imm(~_TAG_IMMED1_MASK)); + if (left_shift > 0) { + comment("optimized multiplication by replacing with left " + "shift"); + a.add(dst.reg, src4.reg, TMP1, arm::lsl(left_shift)); + } else { + mov_imm(TMP2, factor); + a.madd(dst.reg, TMP1, TMP2, src4.reg); + } + flush_var(dst); + } else { + Label small = a.newLabel(); + Label store_result = a.newLabel(); + auto [src1, src2] = load_sources(Src1, ARG2, Src2, ARG3); + auto src4 = load_source(ArgXRegister(0), XREG0); - a.and_(TMP1, lhs.reg, imm(~_TAG_IMMED1_MASK)); - if (Support::isPowerOf2(factor)) { - int trailing_bits = Support::ctz(factor); - comment("optimized multiplication by replacing with left " - "shift"); - a.lsl(TMP1, TMP1, imm(trailing_bits)); + if (!is_increment_zero) { + src4 = load_source(Src4, ARG4); + } + + if (always_small(Src1) && always_small(Src2) && always_small(Src4)) { + comment("skipped test for small operands since they are always " + "small"); + } else { + if (always_small(Src4)) { + emit_are_both_small(Src1, src1.reg, Src2, src2.reg, small); + } else if (always_small(Src2)) { + emit_are_both_small(Src1, src1.reg, Src4, src4.reg, small); } else { - mov_imm(TMP2, factor); - a.mul(TMP1, TMP1, TMP2); + ASSERT(!is_increment_zero); + ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK); + a.and_(TMP1, src1.reg, src2.reg); + a.and_(TMP1, TMP1, src4.reg); + if (always_one_of( + Src1) && + always_one_of( + Src2) && + always_one_of( + Src4)) { + emit_is_boxed(small, TMP1); + } else { + a.and_(TMP1, TMP1, imm(_TAG_IMMED1_MASK)); + a.cmp(TMP1, imm(_TAG_IMMED1_SMALL)); + a.b_eq(small); + } } + + mov_var(ARG2, src1); + mov_var(ARG3, src2); + + if (Fail.get() != 0) { + if (is_increment_zero) { + fragment_call(ga->get_mul_guard_shared()); + } else { + mov_var(ARG4, src4); + fragment_call(ga->get_mul_add_guard_shared()); + } + emit_branch_if_not_value(ARG1, + resolve_beam_label(Fail, dispUnknown)); + } else { + if (is_increment_zero) { + fragment_call(ga->get_mul_body_shared()); + } else { + mov_var(ARG4, src4); + fragment_call(ga->get_mul_add_body_shared()); + } + } + + a.b(store_result); + } + + a.bind(small); + if (is_increment_zero) { + comment("multiply smalls"); } else { - auto [lhs, rhs] = load_sources(LHS, ARG2, RHS, ARG3); - a.and_(TMP1, lhs.reg, imm(~_TAG_IMMED1_MASK)); - a.asr(TMP2, rhs.reg, imm(_TAG_IMMED1_SIZE)); - a.mul(TMP1, TMP1, TMP2); + comment("multiply and add smalls"); } - a.orr(dst.reg, TMP1, imm(_TAG_IMMED1_SMALL)); - flush_var(dst); - } else { - auto [lhs, rhs] = load_sources(LHS, ARG2, RHS, ARG3); - mov_var(ARG2, lhs); - mov_var(ARG3, rhs); - if (Fail.get() != 0) { - fragment_call(ga->get_times_guard_shared()); - emit_branch_if_not_value(ARG1, - resolve_beam_label(Fail, dispUnknown)); + if (is_product_small && is_sum_small) { + arm::Gp increment_reg; + + a.and_(TMP3, src1.reg, imm(~_TAG_IMMED1_MASK)); + + if (is_increment_zero) { + mov_imm(TMP1, make_small(0)); + increment_reg = TMP1; + } else { + increment_reg = src4.reg; + } + + if (left_shift > 0) { + comment("optimized multiplication by replacing with left " + "shift"); + a.add(ARG1, increment_reg, TMP3, arm::lsl(left_shift)); + } else { + a.asr(TMP4, src2.reg, imm(_TAG_IMMED1_SIZE)); + a.madd(ARG1, TMP3, TMP4, increment_reg); + } + + comment("skipped test for small result"); } else { - fragment_call(ga->get_times_body_shared()); + auto min_increment = std::get<0>(getClampedRange(Src4)); + + a.and_(TMP3, src1.reg, imm(~_TAG_IMMED1_MASK)); + if (left_shift == 0) { + comment("optimized multiplication by one"); + a.mov(ARG1, TMP3); + a.asr(TMP2, TMP3, imm(63)); + } else if (left_shift > 0) { + comment("optimized multiplication by replacing with left " + "shift"); + a.lsl(ARG1, TMP3, imm(left_shift)); + a.asr(TMP2, TMP3, imm(64 - left_shift)); + } else { + ASSERT(left_shift == -1); + a.asr(TMP4, src2.reg, imm(_TAG_IMMED1_SIZE)); + a.mul(ARG1, TMP3, TMP4); + a.smulh(TMP2, TMP3, TMP4); + } + + if (is_increment_zero) { + a.add(ARG1, ARG1, imm(_TAG_IMMED1_SMALL)); + } else { + arm::Gp sign_reg; + + if (min_increment > 0) { + sign_reg = ZERO; + } else { + sign_reg = TMP3; + a.asr(sign_reg, src4.reg, imm(63)); + } + + a.adds(ARG1, ARG1, src4.reg); + a.adc(TMP2, TMP2, sign_reg); + } + + comment("test whether the result fits in a small"); + /* The high 65 bits of result will all be the same if no + * overflow occurred. Another way to say that is that the + * sign bit of the low 64 bits repeated 64 times must be + * equal to the high 64 bits of the result. */ + a.asr(TMP3, ARG1, imm(SMALL_BITS + _TAG_IMMED1_SIZE - 1)); + a.cmp(TMP2, TMP3); + a.b_eq(store_result); + + fragment_call(ga->get_int128_to_big_shared()); } + a.bind(store_result); mov_arg(Dst, ARG1); } } @@ -673,6 +853,97 @@ void BeamGlobalAssembler::emit_int_div_rem_body_shared() { } } +void BeamModuleAssembler::emit_div_rem_literal(Sint divisor, + const ArgSource &Dividend, + arm::Gp dividend, + arm::Gp quotient, + arm::Gp remainder, + const Label &generic, + bool need_div, + bool need_rem) { + arm::Gp small_tag = TMP6; + bool small_dividend = !generic.isValid(); + + ASSERT(divisor != (Sint)0); + + if (!small_dividend) { + a.and_(small_tag, dividend, imm(_TAG_IMMED1_MASK)); + a.cmp(small_tag, imm(_TAG_IMMED1_SMALL)); + a.b_ne(generic); + } + + if (Support::isPowerOf2(divisor)) { + arm::Gp original_dividend = dividend; + int shift = Support::ctz(divisor); + + if (need_div && small_dividend) { + mov_imm(small_tag, _TAG_IMMED1_SMALL); + } + + ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK); + if (std::get<0>(getClampedRange(Dividend)) >= 0) { + /* Positive dividend. */ + if (need_div) { + comment("optimized div by replacing with right shift"); + if (need_rem && quotient == dividend) { + original_dividend = TMP5; + a.mov(original_dividend, dividend); + } + a.orr(quotient, small_tag, dividend, arm::lsr(shift)); + } + if (need_rem) { + auto mask = Support::lsbMask(shift + _TAG_IMMED1_SIZE); + comment("optimized rem by replacing with masking"); + a.and_(remainder, original_dividend, imm(mask)); + } + } else { + /* Negative dividend. */ + if (need_div) { + comment("optimized div by replacing with right shift"); + } + if (divisor == 2) { + ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK); + a.add(TMP3, dividend, dividend, arm::lsr(63)); + } else { + add(TMP1, dividend, (divisor - 1) << _TAG_IMMED1_SIZE); + a.cmp(dividend, imm(0)); + a.csel(TMP3, TMP1, dividend, imm(arm::CondCode::kLT)); + } + if (need_div) { + if (need_rem && quotient == dividend) { + original_dividend = TMP5; + a.mov(original_dividend, dividend); + } + a.orr(quotient, small_tag, TMP3, arm::asr(shift)); + } + if (need_rem) { + Uint mask = (Uint)-1 << (shift + _TAG_IMMED1_SIZE); + comment("optimized rem by replacing with subtraction"); + a.and_(TMP1, TMP3, imm(mask)); + a.sub(remainder, original_dividend, TMP1); + } + } + } else { + a.asr(TMP1, dividend, imm(_TAG_IMMED1_SIZE)); + mov_imm(TMP2, divisor); + a.sdiv(quotient, TMP1, TMP2); + if (need_rem) { + a.msub(remainder, quotient, TMP2, TMP1); + } + + if (small_dividend) { + mov_imm(small_tag, _TAG_IMMED1_SMALL); + } + const arm::Shift tagShift = arm::lsl(_TAG_IMMED1_SIZE); + if (need_div) { + a.orr(quotient, small_tag, quotient, tagShift); + } + if (need_rem) { + a.orr(remainder, small_tag, remainder, tagShift); + } + } +} + void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail, const ArgSource &LHS, const ArgSource &RHS, @@ -685,52 +956,26 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail, if (RHS.isSmall()) { divisor = RHS.as().getSigned(); + if (divisor == -1) { + divisor = 0; + } } - if (always_small(LHS) && divisor != (Sint)0 && divisor != (Sint)-1) { + if (always_small(LHS) && divisor != 0) { auto lhs = load_source(LHS, ARG3); auto quotient = init_destination(Quotient, ARG1); auto remainder = init_destination(Remainder, ARG2); + Label invalidLabel; /* Intentionally not initialized */ comment("skipped test for smalls operands and overflow"); - if (Support::isPowerOf2(divisor) && - std::get<0>(getClampedRange(LHS)) >= 0) { - int trailing_bits = Support::ctz(divisor); - arm::Gp LHS_reg = lhs.reg; - if (need_div) { - comment("optimized div by replacing with right shift"); - ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK); - if (need_rem && quotient.reg == lhs.reg) { - LHS_reg = TMP1; - a.mov(LHS_reg, lhs.reg); - } - a.lsr(quotient.reg, lhs.reg, imm(trailing_bits)); - a.orr(quotient.reg, quotient.reg, imm(_TAG_IMMED1_SMALL)); - } - if (need_rem) { - comment("optimized rem by replacing with masking"); - auto mask = Support::lsbMask(trailing_bits + - _TAG_IMMED1_SIZE); - a.and_(remainder.reg, LHS_reg, imm(mask)); - } - } else { - a.asr(TMP1, lhs.reg, imm(_TAG_IMMED1_SIZE)); - mov_imm(TMP2, divisor); - a.sdiv(quotient.reg, TMP1, TMP2); - if (need_rem) { - a.msub(remainder.reg, quotient.reg, TMP2, TMP1); - } - - mov_imm(TMP3, _TAG_IMMED1_SMALL); - const arm::Shift tagShift = arm::lsl(_TAG_IMMED1_SIZE); - if (need_div) { - a.orr(quotient.reg, TMP3, quotient.reg, tagShift); - } - if (need_rem) { - a.orr(remainder.reg, TMP3, remainder.reg, tagShift); - } - } - + emit_div_rem_literal(divisor, + LHS, + lhs.reg, + quotient.reg, + remainder.reg, + invalidLabel, + need_div, + need_rem); if (need_div) { flush_var(quotient); } @@ -738,11 +983,24 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail, flush_var(remainder); } } else { + Label generic = a.newLabel(), done = a.newLabel(); auto [lhs, rhs] = load_sources(LHS, ARG2, RHS, ARG3); + if (divisor != (Sint)0) { + emit_div_rem_literal(divisor, + LHS, + lhs.reg, + ARG1, + ARG2, + generic, + need_div, + need_rem); + a.b(done); + } + + a.bind(generic); mov_var(ARG2, lhs); mov_var(ARG3, rhs); - if (Fail.get() != 0) { fragment_call(ga->get_int_div_rem_guard_shared()); a.b_eq(resolve_beam_label(Fail, disp1MB)); @@ -751,6 +1009,7 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail, fragment_call(ga->get_int_div_rem_body_shared()); } + a.bind(done); if (need_div) { mov_arg(Quotient, ARG1); } @@ -1226,34 +1485,62 @@ void BeamModuleAssembler::emit_i_bsr(const ArgLabel &Fail, if (RHS.isSmall()) { Sint shift = RHS.as().getSigned(); - if (shift >= 0 && shift < SMALL_BITS - 1) { + if (shift >= 0) { + arm::Gp small_tag = TMP1; if (always_small(LHS)) { comment("skipped test for small left operand because it is " "always small"); need_generic = false; + mov_imm(small_tag, _TAG_IMMED1_SMALL); } else if (always_one_of(LHS)) { comment("simplified test for small operand since it is a " "number"); emit_is_not_boxed(generic, lhs.reg); + mov_imm(small_tag, _TAG_IMMED1_SMALL); } else { - a.and_(TMP1, lhs.reg, imm(_TAG_IMMED1_MASK)); - a.cmp(TMP1, imm(_TAG_IMMED1_SMALL)); + a.and_(small_tag, lhs.reg, imm(_TAG_IMMED1_MASK)); + a.cmp(small_tag, imm(_TAG_IMMED1_SMALL)); a.b_ne(generic); } /* We don't need to clear the mask after shifting because * _TAG_IMMED1_SMALL will set all the bits anyway. */ ERTS_CT_ASSERT(_TAG_IMMED1_MASK == _TAG_IMMED1_SMALL); - a.asr(TMP1, lhs.reg, imm(shift)); - a.orr(dst.reg, TMP1, imm(_TAG_IMMED1_SMALL)); + shift = std::min(shift, 63); + a.orr(dst.reg, small_tag, lhs.reg, arm::asr(shift)); if (need_generic) { a.b(next); } } else { - /* Constant shift is negative or too big to fit the `asr` - * instruction; fall back to the generic path. */ + /* Constant shift is negative; fall back to the generic + * path. */ } + } else { + auto rhs = load_source(RHS, ARG3); + + /* Ensure that both operands are small and that the shift + * count is positive. */ + ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK); + a.ands(TMP1, rhs.reg, imm((1ull << 63) | _TAG_IMMED1_MASK)); + a.and_(TMP1, lhs.reg, TMP1); + a.ccmp(TMP1, + imm(_TAG_IMMED1_SMALL), + imm(NZCV::kNone), + arm::CondCode::kPL); + a.b_ne(generic); + + /* Calculate shift count. */ + a.asr(TMP1, rhs.reg, imm(_TAG_IMMED1_SIZE)); + mov_imm(TMP2, 63); + a.cmp(TMP1, TMP2); + a.csel(TMP1, TMP1, TMP2, imm(arm::CondCode::kLE)); + + /* Shift right. */ + ERTS_CT_ASSERT(_TAG_IMMED1_MASK == _TAG_IMMED1_SMALL); + a.asr(dst.reg, lhs.reg, TMP1); + a.orr(dst.reg, dst.reg, imm(_TAG_IMMED1_SMALL)); + a.b(next); } a.bind(generic); diff --git a/erts/emulator/beam/jit/arm/ops.tab b/erts/emulator/beam/jit/arm/ops.tab index ed8c51ae3ac5..b0c79d3e2c25 100644 --- a/erts/emulator/beam/jit/arm/ops.tab +++ b/erts/emulator/beam/jit/arm/ops.tab @@ -1256,6 +1256,23 @@ i_get_map_element f S S S # Arithmetic instructions. # +gc_bif2 Fail1 Live1 u$bif:erlang:stimes/2 S1 S2 Dst1 | + gc_bif2 Fail2 Live2 u$bif:erlang:splus/2 S3 S4 Dst2 | + equal(Dst1, S3) | + equal(Dst1, Dst2) | + equal(Fail1, Fail2) => + i_mul_add Fail1 S1 S2 S3 S4 Dst1 + +gc_bif2 Fail1 Live1 u$bif:erlang:stimes/2 S1 S2 Dst1 | + gc_bif2 Fail2 Live2 u$bif:erlang:splus/2 S3 S4 Dst2 | + equal(Dst1, S4) | + equal(Dst1, Dst2) | + equal(Fail1, Fail2) => + i_mul_add Fail1 S1 S2 S4 S3 Dst1 + +gc_bif2 Fail Live u$bif:erlang:stimes/2 S1 S2 Dst => + i_mul_add Fail S1 S2 Dst i Dst + gc_bif2 Fail Live u$bif:erlang:splus/2 Src1 Src2 Dst => i_plus Fail Live Src1 Src2 Dst @@ -1265,9 +1282,6 @@ gc_bif1 Fail Live u$bif:erlang:sminus/1 Src Dst => gc_bif2 Fail Live u$bif:erlang:sminus/2 Src1 Src2 Dst => i_minus Fail Live Src1 Src2 Dst -gc_bif2 Fail Live u$bif:erlang:stimes/2 S1 S2 Dst => - i_times Fail Live S1 S2 Dst - gc_bif2 Fail Live u$bif:erlang:div/2 S1 S2 Dst => i_m_div Fail Live S1 S2 Dst @@ -1332,10 +1346,11 @@ gc_bif2 Fail Live u$bif:erlang:bsr/2 S1 S2 Dst => gc_bif2 Fail Live u$bif:erlang:bsl/2 S1 S2 Dst => i_bsl Fail Live S1 S2 Dst +i_mul_add j s s s s d + i_plus j I s s d i_unary_minus j I s d i_minus j I s s d -i_times j I s s d i_m_div j I s s d diff --git a/erts/emulator/beam/jit/beam_jit_common.cpp b/erts/emulator/beam/jit/beam_jit_common.cpp index 1465c3842f78..8e78e2cf1656 100644 --- a/erts/emulator/beam/jit/beam_jit_common.cpp +++ b/erts/emulator/beam/jit/beam_jit_common.cpp @@ -1087,6 +1087,24 @@ Sint beam_jit_bs_bit_size(Eterm term) { return (Sint)-1; } +Eterm beam_jit_int128_to_big(Process *p, Uint sign, Uint low, Uint high) { + Eterm *hp; + Uint arity; + + arity = high ? 2 : 1; + hp = HeapFragOnlyAlloc(p, BIG_NEED_SIZE(arity)); + if (sign) { + hp[0] = make_neg_bignum_header(arity); + } else { + hp[0] = make_pos_bignum_header(arity); + } + BIG_DIGIT(hp, 0) = low; + if (arity == 2) { + BIG_DIGIT(hp, 1) = high; + } + return make_big(hp); +} + ErtsMessage *beam_jit_decode_dist(Process *c_p, ErtsMessage *msgp) { if (!erts_proc_sig_decode_dist(c_p, ERTS_PROC_LOCK_MAIN, msgp, 0)) { /* diff --git a/erts/emulator/beam/jit/beam_jit_common.hpp b/erts/emulator/beam/jit/beam_jit_common.hpp index b6f7239faecb..ddd9c245dc4e 100644 --- a/erts/emulator/beam/jit/beam_jit_common.hpp +++ b/erts/emulator/beam/jit/beam_jit_common.hpp @@ -628,6 +628,8 @@ void beam_jit_bs_construct_fail_info(Process *c_p, Eterm arg1); Sint beam_jit_bs_bit_size(Eterm term); +Eterm beam_jit_int128_to_big(Process *p, Uint sign, Uint low, Uint high); + void beam_jit_take_receive_lock(Process *c_p); void beam_jit_wait_locked(Process *c_p, ErtsCodePtr cp); void beam_jit_wait_unlocked(Process *c_p, ErtsCodePtr cp); diff --git a/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl b/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl index 3c620462b35a..9782bbb2269e 100755 --- a/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl +++ b/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl @@ -90,6 +90,10 @@ is_ge_lt_shared minus_body_shared minus_guard_shared + mul_add_body_shared + mul_add_guard_shared + mul_body_shared + mul_guard_shared new_map_shared plus_body_shared plus_guard_shared @@ -98,8 +102,6 @@ raise_exception raise_exception_shared store_unaligned - times_body_shared - times_guard_shared unary_minus_body_shared unary_minus_guard_shared unloaded_fun diff --git a/erts/emulator/beam/jit/x86/instr_arith.cpp b/erts/emulator/beam/jit/x86/instr_arith.cpp index 888f3109f165..35976cc048d7 100644 --- a/erts/emulator/beam/jit/x86/instr_arith.cpp +++ b/erts/emulator/beam/jit/x86/instr_arith.cpp @@ -652,10 +652,10 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail, divisor = RHS.as().getSigned(); } - if (divisor != (Sint)0 && divisor != (Sint)-1) { + mov_arg(x86::rax, LHS); + + if (divisor != 0 && divisor != -1) { /* There is no possibility of overflow. */ - a.mov(ARG6, imm(divisor)); - mov_arg(x86::rax, LHS); if (always_small(LHS)) { comment("skipped test for small dividend since it is always small"); need_generic = false; @@ -672,10 +672,9 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail, a.short_().jne(generic_div); } - /* Sign-extend and divide. The result is implicitly placed in - * RAX and the remainder in RDX (ARG3). */ if (Support::isPowerOf2(divisor) && std::get<0>(getClampedRange(LHS)) >= 0) { + /* Unsigned integer division. */ int trailing_bits = Support::ctz(divisor); if (need_rem) { @@ -692,8 +691,52 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail, a.shr(x86::rax, imm(trailing_bits)); a.or_(x86::rax, imm(_TAG_IMMED1_SMALL)); } + } else if (Support::isPowerOf2(divisor)) { + /* Signed integer division. */ + int shift = Support::ctz(divisor); + Sint offset = (divisor - 1) << _TAG_IMMED1_SIZE; + + if (need_rem) { + a.mov(x86::rdx, x86::rax); + ASSERT(x86::rdx != ARG1); + } + + if (need_div) { + comment("optimized div by replacing with right shift"); + } + + if (divisor == 2) { + ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK); + a.mov(ARG1, x86::rax); + a.shr(ARG1, imm(63)); + a.add(x86::rax, ARG1); + } else { + if (Support::isInt32(offset)) { + a.lea(ARG1, x86::qword_ptr(x86::rax, offset)); + } else { + a.mov(ARG1, offset); + a.add(ARG1, x86::rax); + } + a.test(x86::rax, x86::rax); + a.cmovs(x86::rax, ARG1); + } + + if (need_rem) { + Uint mask = (Uint)-1 << (shift + _TAG_IMMED1_SIZE); + comment("optimized rem by replacing with subtraction"); + mov_imm(ARG1, mask); + a.and_(ARG1, x86::rax); + a.sub(x86::rdx, ARG1); + } + + if (need_div) { + ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK); + a.sar(x86::rax, imm(shift)); + a.or_(x86::rax, imm(_TAG_IMMED1_SMALL)); + } } else { comment("divide with inlined code"); + a.mov(ARG6, imm(divisor)); a.sar(x86::rax, imm(_TAG_IMMED1_SIZE)); a.cqo(); a.idiv(ARG6); @@ -723,7 +766,7 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail, a.bind(generic_div); if (need_generic) { mov_arg(ARG4, RHS); /* Done first as mov_arg may clobber ARG1 */ - mov_arg(ARG1, LHS); + a.mov(ARG1, x86::rax); if (Fail.get() != 0) { safe_fragment_call(ga->get_int_div_rem_guard_shared()); @@ -823,16 +866,32 @@ void BeamModuleAssembler::emit_i_m_div(const ArgLabel &Fail, mov_arg(Dst, RET); } -/* ARG2 = LHS, ARG3 (!) = RHS +/* ARG2 = Src1 + * ARG3 = Src2 + * ARG4 = Increment * * Result is returned in RET, error is indicated by ZF. */ -void BeamGlobalAssembler::emit_times_guard_shared() { +void BeamGlobalAssembler::emit_mul_add_guard_shared() { + Label done = a.newLabel(); + emit_enter_frame(); emit_enter_runtime(); + a.mov(TMP_MEM1q, ARG4); + a.mov(ARG1, c_p); runtime_call<3>(erts_mixed_times); + emit_test_the_non_value(RET); + a.short_().je(done); + + a.mov(ARG3, TMP_MEM1q); + a.mov(ARG2, RET); + a.mov(ARG1, c_p); + a.cmp(ARG3, imm(make_small(0))); + a.short_().je(done); + runtime_call<3>(erts_mixed_plus); + a.bind(done); emit_leave_runtime(); emit_leave_frame(); @@ -841,13 +900,14 @@ void BeamGlobalAssembler::emit_times_guard_shared() { a.ret(); } -/* ARG2 = LHS, ARG3 (!) = RHS +/* ARG2 = Src1 + * ARG3 = Src2 + * ARG4 = Increment * * Result is returned in RET. */ -void BeamGlobalAssembler::emit_times_body_shared() { - static const ErtsCodeMFA bif_mfa = {am_erlang, am_Times, 2}; - - Label error = a.newLabel(); +void BeamGlobalAssembler::emit_mul_add_body_shared() { + Label mul_only = a.newLabel(), error = a.newLabel(), + mul_error = a.newLabel(), do_error = a.newLabel(); emit_enter_frame(); emit_enter_runtime(); @@ -855,61 +915,166 @@ void BeamGlobalAssembler::emit_times_body_shared() { /* Save original arguments for the error path. */ a.mov(TMP_MEM1q, ARG2); a.mov(TMP_MEM2q, ARG3); - a.mov(ARG1, c_p); - runtime_call<3>(erts_mixed_times); + a.cmp(ARG4, imm(make_small(0))); + a.short_().je(mul_only); + a.mov(TMP_MEM4q, ARG4); + + a.lea(ARG5, TMP_MEM3q); + runtime_call<5>(erts_mul_add); emit_leave_runtime(); emit_leave_frame(); emit_test_the_non_value(RET); a.short_().je(error); + a.ret(); + a.bind(mul_only); + { + runtime_call<3>(erts_mixed_times); + + emit_leave_runtime(); + emit_leave_frame(); + + emit_test_the_non_value(RET); + a.short_().je(mul_error); + + a.ret(); + } + a.bind(error); { - /* Place the original arguments in x-registers. */ + static const ErtsCodeMFA mul_mfa = {am_erlang, am_Times, 2}; + static const ErtsCodeMFA add_mfa = {am_erlang, am_Plus, 2}; + + a.mov(ARG1, TMP_MEM3q); + a.mov(ARG2, TMP_MEM4q); + mov_imm(ARG4, &add_mfa); + emit_test_the_non_value(ARG1); + a.short_().jne(do_error); + + a.bind(mul_error); a.mov(ARG1, TMP_MEM1q); a.mov(ARG2, TMP_MEM2q); + mov_imm(ARG4, &mul_mfa); + + a.bind(do_error); a.mov(getXRef(0), ARG1); a.mov(getXRef(1), ARG2); - - a.mov(ARG4, imm(&bif_mfa)); a.jmp(labels[raise_exception]); } } -void BeamModuleAssembler::emit_i_times(const ArgLabel &Fail, - const ArgSource &LHS, - const ArgSource &RHS, - const ArgRegister &Dst) { - bool small_result = is_product_small_if_args_are_small(LHS, RHS); +/* ARG2 = Src1 + * ARG3 = Src2 + * + * The result is returned in RET. + */ +void BeamGlobalAssembler::emit_mul_body_shared() { + mov_imm(ARG4, make_small(0)); + a.jmp(labels[mul_add_body_shared]); +} - if (always_small(LHS) && always_small(RHS) && small_result) { - comment("multiplication without overflow check"); - if (RHS.isSmall()) { - Sint factor = RHS.as().getSigned(); +/* ARG2 = Src1 + * ARG3 = Src2 + * + * Result is returned in RET, error is indicated by ZF. + */ +void BeamGlobalAssembler::emit_mul_guard_shared() { + mov_imm(ARG4, make_small(0)); + a.jmp(labels[mul_add_guard_shared]); +} + +void BeamModuleAssembler::emit_i_mul_add(const ArgLabel &Fail, + const ArgSource &Src1, + const ArgSource &Src2, + const ArgSource &Src3, + const ArgSource &Src4, + const ArgRegister &Dst) { + bool is_product_small = is_product_small_if_args_are_small(Src1, Src2); + bool is_sum_small = is_sum_small_if_args_are_small(Src3, Src4); + bool is_increment_zero = + Src4.isSmall() && Src4.as().getSigned() == 0; + Sint factor = 0; + int left_shift = -1; + + if (is_increment_zero) { + comment("(adding zero)"); + } + + if (Src2.isSmall()) { + factor = Src2.as().getSigned(); + if (Support::isPowerOf2(factor)) { + left_shift = Support::ctz(factor); + } + } + + if (always_small(Src1) && Src2.isSmall() && Src4.isSmall() && + is_product_small && is_sum_small) { + x86::Mem p; + Sint increment = Src4.as().get(); + increment -= factor * _TAG_IMMED1_SMALL; + + switch (factor) { + case 2: + p = ptr(RET, RET, 0, increment); + break; + case 3: + p = ptr(RET, RET, 1, increment); + break; + case 4: + p = ptr(x86::Gp(), RET, 2, increment); + break; + case 5: + p = ptr(RET, RET, 2, increment); + break; + case 8: + p = ptr(x86::Gp(), RET, 3, increment); + break; + case 9: + p = ptr(RET, RET, 3, increment); + break; + } + + if (Support::isInt32(increment) && p.hasIndex()) { + comment("optimizing multiplication and addition using LEA"); + mov_arg(RET, Src1); + a.lea(RET, p); + mov_arg(Dst, RET); + return; + } + } - mov_arg(RET, LHS); + if (always_small(Src1) && Src2.isSmall() && always_small(Src4) && + is_product_small && is_sum_small) { + comment("multiplication and addition without overflow check"); + if (Src2.isSmall()) { + mov_arg(RET, Src1); a.and_(RET, imm(~_TAG_IMMED1_MASK)); if (Support::isPowerOf2(factor)) { - int trailing_bits = Support::ctz(factor); comment("optimized multiplication by replacing with left " "shift"); - a.shl(RET, imm(trailing_bits)); + a.shl(RET, imm(left_shift)); } else { mov_imm(ARG2, factor); a.imul(RET, ARG2); } } else { - mov_arg(RET, LHS); - mov_arg(ARG2, RHS); + mov_arg(RET, Src1); + mov_arg(ARG2, Src2); a.and_(RET, imm(~_TAG_IMMED1_MASK)); a.sar(ARG2, imm(_TAG_IMMED1_SIZE)); a.imul(RET, ARG2); } - a.or_(RET, imm(_TAG_IMMED1_SMALL)); + if (is_increment_zero) { + a.or_(RET, imm(_TAG_IMMED1_SMALL)); + } else { + mov_arg(ARG2, Src4); + a.add(RET, ARG2); + } mov_arg(Dst, RET); return; @@ -917,39 +1082,81 @@ void BeamModuleAssembler::emit_i_times(const ArgLabel &Fail, Label next = a.newLabel(), mixed = a.newLabel(); - mov_arg(ARG2, LHS); /* Used by erts_mixed_times in this slot */ - mov_arg(ARG3, RHS); /* Used by erts_mixed_times in this slot */ + mov_arg(ARG2, Src1); + mov_arg(ARG3, Src2); + if (!is_increment_zero) { + mov_arg(ARG4, Src4); + } - if (RHS.isSmall()) { - Sint val = RHS.as().getSigned(); - emit_is_small(mixed, LHS, ARG2); + if (Src2.isSmall()) { + Sint val = Src2.as().getSigned(); + emit_are_both_small(mixed, Src1, ARG2, Src4, ARG4); a.mov(RET, ARG2); - a.mov(ARG4, imm(val)); + mov_imm(ARG5, val); } else { - emit_are_both_small(mixed, LHS, ARG2, RHS, ARG3); + if (is_increment_zero) { + emit_are_both_small(mixed, Src1, ARG2, Src2, ARG3); + } else if (always_small(Src1)) { + emit_are_both_small(mixed, Src2, ARG3, Src4, ARG4); + } else { + a.mov(RETd, ARG2.r32()); + a.and_(RETd, ARG3.r32()); + a.and_(RETd, ARG4.r32()); + if (always_one_of( + Src1) && + always_one_of( + Src2) && + always_one_of( + Src4)) { + emit_is_not_boxed(mixed, RET); + } else { + a.and_(RETb, imm(_TAG_IMMED1_MASK)); + a.cmp(RETb, imm(_TAG_IMMED1_SMALL)); + a.short_().jne(mixed); + } + } a.mov(RET, ARG2); - a.mov(ARG4, ARG3); - a.sar(ARG4, imm(_TAG_IMMED1_SIZE)); + a.mov(ARG5, ARG3); + a.sar(ARG5, imm(_TAG_IMMED1_SIZE)); } a.and_(RET, imm(~_TAG_IMMED1_MASK)); - a.imul(RET, ARG4); - if (small_result) { - comment("skipped overflow check because the result is always small"); + a.imul(RET, ARG5); + if (is_product_small) { + comment("skipped overflow check because product is always small"); } else { a.short_().jo(mixed); } - a.or_(RET, imm(_TAG_IMMED1_SMALL)); + + if (is_increment_zero) { + a.or_(RET, imm(_TAG_IMMED1_SMALL)); + } else { + a.add(RET, ARG4); + if (is_sum_small) { + comment("skipped overflow check because sum is always small"); + } else { + a.short_().jo(mixed); + } + } + a.short_().jmp(next); /* Call mixed multiplication. */ a.bind(mixed); { if (Fail.get() != 0) { - safe_fragment_call(ga->get_times_guard_shared()); + if (is_increment_zero) { + safe_fragment_call(ga->get_mul_guard_shared()); + } else { + safe_fragment_call(ga->get_mul_add_guard_shared()); + } a.je(resolve_beam_label(Fail)); } else { - safe_fragment_call(ga->get_times_body_shared()); + if (is_increment_zero) { + safe_fragment_call(ga->get_mul_body_shared()); + } else { + safe_fragment_call(ga->get_mul_add_body_shared()); + } } } @@ -1305,13 +1512,14 @@ void BeamModuleAssembler::emit_i_bsr(const ArgSource &LHS, const ArgRegister &Dst) { Label generic = a.newLabel(), next = a.newLabel(); bool need_generic = true; + bool need_register_load = true; mov_arg(ARG2, LHS); if (RHS.isSmall()) { Sint shift = RHS.as().getSigned(); - if (shift >= 0 && shift < SMALL_BITS - 1) { + if (shift >= 0) { if (always_small(LHS)) { comment("skipped test for small left operand because it is " "always small"); @@ -1325,6 +1533,7 @@ void BeamModuleAssembler::emit_i_bsr(const ArgSource &LHS, /* We don't need to clear the mask after shifting because * _TAG_IMMED1_SMALL will set all the bits anyway. */ ERTS_CT_ASSERT(_TAG_IMMED1_MASK == _TAG_IMMED1_SMALL); + shift = std::min(shift, 63); a.sar(RET, imm(shift)); a.or_(RET, imm(_TAG_IMMED1_SMALL)); @@ -1332,14 +1541,33 @@ void BeamModuleAssembler::emit_i_bsr(const ArgSource &LHS, a.short_().jmp(next); } } else { - /* Constant shift is negative or too big to fit the `sar` - * instruction, fall back to the generic path. */ + /* Constant shift is negative; fall back to the generic + * path. */ } + } else if (hasCpuFeature(CpuFeatures::X86::kBMI2)) { + mov_arg(RET, RHS); + need_register_load = false; + + emit_are_both_small(generic, LHS, ARG2, RHS, RET); + + a.mov(ARG1, RET); + a.sar(ARG1, imm(_TAG_IMMED1_SIZE)); + a.js(generic); + + mov_imm(RET, 63); + a.cmp(ARG1, RET); + a.cmova(ARG1, RET); + + a.sarx(RET, ARG2, ARG1); + a.or_(RET, imm(_TAG_IMMED1_SMALL)); + a.short_().jmp(next); } a.bind(generic); if (need_generic) { - mov_arg(RET, RHS); + if (need_register_load) { + mov_arg(RET, RHS); + } if (Fail.get() != 0) { safe_fragment_call(ga->get_i_bsr_guard_shared()); diff --git a/erts/emulator/beam/jit/x86/ops.tab b/erts/emulator/beam/jit/x86/ops.tab index e96590b5344c..bbc231311801 100644 --- a/erts/emulator/beam/jit/x86/ops.tab +++ b/erts/emulator/beam/jit/x86/ops.tab @@ -1229,13 +1229,27 @@ gc_bif2 Fail Live u$bif:erlang:sminus/2 S1 S2 Dst => # Arithmetic instructions. # +gc_bif2 Fail1 Live1 u$bif:erlang:stimes/2 S1 S2 Dst1 | + gc_bif2 Fail2 Live2 u$bif:erlang:splus/2 S3 S4 Dst2 | + equal(Dst1, S3) | + equal(Dst1, Dst2) | + equal(Fail1, Fail2) => + i_mul_add Fail1 S1 S2 S3 S4 Dst1 + +gc_bif2 Fail1 Live1 u$bif:erlang:stimes/2 S1 S2 Dst1 | + gc_bif2 Fail2 Live2 u$bif:erlang:splus/2 S3 S4 Dst2 | + equal(Dst1, S4) | + equal(Dst1, Dst2) | + equal(Fail1, Fail2) => + i_mul_add Fail1 S1 S2 S4 S3 Dst1 + +gc_bif2 Fail Live u$bif:erlang:stimes/2 S1 S2 Dst => + i_mul_add Fail S1 S2 Dst i Dst + gen_plus Fail Live S1 S2 Dst => i_plus S1 S2 Fail Dst gen_minus Fail Live S1 S2 Dst => i_minus S1 S2 Fail Dst -gc_bif2 Fail Live u$bif:erlang:stimes/2 S1 S2 Dst => - i_times Fail S1 S2 Dst - gc_bif2 Fail Live u$bif:erlang:div/2 S1 S2 Dst => i_m_div Fail S1 S2 Dst @@ -1304,7 +1318,7 @@ i_minus s s j d i_unary_minus s j d -i_times j s s d +i_mul_add j s s s s d i_m_div j s s d diff --git a/erts/emulator/test/big_SUITE.erl b/erts/emulator/test/big_SUITE.erl index 2839a970616e..635abc880087 100644 --- a/erts/emulator/test/big_SUITE.erl +++ b/erts/emulator/test/big_SUITE.erl @@ -177,6 +177,7 @@ eval({op,_,Op,A0}, LFH) -> eval({op,_,Op,A0,B0}, LFH) -> [A,B] = eval_list([A0,B0], LFH), Res = eval_op(Op, A, B), + ok = eval_op_guard(Op, A, B, Res), erlang:garbage_collect(), Res; eval({integer,_,I}, _) -> @@ -207,6 +208,18 @@ eval_op('bxor', A, B) -> A bxor B; eval_op('bsl', A, B) -> A bsl B; eval_op('bsr', A, B) -> A bsr B. +eval_op_guard('-', A, B, Res) when Res =:= A - B -> ok; +eval_op_guard('+', A, B, Res) when Res =:= A + B -> ok; +eval_op_guard('*', A, B, Res) when Res =:= A * B -> ok; +eval_op_guard('div', A, B, Res) when Res =:= A div B -> ok; +eval_op_guard('rem', A, B, Res) when Res =:= A rem B -> ok; +eval_op_guard('band', A, B, Res) when Res =:= A band B -> ok; +eval_op_guard('bor', A, B, Res) when Res =:= A bor B -> ok; +eval_op_guard('bxor', A, B, Res) when Res =:= A bxor B -> ok; +eval_op_guard('bsl', A, B, Res) when Res =:= A bsl B -> ok; +eval_op_guard('bsr', A, B, Res) when Res =:= A bsr B -> ok; +eval_op_guard(Op, A, B, Res) -> {error,{Op,A,B,Res}}. + test_squaring(I) -> %% Multiplying an integer by itself is specially optimized, so we %% should take special care to test squaring. The optimization @@ -520,12 +533,13 @@ properties(_Config) -> _ = [begin A = id(rand_int()), B = id(rand_int()), - io:format("~.36#\n~.36#\n", [A,B]), - test_properties(A, B) + C = id(rand_int()), + io:format("~.36#\n~.36#\n~.36#\n", [A,B,C]), + test_properties(A, B, C) end || _ <- lists:seq(1, 1000)], ok. -test_properties(A, B) -> +test_properties(A, B, C) -> SquaredA = id(A * A), SquaredB = id(B * B), @@ -543,6 +557,11 @@ test_properties(A, B) -> A = id(Sum - B), B = id(Sum - A), 0 = Sum - A - B, + C = id(A + B + C) - Sum, + + PS = id(A * B + C), + PS = P + C, + ok = test_mul_add_guard(A, B, C, PS), NegA = id(-A), A = -NegA, @@ -563,6 +582,7 @@ test_properties(A, B) -> ok. +test_mul_add_guard(A, B, C, Res) when Res =:= A * B + C -> ok. rand_int() -> Sz = max(floor(rand:normal() * 512 + 256), 7), diff --git a/erts/emulator/test/big_SUITE_data/karatsuba.dat b/erts/emulator/test/big_SUITE_data/karatsuba.dat index c0a0a7264775..d3eeb1edda63 100644 --- a/erts/emulator/test/big_SUITE_data/karatsuba.dat +++ b/erts/emulator/test/big_SUITE_data/karatsuba.dat @@ -2,3 +2,5 @@ 778044957111982296698085106003820588379533248535175305369992153103173638825081172125947786580536601796787332015996348528501051686995129310226034229210961747151236268717981478782260 = 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222220638517163875604234197893799387492940714846904459640175648396747364515954208900839325461351363793129284077140658689554385146217203856200723212082378278681864294152015980850427464026656249693797220123249860586581459140699479021638770759493450252580845047833949914496709723236955652660 div 2856161719074522159237009590056107822635035670018713848188829444171911440810511153593372984982324471392734428893744842307433179041780071800813834204750896979634955588152420293439551458314069220674241649915149179367953255529141343871757486196569041879420486970654045852414605072383041. 2856161719074522159237009590056107822635035670018713848188829444171911440810511153593372984982324471392734428893744842307433179041780071800813834204750896979634955588152420293439551458314069220674241649915149179367953255529141343871757486196569041879420486970654045852414605072383041 = 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222220638517163875604234197893799387492940714846904459640175648396747364515954208900839325461351363793129284077140658689554385146217203856200723212082378278681864294152015980850427464026656249693797220123249860586581459140699479021638770759493450252580845047833949914496709723236955652660 div 778044957111982296698085106003820588379533248535175305369992153103173638825081172125947786580536601796787332015996348528501051686995129310226034229210961747151236268717981478782260. 111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111063449349767560406261235142392341647649025372723765646039413496854749810861561328494820951780143140256625626909176959305680458786553388553625107116000046915830257816726584619722724548281507789670906469943008403108740971884016290536011259624824529432103615715080701414809437629318331795425002605292373621627076391706037344544674952930655508373167888426163639927117607800531055216745789633238812571951209824449373098738720474325546029838911536645854016658738287810538248535167943048401518088275884360994522447081283976255652056784266085927715913003002338073550339930729586019785550085812920084971952784494047195550542735273493532002238413738149969014369426810026122179115108673871125270161926777717396432145405190153796887093089464334888549712739386389592196847353891969079332035195409407770223110583450176399233173852721971468580226906388907065682291829282704839192753805483705821353760692262887904886094845663298931749956912234282496355841821173993089465864518791095006027639953496042767023689710197973028632817727184498866638467324567019639370889678900544190692386873018896672246736368046857567131910710307733567633067262431422838734848 = 227244752509208666244300049023218501664936565624639514576438496566054518678090377206901554539288856996266272083253480866328891556584179795376043393179397439111001075307836347541226766323267029349186809052518632576630669501495499405707466858562586370759397778341785328471459982484138277798132149400741413820728000336730772425983653136275550466388207243285757977918 * 488949073121539043178895838851406898867685260673611653386208527301320256750945098442460084709113182789107256214464892052884956575049058585335894004824225556978848052700870382486432811126512032810019093103514678580988403100655092773497194921652262878660895819095201202957727846410635903499346091564032100080922029110666198856874372068909543752725127446446275130141704384403153173213642973260347547120927518593494548154693042687813126757638259919480596660015785042800818110628659177941014457037586017066097793890372685757581701160719984224943404424801688527463891923678834379546779645545606419569427574824563111913480237499286872496822079247464512671595600027881545516790726794520875456846025695904828856824122680691695622222780388482248652558345646407544371724549307907546148267185543556530668197490478774267191002528938567372851525085232721844598755176527243119768654561142351728852269053413174476938571816914419435443895662361267309023154705765158795604804766486473731465844362909575530822196003981963565054860681271890620405191718478159479477685388913902388171544282368568608479247952664643776459165494556000797676507398085908822491136. +16#aeb17ba36a5a62ac6f0aad2b264d0787363825b9f0edf1ddd6e3d06eb970b70c90d5a43da0e234d85a2bd692ac118318965a1fa855019b8c65f32487755dc5677e27863aa4e4a6a82a76884c4d5d78f5b7807151b0179ee3b387b2118211610d832d1e7367a0e3cd50cce3ce2810e3567fc3fddf180c5ccd0572dc0f8662ef54e864e6182c3f951deff6d4a6cead4322e9bf3d55276f9dbdab649fa18fbdeaa89c002e037bb9090b1a5907ab6d18de09f8f376efdc0341ae360aa732405bf83cfe8342d644443208cfb8ef0568cd597de1ce7389878e48863bf0ebf1538ce2c317d8ac9f81976ae51617d7f6939582a8c28375caab30052d8ddf1b2995fb3891ea4541ef3d92bff37b6726052e8d7530b1f64a3cdfbba9cc320b55b2504417ff21986ceaaab8d4f73fafca6076e04fda786562571c5482b1f06b9b2762f51f3c1734284916153b377f147feb9ab398cf9ee46ba272c0ec8685f5a3832ff4e32aca370591f68bf38523839bd7367ebe02170150e87c69c3ff0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 = 16#ffffffffffffffff514e845c95a59d5319bf93b6817098d5d7971aec9505825ab1147be429d33c3c85e64de35cbde5d4346523fbc587238f2dc034f4089e119df20a0ddedd415203a7f0a3197be55398eed8be064b7654f4ad47b9bba204f02e04e3d5765209f9606f5d9dbf3b5a2d3734c8f69d2c4677c7d19b6e7ce34b705b220cd214d02435619b89c579d4110f7904aee7c0461b50a48e35c911cea6aae434020aa597a1dc70510e6dab26caf2327ee50d24077a61b317d42479cdf6e1ff00000000000000000000000000000000000000000000000000000000000000000000000000000000 * 16#aeb17ba36a5a62ace6406c497e8f672a2868e5136afa7da54eeb841bd62cc3c37a19b21ca3421a2bcb9adc043a78dc70d23fcb0bf761ee620df5f22122beadfc580f5ce6841aac67112741f9b489ab0b52b846445dfb0fd1fb1c2a89adf6069f90a26240c4a5d2c8cb370962d3b988382e6491831cb48fa4ddf32deb2fdbca9e64763a862beef086fb51183fb9e4af5b71ca36ee3159551bcbfdf55a685e238faef19254d9350dcd811af2dbf8859e4ce82bdb8632091e0100000000000000000000000000000000000000000000000000000000000000000000000000000000. +16#34c8f69d2c4677c6d19b6e7ce34b705bd0be4db83a7e980e81ca31c352a076a32d17ccd3b115ce49dd214d2da4d36ea7ae1bbcc23ae3f69c1ca949af6143cea35124d82ffedc501525ca169af0b58ffb580f5ce6841aac67112741f9b489ab0b52b846445dfb0fd1fb1c2a89adf6069f90a26240c4a5d2c9 = 16#34c8f69d2c4677c7d19b6e7ce34b705b220cd214d02435619b89c579d4110f7904aee7c0461b50a48e35c911cea6aae434020aa597a1dc70510e6dab26caf2327ee50d24077a61b317d42479cdf6e1ff00000000000000000000000000000000000000000000000000000000000000000000000000000000 - 16#ffffffffffffffff514e845c95a59d5319bf93b6817098d5d7971aec9505825ab1147be429d33c3c85e64de35cbde5d4346523fbc587238f2dc034f4089e119df20a0ddedd415203a7f0a3197be55398eed8be064b7654f4ad47b9bba204f02e04e3d5765209f9606f5d9dbf3b5a2d37. diff --git a/erts/emulator/test/small_SUITE.erl b/erts/emulator/test/small_SUITE.erl index 7d49522f0592..aa732c1e24d9 100644 --- a/erts/emulator/test/small_SUITE.erl +++ b/erts/emulator/test/small_SUITE.erl @@ -23,10 +23,12 @@ -export([all/0, suite/0, groups/0]). -export([edge_cases/1, - addition/1, subtraction/1, negation/1, multiplication/1, division/1, - test_bitwise/1, test_bsl/1, + addition/1, subtraction/1, negation/1, + multiplication/1, mul_add/1, division/1, + test_bitwise/1, test_bsl/1, test_bsr/1, element/1, range_optimization/1]). +-export([mul_add/0, division/0]). -include_lib("common_test/include/ct.hrl"). @@ -40,8 +42,8 @@ all() -> groups() -> [{p, [parallel], [edge_cases, - addition, subtraction, negation, multiplication, division, - test_bitwise, test_bsl, + addition, subtraction, negation, multiplication, mul_add, division, + test_bitwise, test_bsl, test_bsr, element, range_optimization]}]. @@ -139,6 +141,7 @@ addition(_Config) -> %% merl:print(Tree), {ok,_Bin} = merl:compile_and_load(Tree, []), test_addition(Fs0, Mod), + unload(Mod), ok. add_gen_pairs() -> @@ -247,6 +250,7 @@ subtraction(_Config) -> %% merl:print(Tree), {ok,_Bin} = merl:compile_and_load(Tree, []), test_subtraction(Fs0, Mod), + unload(Mod), ok. sub_gen_pairs() -> @@ -340,6 +344,7 @@ negation(_Config) -> merl:print(Tree), {ok,_Bin} = merl:compile_and_load(Tree, []), test_negation(Fs0, Mod), + unload(Mod), ok. neg_gen_integers() -> @@ -405,6 +410,7 @@ multiplication(_Config) -> %% merl:print(Tree), {ok,_Bin} = merl:compile_and_load(Tree, []), test_multiplication(Fs0, Mod), + unload(Mod), ok. mul_gen_pairs() -> @@ -416,7 +422,9 @@ mul_gen_pairs() -> _ <- lists:seq(1, 75)], %% Generate pairs of numbers whose product is small. - Pairs1 = [{N, MaxSmall div N} || N <- [1,2,3,5,17,63,64,1111,22222]] ++ Pairs0, + SmallPairs = [{N, MaxSmall div N} || + N <- [1,2,3,4,5,8,16,17,32,63,64,1111,22222]], + Pairs1 = [{N,M-1} || {N,M} <- SmallPairs] ++ SmallPairs ++ Pairs0, %% Add prime factors of 2^59 - 1 (MAX_SMALL for 64-bit architecture %% at the time of writing). @@ -456,7 +464,11 @@ gen_mul_function({Name,{A,B}}) -> Res = Y * X; '@Name@'(X, fixed, number) when -_@APlusOne@ < X, X < _@APlusOne@ -> X * _@B@; + '@Name@'(X, fixed, any) -> + X * _@B@; '@Name@'(fixed, Y, number) when -_@BPlusOne@ < Y, Y < _@BPlusOne@ -> + _@A@ * Y; + '@Name@'(fixed, Y, any) -> _@A@ * Y. "). test_multiplication([{Name,{A,B}}|T], Mod) -> @@ -470,7 +482,9 @@ test_multiplication([{Name,{A,B}}|T], Mod) -> Res0 = F(-A, -B, false), Res0 = F(A, B, number), Res0 = F(fixed, B, number), + Res0 = F(fixed, B, any), Res0 = F(A, fixed, number), + Res0 = F(A, fixed, any), Res0 = F(-A, -B, number), Res1 = -(A * B), @@ -479,7 +493,9 @@ test_multiplication([{Name,{A,B}}|T], Mod) -> Res1 = F(-A, B, number), Res1 = F(A, -B, number), Res1 = F(-A, fixed, number), - Res1 = F(fixed, -B, number) + Res1 = F(-A, fixed, any), + Res1 = F(fixed, -B, number), + Res1 = F(fixed, -B, any) catch C:R:Stk -> io:format("~p failed. numbers: ~p ~p\n", [Name,A,B]), @@ -490,7 +506,215 @@ test_multiplication([{Name,{A,B}}|T], Mod) -> test_multiplication([], _) -> ok. +mul_add() -> + [{timetrap, {minutes, 5}}]. +mul_add(_Config) -> + _ = rand:uniform(), %Seed generator + io:format("Seed: ~p", [rand:export_seed()]), + Mod = list_to_atom(lists:concat([?MODULE,"_",?FUNCTION_NAME])), + Triples = mul_add_triples(), + Fs0 = gen_func_names(Triples, 0), + Fs = [gen_mul_add_function(F) || F <- Fs0], + Tree = ?Q(["-module('@Mod@').", + "-compile([export_all,nowarn_export_all]).", + "id(I) -> I."]) ++ Fs, + %% merl:print(Tree), + {ok,_Bin} = merl:compile_and_load(Tree, []), + test_mul_add(Fs0, Mod), + unload(Mod), + + test_mul_add_float(), + test_mul_add_exceptions(), + + ok. + +mul_add_triples() -> + {_, MaxSmall} = determine_small_limits(0), + SqrtMaxSmall = floor(math:sqrt(MaxSmall)), + + Numbers0 = [1,2,3,4,5,8,9, + (MaxSmall div 2) band -2, + MaxSmall band -2, + MaxSmall * 2], + Numbers = [rand:uniform(SqrtMaxSmall) || _ <- lists:seq(1, 5)] ++ Numbers0, + + %% Generate pairs of numbers whose product is small. + SmallPairs = [{MaxSmall div M,M} || M <- Numbers], + Pairs = [{N+M,M} || {N,M} <- SmallPairs] ++ SmallPairs, + + Triples0 = [{A,B,rand:uniform(MaxSmall)} || {A,B} <- Pairs], + Triples1a = [{A,B,abs(MaxSmall - A * B)} || {A,B} <- Pairs], + Triples1 = [{A,B,C+Offset} || + {A,B,C} <- Triples1a, + Offset <- [-2,-1,0,1,2], + C + Offset >= 0], + Triples2 = [{A,B,MaxSmall+1} || {A,B} <- Pairs], + [{3,4,5}, + {MaxSmall div 2,2,42}, %Result is not small. + {MaxSmall,MaxSmall,MaxSmall}|Triples0 ++ Triples1 ++ Triples2]. + +gen_mul_add_function({Name,{A,B,C}}) -> + APlusOne = A + 1, + BPlusOne = B + 1, + CPlusOne = C + 1, + ?Q("'@Name@'(int_vvv_plus_z, X, Y, Z) + when is_integer(X), is_integer(Y), is_integer(Z), + -_@APlusOne@ < X, X < _@APlusOne@, + -_@BPlusOne@ < Y, Y < _@BPlusOne@, + -_@CPlusOne@ < Z, Z < _@CPlusOne@ -> + Res = id(X * Y + Z), + Res = id(Y * X + Z), + Res = id(Z + X * Y), + Res = id(Z + Y * X), + Res; + '@Name@'(int_vvv_minus_z, X, Y, Z) + when is_integer(X), is_integer(Y), is_integer(Z), + -_@APlusOne@ < X, X < _@APlusOne@, + -_@BPlusOne@ < Y, Y < _@BPlusOne@, + -_@CPlusOne@ < Z, Z < _@CPlusOne@ -> + Res = id(X * Y - Z), + Res = id(Y * X - Z), + Res; + '@Name@'(pos_int_vvv_plus_z, X, Y, Z) + when is_integer(X), is_integer(Y), is_integer(Z), + 0 =< X, X < _@APlusOne@, + 0 =< Y, Y < _@BPlusOne@, + 0 =< Z, Z < _@CPlusOne@ -> + Res = id(X * Y + Z), + Res = id(Y * X + Z), + Res = id(Z + X * Y), + Res = id(Z + Y * X), + Res; + '@Name@'(neg_int_vvv_plus_z, X, Y, Z) + when is_integer(X), is_integer(Y), is_integer(Z), + -_@APlusOne@ < X, X < 0, + -_@BPlusOne@ < Y, Y < 0, + -_@CPlusOne@ < Z, Z < 0 -> + Res = id(X * Y + Z), + Res = id(Y * X + Z), + Res = id(Z + X * Y), + Res = id(Z + Y * X), + Res; + '@Name@'(any_vvv_plus_z, X, Y, Z) -> + Res = id(X * Y + Z), + Res = id(Y * X + Z), + Res = id(Z + X * Y), + Res = id(Z + Y * X), + Res = '@Name@'(int_vvv_plus_z, id(X), id(Y), id(Z)), + Res; + '@Name@'(any_vvv_minus_z, X, Y, Z) -> + Res = id(X * Y - Z), + Res = id(Y * X - Z), + Res = '@Name@'(int_vvv_minus_z, id(X), id(Y), id(Z)), + Res; + '@Name@'(any_vvi_plus_z, X, Y, _Z) -> + Z = _@C@, + Res = id(X * Y + Z), + Res = id(Y * X + Z), + Res = id(Z + X * Y), + Res = id(Z + Y * X), + Res = '@Name@'(any_vvv_plus_z, X, Y, id(Z)), + Res = '@Name@'(any_vvv_minus_z, X, Y, id(-Z)), + Res; + '@Name@'(any_vvi_minus_z, X, Y, _Z) -> + Z = _@C@, + Res = id(X * Y - Z), + Res = id(Y * X - Z), + Res = id(-Z + X * Y), + Res = id(-Z + Y * X), + Res = '@Name@'(any_vvv_plus_z, X, Y, id(-Z)), + Res = '@Name@'(any_vvv_minus_z, X, Y, id(Z)), + Res; + '@Name@'(any_vii_plus_z, X, fixed, fixed) -> + Y = _@B@, + Z = _@C@, + Res = id(X * Y + Z), + Res = id(Y * X + Z), + Res = id(Z + X * Y), + Res = id(Z + Y * X), + Res = '@Name@'(any_vvi_plus_z, X, id(Y), fixed), + Res = '@Name@'(any_vvv_minus_z, X, id(Y), id(-Z)), + Res; + '@Name@'(any_vii_minus_z, X, fixed, fixed) -> + Y = _@B@, + Z = _@C@, + Res = id(X * Y - Z), + Res = id(Y * X - Z), + Res = id(-Z + X * Y), + Res = id(-Z + Y * X), + Res = '@Name@'(any_vvi_minus_z, X, id(Y), fixed), + Res = '@Name@'(any_vvv_plus_z, X, Y, id(-Z)), + Res; + '@Name@'({guard_plus_z,Res}, X, Y, Z) when X * Y + Z =:= Res -> + ok; + '@Name@'({guard_minus_z,Res}, X, Y, Z) when X * Y - Z =:= Res -> + ok. "). + +test_mul_add([{Name,{A,B,C}}|T], Mod) -> + F = fun Mod:Name/4, + try + Res0 = A * B + C, + Res0 = F(any_vii_plus_z, A, fixed, fixed), + Res0 = F(pos_int_vvv_plus_z, A, B, C), + ok = F({guard_plus_z,Res0}, A, B, C), + ok = F({guard_plus_z,Res0}, -A, -B, C), + + Res1 = A * B - C, + Res1 = F(any_vii_minus_z, A, fixed, fixed), + Res1 = if + A > 0, B > 0, C > 0 -> + F(neg_int_vvv_plus_z, -A, -B, -C); + true -> + Res1 + end, + ok = F({guard_minus_z,Res1}, A, B, C), + ok = F({guard_minus_z,Res1}, -A, -B, C), + + Res2 = -A * B + C, + Res2 = A * -B + C, + Res2 = F(any_vii_plus_z, -A, fixed, fixed), + ok = F({guard_plus_z,Res2}, -A, B, C), + + Res3 = -A * B - C, + Res3 = A * -B - C, + Res3 = F(any_vii_minus_z, -A, fixed, fixed), + ok = F({guard_minus_z,Res3}, -A, B, C) + catch + Class:R:Stk -> + io:format("~p failed. numbers: ~p ~p ~p\n", [Name,A,B,C]), + erlang:raise(Class, R, Stk) + end, + test_mul_add(T, Mod); +test_mul_add([], _) -> + ok. + +test_mul_add_float() -> + Res = madd(id(2.0), id(3.0), id(7.0)), + Res = madd(id(2.0), id(3.0), id(7)), + ok = madd(id(2.0), id(3.0), id(7), id(Res)). + +test_mul_add_exceptions() -> + error = madd(id(a), id(2), id(3), id(whatever)), + error = madd(id(7), id(b), id(3), id(whatever)), + error = madd(id(7), id(15), id(c), id(whatever)), + + {'EXIT',{badarith,[{erlang,'*',[a,2],_}|_]}} = catch madd(id(a), id(2), id(0)), + {'EXIT',{badarith,[{erlang,'*',[a,2],_}|_]}} = catch madd(id(a), id(2), id(42)), + {'EXIT',{badarith,[{erlang,'*',[a,2],_}|_]}} = catch madd(id(a), id(2), id(c)), + {'EXIT',{badarith,[{erlang,'*',[3,b],_}|_]}} = catch madd(id(3), id(b), id(c)), + {'EXIT',{badarith,[{erlang,'+',[6,c],_}|_]}} = catch madd(id(2), id(3), id(c)), + + ok. + +madd(A, B, C) -> A * B + C. + +madd(A, B, C, Res) when Res =:= A * B + C -> ok; +madd(_, _, _, _) -> error. + + %% Test that the JIT only omits the overflow check when it's safe. +division() -> + [{timetrap, {minutes, 5}}]. division(_Config) -> _ = rand:uniform(), %Seed generator io:format("Seed: ~p", [rand:export_seed()]), @@ -507,6 +731,8 @@ division(_Config) -> 3 = ignore_rem(ignore, 10, 3), 1 = ignore_div(ignore, 16, 5), + unload(Mod), + ok. ignore_rem(_, X, Y) -> @@ -721,6 +947,7 @@ gen_div_function({Name,{A,B}}) -> R = X rem Y, {Q, R}. "). + test_division([{Name,{A,B}}|T], Mod) -> F = fun Mod:Name/3, try @@ -802,6 +1029,7 @@ test_bitwise(_Config) -> merl:print(Tree), {ok,_Bin} = merl:compile_and_load(Tree, []), test_bitwise(Fs0, Mod), + unload(Mod), %% Test invalid operands. expect_badarith(fun(X) -> 42 band X end), @@ -932,6 +1160,7 @@ test_bsl(_Config) -> %% merl:print(Tree), {ok,_Bin} = merl:compile_and_load(Tree, []), test_bsl(Fs0, Mod), + unload(Mod), ok. bsl_gen_pairs() -> @@ -990,6 +1219,93 @@ test_bsl([{Name,{N,S}}|T], Mod) -> test_bsl([], _) -> ok. +test_bsr(_Config) -> + _ = rand:uniform(), %Seed generator + io:format("Seed: ~p", [rand:export_seed()]), + Mod = list_to_atom(lists:concat([?MODULE,"_",?FUNCTION_NAME])), + Pairs = bsr_gen_pairs(), + Fs0 = gen_func_names(Pairs, 0), + Fs = [gen_bsr_function(F) || F <- Fs0], + Tree = ?Q(["-module('@Mod@').", + "-compile([export_all,nowarn_export_all]).", + "id(I) -> I."]) ++ Fs, + %% merl:print(Tree), + {ok,_Bin} = merl:compile_and_load(Tree, []), + test_bsr(Fs0, Mod), + unload(Mod), + ok. + +bsr_gen_pairs() -> + {_MinSmall, MaxSmall} = determine_small_limits(0), + SmallBits = num_bits(MaxSmall), + + {Powers,Shifts} = + if + SmallBits < 32 -> + {lists:seq(15, SmallBits+2), + lists:seq(0, 7) ++ lists:seq(24, 36)}; + true -> + {lists:seq(30, SmallBits+2), + lists:seq(0, 7) ++ lists:seq(56, 72)} + end, + + [{N,S} || + P <- Powers, + N <- [rand:uniform(1 bsl P), (1 bsl P)-1], + S <- Shifts]. + +gen_bsr_function({Name,{N,S}}) -> + Mask = (1 bsl num_bits(N)) - 1, + ?Q("'@Name@'(N0, fixed, More) -> + Res = N0 bsr _@S@, + if + More -> + N = N0 band _@Mask@, + Res = N0 bsr _@S@, + Res = N bsr _@S@; + true -> + Res + end; + '@Name@'(N0, S, More) -> + Res = id(N0 bsr S), + if + More -> + N = N0 band _@Mask@, + Res = id(N0 bsr S), + Res = id(N bsr S), + if + S >= 0 -> + Res = id(N bsr S); + true -> + Res + end; + true -> + Res + end. "). + +test_bsr([{Name,{N,S}}|T], Mod) -> + try + Res = N bsr S, + Res = Mod:Name(N, fixed, true), + Res = Mod:Name(N, S, true), + + NegRes = -N bsr S, + NegRes = Mod:Name(-N, fixed, false), + + NegRes = -N bsr S, + NegRes = Mod:Name(-N, S, false), + + BslRes = N bsr -S, + BslRes = Mod:Name(N, -S, false) + catch + C:R:Stk -> + io:format("~p failed. numbers: ~p ~p\n", [Name,N,S]), + erlang:raise(C, R, Stk) + end, + test_bsr(T, Mod); +test_bsr([], _) -> + ok. + element(_Config) -> %% Test element_1: Can't fail for integer arguments. zero = element_1(0), @@ -1198,4 +1514,8 @@ determine_small_limits(N) -> false -> {-1 bsl (N - 1), (1 bsl (N - 1)) - 1} end. +unload(Mod) -> + _ = code:delete(Mod), + code:purge(Mod). + id(I) -> I.