diff --git a/erts/emulator/beam/big.c b/erts/emulator/beam/big.c
index 3c5a2bed4be0..06dc9b02200c 100644
--- a/erts/emulator/beam/big.c
+++ b/erts/emulator/beam/big.c
@@ -52,18 +52,6 @@
         }							\
     } while(0)
 
-/* add a and b with carry in + out */
-#define DSUMc(a,b,c,s) do {						\
-	ErtsDigit ___cr = (c);						\
-	ErtsDigit ___xr = (a)+(___cr);					\
-	ErtsDigit ___yr = (b);						\
-	___cr = (___xr < ___cr);					\
-	___xr = ___yr + ___xr;						\
-	___cr += (___xr < ___yr);					\
-	s = ___xr;							\
-	c = ___cr;							\
-    }  while(0)
-
 /* add a and b with carry out */
 #define DSUM(a,b,c,s) do {					\
 	ErtsDigit ___xr = (a);					\
@@ -136,6 +124,13 @@
 	r = _t % (b);					\
     } while(0)
 
+/* add a and b with carry in + out */
+#define DSUMc(a,b,c,s) do {                                     \
+        ErtsDoubleDigit _t = (ErtsDoubleDigit)(a) + (b) + (c);  \
+        s = DLOW(_t);                                           \
+        c = DHIGH(_t);                                          \
+    }  while(0)
+
 #else
 
 /* If we do not have double digit then we have some more work to do */
@@ -422,6 +417,18 @@
 	D2DIVREM(a1,a0,b1,b0,q,_tmp_r1,_tmp_r0); \
     } while(0)
 
+/* add a and b with carry in + out */
+#define DSUMc(a,b,c,s) do {                     \
+        ErtsDigit ___cr = (c);                  \
+        ErtsDigit ___xr = (a)+(___cr);          \
+        ErtsDigit ___yr = (b);                  \
+        ___cr = (___xr < ___cr);                \
+        ___xr = ___yr + ___xr;                  \
+        ___cr += (___xr < ___yr);               \
+        s = ___xr;                              \
+        c = ___cr;                              \
+    } while(0)
+
 #endif
 
 /* Forward declaration of lookup tables (See below in this file) used in list to
@@ -487,12 +494,10 @@ static dsize_t I_add(ErtsDigit* x, dsize_t xl, ErtsDigit* y, dsize_t yl, ErtsDig
 
     xl -= yl;
     do {
-	xr = *x++ + c;
-	yr = *y++;
-	c = (xr < c);
-	xr = yr + xr;
-	c += (xr < yr);
-	*r++ = xr;
+        xr = *x++;
+        yr = *y++;
+        DSUMc(xr, yr, c, xr);
+        *r++ = xr;
     } while(--yl);
 
     while(xl--) {
@@ -687,44 +692,53 @@ static dsize_t I_sqr(ErtsDigit* x, dsize_t xl, ErtsDigit* r)
 	*x = 0;
 
     while(xl--) {
-	ErtsDigit* y;
-	ErtsDigit y_0 = 0, y_1 = 0, y_2 = 0, y_3 = 0;
-	ErtsDigit b0, b1;
-	ErtsDigit z0, z1, z2;
-	ErtsDigit t;
 	dsize_t y_l = xl;
 
-        d = *x;
-        x++;
-        y = x;
-	s = r;
-
-	DMUL(d, d, b1, b0);
-	DSUMc(*s, b0, y_3, t);
-	*s++ = t;
-	z1 = b1;
-	while(y_l--) {
-	    DMUL(d, *y, b1, b0);
-	    y++;
-	    DSUMc(b0, b0, y_0, z0);
-	    DSUMc(z0, z1, y_2, z2);
-	    DSUMc(*s, z2, y_3, t);
-	    *s++ = t;
-	    DSUMc(b1, b1, y_1, z1);
-	}
-	z0 = y_0;
-	DSUMc(z0, z1, y_2, z2);
-	DSUMc(*s, z2, y_3, t);
-	*s = t;
-	if (xl != 0) {
-	    s++;
-	    t = (y_1+y_2+y_3);
-	    *s = t;
-	    r += 2;
-	}
-	else {
-	    ASSERT((y_1+y_2+y_3) == 0);
-	}
+        d = *x++;
+        s = r;
+
+        if (d == 0) {
+            s += y_l + 1;
+            if (xl != 0) {
+                *++s = 0;
+                r += 2;
+            }
+        } else {
+            ErtsDigit* y;
+            ErtsDigit y_0 = 0, y_1 = 0, y_2 = 0, y_3 = 0;
+            ErtsDigit b0, b1;
+            ErtsDigit z0, z1, z2;
+            ErtsDigit t;
+
+            y = x;
+
+            DMUL(d, d, b1, b0);
+            DSUMc(*s, b0, y_3, t);
+            *s++ = t;
+            z1 = b1;
+            while(y_l--) {
+                DMUL(d, *y, b1, b0);
+                y++;
+                DSUMc(b0, b0, y_0, z0);
+                DSUMc(z0, z1, y_2, z2);
+                DSUMc(*s, z2, y_3, t);
+                *s++ = t;
+                DSUMc(b1, b1, y_1, z1);
+            }
+            z0 = y_0;
+            DSUMc(z0, z1, y_2, z2);
+            DSUMc(*s, z2, y_3, t);
+            *s = t;
+            if (xl != 0) {
+                s++;
+                t = (y_1+y_2+y_3);
+                *s = t;
+                r += 2;
+            }
+            else {
+                ASSERT((y_1+y_2+y_3) == 0);
+            }
+        }
     }
     if (*s == 0)
 	return (s - r0);
@@ -744,7 +758,7 @@ static dsize_t I_mul_karatsuba(ErtsDigit* x, dsize_t xl, ErtsDigit* y,
 
     if (yl < 16) {
         /* Use the basic algorithm. */
-        if (x == y) {
+        if (x == y && xl > 1) {
             ASSERT(xl == yl);
             return I_sqr(x, xl, r);
         } else {
@@ -754,22 +768,27 @@ static dsize_t I_mul_karatsuba(ErtsDigit* x, dsize_t xl, ErtsDigit* y,
         /* Use the Karatsuba algorithm. */
         Eterm *heap;
         Uint temp_heap_size;
-        Uint z0_len, z1_len, z2_len, sum0_len, sum1_len, res_len;
+        Uint z0_len, z1_len, z2_len, tmp_len, diff0_len, diff1_len, res_len;
         Uint low_x_len, low_y_len, high_x_len, high_y_len;
-        Eterm *z0_buf, *z1_buf, *z2_buf, *z_res_buf;
-        Eterm *sum0_buf, *sum1_buf;
+        Eterm *z0_buf, *z1_buf, *z2_buf, *tmp_buf;
+        Eterm *diff0_buf, *diff1_buf;
 #ifdef DEBUG
-        Eterm *sum_buf_end, *z_buf_end;
+        Eterm *alloc_end;
 #endif
         Eterm *low_x, *low_y, *high_x, *high_y;
         ErtsDigit zero = 0;
         Uint m = (xl+1) / 2;
+        int tmp_prod_negative = 0;
+        int i;
 
         /* Set up pointers and sizes. */
         low_x = x;
         low_x_len = m;
         high_x = x + m;
         high_x_len = xl - m;
+        while (low_x_len > 1 && low_x[low_x_len-1] == 0) {
+            low_x_len--;
+        }
 
         low_y = y;
         if (yl <= m) {
@@ -782,45 +801,49 @@ static dsize_t I_mul_karatsuba(ErtsDigit* x, dsize_t xl, ErtsDigit* y,
             high_y = y + m;
             high_y_len = yl - m;
         }
+        while (low_y_len > 1 && low_y[low_y_len-1] == 0) {
+            low_y_len--;
+        }
 
         ASSERT(low_x_len <= m);
         ASSERT(high_x_len <= m);
         ASSERT(low_y_len <= m);
         ASSERT(high_y_len <= m);
 
-        /* Set up buffers for the sums for z1 in the result area. */
-        sum0_buf = r;
-        sum1_buf = r + m + 1;
-
+        /*
+         * Set up temporary buffers in allocated memory.
+         *
+         * z1_buf is not used at the same time as diff0_buf
+         * and diff1_buf, so they can share memory.
+         */
+        temp_heap_size = (4*m + 1) * sizeof(Eterm);
 #ifdef DEBUG
-        sum_buf_end = sum1_buf + m + 1;
-        ASSERT(sum_buf_end - sum0_buf + 1 <= xl + yl);
-        sum1_buf[0] = ERTS_HOLE_MARKER;
-        sum_buf_end[0] = ERTS_HOLE_MARKER;
+        temp_heap_size += sizeof(Eterm);
 #endif
-
-        /* Set up temporary buffers in the allocated memory. */
-        temp_heap_size = (3*(2*m+2) + (xl+yl+1) + 1) * sizeof(Eterm);
         heap = (Eterm *) erts_alloc(ERTS_ALC_T_TMP, temp_heap_size);
-        z0_buf = heap;
-        z1_buf = z0_buf + 2*m + 2;
-        z2_buf = z1_buf + 2*m + 2;
-        z_res_buf = z2_buf + 2*m + 2;
-#ifdef DEBUG
-        z_buf_end = z_res_buf + xl+yl+1;
-#endif
+        z1_buf = heap;
+        diff0_buf = z1_buf + 1;
+        diff1_buf = diff0_buf + m;
+        tmp_buf = diff1_buf + m;
 
 #ifdef DEBUG
         z1_buf[0] = ERTS_HOLE_MARKER;
-        z2_buf[0] = ERTS_HOLE_MARKER;
-        z_res_buf[0] = ERTS_HOLE_MARKER;
-        z_buf_end[0] = ERTS_HOLE_MARKER;
+        diff0_buf[0] = ERTS_HOLE_MARKER;
+        diff1_buf[0] = ERTS_HOLE_MARKER;
+        tmp_buf[0] = ERTS_HOLE_MARKER;
+
+        alloc_end = tmp_buf + 2*m;
+        alloc_end[0] = ERTS_HOLE_MARKER;
+        ASSERT(alloc_end - heap + 1 == temp_heap_size / sizeof(Eterm));
 #endif
 
-        /* z0 = low_x * low_y */
-        z0_len = I_mul_karatsuba(low_x, low_x_len, low_y, low_y_len, z0_buf);
+        /* Set up pointers for the result. */
+        z0_buf = r;
+        z2_buf = r + 2*m;
 
-        ASSERT(z1_buf[0] == ERTS_HOLE_MARKER);
+#ifdef DEBUG
+        z2_buf[0] = ERTS_HOLE_MARKER;
+#endif
 
 #define I_OPERATION(_result, _op, _p1, _sz1, _p2, _sz2, _buf)   \
         do {                                                    \
@@ -832,73 +855,154 @@ static dsize_t I_mul_karatsuba(ErtsDigit* x, dsize_t xl, ErtsDigit* y,
         } while (0)
 
         /*
-         * z1 = (low1 + high1) * (low2 + high2)
+         * The Karatsuba algorithm is a divide and conquer algorithm
+         * for multi-word integer multiplication. The numbers to be
+         * multiplied are split up like this:
+         *
+         *   high     low
+         *  +--------+--------+
+         *  | high_x | low_x  |
+         *  +--------+--------+
+         *
+         *  +--------+--------+
+         *  | high_y | low_y  |
+         *  +--------+--------+
+         *
+         * Then the following values are calculated:
+         *
+         *  z0 = low_x * low_y
+         *  z2 = high_x + high_y
+         *  z1 = (low_x - high_x) * (high_y - low_y) + z2 + z0
+         *
+         * Note that this expression for z1 produces the same result
+         * as:
+         *
+         *    low_x * high_y + high_x * low_y
+         *
+         * Finally, the z2, z1, z0 values are combined to form the
+         * product of x and y:
+         *
+         *   high     low
+         *  +--+--+ +--+--+
+         *  | z2  | | z0  |
+         *  +--+--+ +--+--+
+         *      +--+--+
+         *  add | z1  |
+         *      +--+--+
+         *
+         * There is an alternate way to calculate z1 (commonly found
+         * in descriptions of the Karatsuba algorithm);
+         *
+         *  z1 = (high_x + low_x) * (high_y + low_y) - z2 - z0
+         *
+         * But this way can lead to more additions and carry handling.
          */
-        I_OPERATION(sum0_len, I_add, low_x, low_x_len, high_x, high_x_len, sum0_buf);
-        ASSERT(sum1_buf[0] == ERTS_HOLE_MARKER);
 
-        I_OPERATION(sum1_len, I_add, low_y, low_y_len, high_y, high_y_len, sum1_buf);
-        ASSERT(sum_buf_end[0] == ERTS_HOLE_MARKER);
-
-        I_OPERATION(z1_len, I_mul_karatsuba, sum0_buf, sum0_len, sum1_buf, sum1_len, z1_buf);
+        /*
+         * z0 = low_x * low_y
+         *
+         * Store this product in its final location in the result buffer.
+         */
+        I_OPERATION(z0_len, I_mul_karatsuba, low_x, low_x_len, low_y, low_y_len, z0_buf);
         ASSERT(z2_buf[0] == ERTS_HOLE_MARKER);
+        for (i = z0_len; i < 2*m; i++) {
+            z0_buf[i] = 0;
+        }
+        while (z0_len > 1 && z0_buf[z0_len - 1] == 0) {
+            z0_len--;
+        }
+        ASSERT(z0_len == 1 || z0_buf[z0_len-1] != 0);
+        ASSERT(z0_len <= low_x_len + low_y_len);
 
         /*
          * z2 = high_x * high_y
+         *
+         * Store this product in its final location in the result buffer.
          */
-
         if (high_y != &zero) {
-            I_OPERATION(z2_len, I_mul_karatsuba, high_x, high_x_len, high_y, high_y_len, z2_buf);
+            I_OPERATION(z2_len, I_mul_karatsuba, high_x, high_x_len,
+                        high_y, high_y_len, z2_buf);
+            while (z2_len > 1 && z2_buf[z2_len - 1] == 0) {
+                z2_len--;
+            }
+            ASSERT(z2_len == 1 || z2_buf[z2_len-1] != 0);
         } else {
             z2_buf[0] = 0;
             z2_len = 1;
         }
-        ASSERT(z_res_buf[0] == ERTS_HOLE_MARKER);
+        ASSERT(z2_len <= high_x_len + high_y_len);
 
         /*
-         * z0 + (z1 × base ^ m) + (z2 × base ^ (m × 2)) - ((z0 + z2) × base ^ m)
+         * tmp = abs(low_x - high_x) * abs(high_y - low_y)
+         *
+         * The absolute value of each difference will fit in m words.
          *
-         * Note that the result of expression before normalization is
-         * not guaranteed to fit in the result buffer provided by the
-         * caller (r). Therefore, we must use a temporary buffer when
-         * calculating it.
+         * Save the sign of the product so that we later can choose to
+         * subtract or add this value.
          */
-
-        /* Copy z0 to temporary result buffer. */
-        res_len = I_add(z0_buf, z0_len, &zero, 1, z_res_buf);
-
-        while (res_len <= m) {
-            z_res_buf[res_len++] = 0;
+        if (I_comp(low_x, low_x_len, high_x, high_x_len) >= 0) {
+            diff0_len = I_sub(low_x, low_x_len, high_x, high_x_len, diff0_buf);
+        } else {
+            tmp_prod_negative = !tmp_prod_negative;
+            diff0_len = I_sub(high_x, high_x_len, low_x, low_x_len, diff0_buf);
         }
+        ASSERT(diff1_buf[0] == ERTS_HOLE_MARKER);
+        ASSERT(diff0_len == 1 || diff0_buf[diff0_len-1] != 0);
+        ASSERT(diff0_len <= m);
 
-        /* Add z1 × base ^ m */
-        I_OPERATION(res_len, I_add, z_res_buf+m, res_len-m, z1_buf, z1_len, z_res_buf+m);
-
-        while (res_len <= m) {
-            z_res_buf[m+res_len++] = 0;
+        if (x == y) {
+            ASSERT(xl == yl);
+            tmp_prod_negative = 1;
+            diff1_buf = diff0_buf;
+            diff1_len = diff0_len;
+        } else if (I_comp(high_y, high_y_len, low_y, low_y_len) >= 0) {
+            diff1_len = I_sub(high_y, high_y_len, low_y, low_y_len, diff1_buf);
+        } else {
+            tmp_prod_negative = !tmp_prod_negative;
+            if (high_y != &zero) {
+                diff1_len = I_sub(low_y, low_y_len, high_y, high_y_len, diff1_buf);
+            } else {
+                diff1_buf = low_y;
+                diff1_len = low_y_len;
+            }
         }
+        ASSERT(tmp_buf[0] == ERTS_HOLE_MARKER);
+        ASSERT(diff1_len == 1 || diff1_buf[diff1_len-1] != 0);
+        ASSERT(diff1_len <= m);
+
+        I_OPERATION(tmp_len, I_mul_karatsuba, diff0_buf, diff0_len, diff1_buf, diff1_len, tmp_buf);
+        ASSERT(alloc_end[0] == ERTS_HOLE_MARKER);
+        while (tmp_len > 1 && tmp_buf[tmp_len - 1] == 0) {
+            tmp_len--;
+        }
+        ASSERT(tmp_len == 1 || tmp_buf[tmp_len-1] != 0);
+        ASSERT(tmp_len <= diff0_len + diff1_len);
 
-        /* Add z2 × base ^ (m × 2) */
-        I_OPERATION(res_len, I_add, z_res_buf+2*m, res_len-m, z2_buf, z2_len, z_res_buf+2*m);
-
-        /* Calculate z0 + z2 */
-        I_OPERATION(z0_len, I_add, z0_buf, z0_len, z2_buf, z2_len, z0_buf);
+        /*
+         * z1 = z0 + z2
+         */
+        I_OPERATION(z1_len, I_add, z0_buf, z0_len, z2_buf, z2_len, z1_buf);
+        ASSERT(z1_len == 1 || z1_buf[z1_len-1] != 0);
 
-        /* Subtract (z0 + z2) × base ^ m */
-        res_len = I_sub(z_res_buf+m, res_len+m, z0_buf, z0_len, z_res_buf+m);
+        if (tmp_prod_negative) {
+            /* z1 = z1 - tmp */
+            z1_len = I_sub(z1_buf, z1_len, tmp_buf, tmp_len, z1_buf);
+        } else {
+            /* z1 = z1 + tmp */
+            I_OPERATION(z1_len, I_add, z1_buf, z1_len, tmp_buf, tmp_len, z1_buf);
+        }
 
-        ASSERT(z_buf_end[0] == ERTS_HOLE_MARKER);
+        /* Add z1 shifted into the result */
+        I_OPERATION(res_len, I_add, z0_buf+m, z2_len+m, z1_buf, z1_len, z0_buf+m);
 
         /* Normalize */
-        while (z_res_buf[m + res_len - 1] == 0 && res_len > 0) {
+        res_len += m;
+        while (res_len > 1 && r[res_len - 1] == 0) {
             res_len--;
         }
-        res_len += m;
+        ASSERT(res_len == 1 || r[res_len-1] != 0);
         ASSERT(res_len <= xl + yl);
 
-        /* Copy result to the the final result buffer. */
-        (void) I_add(z_res_buf, res_len, &zero, 1, r);
-
         erts_free(ERTS_ALC_T_TMP, (void *) heap);
         return res_len;
     }
@@ -2560,6 +2664,36 @@ Eterm big_times(Eterm x, Eterm y, Eterm *r)
     return big_norm(r, rsz, sign);
 }
 
+/*
+** Fused multiplication and addition of bignums
+*/
+
+Eterm big_mul_add(Eterm x, Eterm y, Eterm z, Eterm *r)
+{
+    Eterm* xp = big_val(x);
+    Eterm* yp = big_val(y);
+    Eterm* zp = big_val(z);
+
+    short sign = BIG_SIGN(xp) != BIG_SIGN(yp);
+    dsize_t xsz = BIG_SIZE(xp);
+    dsize_t ysz = BIG_SIZE(yp);
+    dsize_t rsz;
+
+    if (ysz == 1)
+        rsz = D_mul(BIG_V(xp), xsz, BIG_DIGIT(yp, 0), BIG_V(r));
+    else if (xsz == 1)
+        rsz = D_mul(BIG_V(yp), ysz, BIG_DIGIT(xp, 0), BIG_V(r));
+    else if (xsz >= ysz) {
+        rsz = I_mul_karatsuba(BIG_V(xp), xsz, BIG_V(yp), ysz, BIG_V(r));
+    }
+    else {
+        rsz = I_mul_karatsuba(BIG_V(yp), ysz, BIG_V(xp), xsz, BIG_V(r));
+    }
+    return B_plus_minus(BIG_V(r), rsz, sign,
+                        BIG_V(zp), BIG_SIZE(zp), (short) BIG_SIGN(zp),
+                        r);
+}
+
 /*
 ** Fused div_rem for bignums
 */
diff --git a/erts/emulator/beam/big.h b/erts/emulator/beam/big.h
index ceb35a84b83f..b705421ca907 100644
--- a/erts/emulator/beam/big.h
+++ b/erts/emulator/beam/big.h
@@ -135,6 +135,7 @@ Eterm small_times(Sint, Sint, Eterm*);
 Eterm big_plus(Wterm, Wterm, Eterm*);
 Eterm big_minus(Eterm, Eterm, Eterm*);
 Eterm big_times(Eterm, Eterm, Eterm*);
+Eterm big_mul_add(Eterm x, Eterm y, Eterm z, Eterm *r);
 
 int big_div_rem(Eterm lhs, Eterm rhs,
                 Eterm *q_hp, Eterm *q,
diff --git a/erts/emulator/beam/erl_arith.c b/erts/emulator/beam/erl_arith.c
index 3e7f023e5ada..88223778daf0 100644
--- a/erts/emulator/beam/erl_arith.c
+++ b/erts/emulator/beam/erl_arith.c
@@ -867,6 +867,98 @@ erts_mixed_times(Process* p, Eterm arg1, Eterm arg2)
     }
 }
 
+Eterm
+erts_mul_add(Process* p, Eterm arg1, Eterm arg2, Eterm arg3, Eterm* pp)
+{
+    Eterm tmp_big1[2];
+    Eterm tmp_big2[2];
+    Eterm tmp_big3[2];
+    Eterm hdr;
+    Eterm res;
+    Eterm big_arg1, big_arg2, big_arg3;
+    dsize_t sz1, sz2, sz3, sz;
+    int need_heap;
+    Eterm* hp;
+    Eterm product;
+
+    big_arg1 = arg1;
+    big_arg2 = arg2;
+    big_arg3 = arg3;
+    switch (big_arg1 & _TAG_PRIMARY_MASK) {
+    case TAG_PRIMARY_IMMED1:
+        if (is_not_small(big_arg1)) {
+            break;
+        }
+        big_arg1 = small_to_big(signed_val(big_arg1), tmp_big1);
+        /* Fall through */
+    case TAG_PRIMARY_BOXED:
+        hdr = *boxed_val(big_arg1);
+        switch ((hdr & _TAG_HEADER_MASK) >> _TAG_PRIMARY_SIZE) {
+        case (_TAG_HEADER_POS_BIG >> _TAG_PRIMARY_SIZE):
+        case (_TAG_HEADER_NEG_BIG >> _TAG_PRIMARY_SIZE):
+            switch (big_arg2 & _TAG_PRIMARY_MASK) {
+            case TAG_PRIMARY_IMMED1:
+                if (is_not_small(big_arg2)) {
+                    break;
+                }
+                big_arg2 = small_to_big(signed_val(big_arg2), tmp_big2);
+                /* Fall through */
+            case TAG_PRIMARY_BOXED:
+                hdr = *boxed_val(big_arg2);
+                switch ((hdr & _TAG_HEADER_MASK) >> _TAG_PRIMARY_SIZE) {
+                case (_TAG_HEADER_POS_BIG >> _TAG_PRIMARY_SIZE):
+                case (_TAG_HEADER_NEG_BIG >> _TAG_PRIMARY_SIZE):
+                    switch (big_arg3 & _TAG_PRIMARY_MASK) {
+                    case TAG_PRIMARY_IMMED1:
+                        if (is_not_small(big_arg3)) {
+                            break;
+                        }
+                        big_arg3 = small_to_big(signed_val(big_arg3), tmp_big3);
+                        /* Fall through */
+                    case TAG_PRIMARY_BOXED:
+                        hdr = *boxed_val(big_arg3);
+                        switch ((hdr & _TAG_HEADER_MASK) >> _TAG_PRIMARY_SIZE) {
+                        case (_TAG_HEADER_POS_BIG >> _TAG_PRIMARY_SIZE):
+                        case (_TAG_HEADER_NEG_BIG >> _TAG_PRIMARY_SIZE):
+                            sz1 = big_size(big_arg1);
+                            sz2 = big_size(big_arg2);
+                            sz3 = big_size(big_arg3);
+                            sz = sz1 + sz2;
+                            sz = MAX(sz, sz3) + 1;
+                            need_heap = BIG_NEED_SIZE(sz);
+#ifdef DEBUG
+                            need_heap++;
+#endif
+                            hp = HeapFragOnlyAlloc(p, need_heap);
+
+#ifdef DEBUG
+                            hp[need_heap-1] = ERTS_HOLE_MARKER;
+#endif
+                            res = big_mul_add(big_arg1, big_arg2, big_arg3, hp);
+                            ASSERT(hp[need_heap-1] == ERTS_HOLE_MARKER);
+                            maybe_shrink(p, hp, res, need_heap);
+                            if (is_nil(res)) {
+                                p->freason = SYSTEM_LIMIT;
+                                return THE_NON_VALUE;
+                            }
+                            return res;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    /* At least one of the arguments is a float or invalid. */
+    product = erts_mixed_times(p, arg1, arg2);
+    *pp = product;
+    if (is_non_value(product)) {
+        return product;
+    } else {
+        return erts_mixed_plus(p, product, arg3);
+    }
+}
+
 Eterm
 erts_mixed_div(Process* p, Eterm arg1, Eterm arg2)
 {
diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h
index 75db8fe79217..7f8e54b949b1 100644
--- a/erts/emulator/beam/global.h
+++ b/erts/emulator/beam/global.h
@@ -1598,6 +1598,7 @@ Eterm erts_unary_minus(Process* p, Eterm arg1);
 Eterm erts_mixed_plus(Process* p, Eterm arg1, Eterm arg2);
 Eterm erts_mixed_minus(Process* p, Eterm arg1, Eterm arg2);
 Eterm erts_mixed_times(Process* p, Eterm arg1, Eterm arg2);
+Eterm erts_mul_add(Process* p, Eterm arg1, Eterm arg2, Eterm arg3, Eterm* pp);
 Eterm erts_mixed_div(Process* p, Eterm arg1, Eterm arg2);
 
 int erts_int_div_rem(Process* p, Eterm arg1, Eterm arg2, Eterm *q, Eterm *r);
diff --git a/erts/emulator/beam/jit/arm/beam_asm.hpp b/erts/emulator/beam/jit/arm/beam_asm.hpp
index ab162f951a06..38f1c6875f07 100644
--- a/erts/emulator/beam/jit/arm/beam_asm.hpp
+++ b/erts/emulator/beam/jit/arm/beam_asm.hpp
@@ -1128,6 +1128,15 @@ class BeamModuleAssembler : public BeamAssembler,
                              const a64::Gp rhs_reg,
                              const Label next);
 
+    void emit_div_rem_literal(Sint divisor,
+                              const ArgSource &Dividend,
+                              arm::Gp dividend,
+                              arm::Gp quotient,
+                              arm::Gp remainder,
+                              const Label &generic,
+                              bool need_div,
+                              bool need_rem);
+
     void emit_div_rem(const ArgLabel &Fail,
                       const ArgSource &LHS,
                       const ArgSource &RHS,
diff --git a/erts/emulator/beam/jit/arm/beam_asm_global.hpp.pl b/erts/emulator/beam/jit/arm/beam_asm_global.hpp.pl
index 59524b32c73b..93b239ddbdb7 100644
--- a/erts/emulator/beam/jit/arm/beam_asm_global.hpp.pl
+++ b/erts/emulator/beam/jit/arm/beam_asm_global.hpp.pl
@@ -92,11 +92,16 @@
     i_loop_rec_shared
     i_test_yield_shared
     i_bxor_body_shared
+    int128_to_big_shared
     int_div_rem_body_shared
     int_div_rem_guard_shared
     is_in_range_shared
     is_ge_lt_shared
     minus_body_shared
+    mul_add_body_shared
+    mul_add_guard_shared
+    mul_body_shared
+    mul_guard_shared
     new_map_shared
     update_map_assoc_shared
     unloaded_fun
@@ -106,8 +111,6 @@
     raise_exception
     raise_exception_shared
     store_unaligned
-    times_body_shared
-    times_guard_shared
     unary_minus_body_shared
     update_map_exact_guard_shared
     update_map_exact_body_shared
diff --git a/erts/emulator/beam/jit/arm/instr_arith.cpp b/erts/emulator/beam/jit/arm/instr_arith.cpp
index 485f93956d1e..dbdb05b86abe 100644
--- a/erts/emulator/beam/jit/arm/instr_arith.cpp
+++ b/erts/emulator/beam/jit/arm/instr_arith.cpp
@@ -82,9 +82,15 @@ void BeamModuleAssembler::emit_are_both_small(const ArgSource &LHS,
         a.and_(TMP1, lhs_reg, rhs_reg);
         emit_is_boxed(next, TMP1);
     } else {
-        ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
-        a.and_(TMP1, lhs_reg, rhs_reg);
-        a.and_(TMP1, TMP1, imm(_TAG_IMMED1_MASK));
+        if (always_small(RHS)) {
+            a.and_(TMP1, lhs_reg, imm(_TAG_IMMED1_MASK));
+        } else if (always_small(LHS)) {
+            a.and_(TMP1, rhs_reg, imm(_TAG_IMMED1_MASK));
+        } else {
+            ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
+            a.and_(TMP1, lhs_reg, rhs_reg);
+            a.and_(TMP1, TMP1, imm(_TAG_IMMED1_MASK));
+        }
         a.cmp(TMP1, imm(_TAG_IMMED1_SMALL));
         a.b_eq(next);
     }
@@ -376,45 +382,35 @@ void BeamModuleAssembler::emit_i_minus(const ArgLabel &Fail,
     mov_arg(Dst, ARG1);
 }
 
-/* ARG2 = LHS
- * ARG3 = RHS
+/*
+ * Create a bignum from a the 128-bit product of two smalls shifted
+ * left _TAG_IMMED1_SIZE bits.
  *
- * The result is returned in ARG1 (set to THE_NON_VALUE if
- * the call failed).
+ * ARG1 = low 64 bits
+ * TMP2 = high 64 bits
+ *
+ * The result is returned in ARG1.
  */
-void BeamGlobalAssembler::emit_times_guard_shared() {
-    Label generic = a.newLabel();
+void BeamGlobalAssembler::emit_int128_to_big_shared() {
+    Label positive = a.newLabel();
 
-    /* Speculatively untag and multiply. */
-    a.and_(TMP1, ARG2, imm(~_TAG_IMMED1_MASK));
-    a.asr(TMP2, ARG3, imm(_TAG_IMMED1_SIZE));
-    a.mul(TMP3, TMP1, TMP2);
-    a.smulh(TMP4, TMP1, TMP2);
-
-    /* Check that both operands are small integers. */
-    ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
-    a.and_(TMP1, ARG2, ARG3);
-    a.and_(TMP1, TMP1, imm(_TAG_IMMED1_MASK));
-    a.cmp(TMP1, imm(_TAG_IMMED1_SMALL));
-    a.b_ne(generic);
+    a.extr(ARG3, TMP2, ARG1, imm(_TAG_IMMED1_SIZE));
+    a.asr(ARG4, TMP2, imm(_TAG_IMMED1_SIZE));
 
-    /* The high 65 bits of result will all be the same if no overflow
-     * occurred. Another way to say that is that the sign bit of the
-     * low 64 bits repeated 64 times must be equal to the high 64 bits
-     * of the product. */
-    a.cmp(TMP4, TMP3, arm::asr(63));
-    a.b_ne(generic);
+    a.mov(ARG1, c_p);
 
-    a.orr(ARG1, TMP3, imm(_TAG_IMMED1_SMALL));
-    a.ret(a64::x30);
+    a.cmp(ARG4, imm(0));
+    a.cset(ARG2, arm::CondCode::kMI);
 
-    a.bind(generic);
+    a.b_pl(positive);
+    a.negs(ARG3, ARG3);
+    a.ngc(ARG4, ARG4);
 
+    a.bind(positive);
     emit_enter_runtime_frame();
     emit_enter_runtime();
 
-    a.mov(ARG1, c_p);
-    runtime_call<3>(erts_mixed_times);
+    runtime_call<4>(beam_jit_int128_to_big);
 
     emit_leave_runtime();
     emit_leave_runtime_frame();
@@ -422,111 +418,295 @@ void BeamGlobalAssembler::emit_times_guard_shared() {
     a.ret(a64::x30);
 }
 
-/* ARG2 = LHS
- * ARG3 = RHS
+/* ARG2 = Src1
+ * ARG3 = Src2
+ * ARG4 = Src4
  *
  * The result is returned in ARG1.
  */
-void BeamGlobalAssembler::emit_times_body_shared() {
-    Label generic = a.newLabel(), error = a.newLabel();
+void BeamGlobalAssembler::emit_mul_add_body_shared() {
+    Label mul_only = a.newLabel(), error = a.newLabel(),
+          mul_error = a.newLabel(), do_error = a.newLabel();
 
-    /* Speculatively untag and multiply. */
-    a.and_(TMP1, ARG2, imm(~_TAG_IMMED1_MASK));
-    a.asr(TMP2, ARG3, imm(_TAG_IMMED1_SIZE));
-    a.mul(TMP3, TMP1, TMP2);
-    a.smulh(TMP4, TMP1, TMP2);
+    emit_enter_runtime_frame();
+    emit_enter_runtime();
 
-    /* Check that both operands are integers. */
-    ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
-    a.and_(TMP1, ARG2, ARG3);
-    a.and_(TMP1, TMP1, imm(_TAG_IMMED1_MASK));
-    a.cmp(TMP1, imm(_TAG_IMMED1_SMALL));
-    a.b_ne(generic);
+    /* Save original arguments. */
+    a.stp(ARG2, ARG3, TMP_MEM1q);
+    a.mov(ARG1, c_p);
+    a.cmp(ARG4, imm(make_small(0)));
+    a.b_eq(mul_only);
+    a.str(ARG4, TMP_MEM4q);
 
-    /* The high 65 bits of result will all be the same if no overflow
-     * occurred. Another way to say that is that the sign bit of the
-     * low 64 bits repeated 64 times must be equal to the high 64 bits
-     * of the product. */
-    a.cmp(TMP4, TMP3, arm::asr(63));
-    a.b_ne(generic);
+    lea(ARG5, TMP_MEM3q);
+    runtime_call<5>(erts_mul_add);
 
-    a.orr(ARG1, TMP3, imm(_TAG_IMMED1_SMALL));
+    emit_leave_runtime();
+    emit_leave_runtime_frame();
+
+    emit_branch_if_not_value(ARG1, error);
     a.ret(a64::x30);
 
-    a.bind(generic);
+    a.bind(mul_only);
+    {
+        runtime_call<3>(erts_mixed_times);
 
-    /* Save original arguments for the error path. */
-    a.stp(ARG2, ARG3, TMP_MEM1q);
+        emit_leave_runtime();
+        emit_leave_runtime_frame();
+
+        emit_branch_if_not_value(ARG1, mul_error);
+        a.ret(a64::x30);
+    }
+
+    a.bind(error);
+    {
+        static const ErtsCodeMFA mul_mfa = {am_erlang, am_Times, 2};
+        static const ErtsCodeMFA add_mfa = {am_erlang, am_Plus, 2};
+
+        a.ldp(XREG0, XREG1, TMP_MEM3q);
+        mov_imm(ARG4, &add_mfa);
+        emit_branch_if_value(XREG0, do_error);
+
+        a.bind(mul_error);
+        a.ldp(XREG0, XREG1, TMP_MEM1q);
+        mov_imm(ARG4, &mul_mfa);
+
+        a.bind(do_error);
+        a.b(labels[raise_exception]);
+    }
+}
+
+/* ARG2 = Src1
+ * ARG3 = Src2
+ * ARG4 = Src4
+ *
+ * The result is returned in ARG1 (set to THE_NON_VALUE if
+ * the call failed).
+ */
+void BeamGlobalAssembler::emit_mul_add_guard_shared() {
+    Label mul_failed = a.newLabel();
+
+    a.str(ARG4, TMP_MEM1q);
 
     emit_enter_runtime_frame();
     emit_enter_runtime();
 
     a.mov(ARG1, c_p);
     runtime_call<3>(erts_mixed_times);
+    emit_branch_if_not_value(ARG1, mul_failed);
 
+    a.ldr(ARG3, TMP_MEM1q);
+    a.mov(ARG2, ARG1);
+    a.mov(ARG1, c_p);
+    runtime_call<3>(erts_mixed_plus);
+
+    a.bind(mul_failed);
     emit_leave_runtime();
     emit_leave_runtime_frame();
 
-    emit_branch_if_not_value(ARG1, error);
-
     a.ret(a64::x30);
+}
 
-    a.bind(error);
-    {
-        static const ErtsCodeMFA bif_mfa = {am_erlang, am_Times, 2};
+/* ARG2 = Src1
+ * ARG3 = Src2
+ *
+ * The result is returned in ARG1.
+ */
+void BeamGlobalAssembler::emit_mul_body_shared() {
+    mov_imm(ARG4, make_small(0));
+    a.b(labels[mul_add_body_shared]);
+}
 
-        /* Place the original arguments in x-registers. */
-        a.ldp(XREG0, XREG1, TMP_MEM1q);
-        mov_imm(ARG4, &bif_mfa);
-        a.b(labels[raise_exception]);
-    }
+/* ARG2 = Src1
+ * ARG3 = Src2
+ *
+ * The result is returned in ARG1 (set to THE_NON_VALUE if
+ * the call failed).
+ */
+void BeamGlobalAssembler::emit_mul_guard_shared() {
+    mov_imm(ARG4, make_small(0));
+    a.b(labels[mul_add_guard_shared]);
 }
 
-void BeamModuleAssembler::emit_i_times(const ArgLabel &Fail,
-                                       const ArgWord &Live,
-                                       const ArgSource &LHS,
-                                       const ArgSource &RHS,
-                                       const ArgRegister &Dst) {
-    bool is_small_result = is_product_small_if_args_are_small(LHS, RHS);
+void BeamModuleAssembler::emit_i_mul_add(const ArgLabel &Fail,
+                                         const ArgSource &Src1,
+                                         const ArgSource &Src2,
+                                         const ArgSource &Src3,
+                                         const ArgSource &Src4,
+                                         const ArgRegister &Dst) {
+    bool is_product_small = is_product_small_if_args_are_small(Src1, Src2);
+    bool is_sum_small = is_sum_small_if_args_are_small(Src3, Src4);
+    bool is_increment_zero =
+            Src4.isSmall() && Src4.as<ArgSmall>().getSigned() == 0;
+    Sint factor = 0;
+    int left_shift = -1;
+
+    if (is_increment_zero) {
+        comment("(adding zero)");
+    }
 
-    if (always_small(LHS) && always_small(RHS) && is_small_result) {
+    if (Src2.isSmall()) {
+        factor = Src2.as<ArgSmall>().getSigned();
+        if (Support::isPowerOf2(factor)) {
+            left_shift = Support::ctz<Eterm>(factor);
+        }
+    }
+
+    if (always_small(Src1) && Src2.isSmall() && always_small(Src4) &&
+        is_product_small && is_sum_small) {
         auto dst = init_destination(Dst, ARG1);
-        comment("multiplication without overflow check");
-        if (RHS.isSmall()) {
-            auto lhs = load_source(LHS, ARG2);
-            Sint factor = RHS.as<ArgSmall>().getSigned();
+        auto [src1, src4] = load_sources(Src1, ARG2, Src4, ARG3);
+
+        comment("multiplication and addition without overflow check");
+        a.and_(TMP1, src1.reg, imm(~_TAG_IMMED1_MASK));
+        if (left_shift > 0) {
+            comment("optimized multiplication by replacing with left "
+                    "shift");
+            a.add(dst.reg, src4.reg, TMP1, arm::lsl(left_shift));
+        } else {
+            mov_imm(TMP2, factor);
+            a.madd(dst.reg, TMP1, TMP2, src4.reg);
+        }
+        flush_var(dst);
+    } else {
+        Label small = a.newLabel();
+        Label store_result = a.newLabel();
+        auto [src1, src2] = load_sources(Src1, ARG2, Src2, ARG3);
+        auto src4 = load_source(ArgXRegister(0), XREG0);
 
-            a.and_(TMP1, lhs.reg, imm(~_TAG_IMMED1_MASK));
-            if (Support::isPowerOf2(factor)) {
-                int trailing_bits = Support::ctz<Eterm>(factor);
-                comment("optimized multiplication by replacing with left "
-                        "shift");
-                a.lsl(TMP1, TMP1, imm(trailing_bits));
+        if (!is_increment_zero) {
+            src4 = load_source(Src4, ARG4);
+        }
+
+        if (always_small(Src1) && always_small(Src2) && always_small(Src4)) {
+            comment("skipped test for small operands since they are always "
+                    "small");
+        } else {
+            if (always_small(Src4)) {
+                emit_are_both_small(Src1, src1.reg, Src2, src2.reg, small);
+            } else if (always_small(Src2)) {
+                emit_are_both_small(Src1, src1.reg, Src4, src4.reg, small);
             } else {
-                mov_imm(TMP2, factor);
-                a.mul(TMP1, TMP1, TMP2);
+                ASSERT(!is_increment_zero);
+                ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
+                a.and_(TMP1, src1.reg, src2.reg);
+                a.and_(TMP1, TMP1, src4.reg);
+                if (always_one_of<BeamTypeId::Integer, BeamTypeId::AlwaysBoxed>(
+                            Src1) &&
+                    always_one_of<BeamTypeId::Integer, BeamTypeId::AlwaysBoxed>(
+                            Src2) &&
+                    always_one_of<BeamTypeId::Integer, BeamTypeId::AlwaysBoxed>(
+                            Src4)) {
+                    emit_is_boxed(small, TMP1);
+                } else {
+                    a.and_(TMP1, TMP1, imm(_TAG_IMMED1_MASK));
+                    a.cmp(TMP1, imm(_TAG_IMMED1_SMALL));
+                    a.b_eq(small);
+                }
             }
+
+            mov_var(ARG2, src1);
+            mov_var(ARG3, src2);
+
+            if (Fail.get() != 0) {
+                if (is_increment_zero) {
+                    fragment_call(ga->get_mul_guard_shared());
+                } else {
+                    mov_var(ARG4, src4);
+                    fragment_call(ga->get_mul_add_guard_shared());
+                }
+                emit_branch_if_not_value(ARG1,
+                                         resolve_beam_label(Fail, dispUnknown));
+            } else {
+                if (is_increment_zero) {
+                    fragment_call(ga->get_mul_body_shared());
+                } else {
+                    mov_var(ARG4, src4);
+                    fragment_call(ga->get_mul_add_body_shared());
+                }
+            }
+
+            a.b(store_result);
+        }
+
+        a.bind(small);
+        if (is_increment_zero) {
+            comment("multiply smalls");
         } else {
-            auto [lhs, rhs] = load_sources(LHS, ARG2, RHS, ARG3);
-            a.and_(TMP1, lhs.reg, imm(~_TAG_IMMED1_MASK));
-            a.asr(TMP2, rhs.reg, imm(_TAG_IMMED1_SIZE));
-            a.mul(TMP1, TMP1, TMP2);
+            comment("multiply and add smalls");
         }
-        a.orr(dst.reg, TMP1, imm(_TAG_IMMED1_SMALL));
-        flush_var(dst);
-    } else {
-        auto [lhs, rhs] = load_sources(LHS, ARG2, RHS, ARG3);
-        mov_var(ARG2, lhs);
-        mov_var(ARG3, rhs);
 
-        if (Fail.get() != 0) {
-            fragment_call(ga->get_times_guard_shared());
-            emit_branch_if_not_value(ARG1,
-                                     resolve_beam_label(Fail, dispUnknown));
+        if (is_product_small && is_sum_small) {
+            arm::Gp increment_reg;
+
+            a.and_(TMP3, src1.reg, imm(~_TAG_IMMED1_MASK));
+
+            if (is_increment_zero) {
+                mov_imm(TMP1, make_small(0));
+                increment_reg = TMP1;
+            } else {
+                increment_reg = src4.reg;
+            }
+
+            if (left_shift > 0) {
+                comment("optimized multiplication by replacing with left "
+                        "shift");
+                a.add(ARG1, increment_reg, TMP3, arm::lsl(left_shift));
+            } else {
+                a.asr(TMP4, src2.reg, imm(_TAG_IMMED1_SIZE));
+                a.madd(ARG1, TMP3, TMP4, increment_reg);
+            }
+
+            comment("skipped test for small result");
         } else {
-            fragment_call(ga->get_times_body_shared());
+            auto min_increment = std::get<0>(getClampedRange(Src4));
+
+            a.and_(TMP3, src1.reg, imm(~_TAG_IMMED1_MASK));
+            if (left_shift == 0) {
+                comment("optimized multiplication by one");
+                a.mov(ARG1, TMP3);
+                a.asr(TMP2, TMP3, imm(63));
+            } else if (left_shift > 0) {
+                comment("optimized multiplication by replacing with left "
+                        "shift");
+                a.lsl(ARG1, TMP3, imm(left_shift));
+                a.asr(TMP2, TMP3, imm(64 - left_shift));
+            } else {
+                ASSERT(left_shift == -1);
+                a.asr(TMP4, src2.reg, imm(_TAG_IMMED1_SIZE));
+                a.mul(ARG1, TMP3, TMP4);
+                a.smulh(TMP2, TMP3, TMP4);
+            }
+
+            if (is_increment_zero) {
+                a.add(ARG1, ARG1, imm(_TAG_IMMED1_SMALL));
+            } else {
+                arm::Gp sign_reg;
+
+                if (min_increment > 0) {
+                    sign_reg = ZERO;
+                } else {
+                    sign_reg = TMP3;
+                    a.asr(sign_reg, src4.reg, imm(63));
+                }
+
+                a.adds(ARG1, ARG1, src4.reg);
+                a.adc(TMP2, TMP2, sign_reg);
+            }
+
+            comment("test whether the result fits in a small");
+            /* The high 65 bits of result will all be the same if no
+             * overflow occurred. Another way to say that is that the
+             * sign bit of the low 64 bits repeated 64 times must be
+             * equal to the high 64 bits of the result. */
+            a.asr(TMP3, ARG1, imm(SMALL_BITS + _TAG_IMMED1_SIZE - 1));
+            a.cmp(TMP2, TMP3);
+            a.b_eq(store_result);
+
+            fragment_call(ga->get_int128_to_big_shared());
         }
 
+        a.bind(store_result);
         mov_arg(Dst, ARG1);
     }
 }
@@ -673,6 +853,97 @@ void BeamGlobalAssembler::emit_int_div_rem_body_shared() {
     }
 }
 
+void BeamModuleAssembler::emit_div_rem_literal(Sint divisor,
+                                               const ArgSource &Dividend,
+                                               arm::Gp dividend,
+                                               arm::Gp quotient,
+                                               arm::Gp remainder,
+                                               const Label &generic,
+                                               bool need_div,
+                                               bool need_rem) {
+    arm::Gp small_tag = TMP6;
+    bool small_dividend = !generic.isValid();
+
+    ASSERT(divisor != (Sint)0);
+
+    if (!small_dividend) {
+        a.and_(small_tag, dividend, imm(_TAG_IMMED1_MASK));
+        a.cmp(small_tag, imm(_TAG_IMMED1_SMALL));
+        a.b_ne(generic);
+    }
+
+    if (Support::isPowerOf2(divisor)) {
+        arm::Gp original_dividend = dividend;
+        int shift = Support::ctz<Eterm>(divisor);
+
+        if (need_div && small_dividend) {
+            mov_imm(small_tag, _TAG_IMMED1_SMALL);
+        }
+
+        ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
+        if (std::get<0>(getClampedRange(Dividend)) >= 0) {
+            /* Positive dividend. */
+            if (need_div) {
+                comment("optimized div by replacing with right shift");
+                if (need_rem && quotient == dividend) {
+                    original_dividend = TMP5;
+                    a.mov(original_dividend, dividend);
+                }
+                a.orr(quotient, small_tag, dividend, arm::lsr(shift));
+            }
+            if (need_rem) {
+                auto mask = Support::lsbMask<Uint>(shift + _TAG_IMMED1_SIZE);
+                comment("optimized rem by replacing with masking");
+                a.and_(remainder, original_dividend, imm(mask));
+            }
+        } else {
+            /* Negative dividend. */
+            if (need_div) {
+                comment("optimized div by replacing with right shift");
+            }
+            if (divisor == 2) {
+                ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
+                a.add(TMP3, dividend, dividend, arm::lsr(63));
+            } else {
+                add(TMP1, dividend, (divisor - 1) << _TAG_IMMED1_SIZE);
+                a.cmp(dividend, imm(0));
+                a.csel(TMP3, TMP1, dividend, imm(arm::CondCode::kLT));
+            }
+            if (need_div) {
+                if (need_rem && quotient == dividend) {
+                    original_dividend = TMP5;
+                    a.mov(original_dividend, dividend);
+                }
+                a.orr(quotient, small_tag, TMP3, arm::asr(shift));
+            }
+            if (need_rem) {
+                Uint mask = (Uint)-1 << (shift + _TAG_IMMED1_SIZE);
+                comment("optimized rem by replacing with subtraction");
+                a.and_(TMP1, TMP3, imm(mask));
+                a.sub(remainder, original_dividend, TMP1);
+            }
+        }
+    } else {
+        a.asr(TMP1, dividend, imm(_TAG_IMMED1_SIZE));
+        mov_imm(TMP2, divisor);
+        a.sdiv(quotient, TMP1, TMP2);
+        if (need_rem) {
+            a.msub(remainder, quotient, TMP2, TMP1);
+        }
+
+        if (small_dividend) {
+            mov_imm(small_tag, _TAG_IMMED1_SMALL);
+        }
+        const arm::Shift tagShift = arm::lsl(_TAG_IMMED1_SIZE);
+        if (need_div) {
+            a.orr(quotient, small_tag, quotient, tagShift);
+        }
+        if (need_rem) {
+            a.orr(remainder, small_tag, remainder, tagShift);
+        }
+    }
+}
+
 void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
                                        const ArgSource &LHS,
                                        const ArgSource &RHS,
@@ -685,52 +956,26 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
 
     if (RHS.isSmall()) {
         divisor = RHS.as<ArgSmall>().getSigned();
+        if (divisor == -1) {
+            divisor = 0;
+        }
     }
 
-    if (always_small(LHS) && divisor != (Sint)0 && divisor != (Sint)-1) {
+    if (always_small(LHS) && divisor != 0) {
         auto lhs = load_source(LHS, ARG3);
         auto quotient = init_destination(Quotient, ARG1);
         auto remainder = init_destination(Remainder, ARG2);
+        Label invalidLabel; /* Intentionally not initialized */
 
         comment("skipped test for smalls operands and overflow");
-        if (Support::isPowerOf2(divisor) &&
-            std::get<0>(getClampedRange(LHS)) >= 0) {
-            int trailing_bits = Support::ctz<Eterm>(divisor);
-            arm::Gp LHS_reg = lhs.reg;
-            if (need_div) {
-                comment("optimized div by replacing with right shift");
-                ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
-                if (need_rem && quotient.reg == lhs.reg) {
-                    LHS_reg = TMP1;
-                    a.mov(LHS_reg, lhs.reg);
-                }
-                a.lsr(quotient.reg, lhs.reg, imm(trailing_bits));
-                a.orr(quotient.reg, quotient.reg, imm(_TAG_IMMED1_SMALL));
-            }
-            if (need_rem) {
-                comment("optimized rem by replacing with masking");
-                auto mask = Support::lsbMask<Uint>(trailing_bits +
-                                                   _TAG_IMMED1_SIZE);
-                a.and_(remainder.reg, LHS_reg, imm(mask));
-            }
-        } else {
-            a.asr(TMP1, lhs.reg, imm(_TAG_IMMED1_SIZE));
-            mov_imm(TMP2, divisor);
-            a.sdiv(quotient.reg, TMP1, TMP2);
-            if (need_rem) {
-                a.msub(remainder.reg, quotient.reg, TMP2, TMP1);
-            }
-
-            mov_imm(TMP3, _TAG_IMMED1_SMALL);
-            const arm::Shift tagShift = arm::lsl(_TAG_IMMED1_SIZE);
-            if (need_div) {
-                a.orr(quotient.reg, TMP3, quotient.reg, tagShift);
-            }
-            if (need_rem) {
-                a.orr(remainder.reg, TMP3, remainder.reg, tagShift);
-            }
-        }
-
+        emit_div_rem_literal(divisor,
+                             LHS,
+                             lhs.reg,
+                             quotient.reg,
+                             remainder.reg,
+                             invalidLabel,
+                             need_div,
+                             need_rem);
         if (need_div) {
             flush_var(quotient);
         }
@@ -738,11 +983,24 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
             flush_var(remainder);
         }
     } else {
+        Label generic = a.newLabel(), done = a.newLabel();
         auto [lhs, rhs] = load_sources(LHS, ARG2, RHS, ARG3);
 
+        if (divisor != (Sint)0) {
+            emit_div_rem_literal(divisor,
+                                 LHS,
+                                 lhs.reg,
+                                 ARG1,
+                                 ARG2,
+                                 generic,
+                                 need_div,
+                                 need_rem);
+            a.b(done);
+        }
+
+        a.bind(generic);
         mov_var(ARG2, lhs);
         mov_var(ARG3, rhs);
-
         if (Fail.get() != 0) {
             fragment_call(ga->get_int_div_rem_guard_shared());
             a.b_eq(resolve_beam_label(Fail, disp1MB));
@@ -751,6 +1009,7 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
             fragment_call(ga->get_int_div_rem_body_shared());
         }
 
+        a.bind(done);
         if (need_div) {
             mov_arg(Quotient, ARG1);
         }
@@ -1226,34 +1485,62 @@ void BeamModuleAssembler::emit_i_bsr(const ArgLabel &Fail,
     if (RHS.isSmall()) {
         Sint shift = RHS.as<ArgSmall>().getSigned();
 
-        if (shift >= 0 && shift < SMALL_BITS - 1) {
+        if (shift >= 0) {
+            arm::Gp small_tag = TMP1;
             if (always_small(LHS)) {
                 comment("skipped test for small left operand because it is "
                         "always small");
                 need_generic = false;
+                mov_imm(small_tag, _TAG_IMMED1_SMALL);
             } else if (always_one_of<BeamTypeId::Number>(LHS)) {
                 comment("simplified test for small operand since it is a "
                         "number");
                 emit_is_not_boxed(generic, lhs.reg);
+                mov_imm(small_tag, _TAG_IMMED1_SMALL);
             } else {
-                a.and_(TMP1, lhs.reg, imm(_TAG_IMMED1_MASK));
-                a.cmp(TMP1, imm(_TAG_IMMED1_SMALL));
+                a.and_(small_tag, lhs.reg, imm(_TAG_IMMED1_MASK));
+                a.cmp(small_tag, imm(_TAG_IMMED1_SMALL));
                 a.b_ne(generic);
             }
 
             /* We don't need to clear the mask after shifting because
              * _TAG_IMMED1_SMALL will set all the bits anyway. */
             ERTS_CT_ASSERT(_TAG_IMMED1_MASK == _TAG_IMMED1_SMALL);
-            a.asr(TMP1, lhs.reg, imm(shift));
-            a.orr(dst.reg, TMP1, imm(_TAG_IMMED1_SMALL));
+            shift = std::min<Sint>(shift, 63);
+            a.orr(dst.reg, small_tag, lhs.reg, arm::asr(shift));
 
             if (need_generic) {
                 a.b(next);
             }
         } else {
-            /* Constant shift is negative or too big to fit the `asr`
-             * instruction; fall back to the generic path. */
+            /* Constant shift is negative; fall back to the generic
+             * path. */
         }
+    } else {
+        auto rhs = load_source(RHS, ARG3);
+
+        /* Ensure that both operands are small and that the shift
+         * count is positive. */
+        ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
+        a.ands(TMP1, rhs.reg, imm((1ull << 63) | _TAG_IMMED1_MASK));
+        a.and_(TMP1, lhs.reg, TMP1);
+        a.ccmp(TMP1,
+               imm(_TAG_IMMED1_SMALL),
+               imm(NZCV::kNone),
+               arm::CondCode::kPL);
+        a.b_ne(generic);
+
+        /* Calculate shift count. */
+        a.asr(TMP1, rhs.reg, imm(_TAG_IMMED1_SIZE));
+        mov_imm(TMP2, 63);
+        a.cmp(TMP1, TMP2);
+        a.csel(TMP1, TMP1, TMP2, imm(arm::CondCode::kLE));
+
+        /* Shift right. */
+        ERTS_CT_ASSERT(_TAG_IMMED1_MASK == _TAG_IMMED1_SMALL);
+        a.asr(dst.reg, lhs.reg, TMP1);
+        a.orr(dst.reg, dst.reg, imm(_TAG_IMMED1_SMALL));
+        a.b(next);
     }
 
     a.bind(generic);
diff --git a/erts/emulator/beam/jit/arm/ops.tab b/erts/emulator/beam/jit/arm/ops.tab
index ed8c51ae3ac5..b0c79d3e2c25 100644
--- a/erts/emulator/beam/jit/arm/ops.tab
+++ b/erts/emulator/beam/jit/arm/ops.tab
@@ -1256,6 +1256,23 @@ i_get_map_element f S S S
 # Arithmetic instructions.
 #
 
+gc_bif2 Fail1 Live1 u$bif:erlang:stimes/2 S1 S2 Dst1 |
+  gc_bif2 Fail2 Live2 u$bif:erlang:splus/2 S3 S4 Dst2 |
+  equal(Dst1, S3) |
+  equal(Dst1, Dst2) |
+  equal(Fail1, Fail2) =>
+    i_mul_add Fail1 S1 S2 S3 S4 Dst1
+
+gc_bif2 Fail1 Live1 u$bif:erlang:stimes/2 S1 S2 Dst1 |
+  gc_bif2 Fail2 Live2 u$bif:erlang:splus/2 S3 S4 Dst2 |
+  equal(Dst1, S4) |
+  equal(Dst1, Dst2) |
+  equal(Fail1, Fail2) =>
+    i_mul_add Fail1 S1 S2 S4 S3 Dst1
+
+gc_bif2 Fail Live u$bif:erlang:stimes/2 S1 S2 Dst =>
+    i_mul_add Fail S1 S2 Dst i Dst
+
 gc_bif2 Fail Live u$bif:erlang:splus/2 Src1 Src2 Dst =>
     i_plus Fail Live Src1 Src2 Dst
 
@@ -1265,9 +1282,6 @@ gc_bif1 Fail Live u$bif:erlang:sminus/1 Src Dst =>
 gc_bif2 Fail Live u$bif:erlang:sminus/2 Src1 Src2 Dst =>
     i_minus Fail Live Src1 Src2 Dst
 
-gc_bif2 Fail Live u$bif:erlang:stimes/2 S1 S2 Dst =>
-    i_times Fail Live S1 S2 Dst
-
 gc_bif2 Fail Live u$bif:erlang:div/2 S1 S2 Dst =>
     i_m_div Fail Live S1 S2 Dst
 
@@ -1332,10 +1346,11 @@ gc_bif2 Fail Live u$bif:erlang:bsr/2 S1 S2 Dst =>
 gc_bif2 Fail Live u$bif:erlang:bsl/2 S1 S2 Dst =>
     i_bsl Fail Live S1 S2 Dst
 
+i_mul_add j s s s s d
+
 i_plus j I s s d
 i_unary_minus j I s d
 i_minus j I s s d
-i_times j I s s d
 
 i_m_div j I s s d
 
diff --git a/erts/emulator/beam/jit/beam_jit_common.cpp b/erts/emulator/beam/jit/beam_jit_common.cpp
index 1465c3842f78..8e78e2cf1656 100644
--- a/erts/emulator/beam/jit/beam_jit_common.cpp
+++ b/erts/emulator/beam/jit/beam_jit_common.cpp
@@ -1087,6 +1087,24 @@ Sint beam_jit_bs_bit_size(Eterm term) {
     return (Sint)-1;
 }
 
+Eterm beam_jit_int128_to_big(Process *p, Uint sign, Uint low, Uint high) {
+    Eterm *hp;
+    Uint arity;
+
+    arity = high ? 2 : 1;
+    hp = HeapFragOnlyAlloc(p, BIG_NEED_SIZE(arity));
+    if (sign) {
+        hp[0] = make_neg_bignum_header(arity);
+    } else {
+        hp[0] = make_pos_bignum_header(arity);
+    }
+    BIG_DIGIT(hp, 0) = low;
+    if (arity == 2) {
+        BIG_DIGIT(hp, 1) = high;
+    }
+    return make_big(hp);
+}
+
 ErtsMessage *beam_jit_decode_dist(Process *c_p, ErtsMessage *msgp) {
     if (!erts_proc_sig_decode_dist(c_p, ERTS_PROC_LOCK_MAIN, msgp, 0)) {
         /*
diff --git a/erts/emulator/beam/jit/beam_jit_common.hpp b/erts/emulator/beam/jit/beam_jit_common.hpp
index b6f7239faecb..ddd9c245dc4e 100644
--- a/erts/emulator/beam/jit/beam_jit_common.hpp
+++ b/erts/emulator/beam/jit/beam_jit_common.hpp
@@ -628,6 +628,8 @@ void beam_jit_bs_construct_fail_info(Process *c_p,
                                      Eterm arg1);
 Sint beam_jit_bs_bit_size(Eterm term);
 
+Eterm beam_jit_int128_to_big(Process *p, Uint sign, Uint low, Uint high);
+
 void beam_jit_take_receive_lock(Process *c_p);
 void beam_jit_wait_locked(Process *c_p, ErtsCodePtr cp);
 void beam_jit_wait_unlocked(Process *c_p, ErtsCodePtr cp);
diff --git a/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl b/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl
index 3c620462b35a..9782bbb2269e 100755
--- a/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl
+++ b/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl
@@ -90,6 +90,10 @@
     is_ge_lt_shared
     minus_body_shared
     minus_guard_shared
+    mul_add_body_shared
+    mul_add_guard_shared
+    mul_body_shared
+    mul_guard_shared
     new_map_shared
     plus_body_shared
     plus_guard_shared
@@ -98,8 +102,6 @@
     raise_exception
     raise_exception_shared
     store_unaligned
-    times_body_shared
-    times_guard_shared
     unary_minus_body_shared
     unary_minus_guard_shared
     unloaded_fun
diff --git a/erts/emulator/beam/jit/x86/instr_arith.cpp b/erts/emulator/beam/jit/x86/instr_arith.cpp
index 888f3109f165..35976cc048d7 100644
--- a/erts/emulator/beam/jit/x86/instr_arith.cpp
+++ b/erts/emulator/beam/jit/x86/instr_arith.cpp
@@ -652,10 +652,10 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
         divisor = RHS.as<ArgSmall>().getSigned();
     }
 
-    if (divisor != (Sint)0 && divisor != (Sint)-1) {
+    mov_arg(x86::rax, LHS);
+
+    if (divisor != 0 && divisor != -1) {
         /* There is no possibility of overflow. */
-        a.mov(ARG6, imm(divisor));
-        mov_arg(x86::rax, LHS);
         if (always_small(LHS)) {
             comment("skipped test for small dividend since it is always small");
             need_generic = false;
@@ -672,10 +672,9 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
             a.short_().jne(generic_div);
         }
 
-        /* Sign-extend and divide. The result is implicitly placed in
-         * RAX and the remainder in RDX (ARG3). */
         if (Support::isPowerOf2(divisor) &&
             std::get<0>(getClampedRange(LHS)) >= 0) {
+            /* Unsigned integer division. */
             int trailing_bits = Support::ctz<Eterm>(divisor);
 
             if (need_rem) {
@@ -692,8 +691,52 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
                 a.shr(x86::rax, imm(trailing_bits));
                 a.or_(x86::rax, imm(_TAG_IMMED1_SMALL));
             }
+        } else if (Support::isPowerOf2(divisor)) {
+            /* Signed integer division. */
+            int shift = Support::ctz<Eterm>(divisor);
+            Sint offset = (divisor - 1) << _TAG_IMMED1_SIZE;
+
+            if (need_rem) {
+                a.mov(x86::rdx, x86::rax);
+                ASSERT(x86::rdx != ARG1);
+            }
+
+            if (need_div) {
+                comment("optimized div by replacing with right shift");
+            }
+
+            if (divisor == 2) {
+                ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
+                a.mov(ARG1, x86::rax);
+                a.shr(ARG1, imm(63));
+                a.add(x86::rax, ARG1);
+            } else {
+                if (Support::isInt32(offset)) {
+                    a.lea(ARG1, x86::qword_ptr(x86::rax, offset));
+                } else {
+                    a.mov(ARG1, offset);
+                    a.add(ARG1, x86::rax);
+                }
+                a.test(x86::rax, x86::rax);
+                a.cmovs(x86::rax, ARG1);
+            }
+
+            if (need_rem) {
+                Uint mask = (Uint)-1 << (shift + _TAG_IMMED1_SIZE);
+                comment("optimized rem by replacing with subtraction");
+                mov_imm(ARG1, mask);
+                a.and_(ARG1, x86::rax);
+                a.sub(x86::rdx, ARG1);
+            }
+
+            if (need_div) {
+                ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
+                a.sar(x86::rax, imm(shift));
+                a.or_(x86::rax, imm(_TAG_IMMED1_SMALL));
+            }
         } else {
             comment("divide with inlined code");
+            a.mov(ARG6, imm(divisor));
             a.sar(x86::rax, imm(_TAG_IMMED1_SIZE));
             a.cqo();
             a.idiv(ARG6);
@@ -723,7 +766,7 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
     a.bind(generic_div);
     if (need_generic) {
         mov_arg(ARG4, RHS); /* Done first as mov_arg may clobber ARG1 */
-        mov_arg(ARG1, LHS);
+        a.mov(ARG1, x86::rax);
 
         if (Fail.get() != 0) {
             safe_fragment_call(ga->get_int_div_rem_guard_shared());
@@ -823,16 +866,32 @@ void BeamModuleAssembler::emit_i_m_div(const ArgLabel &Fail,
     mov_arg(Dst, RET);
 }
 
-/* ARG2 = LHS, ARG3 (!) = RHS
+/* ARG2 = Src1
+ * ARG3 = Src2
+ * ARG4 = Increment
  *
  * Result is returned in RET, error is indicated by ZF. */
-void BeamGlobalAssembler::emit_times_guard_shared() {
+void BeamGlobalAssembler::emit_mul_add_guard_shared() {
+    Label done = a.newLabel();
+
     emit_enter_frame();
     emit_enter_runtime();
 
+    a.mov(TMP_MEM1q, ARG4);
+
     a.mov(ARG1, c_p);
     runtime_call<3>(erts_mixed_times);
+    emit_test_the_non_value(RET);
+    a.short_().je(done);
+
+    a.mov(ARG3, TMP_MEM1q);
+    a.mov(ARG2, RET);
+    a.mov(ARG1, c_p);
+    a.cmp(ARG3, imm(make_small(0)));
+    a.short_().je(done);
+    runtime_call<3>(erts_mixed_plus);
 
+    a.bind(done);
     emit_leave_runtime();
     emit_leave_frame();
 
@@ -841,13 +900,14 @@ void BeamGlobalAssembler::emit_times_guard_shared() {
     a.ret();
 }
 
-/* ARG2 = LHS, ARG3 (!) = RHS
+/* ARG2 = Src1
+ * ARG3 = Src2
+ * ARG4 = Increment
  *
  * Result is returned in RET. */
-void BeamGlobalAssembler::emit_times_body_shared() {
-    static const ErtsCodeMFA bif_mfa = {am_erlang, am_Times, 2};
-
-    Label error = a.newLabel();
+void BeamGlobalAssembler::emit_mul_add_body_shared() {
+    Label mul_only = a.newLabel(), error = a.newLabel(),
+          mul_error = a.newLabel(), do_error = a.newLabel();
 
     emit_enter_frame();
     emit_enter_runtime();
@@ -855,61 +915,166 @@ void BeamGlobalAssembler::emit_times_body_shared() {
     /* Save original arguments for the error path. */
     a.mov(TMP_MEM1q, ARG2);
     a.mov(TMP_MEM2q, ARG3);
-
     a.mov(ARG1, c_p);
-    runtime_call<3>(erts_mixed_times);
+    a.cmp(ARG4, imm(make_small(0)));
+    a.short_().je(mul_only);
+    a.mov(TMP_MEM4q, ARG4);
+
+    a.lea(ARG5, TMP_MEM3q);
+    runtime_call<5>(erts_mul_add);
 
     emit_leave_runtime();
     emit_leave_frame();
 
     emit_test_the_non_value(RET);
     a.short_().je(error);
+
     a.ret();
 
+    a.bind(mul_only);
+    {
+        runtime_call<3>(erts_mixed_times);
+
+        emit_leave_runtime();
+        emit_leave_frame();
+
+        emit_test_the_non_value(RET);
+        a.short_().je(mul_error);
+
+        a.ret();
+    }
+
     a.bind(error);
     {
-        /* Place the original arguments in x-registers. */
+        static const ErtsCodeMFA mul_mfa = {am_erlang, am_Times, 2};
+        static const ErtsCodeMFA add_mfa = {am_erlang, am_Plus, 2};
+
+        a.mov(ARG1, TMP_MEM3q);
+        a.mov(ARG2, TMP_MEM4q);
+        mov_imm(ARG4, &add_mfa);
+        emit_test_the_non_value(ARG1);
+        a.short_().jne(do_error);
+
+        a.bind(mul_error);
         a.mov(ARG1, TMP_MEM1q);
         a.mov(ARG2, TMP_MEM2q);
+        mov_imm(ARG4, &mul_mfa);
+
+        a.bind(do_error);
         a.mov(getXRef(0), ARG1);
         a.mov(getXRef(1), ARG2);
-
-        a.mov(ARG4, imm(&bif_mfa));
         a.jmp(labels[raise_exception]);
     }
 }
 
-void BeamModuleAssembler::emit_i_times(const ArgLabel &Fail,
-                                       const ArgSource &LHS,
-                                       const ArgSource &RHS,
-                                       const ArgRegister &Dst) {
-    bool small_result = is_product_small_if_args_are_small(LHS, RHS);
+/* ARG2 = Src1
+ * ARG3 = Src2
+ *
+ * The result is returned in RET.
+ */
+void BeamGlobalAssembler::emit_mul_body_shared() {
+    mov_imm(ARG4, make_small(0));
+    a.jmp(labels[mul_add_body_shared]);
+}
 
-    if (always_small(LHS) && always_small(RHS) && small_result) {
-        comment("multiplication without overflow check");
-        if (RHS.isSmall()) {
-            Sint factor = RHS.as<ArgSmall>().getSigned();
+/* ARG2 = Src1
+ * ARG3 = Src2
+ *
+ * Result is returned in RET, error is indicated by ZF.
+ */
+void BeamGlobalAssembler::emit_mul_guard_shared() {
+    mov_imm(ARG4, make_small(0));
+    a.jmp(labels[mul_add_guard_shared]);
+}
+
+void BeamModuleAssembler::emit_i_mul_add(const ArgLabel &Fail,
+                                         const ArgSource &Src1,
+                                         const ArgSource &Src2,
+                                         const ArgSource &Src3,
+                                         const ArgSource &Src4,
+                                         const ArgRegister &Dst) {
+    bool is_product_small = is_product_small_if_args_are_small(Src1, Src2);
+    bool is_sum_small = is_sum_small_if_args_are_small(Src3, Src4);
+    bool is_increment_zero =
+            Src4.isSmall() && Src4.as<ArgSmall>().getSigned() == 0;
+    Sint factor = 0;
+    int left_shift = -1;
+
+    if (is_increment_zero) {
+        comment("(adding zero)");
+    }
+
+    if (Src2.isSmall()) {
+        factor = Src2.as<ArgSmall>().getSigned();
+        if (Support::isPowerOf2(factor)) {
+            left_shift = Support::ctz<Eterm>(factor);
+        }
+    }
+
+    if (always_small(Src1) && Src2.isSmall() && Src4.isSmall() &&
+        is_product_small && is_sum_small) {
+        x86::Mem p;
+        Sint increment = Src4.as<ArgSmall>().get();
+        increment -= factor * _TAG_IMMED1_SMALL;
+
+        switch (factor) {
+        case 2:
+            p = ptr(RET, RET, 0, increment);
+            break;
+        case 3:
+            p = ptr(RET, RET, 1, increment);
+            break;
+        case 4:
+            p = ptr(x86::Gp(), RET, 2, increment);
+            break;
+        case 5:
+            p = ptr(RET, RET, 2, increment);
+            break;
+        case 8:
+            p = ptr(x86::Gp(), RET, 3, increment);
+            break;
+        case 9:
+            p = ptr(RET, RET, 3, increment);
+            break;
+        }
+
+        if (Support::isInt32(increment) && p.hasIndex()) {
+            comment("optimizing multiplication and addition using LEA");
+            mov_arg(RET, Src1);
+            a.lea(RET, p);
+            mov_arg(Dst, RET);
+            return;
+        }
+    }
 
-            mov_arg(RET, LHS);
+    if (always_small(Src1) && Src2.isSmall() && always_small(Src4) &&
+        is_product_small && is_sum_small) {
+        comment("multiplication and addition without overflow check");
+        if (Src2.isSmall()) {
+            mov_arg(RET, Src1);
             a.and_(RET, imm(~_TAG_IMMED1_MASK));
             if (Support::isPowerOf2(factor)) {
-                int trailing_bits = Support::ctz<Eterm>(factor);
                 comment("optimized multiplication by replacing with left "
                         "shift");
-                a.shl(RET, imm(trailing_bits));
+                a.shl(RET, imm(left_shift));
             } else {
                 mov_imm(ARG2, factor);
                 a.imul(RET, ARG2);
             }
         } else {
-            mov_arg(RET, LHS);
-            mov_arg(ARG2, RHS);
+            mov_arg(RET, Src1);
+            mov_arg(ARG2, Src2);
             a.and_(RET, imm(~_TAG_IMMED1_MASK));
             a.sar(ARG2, imm(_TAG_IMMED1_SIZE));
             a.imul(RET, ARG2);
         }
 
-        a.or_(RET, imm(_TAG_IMMED1_SMALL));
+        if (is_increment_zero) {
+            a.or_(RET, imm(_TAG_IMMED1_SMALL));
+        } else {
+            mov_arg(ARG2, Src4);
+            a.add(RET, ARG2);
+        }
         mov_arg(Dst, RET);
 
         return;
@@ -917,39 +1082,81 @@ void BeamModuleAssembler::emit_i_times(const ArgLabel &Fail,
 
     Label next = a.newLabel(), mixed = a.newLabel();
 
-    mov_arg(ARG2, LHS); /* Used by erts_mixed_times in this slot */
-    mov_arg(ARG3, RHS); /* Used by erts_mixed_times in this slot */
+    mov_arg(ARG2, Src1);
+    mov_arg(ARG3, Src2);
+    if (!is_increment_zero) {
+        mov_arg(ARG4, Src4);
+    }
 
-    if (RHS.isSmall()) {
-        Sint val = RHS.as<ArgSmall>().getSigned();
-        emit_is_small(mixed, LHS, ARG2);
+    if (Src2.isSmall()) {
+        Sint val = Src2.as<ArgSmall>().getSigned();
+        emit_are_both_small(mixed, Src1, ARG2, Src4, ARG4);
         a.mov(RET, ARG2);
-        a.mov(ARG4, imm(val));
+        mov_imm(ARG5, val);
     } else {
-        emit_are_both_small(mixed, LHS, ARG2, RHS, ARG3);
+        if (is_increment_zero) {
+            emit_are_both_small(mixed, Src1, ARG2, Src2, ARG3);
+        } else if (always_small(Src1)) {
+            emit_are_both_small(mixed, Src2, ARG3, Src4, ARG4);
+        } else {
+            a.mov(RETd, ARG2.r32());
+            a.and_(RETd, ARG3.r32());
+            a.and_(RETd, ARG4.r32());
+            if (always_one_of<BeamTypeId::Integer, BeamTypeId::AlwaysBoxed>(
+                        Src1) &&
+                always_one_of<BeamTypeId::Integer, BeamTypeId::AlwaysBoxed>(
+                        Src2) &&
+                always_one_of<BeamTypeId::Integer, BeamTypeId::AlwaysBoxed>(
+                        Src4)) {
+                emit_is_not_boxed(mixed, RET);
+            } else {
+                a.and_(RETb, imm(_TAG_IMMED1_MASK));
+                a.cmp(RETb, imm(_TAG_IMMED1_SMALL));
+                a.short_().jne(mixed);
+            }
+        }
         a.mov(RET, ARG2);
-        a.mov(ARG4, ARG3);
-        a.sar(ARG4, imm(_TAG_IMMED1_SIZE));
+        a.mov(ARG5, ARG3);
+        a.sar(ARG5, imm(_TAG_IMMED1_SIZE));
     }
 
     a.and_(RET, imm(~_TAG_IMMED1_MASK));
-    a.imul(RET, ARG4);
-    if (small_result) {
-        comment("skipped overflow check because the result is always small");
+    a.imul(RET, ARG5);
+    if (is_product_small) {
+        comment("skipped overflow check because product is always small");
     } else {
         a.short_().jo(mixed);
     }
-    a.or_(RET, imm(_TAG_IMMED1_SMALL));
+
+    if (is_increment_zero) {
+        a.or_(RET, imm(_TAG_IMMED1_SMALL));
+    } else {
+        a.add(RET, ARG4);
+        if (is_sum_small) {
+            comment("skipped overflow check because sum is always small");
+        } else {
+            a.short_().jo(mixed);
+        }
+    }
+
     a.short_().jmp(next);
 
     /* Call mixed multiplication. */
     a.bind(mixed);
     {
         if (Fail.get() != 0) {
-            safe_fragment_call(ga->get_times_guard_shared());
+            if (is_increment_zero) {
+                safe_fragment_call(ga->get_mul_guard_shared());
+            } else {
+                safe_fragment_call(ga->get_mul_add_guard_shared());
+            }
             a.je(resolve_beam_label(Fail));
         } else {
-            safe_fragment_call(ga->get_times_body_shared());
+            if (is_increment_zero) {
+                safe_fragment_call(ga->get_mul_body_shared());
+            } else {
+                safe_fragment_call(ga->get_mul_add_body_shared());
+            }
         }
     }
 
@@ -1305,13 +1512,14 @@ void BeamModuleAssembler::emit_i_bsr(const ArgSource &LHS,
                                      const ArgRegister &Dst) {
     Label generic = a.newLabel(), next = a.newLabel();
     bool need_generic = true;
+    bool need_register_load = true;
 
     mov_arg(ARG2, LHS);
 
     if (RHS.isSmall()) {
         Sint shift = RHS.as<ArgSmall>().getSigned();
 
-        if (shift >= 0 && shift < SMALL_BITS - 1) {
+        if (shift >= 0) {
             if (always_small(LHS)) {
                 comment("skipped test for small left operand because it is "
                         "always small");
@@ -1325,6 +1533,7 @@ void BeamModuleAssembler::emit_i_bsr(const ArgSource &LHS,
             /* We don't need to clear the mask after shifting because
              * _TAG_IMMED1_SMALL will set all the bits anyway. */
             ERTS_CT_ASSERT(_TAG_IMMED1_MASK == _TAG_IMMED1_SMALL);
+            shift = std::min<Sint>(shift, 63);
             a.sar(RET, imm(shift));
             a.or_(RET, imm(_TAG_IMMED1_SMALL));
 
@@ -1332,14 +1541,33 @@ void BeamModuleAssembler::emit_i_bsr(const ArgSource &LHS,
                 a.short_().jmp(next);
             }
         } else {
-            /* Constant shift is negative or too big to fit the `sar`
-             * instruction, fall back to the generic path. */
+            /* Constant shift is negative; fall back to the generic
+             * path. */
         }
+    } else if (hasCpuFeature(CpuFeatures::X86::kBMI2)) {
+        mov_arg(RET, RHS);
+        need_register_load = false;
+
+        emit_are_both_small(generic, LHS, ARG2, RHS, RET);
+
+        a.mov(ARG1, RET);
+        a.sar(ARG1, imm(_TAG_IMMED1_SIZE));
+        a.js(generic);
+
+        mov_imm(RET, 63);
+        a.cmp(ARG1, RET);
+        a.cmova(ARG1, RET);
+
+        a.sarx(RET, ARG2, ARG1);
+        a.or_(RET, imm(_TAG_IMMED1_SMALL));
+        a.short_().jmp(next);
     }
 
     a.bind(generic);
     if (need_generic) {
-        mov_arg(RET, RHS);
+        if (need_register_load) {
+            mov_arg(RET, RHS);
+        }
 
         if (Fail.get() != 0) {
             safe_fragment_call(ga->get_i_bsr_guard_shared());
diff --git a/erts/emulator/beam/jit/x86/ops.tab b/erts/emulator/beam/jit/x86/ops.tab
index e96590b5344c..bbc231311801 100644
--- a/erts/emulator/beam/jit/x86/ops.tab
+++ b/erts/emulator/beam/jit/x86/ops.tab
@@ -1229,13 +1229,27 @@ gc_bif2 Fail Live u$bif:erlang:sminus/2 S1 S2 Dst =>
 # Arithmetic instructions.
 #
 
+gc_bif2 Fail1 Live1 u$bif:erlang:stimes/2 S1 S2 Dst1 |
+  gc_bif2 Fail2 Live2 u$bif:erlang:splus/2 S3 S4 Dst2 |
+  equal(Dst1, S3) |
+  equal(Dst1, Dst2) |
+  equal(Fail1, Fail2) =>
+    i_mul_add Fail1 S1 S2 S3 S4 Dst1
+
+gc_bif2 Fail1 Live1 u$bif:erlang:stimes/2 S1 S2 Dst1 |
+  gc_bif2 Fail2 Live2 u$bif:erlang:splus/2 S3 S4 Dst2 |
+  equal(Dst1, S4) |
+  equal(Dst1, Dst2) |
+  equal(Fail1, Fail2) =>
+    i_mul_add Fail1 S1 S2 S4 S3 Dst1
+
+gc_bif2 Fail Live u$bif:erlang:stimes/2 S1 S2 Dst =>
+    i_mul_add Fail S1 S2 Dst i Dst
+
 gen_plus Fail Live S1 S2 Dst => i_plus S1 S2 Fail Dst
 
 gen_minus Fail Live S1 S2 Dst => i_minus S1 S2 Fail Dst
 
-gc_bif2 Fail Live u$bif:erlang:stimes/2 S1 S2 Dst =>
-    i_times Fail S1 S2 Dst
-
 gc_bif2 Fail Live u$bif:erlang:div/2 S1 S2 Dst =>
     i_m_div Fail S1 S2 Dst
 
@@ -1304,7 +1318,7 @@ i_minus s s j d
 
 i_unary_minus s j d
 
-i_times j s s d
+i_mul_add j s s s s d
 
 i_m_div j s s d
 
diff --git a/erts/emulator/test/big_SUITE.erl b/erts/emulator/test/big_SUITE.erl
index 2839a970616e..635abc880087 100644
--- a/erts/emulator/test/big_SUITE.erl
+++ b/erts/emulator/test/big_SUITE.erl
@@ -177,6 +177,7 @@ eval({op,_,Op,A0}, LFH) ->
 eval({op,_,Op,A0,B0}, LFH) ->
     [A,B] = eval_list([A0,B0], LFH),
     Res = eval_op(Op, A, B),
+    ok = eval_op_guard(Op, A, B, Res),
     erlang:garbage_collect(),
     Res;
 eval({integer,_,I}, _) ->
@@ -207,6 +208,18 @@ eval_op('bxor', A, B) -> A bxor B;
 eval_op('bsl', A, B) -> A bsl B;
 eval_op('bsr', A, B) -> A bsr B.
 
+eval_op_guard('-', A, B, Res) when Res =:= A - B -> ok;
+eval_op_guard('+', A, B, Res) when Res =:= A + B -> ok;
+eval_op_guard('*', A, B, Res) when Res =:= A * B -> ok;
+eval_op_guard('div', A, B, Res) when Res =:= A div B -> ok;
+eval_op_guard('rem', A, B, Res) when Res =:= A rem B -> ok;
+eval_op_guard('band', A, B, Res) when Res =:= A band B -> ok;
+eval_op_guard('bor', A, B, Res) when Res =:= A bor B -> ok;
+eval_op_guard('bxor', A, B, Res) when Res =:= A bxor B -> ok;
+eval_op_guard('bsl', A, B, Res) when Res =:= A bsl B -> ok;
+eval_op_guard('bsr', A, B, Res) when Res =:= A bsr B -> ok;
+eval_op_guard(Op, A, B, Res) -> {error,{Op,A,B,Res}}.
+
 test_squaring(I) ->
     %% Multiplying an integer by itself is specially optimized, so we
     %% should take special care to test squaring.  The optimization
@@ -520,12 +533,13 @@ properties(_Config) ->
     _ = [begin
              A = id(rand_int()),
              B = id(rand_int()),
-             io:format("~.36#\n~.36#\n", [A,B]),
-             test_properties(A, B)
+             C = id(rand_int()),
+             io:format("~.36#\n~.36#\n~.36#\n", [A,B,C]),
+             test_properties(A, B, C)
          end || _ <- lists:seq(1, 1000)],
     ok.
 
-test_properties(A, B) ->
+test_properties(A, B, C) ->
     SquaredA = id(A * A),
     SquaredB = id(B * B),
 
@@ -543,6 +557,11 @@ test_properties(A, B) ->
     A = id(Sum - B),
     B = id(Sum - A),
     0 = Sum - A - B,
+    C = id(A + B + C) - Sum,
+
+    PS = id(A * B + C),
+    PS = P + C,
+    ok = test_mul_add_guard(A, B, C, PS),
 
     NegA = id(-A),
     A = -NegA,
@@ -563,6 +582,7 @@ test_properties(A, B) ->
 
     ok.
 
+test_mul_add_guard(A, B, C, Res) when Res =:= A * B + C -> ok.
 
 rand_int() ->
     Sz = max(floor(rand:normal() * 512 + 256), 7),
diff --git a/erts/emulator/test/big_SUITE_data/karatsuba.dat b/erts/emulator/test/big_SUITE_data/karatsuba.dat
index c0a0a7264775..d3eeb1edda63 100644
--- a/erts/emulator/test/big_SUITE_data/karatsuba.dat
+++ b/erts/emulator/test/big_SUITE_data/karatsuba.dat
@@ -2,3 +2,5 @@
 778044957111982296698085106003820588379533248535175305369992153103173638825081172125947786580536601796787332015996348528501051686995129310226034229210961747151236268717981478782260 = 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222220638517163875604234197893799387492940714846904459640175648396747364515954208900839325461351363793129284077140658689554385146217203856200723212082378278681864294152015980850427464026656249693797220123249860586581459140699479021638770759493450252580845047833949914496709723236955652660 div 2856161719074522159237009590056107822635035670018713848188829444171911440810511153593372984982324471392734428893744842307433179041780071800813834204750896979634955588152420293439551458314069220674241649915149179367953255529141343871757486196569041879420486970654045852414605072383041.
 2856161719074522159237009590056107822635035670018713848188829444171911440810511153593372984982324471392734428893744842307433179041780071800813834204750896979634955588152420293439551458314069220674241649915149179367953255529141343871757486196569041879420486970654045852414605072383041 = 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222220638517163875604234197893799387492940714846904459640175648396747364515954208900839325461351363793129284077140658689554385146217203856200723212082378278681864294152015980850427464026656249693797220123249860586581459140699479021638770759493450252580845047833949914496709723236955652660 div 778044957111982296698085106003820588379533248535175305369992153103173638825081172125947786580536601796787332015996348528501051686995129310226034229210961747151236268717981478782260.
 111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111063449349767560406261235142392341647649025372723765646039413496854749810861561328494820951780143140256625626909176959305680458786553388553625107116000046915830257816726584619722724548281507789670906469943008403108740971884016290536011259624824529432103615715080701414809437629318331795425002605292373621627076391706037344544674952930655508373167888426163639927117607800531055216745789633238812571951209824449373098738720474325546029838911536645854016658738287810538248535167943048401518088275884360994522447081283976255652056784266085927715913003002338073550339930729586019785550085812920084971952784494047195550542735273493532002238413738149969014369426810026122179115108673871125270161926777717396432145405190153796887093089464334888549712739386389592196847353891969079332035195409407770223110583450176399233173852721971468580226906388907065682291829282704839192753805483705821353760692262887904886094845663298931749956912234282496355841821173993089465864518791095006027639953496042767023689710197973028632817727184498866638467324567019639370889678900544190692386873018896672246736368046857567131910710307733567633067262431422838734848 = 227244752509208666244300049023218501664936565624639514576438496566054518678090377206901554539288856996266272083253480866328891556584179795376043393179397439111001075307836347541226766323267029349186809052518632576630669501495499405707466858562586370759397778341785328471459982484138277798132149400741413820728000336730772425983653136275550466388207243285757977918 * 488949073121539043178895838851406898867685260673611653386208527301320256750945098442460084709113182789107256214464892052884956575049058585335894004824225556978848052700870382486432811126512032810019093103514678580988403100655092773497194921652262878660895819095201202957727846410635903499346091564032100080922029110666198856874372068909543752725127446446275130141704384403153173213642973260347547120927518593494548154693042687813126757638259919480596660015785042800818110628659177941014457037586017066097793890372685757581701160719984224943404424801688527463891923678834379546779645545606419569427574824563111913480237499286872496822079247464512671595600027881545516790726794520875456846025695904828856824122680691695622222780388482248652558345646407544371724549307907546148267185543556530668197490478774267191002528938567372851525085232721844598755176527243119768654561142351728852269053413174476938571816914419435443895662361267309023154705765158795604804766486473731465844362909575530822196003981963565054860681271890620405191718478159479477685388913902388171544282368568608479247952664643776459165494556000797676507398085908822491136.
+16#aeb17ba36a5a62ac6f0aad2b264d0787363825b9f0edf1ddd6e3d06eb970b70c90d5a43da0e234d85a2bd692ac118318965a1fa855019b8c65f32487755dc5677e27863aa4e4a6a82a76884c4d5d78f5b7807151b0179ee3b387b2118211610d832d1e7367a0e3cd50cce3ce2810e3567fc3fddf180c5ccd0572dc0f8662ef54e864e6182c3f951deff6d4a6cead4322e9bf3d55276f9dbdab649fa18fbdeaa89c002e037bb9090b1a5907ab6d18de09f8f376efdc0341ae360aa732405bf83cfe8342d644443208cfb8ef0568cd597de1ce7389878e48863bf0ebf1538ce2c317d8ac9f81976ae51617d7f6939582a8c28375caab30052d8ddf1b2995fb3891ea4541ef3d92bff37b6726052e8d7530b1f64a3cdfbba9cc320b55b2504417ff21986ceaaab8d4f73fafca6076e04fda786562571c5482b1f06b9b2762f51f3c1734284916153b377f147feb9ab398cf9ee46ba272c0ec8685f5a3832ff4e32aca370591f68bf38523839bd7367ebe02170150e87c69c3ff0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 = 16#ffffffffffffffff514e845c95a59d5319bf93b6817098d5d7971aec9505825ab1147be429d33c3c85e64de35cbde5d4346523fbc587238f2dc034f4089e119df20a0ddedd415203a7f0a3197be55398eed8be064b7654f4ad47b9bba204f02e04e3d5765209f9606f5d9dbf3b5a2d3734c8f69d2c4677c7d19b6e7ce34b705b220cd214d02435619b89c579d4110f7904aee7c0461b50a48e35c911cea6aae434020aa597a1dc70510e6dab26caf2327ee50d24077a61b317d42479cdf6e1ff00000000000000000000000000000000000000000000000000000000000000000000000000000000 * 16#aeb17ba36a5a62ace6406c497e8f672a2868e5136afa7da54eeb841bd62cc3c37a19b21ca3421a2bcb9adc043a78dc70d23fcb0bf761ee620df5f22122beadfc580f5ce6841aac67112741f9b489ab0b52b846445dfb0fd1fb1c2a89adf6069f90a26240c4a5d2c8cb370962d3b988382e6491831cb48fa4ddf32deb2fdbca9e64763a862beef086fb51183fb9e4af5b71ca36ee3159551bcbfdf55a685e238faef19254d9350dcd811af2dbf8859e4ce82bdb8632091e0100000000000000000000000000000000000000000000000000000000000000000000000000000000.
+16#34c8f69d2c4677c6d19b6e7ce34b705bd0be4db83a7e980e81ca31c352a076a32d17ccd3b115ce49dd214d2da4d36ea7ae1bbcc23ae3f69c1ca949af6143cea35124d82ffedc501525ca169af0b58ffb580f5ce6841aac67112741f9b489ab0b52b846445dfb0fd1fb1c2a89adf6069f90a26240c4a5d2c9 = 16#34c8f69d2c4677c7d19b6e7ce34b705b220cd214d02435619b89c579d4110f7904aee7c0461b50a48e35c911cea6aae434020aa597a1dc70510e6dab26caf2327ee50d24077a61b317d42479cdf6e1ff00000000000000000000000000000000000000000000000000000000000000000000000000000000 - 16#ffffffffffffffff514e845c95a59d5319bf93b6817098d5d7971aec9505825ab1147be429d33c3c85e64de35cbde5d4346523fbc587238f2dc034f4089e119df20a0ddedd415203a7f0a3197be55398eed8be064b7654f4ad47b9bba204f02e04e3d5765209f9606f5d9dbf3b5a2d37.
diff --git a/erts/emulator/test/small_SUITE.erl b/erts/emulator/test/small_SUITE.erl
index 7d49522f0592..aa732c1e24d9 100644
--- a/erts/emulator/test/small_SUITE.erl
+++ b/erts/emulator/test/small_SUITE.erl
@@ -23,10 +23,12 @@
 
 -export([all/0, suite/0, groups/0]).
 -export([edge_cases/1,
-         addition/1, subtraction/1, negation/1, multiplication/1, division/1,
-         test_bitwise/1, test_bsl/1,
+         addition/1, subtraction/1, negation/1,
+         multiplication/1, mul_add/1, division/1,
+         test_bitwise/1, test_bsl/1, test_bsr/1,
          element/1,
          range_optimization/1]).
+-export([mul_add/0, division/0]).
 
 -include_lib("common_test/include/ct.hrl").
 
@@ -40,8 +42,8 @@ all() ->
 groups() ->
     [{p, [parallel],
       [edge_cases,
-       addition, subtraction, negation, multiplication, division,
-       test_bitwise, test_bsl,
+       addition, subtraction, negation, multiplication, mul_add, division,
+       test_bitwise, test_bsl, test_bsr,
        element,
        range_optimization]}].
 
@@ -139,6 +141,7 @@ addition(_Config) ->
     %% merl:print(Tree),
     {ok,_Bin} = merl:compile_and_load(Tree, []),
     test_addition(Fs0, Mod),
+    unload(Mod),
     ok.
 
 add_gen_pairs() ->
@@ -247,6 +250,7 @@ subtraction(_Config) ->
     %% merl:print(Tree),
     {ok,_Bin} = merl:compile_and_load(Tree, []),
     test_subtraction(Fs0, Mod),
+    unload(Mod),
     ok.
 
 sub_gen_pairs() ->
@@ -340,6 +344,7 @@ negation(_Config) ->
     merl:print(Tree),
     {ok,_Bin} = merl:compile_and_load(Tree, []),
     test_negation(Fs0, Mod),
+    unload(Mod),
     ok.
 
 neg_gen_integers() ->
@@ -405,6 +410,7 @@ multiplication(_Config) ->
     %% merl:print(Tree),
     {ok,_Bin} = merl:compile_and_load(Tree, []),
     test_multiplication(Fs0, Mod),
+    unload(Mod),
     ok.
 
 mul_gen_pairs() ->
@@ -416,7 +422,9 @@ mul_gen_pairs() ->
                  _ <- lists:seq(1, 75)],
 
     %% Generate pairs of numbers whose product is small.
-    Pairs1 = [{N, MaxSmall div N} || N <- [1,2,3,5,17,63,64,1111,22222]] ++ Pairs0,
+    SmallPairs = [{N, MaxSmall div N} ||
+                     N <- [1,2,3,4,5,8,16,17,32,63,64,1111,22222]],
+    Pairs1 = [{N,M-1} || {N,M} <- SmallPairs] ++ SmallPairs ++ Pairs0,
 
     %% Add prime factors of 2^59 - 1 (MAX_SMALL for 64-bit architecture
     %% at the time of writing).
@@ -456,7 +464,11 @@ gen_mul_function({Name,{A,B}}) ->
            Res = Y * X;
         '@Name@'(X, fixed, number) when -_@APlusOne@ < X, X < _@APlusOne@ ->
            X * _@B@;
+        '@Name@'(X, fixed, any) ->
+           X * _@B@;
         '@Name@'(fixed, Y, number) when -_@BPlusOne@ < Y, Y < _@BPlusOne@ ->
+           _@A@ * Y;
+        '@Name@'(fixed, Y, any) ->
            _@A@ * Y. ").
 
 test_multiplication([{Name,{A,B}}|T], Mod) ->
@@ -470,7 +482,9 @@ test_multiplication([{Name,{A,B}}|T], Mod) ->
         Res0 = F(-A, -B, false),
         Res0 = F(A, B, number),
         Res0 = F(fixed, B, number),
+        Res0 = F(fixed, B, any),
         Res0 = F(A, fixed, number),
+        Res0 = F(A, fixed, any),
         Res0 = F(-A, -B, number),
 
         Res1 = -(A * B),
@@ -479,7 +493,9 @@ test_multiplication([{Name,{A,B}}|T], Mod) ->
         Res1 = F(-A, B, number),
         Res1 = F(A, -B, number),
         Res1 = F(-A, fixed, number),
-        Res1 = F(fixed, -B, number)
+        Res1 = F(-A, fixed, any),
+        Res1 = F(fixed, -B, number),
+        Res1 = F(fixed, -B, any)
     catch
         C:R:Stk ->
             io:format("~p failed. numbers: ~p ~p\n", [Name,A,B]),
@@ -490,7 +506,215 @@ test_multiplication([{Name,{A,B}}|T], Mod) ->
 test_multiplication([], _) ->
     ok.
 
+mul_add() ->
+    [{timetrap, {minutes, 5}}].
+mul_add(_Config) ->
+    _ = rand:uniform(),				%Seed generator
+    io:format("Seed: ~p", [rand:export_seed()]),
+    Mod = list_to_atom(lists:concat([?MODULE,"_",?FUNCTION_NAME])),
+    Triples = mul_add_triples(),
+    Fs0 = gen_func_names(Triples, 0),
+    Fs = [gen_mul_add_function(F) || F <- Fs0],
+    Tree = ?Q(["-module('@Mod@').",
+               "-compile([export_all,nowarn_export_all]).",
+               "id(I) -> I."]) ++ Fs,
+    %% merl:print(Tree),
+    {ok,_Bin} = merl:compile_and_load(Tree, []),
+    test_mul_add(Fs0, Mod),
+    unload(Mod),
+
+    test_mul_add_float(),
+    test_mul_add_exceptions(),
+
+    ok.
+
+mul_add_triples() ->
+    {_, MaxSmall} = determine_small_limits(0),
+    SqrtMaxSmall = floor(math:sqrt(MaxSmall)),
+
+    Numbers0 = [1,2,3,4,5,8,9,
+                (MaxSmall div 2) band -2,
+                MaxSmall band -2,
+                MaxSmall * 2],
+    Numbers = [rand:uniform(SqrtMaxSmall) || _ <- lists:seq(1, 5)] ++ Numbers0,
+
+    %% Generate pairs of numbers whose product is small.
+    SmallPairs = [{MaxSmall div M,M} || M <- Numbers],
+    Pairs = [{N+M,M} || {N,M} <- SmallPairs] ++ SmallPairs,
+
+    Triples0 = [{A,B,rand:uniform(MaxSmall)} || {A,B} <- Pairs],
+    Triples1a = [{A,B,abs(MaxSmall - A * B)} || {A,B} <- Pairs],
+    Triples1 = [{A,B,C+Offset} ||
+                   {A,B,C} <- Triples1a,
+                   Offset <- [-2,-1,0,1,2],
+                   C + Offset >= 0],
+    Triples2 = [{A,B,MaxSmall+1} || {A,B} <- Pairs],
+    [{3,4,5},
+     {MaxSmall div 2,2,42},                     %Result is not small.
+     {MaxSmall,MaxSmall,MaxSmall}|Triples0 ++ Triples1 ++ Triples2].
+
+gen_mul_add_function({Name,{A,B,C}}) ->
+    APlusOne = A + 1,
+    BPlusOne = B + 1,
+    CPlusOne = C + 1,
+    ?Q("'@Name@'(int_vvv_plus_z, X, Y, Z)
+          when is_integer(X), is_integer(Y), is_integer(Z),
+               -_@APlusOne@ < X, X < _@APlusOne@,
+               -_@BPlusOne@ < Y, Y < _@BPlusOne@,
+               -_@CPlusOne@ < Z, Z < _@CPlusOne@ ->
+           Res = id(X * Y + Z),
+           Res = id(Y * X + Z),
+           Res = id(Z + X * Y),
+           Res = id(Z + Y * X),
+           Res;
+        '@Name@'(int_vvv_minus_z, X, Y, Z)
+           when is_integer(X), is_integer(Y), is_integer(Z),
+               -_@APlusOne@ < X, X < _@APlusOne@,
+               -_@BPlusOne@ < Y, Y < _@BPlusOne@,
+               -_@CPlusOne@ < Z, Z < _@CPlusOne@ ->
+           Res = id(X * Y - Z),
+           Res = id(Y * X - Z),
+           Res;
+        '@Name@'(pos_int_vvv_plus_z, X, Y, Z)
+          when is_integer(X), is_integer(Y), is_integer(Z),
+               0 =< X, X < _@APlusOne@,
+               0 =< Y, Y < _@BPlusOne@,
+               0 =< Z, Z < _@CPlusOne@ ->
+           Res = id(X * Y + Z),
+           Res = id(Y * X + Z),
+           Res = id(Z + X * Y),
+           Res = id(Z + Y * X),
+           Res;
+        '@Name@'(neg_int_vvv_plus_z, X, Y, Z)
+          when is_integer(X), is_integer(Y), is_integer(Z),
+               -_@APlusOne@ < X, X < 0,
+               -_@BPlusOne@ < Y, Y < 0,
+               -_@CPlusOne@ < Z, Z < 0 ->
+           Res = id(X * Y + Z),
+           Res = id(Y * X + Z),
+           Res = id(Z + X * Y),
+           Res = id(Z + Y * X),
+           Res;
+        '@Name@'(any_vvv_plus_z, X, Y, Z) ->
+           Res = id(X * Y + Z),
+           Res = id(Y * X + Z),
+           Res = id(Z + X * Y),
+           Res = id(Z + Y * X),
+           Res = '@Name@'(int_vvv_plus_z, id(X), id(Y), id(Z)),
+           Res;
+        '@Name@'(any_vvv_minus_z, X, Y, Z) ->
+           Res = id(X * Y - Z),
+           Res = id(Y * X - Z),
+           Res = '@Name@'(int_vvv_minus_z, id(X), id(Y), id(Z)),
+           Res;
+        '@Name@'(any_vvi_plus_z, X, Y, _Z) ->
+           Z = _@C@,
+           Res = id(X * Y + Z),
+           Res = id(Y * X + Z),
+           Res = id(Z + X * Y),
+           Res = id(Z + Y * X),
+           Res = '@Name@'(any_vvv_plus_z, X, Y, id(Z)),
+           Res = '@Name@'(any_vvv_minus_z, X, Y, id(-Z)),
+           Res;
+        '@Name@'(any_vvi_minus_z, X, Y, _Z) ->
+           Z = _@C@,
+           Res = id(X * Y - Z),
+           Res = id(Y * X - Z),
+           Res = id(-Z + X * Y),
+           Res = id(-Z + Y * X),
+           Res = '@Name@'(any_vvv_plus_z, X, Y, id(-Z)),
+           Res = '@Name@'(any_vvv_minus_z, X, Y, id(Z)),
+           Res;
+        '@Name@'(any_vii_plus_z, X, fixed, fixed) ->
+           Y = _@B@,
+           Z = _@C@,
+           Res = id(X * Y + Z),
+           Res = id(Y * X + Z),
+           Res = id(Z + X * Y),
+           Res = id(Z + Y * X),
+           Res = '@Name@'(any_vvi_plus_z, X, id(Y), fixed),
+           Res = '@Name@'(any_vvv_minus_z, X, id(Y), id(-Z)),
+           Res;
+        '@Name@'(any_vii_minus_z, X, fixed, fixed) ->
+           Y = _@B@,
+           Z = _@C@,
+           Res = id(X * Y - Z),
+           Res = id(Y * X - Z),
+           Res = id(-Z + X * Y),
+           Res = id(-Z + Y * X),
+           Res = '@Name@'(any_vvi_minus_z, X, id(Y), fixed),
+           Res = '@Name@'(any_vvv_plus_z, X, Y, id(-Z)),
+           Res;
+        '@Name@'({guard_plus_z,Res}, X, Y, Z) when X * Y + Z =:= Res ->
+           ok;
+        '@Name@'({guard_minus_z,Res}, X, Y, Z) when X * Y - Z =:= Res ->
+           ok. ").
+
+test_mul_add([{Name,{A,B,C}}|T], Mod) ->
+    F = fun Mod:Name/4,
+    try
+        Res0 = A * B + C,
+        Res0 = F(any_vii_plus_z, A, fixed, fixed),
+        Res0 = F(pos_int_vvv_plus_z, A, B, C),
+        ok = F({guard_plus_z,Res0}, A, B, C),
+        ok = F({guard_plus_z,Res0}, -A, -B, C),
+
+        Res1 = A * B - C,
+        Res1 = F(any_vii_minus_z, A, fixed, fixed),
+        Res1 = if
+                   A > 0, B > 0, C > 0 ->
+                       F(neg_int_vvv_plus_z, -A, -B, -C);
+                   true ->
+                       Res1
+              end,
+        ok = F({guard_minus_z,Res1}, A, B, C),
+        ok = F({guard_minus_z,Res1}, -A, -B, C),
+
+        Res2 = -A * B + C,
+        Res2 = A * -B + C,
+        Res2 = F(any_vii_plus_z, -A, fixed, fixed),
+        ok = F({guard_plus_z,Res2}, -A, B, C),
+
+        Res3 = -A * B - C,
+        Res3 = A * -B - C,
+        Res3 = F(any_vii_minus_z, -A, fixed, fixed),
+        ok = F({guard_minus_z,Res3}, -A, B, C)
+    catch
+        Class:R:Stk ->
+            io:format("~p failed. numbers: ~p ~p ~p\n", [Name,A,B,C]),
+            erlang:raise(Class, R, Stk)
+    end,
+    test_mul_add(T, Mod);
+test_mul_add([], _) ->
+    ok.
+
+test_mul_add_float() ->
+    Res = madd(id(2.0), id(3.0), id(7.0)),
+    Res = madd(id(2.0), id(3.0), id(7)),
+    ok = madd(id(2.0), id(3.0), id(7), id(Res)).
+
+test_mul_add_exceptions() ->
+    error = madd(id(a), id(2), id(3), id(whatever)),
+    error = madd(id(7), id(b), id(3), id(whatever)),
+    error = madd(id(7), id(15), id(c), id(whatever)),
+
+    {'EXIT',{badarith,[{erlang,'*',[a,2],_}|_]}} = catch madd(id(a), id(2), id(0)),
+    {'EXIT',{badarith,[{erlang,'*',[a,2],_}|_]}} = catch madd(id(a), id(2), id(42)),
+    {'EXIT',{badarith,[{erlang,'*',[a,2],_}|_]}} = catch madd(id(a), id(2), id(c)),
+    {'EXIT',{badarith,[{erlang,'*',[3,b],_}|_]}} = catch madd(id(3), id(b), id(c)),
+    {'EXIT',{badarith,[{erlang,'+',[6,c],_}|_]}} = catch madd(id(2), id(3), id(c)),
+
+    ok.
+
+madd(A, B, C) -> A * B + C.
+
+madd(A, B, C, Res) when Res =:= A * B + C -> ok;
+madd(_, _, _, _) -> error.
+
+
 %% Test that the JIT only omits the overflow check when it's safe.
+division() ->
+    [{timetrap, {minutes, 5}}].
 division(_Config) ->
     _ = rand:uniform(),				%Seed generator
     io:format("Seed: ~p", [rand:export_seed()]),
@@ -507,6 +731,8 @@ division(_Config) ->
     3 = ignore_rem(ignore, 10, 3),
     1 = ignore_div(ignore, 16, 5),
 
+    unload(Mod),
+
     ok.
 
 ignore_rem(_, X, Y) ->
@@ -721,6 +947,7 @@ gen_div_function({Name,{A,B}}) ->
            R = X rem Y,
            {Q, R}. ").
 
+
 test_division([{Name,{A,B}}|T], Mod) ->
     F = fun Mod:Name/3,
     try
@@ -802,6 +1029,7 @@ test_bitwise(_Config) ->
     merl:print(Tree),
     {ok,_Bin} = merl:compile_and_load(Tree, []),
     test_bitwise(Fs0, Mod),
+    unload(Mod),
 
     %% Test invalid operands.
     expect_badarith(fun(X) -> 42 band X end),
@@ -932,6 +1160,7 @@ test_bsl(_Config) ->
     %% merl:print(Tree),
     {ok,_Bin} = merl:compile_and_load(Tree, []),
     test_bsl(Fs0, Mod),
+    unload(Mod),
     ok.
 
 bsl_gen_pairs() ->
@@ -990,6 +1219,93 @@ test_bsl([{Name,{N,S}}|T], Mod) ->
 test_bsl([], _) ->
     ok.
 
+test_bsr(_Config) ->
+    _ = rand:uniform(),				%Seed generator
+    io:format("Seed: ~p", [rand:export_seed()]),
+    Mod = list_to_atom(lists:concat([?MODULE,"_",?FUNCTION_NAME])),
+    Pairs = bsr_gen_pairs(),
+    Fs0 = gen_func_names(Pairs, 0),
+    Fs = [gen_bsr_function(F) || F <- Fs0],
+    Tree = ?Q(["-module('@Mod@').",
+               "-compile([export_all,nowarn_export_all]).",
+               "id(I) -> I."]) ++ Fs,
+    %% merl:print(Tree),
+    {ok,_Bin} = merl:compile_and_load(Tree, []),
+    test_bsr(Fs0, Mod),
+    unload(Mod),
+    ok.
+
+bsr_gen_pairs() ->
+    {_MinSmall, MaxSmall} = determine_small_limits(0),
+    SmallBits = num_bits(MaxSmall),
+
+    {Powers,Shifts} =
+        if
+            SmallBits < 32 ->
+                {lists:seq(15, SmallBits+2),
+                 lists:seq(0, 7) ++ lists:seq(24, 36)};
+            true ->
+                {lists:seq(30, SmallBits+2),
+                 lists:seq(0, 7) ++ lists:seq(56, 72)}
+        end,
+
+    [{N,S} ||
+        P <- Powers,
+        N <- [rand:uniform(1 bsl P), (1 bsl P)-1],
+        S <- Shifts].
+
+gen_bsr_function({Name,{N,S}}) ->
+    Mask = (1 bsl num_bits(N)) - 1,
+    ?Q("'@Name@'(N0, fixed, More) ->
+           Res = N0 bsr _@S@,
+           if
+               More ->
+                   N = N0 band _@Mask@,
+                   Res = N0 bsr _@S@,
+                   Res = N bsr _@S@;
+               true ->
+                   Res
+           end;
+        '@Name@'(N0, S, More) ->
+           Res = id(N0 bsr S),
+           if
+               More ->
+                   N = N0 band _@Mask@,
+                   Res = id(N0 bsr S),
+                   Res = id(N bsr S),
+                   if
+                      S >= 0 ->
+                          Res = id(N bsr S);
+                      true ->
+                           Res
+                   end;
+               true ->
+                   Res
+           end. ").
+
+test_bsr([{Name,{N,S}}|T], Mod) ->
+    try
+        Res = N bsr S,
+        Res = Mod:Name(N, fixed, true),
+        Res = Mod:Name(N, S, true),
+
+        NegRes = -N bsr S,
+        NegRes = Mod:Name(-N, fixed, false),
+
+        NegRes = -N bsr S,
+        NegRes = Mod:Name(-N, S, false),
+
+        BslRes = N bsr -S,
+        BslRes = Mod:Name(N, -S, false)
+    catch
+        C:R:Stk ->
+            io:format("~p failed. numbers: ~p ~p\n", [Name,N,S]),
+            erlang:raise(C, R, Stk)
+    end,
+    test_bsr(T, Mod);
+test_bsr([], _) ->
+    ok.
+
 element(_Config) ->
     %% Test element_1: Can't fail for integer arguments.
     zero = element_1(0),
@@ -1198,4 +1514,8 @@ determine_small_limits(N) ->
         false -> {-1 bsl (N - 1), (1 bsl (N - 1)) - 1}
     end.
 
+unload(Mod) ->
+    _ = code:delete(Mod),
+    code:purge(Mod).
+
 id(I) -> I.