Skip to content

Commit

Permalink
Optimize the implementation of the Karatsuba algorithm
Browse files Browse the repository at this point in the history
This commit implements the Karatsuba algorithm in a way
that reduces the number of additions, resulting in a measureable
performance improvement for multiplication of large integers.
  • Loading branch information
bjorng committed Aug 18, 2023
1 parent 4253b86 commit 4ca352e
Show file tree
Hide file tree
Showing 2 changed files with 159 additions and 67 deletions.
224 changes: 157 additions & 67 deletions erts/emulator/beam/big.c
Original file line number Diff line number Diff line change
Expand Up @@ -768,22 +768,27 @@ static dsize_t I_mul_karatsuba(ErtsDigit* x, dsize_t xl, ErtsDigit* y,
/* Use the Karatsuba algorithm. */
Eterm *heap;
Uint temp_heap_size;
Uint z0_len, z1_len, z2_len, sum0_len, sum1_len, res_len;
Uint z0_len, z1_len, z2_len, tmp_len, diff0_len, diff1_len, res_len;
Uint low_x_len, low_y_len, high_x_len, high_y_len;
Eterm *z0_buf, *z1_buf, *z2_buf, *z_res_buf;
Eterm *sum0_buf, *sum1_buf;
Eterm *z0_buf, *z1_buf, *z2_buf, *tmp_buf;
Eterm *diff0_buf, *diff1_buf;
#ifdef DEBUG
Eterm *sum_buf_end, *z_buf_end;
Eterm *alloc_end;
#endif
Eterm *low_x, *low_y, *high_x, *high_y;
ErtsDigit zero = 0;
Uint m = (xl+1) / 2;
int tmp_prod_negative = 0;
int i;

/* Set up pointers and sizes. */
low_x = x;
low_x_len = m;
high_x = x + m;
high_x_len = xl - m;
while (low_x_len > 1 && low_x[low_x_len-1] == 0) {
low_x_len--;
}

low_y = y;
if (yl <= m) {
Expand All @@ -796,45 +801,49 @@ static dsize_t I_mul_karatsuba(ErtsDigit* x, dsize_t xl, ErtsDigit* y,
high_y = y + m;
high_y_len = yl - m;
}
while (low_y_len > 1 && low_y[low_y_len-1] == 0) {
low_y_len--;
}

ASSERT(low_x_len <= m);
ASSERT(high_x_len <= m);
ASSERT(low_y_len <= m);
ASSERT(high_y_len <= m);

/* Set up buffers for the sums for z1 in the result area. */
sum0_buf = r;
sum1_buf = r + m + 1;

/*
* Set up temporary buffers in allocated memory.
*
* z1_buf is not used at the same time as diff0_buf
* and diff1_buf, so they can share memory.
*/
temp_heap_size = (4*m + 1) * sizeof(Eterm);
#ifdef DEBUG
sum_buf_end = sum1_buf + m + 1;
ASSERT(sum_buf_end - sum0_buf + 1 <= xl + yl);
sum1_buf[0] = ERTS_HOLE_MARKER;
sum_buf_end[0] = ERTS_HOLE_MARKER;
temp_heap_size += sizeof(Eterm);
#endif

/* Set up temporary buffers in the allocated memory. */
temp_heap_size = (3*(2*m+2) + (xl+yl+1) + 1) * sizeof(Eterm);
heap = (Eterm *) erts_alloc(ERTS_ALC_T_TMP, temp_heap_size);
z0_buf = heap;
z1_buf = z0_buf + 2*m + 2;
z2_buf = z1_buf + 2*m + 2;
z_res_buf = z2_buf + 2*m + 2;
#ifdef DEBUG
z_buf_end = z_res_buf + xl+yl+1;
#endif
z1_buf = heap;
diff0_buf = z1_buf + 1;
diff1_buf = diff0_buf + m;
tmp_buf = diff1_buf + m;

#ifdef DEBUG
z1_buf[0] = ERTS_HOLE_MARKER;
z2_buf[0] = ERTS_HOLE_MARKER;
z_res_buf[0] = ERTS_HOLE_MARKER;
z_buf_end[0] = ERTS_HOLE_MARKER;
diff0_buf[0] = ERTS_HOLE_MARKER;
diff1_buf[0] = ERTS_HOLE_MARKER;
tmp_buf[0] = ERTS_HOLE_MARKER;

alloc_end = tmp_buf + 2*m;
alloc_end[0] = ERTS_HOLE_MARKER;
ASSERT(alloc_end - heap + 1 == temp_heap_size / sizeof(Eterm));
#endif

/* z0 = low_x * low_y */
z0_len = I_mul_karatsuba(low_x, low_x_len, low_y, low_y_len, z0_buf);
/* Set up pointers for the result. */
z0_buf = r;
z2_buf = r + 2*m;

ASSERT(z1_buf[0] == ERTS_HOLE_MARKER);
#ifdef DEBUG
z2_buf[0] = ERTS_HOLE_MARKER;
#endif

#define I_OPERATION(_result, _op, _p1, _sz1, _p2, _sz2, _buf) \
do { \
Expand All @@ -846,73 +855,154 @@ static dsize_t I_mul_karatsuba(ErtsDigit* x, dsize_t xl, ErtsDigit* y,
} while (0)

/*
* z1 = (low1 + high1) * (low2 + high2)
* The Karatsuba algorithm is a divide and conquer algorithm
* for multi-word integer multiplication. The numbers to be
* multiplied are split up like this:
*
* high low
* +--------+--------+
* | high_x | low_x |
* +--------+--------+
*
* +--------+--------+
* | high_y | low_y |
* +--------+--------+
*
* Then the following values are calculated:
*
* z0 = low_x * low_y
* z2 = high_x + high_y
* z1 = (low_x - high_x) * (high_y - low_y) + z2 + z0
*
* Note that this expression for z1 produces the same result
* as:
*
* low_x * high_y + high_x * low_y
*
* Finally, the z2, z1, z0 values are combined to form the
* product of x and y:
*
* high low
* +--+--+ +--+--+
* | z2 | | z0 |
* +--+--+ +--+--+
* +--+--+
* add | z1 |
* +--+--+
*
* There is an alternate way to calculate z1 (commonly found
* in descriptions of the Karatsuba algorithm);
*
* z1 = (high_x + low_x) * (high_y + low_y) - z2 - z0
*
* But this way can lead to more additions and carry handling.
*/
I_OPERATION(sum0_len, I_add, low_x, low_x_len, high_x, high_x_len, sum0_buf);
ASSERT(sum1_buf[0] == ERTS_HOLE_MARKER);

I_OPERATION(sum1_len, I_add, low_y, low_y_len, high_y, high_y_len, sum1_buf);
ASSERT(sum_buf_end[0] == ERTS_HOLE_MARKER);

I_OPERATION(z1_len, I_mul_karatsuba, sum0_buf, sum0_len, sum1_buf, sum1_len, z1_buf);
/*
* z0 = low_x * low_y
*
* Store this product in its final location in the result buffer.
*/
I_OPERATION(z0_len, I_mul_karatsuba, low_x, low_x_len, low_y, low_y_len, z0_buf);
ASSERT(z2_buf[0] == ERTS_HOLE_MARKER);
for (i = z0_len; i < 2*m; i++) {
z0_buf[i] = 0;
}
while (z0_len > 1 && z0_buf[z0_len - 1] == 0) {
z0_len--;
}
ASSERT(z0_len == 1 || z0_buf[z0_len-1] != 0);
ASSERT(z0_len <= low_x_len + low_y_len);

/*
* z2 = high_x * high_y
*
* Store this product in its final location in the result buffer.
*/

if (high_y != &zero) {
I_OPERATION(z2_len, I_mul_karatsuba, high_x, high_x_len, high_y, high_y_len, z2_buf);
I_OPERATION(z2_len, I_mul_karatsuba, high_x, high_x_len,
high_y, high_y_len, z2_buf);
while (z2_len > 1 && z2_buf[z2_len - 1] == 0) {
z2_len--;
}
ASSERT(z2_len == 1 || z2_buf[z2_len-1] != 0);
} else {
z2_buf[0] = 0;
z2_len = 1;
}
ASSERT(z_res_buf[0] == ERTS_HOLE_MARKER);
ASSERT(z2_len <= high_x_len + high_y_len);

/*
* z0 + (z1 × base ^ m) + (z2 × base ^ (m × 2)) - ((z0 + z2) × base ^ m)
* tmp = abs(low_x - high_x) * abs(high_y - low_y)
*
* The absolute value of each difference will fit in m words.
*
* Note that the result of expression before normalization is
* not guaranteed to fit in the result buffer provided by the
* caller (r). Therefore, we must use a temporary buffer when
* calculating it.
* Save the sign of the product so that we later can choose to
* subtract or add this value.
*/

/* Copy z0 to temporary result buffer. */
res_len = I_add(z0_buf, z0_len, &zero, 1, z_res_buf);

while (res_len <= m) {
z_res_buf[res_len++] = 0;
if (I_comp(low_x, low_x_len, high_x, high_x_len) >= 0) {
diff0_len = I_sub(low_x, low_x_len, high_x, high_x_len, diff0_buf);
} else {
tmp_prod_negative = !tmp_prod_negative;
diff0_len = I_sub(high_x, high_x_len, low_x, low_x_len, diff0_buf);
}
ASSERT(diff1_buf[0] == ERTS_HOLE_MARKER);
ASSERT(diff0_len == 1 || diff0_buf[diff0_len-1] != 0);
ASSERT(diff0_len <= m);

/* Add z1 × base ^ m */
I_OPERATION(res_len, I_add, z_res_buf+m, res_len-m, z1_buf, z1_len, z_res_buf+m);

while (res_len <= m) {
z_res_buf[m+res_len++] = 0;
if (x == y) {
ASSERT(xl == yl);
tmp_prod_negative = 1;
diff1_buf = diff0_buf;
diff1_len = diff0_len;
} else if (I_comp(high_y, high_y_len, low_y, low_y_len) >= 0) {
diff1_len = I_sub(high_y, high_y_len, low_y, low_y_len, diff1_buf);
} else {
tmp_prod_negative = !tmp_prod_negative;
if (high_y != &zero) {
diff1_len = I_sub(low_y, low_y_len, high_y, high_y_len, diff1_buf);
} else {
diff1_buf = low_y;
diff1_len = low_y_len;
}
}
ASSERT(tmp_buf[0] == ERTS_HOLE_MARKER);
ASSERT(diff1_len == 1 || diff1_buf[diff1_len-1] != 0);
ASSERT(diff1_len <= m);

I_OPERATION(tmp_len, I_mul_karatsuba, diff0_buf, diff0_len, diff1_buf, diff1_len, tmp_buf);
ASSERT(alloc_end[0] == ERTS_HOLE_MARKER);
while (tmp_len > 1 && tmp_buf[tmp_len - 1] == 0) {
tmp_len--;
}
ASSERT(tmp_len == 1 || tmp_buf[tmp_len-1] != 0);
ASSERT(tmp_len <= diff0_len + diff1_len);

/* Add z2 × base ^ (m × 2) */
I_OPERATION(res_len, I_add, z_res_buf+2*m, res_len-m, z2_buf, z2_len, z_res_buf+2*m);

/* Calculate z0 + z2 */
I_OPERATION(z0_len, I_add, z0_buf, z0_len, z2_buf, z2_len, z0_buf);
/*
* z1 = z0 + z2
*/
I_OPERATION(z1_len, I_add, z0_buf, z0_len, z2_buf, z2_len, z1_buf);
ASSERT(z1_len == 1 || z1_buf[z1_len-1] != 0);

/* Subtract (z0 + z2) × base ^ m */
res_len = I_sub(z_res_buf+m, res_len+m, z0_buf, z0_len, z_res_buf+m);
if (tmp_prod_negative) {
/* z1 = z1 - tmp */
z1_len = I_sub(z1_buf, z1_len, tmp_buf, tmp_len, z1_buf);
} else {
/* z1 = z1 + tmp */
I_OPERATION(z1_len, I_add, z1_buf, z1_len, tmp_buf, tmp_len, z1_buf);
}

ASSERT(z_buf_end[0] == ERTS_HOLE_MARKER);
/* Add z1 shifted into the result */
I_OPERATION(res_len, I_add, z0_buf+m, z2_len+m, z1_buf, z1_len, z0_buf+m);

/* Normalize */
while (z_res_buf[m + res_len - 1] == 0 && res_len > 0) {
res_len += m;
while (res_len > 1 && r[res_len - 1] == 0) {
res_len--;
}
res_len += m;
ASSERT(res_len == 1 || r[res_len-1] != 0);
ASSERT(res_len <= xl + yl);

/* Copy result to the the final result buffer. */
(void) I_add(z_res_buf, res_len, &zero, 1, r);

erts_free(ERTS_ALC_T_TMP, (void *) heap);
return res_len;
}
Expand Down
2 changes: 2 additions & 0 deletions erts/emulator/test/big_SUITE_data/karatsuba.dat
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@
778044957111982296698085106003820588379533248535175305369992153103173638825081172125947786580536601796787332015996348528501051686995129310226034229210961747151236268717981478782260 = 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222220638517163875604234197893799387492940714846904459640175648396747364515954208900839325461351363793129284077140658689554385146217203856200723212082378278681864294152015980850427464026656249693797220123249860586581459140699479021638770759493450252580845047833949914496709723236955652660 div 2856161719074522159237009590056107822635035670018713848188829444171911440810511153593372984982324471392734428893744842307433179041780071800813834204750896979634955588152420293439551458314069220674241649915149179367953255529141343871757486196569041879420486970654045852414605072383041.
2856161719074522159237009590056107822635035670018713848188829444171911440810511153593372984982324471392734428893744842307433179041780071800813834204750896979634955588152420293439551458314069220674241649915149179367953255529141343871757486196569041879420486970654045852414605072383041 = 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222220638517163875604234197893799387492940714846904459640175648396747364515954208900839325461351363793129284077140658689554385146217203856200723212082378278681864294152015980850427464026656249693797220123249860586581459140699479021638770759493450252580845047833949914496709723236955652660 div 778044957111982296698085106003820588379533248535175305369992153103173638825081172125947786580536601796787332015996348528501051686995129310226034229210961747151236268717981478782260.
111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111063449349767560406261235142392341647649025372723765646039413496854749810861561328494820951780143140256625626909176959305680458786553388553625107116000046915830257816726584619722724548281507789670906469943008403108740971884016290536011259624824529432103615715080701414809437629318331795425002605292373621627076391706037344544674952930655508373167888426163639927117607800531055216745789633238812571951209824449373098738720474325546029838911536645854016658738287810538248535167943048401518088275884360994522447081283976255652056784266085927715913003002338073550339930729586019785550085812920084971952784494047195550542735273493532002238413738149969014369426810026122179115108673871125270161926777717396432145405190153796887093089464334888549712739386389592196847353891969079332035195409407770223110583450176399233173852721971468580226906388907065682291829282704839192753805483705821353760692262887904886094845663298931749956912234282496355841821173993089465864518791095006027639953496042767023689710197973028632817727184498866638467324567019639370889678900544190692386873018896672246736368046857567131910710307733567633067262431422838734848 = 227244752509208666244300049023218501664936565624639514576438496566054518678090377206901554539288856996266272083253480866328891556584179795376043393179397439111001075307836347541226766323267029349186809052518632576630669501495499405707466858562586370759397778341785328471459982484138277798132149400741413820728000336730772425983653136275550466388207243285757977918 * 488949073121539043178895838851406898867685260673611653386208527301320256750945098442460084709113182789107256214464892052884956575049058585335894004824225556978848052700870382486432811126512032810019093103514678580988403100655092773497194921652262878660895819095201202957727846410635903499346091564032100080922029110666198856874372068909543752725127446446275130141704384403153173213642973260347547120927518593494548154693042687813126757638259919480596660015785042800818110628659177941014457037586017066097793890372685757581701160719984224943404424801688527463891923678834379546779645545606419569427574824563111913480237499286872496822079247464512671595600027881545516790726794520875456846025695904828856824122680691695622222780388482248652558345646407544371724549307907546148267185543556530668197490478774267191002528938567372851525085232721844598755176527243119768654561142351728852269053413174476938571816914419435443895662361267309023154705765158795604804766486473731465844362909575530822196003981963565054860681271890620405191718478159479477685388913902388171544282368568608479247952664643776459165494556000797676507398085908822491136.
16#aeb17ba36a5a62ac6f0aad2b264d0787363825b9f0edf1ddd6e3d06eb970b70c90d5a43da0e234d85a2bd692ac118318965a1fa855019b8c65f32487755dc5677e27863aa4e4a6a82a76884c4d5d78f5b7807151b0179ee3b387b2118211610d832d1e7367a0e3cd50cce3ce2810e3567fc3fddf180c5ccd0572dc0f8662ef54e864e6182c3f951deff6d4a6cead4322e9bf3d55276f9dbdab649fa18fbdeaa89c002e037bb9090b1a5907ab6d18de09f8f376efdc0341ae360aa732405bf83cfe8342d644443208cfb8ef0568cd597de1ce7389878e48863bf0ebf1538ce2c317d8ac9f81976ae51617d7f6939582a8c28375caab30052d8ddf1b2995fb3891ea4541ef3d92bff37b6726052e8d7530b1f64a3cdfbba9cc320b55b2504417ff21986ceaaab8d4f73fafca6076e04fda786562571c5482b1f06b9b2762f51f3c1734284916153b377f147feb9ab398cf9ee46ba272c0ec8685f5a3832ff4e32aca370591f68bf38523839bd7367ebe02170150e87c69c3ff0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 = 16#ffffffffffffffff514e845c95a59d5319bf93b6817098d5d7971aec9505825ab1147be429d33c3c85e64de35cbde5d4346523fbc587238f2dc034f4089e119df20a0ddedd415203a7f0a3197be55398eed8be064b7654f4ad47b9bba204f02e04e3d5765209f9606f5d9dbf3b5a2d3734c8f69d2c4677c7d19b6e7ce34b705b220cd214d02435619b89c579d4110f7904aee7c0461b50a48e35c911cea6aae434020aa597a1dc70510e6dab26caf2327ee50d24077a61b317d42479cdf6e1ff00000000000000000000000000000000000000000000000000000000000000000000000000000000 * 16#aeb17ba36a5a62ace6406c497e8f672a2868e5136afa7da54eeb841bd62cc3c37a19b21ca3421a2bcb9adc043a78dc70d23fcb0bf761ee620df5f22122beadfc580f5ce6841aac67112741f9b489ab0b52b846445dfb0fd1fb1c2a89adf6069f90a26240c4a5d2c8cb370962d3b988382e6491831cb48fa4ddf32deb2fdbca9e64763a862beef086fb51183fb9e4af5b71ca36ee3159551bcbfdf55a685e238faef19254d9350dcd811af2dbf8859e4ce82bdb8632091e0100000000000000000000000000000000000000000000000000000000000000000000000000000000.
16#34c8f69d2c4677c6d19b6e7ce34b705bd0be4db83a7e980e81ca31c352a076a32d17ccd3b115ce49dd214d2da4d36ea7ae1bbcc23ae3f69c1ca949af6143cea35124d82ffedc501525ca169af0b58ffb580f5ce6841aac67112741f9b489ab0b52b846445dfb0fd1fb1c2a89adf6069f90a26240c4a5d2c9 = 16#34c8f69d2c4677c7d19b6e7ce34b705b220cd214d02435619b89c579d4110f7904aee7c0461b50a48e35c911cea6aae434020aa597a1dc70510e6dab26caf2327ee50d24077a61b317d42479cdf6e1ff00000000000000000000000000000000000000000000000000000000000000000000000000000000 - 16#ffffffffffffffff514e845c95a59d5319bf93b6817098d5d7971aec9505825ab1147be429d33c3c85e64de35cbde5d4346523fbc587238f2dc034f4089e119df20a0ddedd415203a7f0a3197be55398eed8be064b7654f4ad47b9bba204f02e04e3d5765209f9606f5d9dbf3b5a2d37.

0 comments on commit 4ca352e

Please sign in to comment.