From ecb54e8afc2375074ed65d0b274d9abf081ad9f7 Mon Sep 17 00:00:00 2001
From: Marshall Ward <marshall.ward@noaa.gov>
Date: Sun, 28 Jan 2024 13:53:55 -0500
Subject: [PATCH 1/3] Intrinsics: Faster cuberoot scaling functions

This patch replaces the intrinsic-based exponent rescaling with explicit
bit manipulation of the floating point number.

This appears to produce a ~2.5x speedup of the solver, reducing its time
from embarassingly slow to disappointingly slow.  It is slightly faster
than the GNU cbrt function, but still about 3x slower than the Intel
SVML cbrt function.

Timings (s) (16M array, -O3 -mavx -mfma)

| Solver              |  -O2  |  -O3  |
|---------------------|-------|-------|
| GNU x**1/3          | 0.225 | 0.198 |
| GNU cuberoot before | 0.418 | 0.412 |
| GNU cuberoot after  | 0.208 | 0.187 |
| Intel x**1/3        | 0.068 | 0.067 |
| Intel before        | 0.514 | 0.507 |
| Intel after         | 0.213 | 0.189 |

At least one issue here is that Intel SVML is using fast vectorized
logic operators whereas the Fortran intrinsics are replaced with slower
legacy scalar versions.  Not sure there is much we could even do about
that without complaining to vendors.

Also, I'm sure there's magic in their solvers which we are not
capturing.  Regardless, I think this is a major improvement.

I do not believe it will change answers, but probably a good idea to
verify this and get it in before committing any solutions using
cuberoot().
---
 src/framework/MOM_intrinsic_functions.F90 | 104 ++++++++++++++++++++--
 1 file changed, 98 insertions(+), 6 deletions(-)

diff --git a/src/framework/MOM_intrinsic_functions.F90 b/src/framework/MOM_intrinsic_functions.F90
index 4327cfa5a6..5d420057d4 100644
--- a/src/framework/MOM_intrinsic_functions.F90
+++ b/src/framework/MOM_intrinsic_functions.F90
@@ -5,6 +5,7 @@ module MOM_intrinsic_functions
 ! This file is part of MOM6. See LICENSE.md for the license.
 
 use iso_fortran_env, only : stdout => output_unit, stderr => error_unit
+use iso_fortran_env, only : int64, real64
 
 implicit none ; private
 
@@ -28,6 +29,7 @@ function invcosh(x)
 
 end function invcosh
 
+
 !> Returns the cube root of a real argument at roundoff accuracy, in a form that works properly with
 !! rescaling of the argument by integer powers of 8.  If the argument is a NaN, a NaN is returned.
 elemental function cuberoot(x) result(root)
@@ -45,16 +47,15 @@ elemental function cuberoot(x) result(root)
               ! of the cube root of asx in arbitrary units that can grow or shrink with each iteration [B D]
   real :: den_prev ! The denominator of an expression for the previous iteration of the evolving estimate of
               ! the cube root of asx in arbitrary units that can grow or shrink with each iteration [D]
-  integer :: ex_3 ! One third of the exponent part of x, used to rescale x to get a.
   integer :: itt
 
+  integer(kind=int64) :: e_x, s_x
+
   if ((x >= 0.0) .eqv. (x <= 0.0)) then
     ! Return 0 for an input of 0, or NaN for a NaN input.
     root = x
   else
-    ex_3 = ceiling(exponent(x) / 3.)
-    ! Here asx is in the range of 0.125 <= asx < 1.0
-    asx = scale(abs(x), -3*ex_3)
+    call rescale_exp(x, asx, e_x, s_x)
 
     !   Iteratively determine root_asx = asx**1/3 using Halley's method and then Newton's method,
     ! noting that Halley's method onverges monotonically and needs no bounding.  Halley's method is
@@ -82,11 +83,102 @@ elemental function cuberoot(x) result(root)
     ! that is within the last bit of the true solution.
     root_asx = root_asx - (root_asx**3 - asx) / (3.0 * (root_asx**2))
 
-    root = sign(scale(root_asx, ex_3), x)
+    root = descale_cbrt(root_asx, e_x, s_x)
   endif
-
 end function cuberoot
 
+
+!> Rescale `a` to the range [0.125, 1) while preserving its fractional term.
+pure subroutine rescale_exp(a, x, e_a, s_a)
+  real, intent(in) :: a
+    !< The value to be rescaled
+  real, intent(out) :: x
+    !< The rescaled value of `a`
+  integer(kind=int64), intent(out) :: e_a
+    !< The biased exponent of `a`
+  integer(kind=int64), intent(out) :: s_a
+    !< The sign bit of `a`
+
+  ! Floating point model, if format is (sign, exp, frac)
+  integer, parameter :: bias = maxexponent(1.) - 1
+    !< The double precision exponent offset (assuming a balanced range)
+  integer, parameter :: signbit = storage_size(1.) - 1
+    !< Position of sign bit
+  integer, parameter :: explen = 1 + ceiling(log(real(bias))/log(2.))
+    !< Bit size of exponent
+  integer, parameter :: expbit = signbit - explen
+    !< Position of lowest exponent bit
+  integer, parameter :: fraclen = expbit
+    !< Length of fractional part
+
+  integer(kind=int64) :: xb
+    !< A floating point number, bit-packed as an integer
+  integer(kind=int64) :: e_scaled
+    !< The new rescaled exponent of `a` (i.e. the exponent of `x`)
+
+  ! Pack bits of `a` into `xb` and extract its exponent and sign
+  xb = transfer(a, 1_int64)
+  s_a = ibits(xb, signbit, 1)
+  e_a = ibits(xb, expbit, explen)
+
+  ! Decompose the exponent as `e = modulo(e,3) + 3*(e/3)` and extract the
+  ! rescaled exponent, now in {-3,-2,-1}
+  e_scaled = modulo(e_a, 3) - 3 + bias
+
+  ! Insert the new 11-bit exponent into `xb`, while also setting the sign bit
+  ! to zero, ensuring that `xb` is always positive.
+  call mvbits(e_scaled, 0, explen + 1, xb, fraclen)
+
+  ! Transfer the final modified value to `x`
+  x = transfer(xb, 1.)
+end subroutine rescale_exp
+
+
+!> Descale a real number to its original base, and apply the cube root to the
+!! remaining exponent.
+pure function descale_cbrt(x, e_a, s_a) result(r)
+  real, intent(in) :: x
+    !< Cube root of the rescaled value, which was rescaled to [0.125, 1.0)
+  integer(kind=int64), intent(in) :: e_a
+    !< Exponent of the original value to be cube rooted
+  integer(kind=int64), intent(in) :: s_a
+    !< Sign bit of the original value to be cube rooted
+  real :: r
+    !< Restored value with the cube root applied to its exponent
+
+  ! Floating point model, if format is (sign, exp, frac)
+  integer, parameter :: bias = maxexponent(1.) - 1
+    !< The double precision exponent offset (assuming a balanced range)
+  integer, parameter :: signbit = storage_size(1.) - 1
+    !< Position of sign bit
+  integer, parameter :: explen = 1 + ceiling(log(real(bias))/log(2.))
+    !< Bit size of exponent
+  integer, parameter :: expbit = signbit - explen
+    !< Position of lowest exponent bit
+  integer, parameter :: fraclen = expbit
+    !< Length of fractional part
+
+  integer(kind=int64) :: xb
+    ! Bit-packed real number into integer form
+  integer(kind=int64) :: e_r
+    ! Exponent of the descaled value
+
+  ! Extract the exponent of the rescaled value, in {-3, -2, -1}
+  xb = transfer(x, 1_8)
+  e_r = ibits(xb, expbit, explen)
+
+  ! Apply the cube root to the old exponent (after removing its bias) and add
+  ! to the rescaled exponent.  Correct the previous -3 with a +1.
+  e_r = e_r + (e_a/3 - bias/3 + 1)
+
+  ! Apply the corrected exponent and sign and convert back to real
+  call mvbits(e_r, 0, explen, xb, expbit)
+  call mvbits(s_a, 0, 1, xb, signbit)
+  r = transfer(xb, 1.)
+end function descale_cbrt
+
+
+
 !> Returns true if any unit test of intrinsic_functions fails, or false if they all pass.
 logical function intrinsic_functions_unit_tests(verbose)
   logical, intent(in) :: verbose !< If true, write results to stdout

From a74d6022760afcc48aa4201b694ee35d5919c948 Mon Sep 17 00:00:00 2001
From: Marshall Ward <marshall.ward@noaa.gov>
Date: Tue, 30 Jan 2024 15:28:29 -0500
Subject: [PATCH 2/3] Cuberoot: Refactor (re|de)scale functions

Some modifications were made to the cuberoot rescale and descale
functions:

* The machine parameters were moved from function to module parameters.
  This could dangerously expose them to other functions, but it prevents
  multiple definitions of the same numbers.

* The exponent is now cube-rooted in rescale rather than descale.

* The exponent expressions are broken into more explicit steps, rather
  than combining multiple steps and assumptions into a single
  expression.

* The bias is no longer assumed to be a multiple of three.  This is true
  for double precision but not single precision.

A new test of quasi-random number was also added to the cuberoot test
suite.  These numbers were able to detect the differences in GNU and
Intel compiler output.  A potential error in the return value of the
test was also fixed.

The volatile test of 1 - 0.5*ULP has been added.  The cube root of this
value rounds to 1, and needs to be handled carefully.

The unit test function `cuberoot(v**3)` was reversed to `cuberoot(v)**`,
to include testing of this value.  (Cubing would wipe out the anomaly.)
---
 src/framework/MOM_intrinsic_functions.F90 | 148 +++++++++++-----------
 1 file changed, 75 insertions(+), 73 deletions(-)

diff --git a/src/framework/MOM_intrinsic_functions.F90 b/src/framework/MOM_intrinsic_functions.F90
index 5d420057d4..07c6abe3ad 100644
--- a/src/framework/MOM_intrinsic_functions.F90
+++ b/src/framework/MOM_intrinsic_functions.F90
@@ -12,6 +12,19 @@ module MOM_intrinsic_functions
 public :: invcosh, cuberoot
 public :: intrinsic_functions_unit_tests
 
+! Floating point model, if bit layout from high to low is (sign, exp, frac)
+
+integer, parameter :: bias = maxexponent(1.) - 1
+  !< The double precision exponent offset
+integer, parameter :: signbit = storage_size(1.) - 1
+  !< Position of sign bit
+integer, parameter :: explen = 1 + ceiling(log(real(bias))/log(2.))
+  !< Bit size of exponent
+integer, parameter :: expbit = signbit - explen
+  !< Position of lowest exponent bit
+integer, parameter :: fraclen = expbit
+  !< Length of fractional part
+
 contains
 
 !> Evaluate the inverse cosh, either using a math library or an
@@ -55,7 +68,7 @@ elemental function cuberoot(x) result(root)
     ! Return 0 for an input of 0, or NaN for a NaN input.
     root = x
   else
-    call rescale_exp(x, asx, e_x, s_x)
+    call rescale_cbrt(x, asx, e_x, s_x)
 
     !   Iteratively determine root_asx = asx**1/3 using Halley's method and then Newton's method,
     ! noting that Halley's method onverges monotonically and needs no bounding.  Halley's method is
@@ -83,109 +96,90 @@ elemental function cuberoot(x) result(root)
     ! that is within the last bit of the true solution.
     root_asx = root_asx - (root_asx**3 - asx) / (3.0 * (root_asx**2))
 
-    root = descale_cbrt(root_asx, e_x, s_x)
+    root = descale(root_asx, e_x, s_x)
   endif
 end function cuberoot
 
 
-!> Rescale `a` to the range [0.125, 1) while preserving its fractional term.
-pure subroutine rescale_exp(a, x, e_a, s_a)
+!> Rescale `a` to the range [0.125, 1) and compute its cube-root exponent.
+pure subroutine rescale_cbrt(a, x, e_r, s_a)
   real, intent(in) :: a
-    !< The value to be rescaled
+    !< The real parameter to be rescaled for cube root
   real, intent(out) :: x
-    !< The rescaled value of `a`
-  integer(kind=int64), intent(out) :: e_a
-    !< The biased exponent of `a`
+    !< The rescaled value of a
+  integer(kind=int64), intent(out) :: e_r
+    !< Cube root of the exponent of the rescaling of `a`
   integer(kind=int64), intent(out) :: s_a
-    !< The sign bit of `a`
-
-  ! Floating point model, if format is (sign, exp, frac)
-  integer, parameter :: bias = maxexponent(1.) - 1
-    !< The double precision exponent offset (assuming a balanced range)
-  integer, parameter :: signbit = storage_size(1.) - 1
-    !< Position of sign bit
-  integer, parameter :: explen = 1 + ceiling(log(real(bias))/log(2.))
-    !< Bit size of exponent
-  integer, parameter :: expbit = signbit - explen
-    !< Position of lowest exponent bit
-  integer, parameter :: fraclen = expbit
-    !< Length of fractional part
+    !< The sign bit of a
 
   integer(kind=int64) :: xb
-    !< A floating point number, bit-packed as an integer
-  integer(kind=int64) :: e_scaled
-    !< The new rescaled exponent of `a` (i.e. the exponent of `x`)
-
-  ! Pack bits of `a` into `xb` and extract its exponent and sign
+    ! Floating point value of a, bit-packed as an integer
+  integer(kind=int64) :: e_a
+    ! Unscaled exponent of a
+  integer(kind=int64) :: e_x
+    ! Exponent of x
+  integer(kind=int64) :: e_div, e_mod
+    ! Quotient and remainder of e in e = 3*(e/3) + modulo(e,3).
+
+  ! Pack bits of a into xb and extract its exponent and sign.
   xb = transfer(a, 1_int64)
   s_a = ibits(xb, signbit, 1)
-  e_a = ibits(xb, expbit, explen)
+  e_a = ibits(xb, expbit, explen) - bias
+
+  ! Compute terms of exponent decomposition e = 3*(e/3) + modulo(e,3).
+  ! (Fortran division is round-to-zero, so we must emulate floor division.)
+  e_mod = modulo(e_a, 3_int64)
+  e_div = (e_a - e_mod)/3
+
+  ! Our scaling decomposes e_a into e = {3*(e/3) + 3} + {modulo(e,3) - 3}.
 
-  ! Decompose the exponent as `e = modulo(e,3) + 3*(e/3)` and extract the
-  ! rescaled exponent, now in {-3,-2,-1}
-  e_scaled = modulo(e_a, 3) - 3 + bias
+  ! The first term is a perfect cube, whose cube root is computed below.
+  e_r = e_div + 1
 
-  ! Insert the new 11-bit exponent into `xb`, while also setting the sign bit
-  ! to zero, ensuring that `xb` is always positive.
-  call mvbits(e_scaled, 0, explen + 1, xb, fraclen)
+  ! The second term ensures that x is shifted to [0.125, 1).
+  e_x = e_mod - 3
 
-  ! Transfer the final modified value to `x`
+  ! Insert the new 11-bit exponent into xb and write to x and extend the
+  ! bitcount to 12, so that the sign bit is zero and x is always positive.
+  call mvbits(e_x + bias, 0, explen + 1, xb, fraclen)
   x = transfer(xb, 1.)
-end subroutine rescale_exp
+end subroutine rescale_cbrt
 
 
-!> Descale a real number to its original base, and apply the cube root to the
-!! remaining exponent.
-pure function descale_cbrt(x, e_a, s_a) result(r)
+!> Undo the rescaling of a real number back to its original base.
+pure function descale(x, e_a, s_a) result(a)
   real, intent(in) :: x
-    !< Cube root of the rescaled value, which was rescaled to [0.125, 1.0)
+    !< The rescaled value which is to be restored.
   integer(kind=int64), intent(in) :: e_a
-    !< Exponent of the original value to be cube rooted
+    !< Exponent of the unscaled value
   integer(kind=int64), intent(in) :: s_a
-    !< Sign bit of the original value to be cube rooted
-  real :: r
-    !< Restored value with the cube root applied to its exponent
-
-  ! Floating point model, if format is (sign, exp, frac)
-  integer, parameter :: bias = maxexponent(1.) - 1
-    !< The double precision exponent offset (assuming a balanced range)
-  integer, parameter :: signbit = storage_size(1.) - 1
-    !< Position of sign bit
-  integer, parameter :: explen = 1 + ceiling(log(real(bias))/log(2.))
-    !< Bit size of exponent
-  integer, parameter :: expbit = signbit - explen
-    !< Position of lowest exponent bit
-  integer, parameter :: fraclen = expbit
-    !< Length of fractional part
+    !< Sign bit of the unscaled value
+  real :: a
+    !< Restored value with the corrected exponent and sign
 
   integer(kind=int64) :: xb
     ! Bit-packed real number into integer form
-  integer(kind=int64) :: e_r
-    ! Exponent of the descaled value
+  integer(kind=int64) :: e_x
+    ! Biased exponent of x
 
-  ! Extract the exponent of the rescaled value, in {-3, -2, -1}
+  ! Apply the corrected exponent and sign to x.
   xb = transfer(x, 1_8)
-  e_r = ibits(xb, expbit, explen)
-
-  ! Apply the cube root to the old exponent (after removing its bias) and add
-  ! to the rescaled exponent.  Correct the previous -3 with a +1.
-  e_r = e_r + (e_a/3 - bias/3 + 1)
-
-  ! Apply the corrected exponent and sign and convert back to real
-  call mvbits(e_r, 0, explen, xb, expbit)
+  e_x = ibits(xb, expbit, explen)
+  call mvbits(e_a + e_x, 0, explen, xb, expbit)
   call mvbits(s_a, 0, 1, xb, signbit)
-  r = transfer(xb, 1.)
-end function descale_cbrt
-
+  a = transfer(xb, 1.)
+end function descale
 
 
 !> Returns true if any unit test of intrinsic_functions fails, or false if they all pass.
-logical function intrinsic_functions_unit_tests(verbose)
+function intrinsic_functions_unit_tests(verbose) result(fail)
   logical, intent(in) :: verbose !< If true, write results to stdout
+  logical :: fail !< True if any of the unit tests fail
 
   ! Local variables
   real :: testval  ! A test value for self-consistency testing [nondim]
-  logical :: fail, v
+  logical :: v
+  integer :: n
 
   fail = .false.
   v = verbose
@@ -199,7 +193,15 @@ logical function intrinsic_functions_unit_tests(verbose)
   fail = fail .or. Test_cuberoot(v, 1.0)
   fail = fail .or. Test_cuberoot(v, 0.125)
   fail = fail .or. Test_cuberoot(v, 0.965)
-
+  fail = fail .or. Test_cuberoot(v, 1.0 - epsilon(1.0))
+  fail = fail .or. Test_cuberoot(v, 1.0 - 0.5*epsilon(1.0))
+
+  testval = 1.0e-99
+  v = .false.
+  do n=-160,160
+    fail = fail .or. Test_cuberoot(v, testval)
+    testval = (-2.908 * (1.414213562373 + 1.2345678901234e-5*n)) * testval
+  enddo
 end function intrinsic_functions_unit_tests
 
 !> True if the cube of cuberoot(val) does not closely match val. False otherwise.
@@ -209,7 +211,7 @@ logical function Test_cuberoot(verbose, val)
   ! Local variables
   real :: diff ! The difference between val and the cube root of its cube.
 
-  diff = val - cuberoot(val**3)
+  diff = val - cuberoot(val)**3
   Test_cuberoot = (abs(diff) > 2.0e-15*abs(val))
 
   if (Test_cuberoot) then

From da9dd8a090e309b61fadb424c31a3a56ead5973a Mon Sep 17 00:00:00 2001
From: Marshall Ward <marshall.ward@noaa.gov>
Date: Tue, 30 Jan 2024 15:56:41 -0500
Subject: [PATCH 3/3] Cuberoot: Break **3 into explicit integer cubes

In separate testing, we observed that Intel would use the `pow()`
function to evaluate the cubes of some numbers, causing different
answers with GNU.

In this patch, I replace the cubic x**3 operations with explicit x*x*x
multiplication, which appears to avoid this substitution.

Well, for the moment, at least.
---
 src/framework/MOM_intrinsic_functions.F90 | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/framework/MOM_intrinsic_functions.F90 b/src/framework/MOM_intrinsic_functions.F90
index 07c6abe3ad..fbb1c28096 100644
--- a/src/framework/MOM_intrinsic_functions.F90
+++ b/src/framework/MOM_intrinsic_functions.F90
@@ -52,14 +52,19 @@ elemental function cuberoot(x) result(root)
   real :: asx ! The absolute value of x rescaled by an integer power of 8 to put it into
               ! the range from 0.125 < asx <= 1.0, in ambiguous units cubed [B3]
   real :: root_asx ! The cube root of asx [B]
+  real :: ra_3 ! root_asx cubed [B3]
   real :: num ! The numerator of an expression for the evolving estimate of the cube root of asx
               ! in arbitrary units that can grow or shrink with each iteration [B C]
   real :: den ! The denominator of an expression for the evolving estimate of the cube root of asx
               ! in arbitrary units that can grow or shrink with each iteration [C]
   real :: num_prev ! The numerator of an expression for the previous iteration of the evolving estimate
               ! of the cube root of asx in arbitrary units that can grow or shrink with each iteration [B D]
+  real :: np_3 ! num_prev cubed  [B3 D3]
   real :: den_prev ! The denominator of an expression for the previous iteration of the evolving estimate of
               ! the cube root of asx in arbitrary units that can grow or shrink with each iteration [D]
+  real :: dp_3 ! den_prev cubed  [C3]
+  real :: r0  ! Initial value of the iterative solver. [B C]
+  real :: r0_3 ! r0 cubed [B3 C3]
   integer :: itt
 
   integer(kind=int64) :: e_x, s_x
@@ -79,14 +84,21 @@ elemental function cuberoot(x) result(root)
 
     ! This first estimate gives the same magnitude of errors for 0.125 and 1.0 after two iterations.
     ! The first iteration is applied explicitly.
-    num = 0.707106 * (0.707106**3 + 2.0 * asx)
-    den = 2.0 * (0.707106**3) + asx
+    r0 = 0.707106
+    r0_3 = r0 * r0 * r0
+    num = r0 * (r0_3 + 2.0 * asx)
+    den = 2.0 * r0_3 + asx
 
     do itt=1,2
       ! Halley's method iterates estimates as Root = Root * (Root**3 + 2.*asx) / (2.*Root**3 + asx).
       num_prev = num ; den_prev = den
-      num = num_prev * (num_prev**3 + 2.0 * asx * (den_prev**3))
-      den = den_prev * (2.0 * num_prev**3 + asx * (den_prev**3))
+
+      ! Pre-compute these as integer powers, to avoid `pow()`-like intrinsics.
+      np_3 = num_prev * num_prev * num_prev
+      dp_3 = den_prev * den_prev * den_prev
+
+      num = num_prev * (np_3 + 2.0 * asx * dp_3)
+      den = den_prev * (2.0 * np_3 + asx * dp_3)
       ! Equivalent to:  root_asx = root_asx * (root_asx**3 + 2.*asx) / (2.*root_asx**3 + asx)
     enddo
     ! At this point the error in root_asx is better than 1 part in 3e14.
@@ -94,7 +106,8 @@ elemental function cuberoot(x) result(root)
 
     ! One final iteration with Newton's method polishes up the root and gives a solution
     ! that is within the last bit of the true solution.
-    root_asx = root_asx - (root_asx**3 - asx) / (3.0 * (root_asx**2))
+    ra_3 = root_asx * root_asx * root_asx
+    root_asx = root_asx - (ra_3 - asx) / (3.0 * (root_asx * root_asx))
 
     root = descale(root_asx, e_x, s_x)
   endif