From 974e1eacda3b8f3f7a87f8dae2805ecfd1d9823d Mon Sep 17 00:00:00 2001
From: Marshall Ward <marshall.ward@gmail.com>
Date: Mon, 5 Sep 2022 14:03:16 -0400
Subject: [PATCH] Barotropic weight vectorization

This patch modifies the calculation of barotropic weight functions
wt_[uv] to a more vectorizable form.  The if-blocks used to evaluate the
piecewise function was replaced with a sequence of min/max evaluations,
which are vectorizable.

A nondimensional subroundoff term is also introducted to address the
division by zero in the evaluation of 1 - 0.5 * Instep / visc_rem.
When visc_rem is zero, this term will never be selected by the max()
operator, since it is negatively asymptotic.  When visc_rem is
sufficiently large to accept this term, the value of subroundoff is too
small to modify the significand of the floating point value.

In our runs, the overall time of btstep was reduced by 6%.

Experiment: benchmark
Platform: AMD Ryzen 5 2600
Compiler: gfortran 12.2
Flags: -O3 -mavx

Timer: (Ocean BT pre-calcs only)

Before:
- 1.781952
- 1.787916
- 1.790447

After:
- 1.544491
- 1.553111
- 1.544491

Timer: (Ocean barotropic mode stepping)

Before:
- 4.042281
- 4.050225
- 4.051413

After:
- 3.785838
- 3.822393
- 3.835511

Speedup:
~13% of "(Ocean BT pre-calc step)"
~6% speedup of btstep
---
 src/core/MOM_barotropic.F90 | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/core/MOM_barotropic.F90 b/src/core/MOM_barotropic.F90
index 5a02f64240..83cc0b7d09 100644
--- a/src/core/MOM_barotropic.F90
+++ b/src/core/MOM_barotropic.F90
@@ -398,6 +398,10 @@ module MOM_barotropic
 character*(20), parameter :: BT_CONT_STRING = "FROM_BT_CONT"
 !>@}
 
+!> A negligible parameter which avoids division by zero, but is too small to
+!! modify physical values.
+real, parameter :: subroundoff = 1e-30
+
 contains
 
 !> This subroutine time steps the barotropic equations explicitly.
@@ -1001,23 +1005,23 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, forces, pbce,
 
   !$OMP parallel do default(shared) private(visc_rem)
   do k=1,nz ; do j=js,je ; do I=is-1,ie
-    ! rem needs greater than visc_rem_u and 1-Instep/visc_rem_u.
+    ! rem needs to be greater than visc_rem_u and 1-Instep/visc_rem_u.
     ! The 0.5 below is just for safety.
-    if (visc_rem_u(I,j,k) <= 0.0) then ; visc_rem = 0.0
-    elseif (visc_rem_u(I,j,k) >= 1.0) then ; visc_rem = 1.0
-    elseif (visc_rem_u(I,j,k)**2 > visc_rem_u(I,j,k) - 0.5*Instep) then
-      visc_rem = visc_rem_u(I,j,k)
-    else ; visc_rem = 1.0 - 0.5*Instep/visc_rem_u(I,j,k) ; endif
+    ! NOTE: subroundoff is a neglible value used to prevent division by zero.
+    ! When 1-0.5*Instep/visc_rem exceeds visc_rem, the subroundoff is too small
+    ! to modify the significand.  When visc_rem is small, the max() operators
+    ! select visc_rem or 0.  So subroundoff cannot impact the final value.
+    visc_rem = min(visc_rem_u(I,j,k), 1.)
+    visc_rem = max(visc_rem, 1. - 0.5 * Instep / (visc_rem + subroundoff))
+    visc_rem = max(visc_rem, 0.)
     wt_u(I,j,k) = CS%frhatu(I,j,k) * visc_rem
   enddo ; enddo ; enddo
   !$OMP parallel do default(shared) private(visc_rem)
   do k=1,nz ; do J=js-1,je ; do i=is,ie
-    ! rem needs greater than visc_rem_v and 1-Instep/visc_rem_v.
-    if (visc_rem_v(i,J,k) <= 0.0) then ; visc_rem = 0.0
-    elseif (visc_rem_v(i,J,k) >= 1.0) then ; visc_rem = 1.0
-    elseif (visc_rem_v(i,J,k)**2 > visc_rem_v(i,J,k) - 0.5*Instep) then
-      visc_rem = visc_rem_v(i,J,k)
-    else ; visc_rem = 1.0 - 0.5*Instep/visc_rem_v(i,J,k) ; endif
+    ! As above, rem must be greater than visc_rem_v and 1-Instep/visc_rem_v.
+    visc_rem = min(visc_rem_v(I,j,k), 1.)
+    visc_rem = max(visc_rem, 1. - 0.5 * Instep / (visc_rem + subroundoff))
+    visc_rem = max(visc_rem, 0.)
     wt_v(i,J,k) = CS%frhatv(i,J,k) * visc_rem
   enddo ; enddo ; enddo