From 974e1eacda3b8f3f7a87f8dae2805ecfd1d9823d Mon Sep 17 00:00:00 2001 From: Marshall Ward Date: Mon, 5 Sep 2022 14:03:16 -0400 Subject: [PATCH] Barotropic weight vectorization This patch modifies the calculation of barotropic weight functions wt_[uv] to a more vectorizable form. The if-blocks used to evaluate the piecewise function was replaced with a sequence of min/max evaluations, which are vectorizable. A nondimensional subroundoff term is also introducted to address the division by zero in the evaluation of 1 - 0.5 * Instep / visc_rem. When visc_rem is zero, this term will never be selected by the max() operator, since it is negatively asymptotic. When visc_rem is sufficiently large to accept this term, the value of subroundoff is too small to modify the significand of the floating point value. In our runs, the overall time of btstep was reduced by 6%. Experiment: benchmark Platform: AMD Ryzen 5 2600 Compiler: gfortran 12.2 Flags: -O3 -mavx Timer: (Ocean BT pre-calcs only) Before: - 1.781952 - 1.787916 - 1.790447 After: - 1.544491 - 1.553111 - 1.544491 Timer: (Ocean barotropic mode stepping) Before: - 4.042281 - 4.050225 - 4.051413 After: - 3.785838 - 3.822393 - 3.835511 Speedup: ~13% of "(Ocean BT pre-calc step)" ~6% speedup of btstep --- src/core/MOM_barotropic.F90 | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/core/MOM_barotropic.F90 b/src/core/MOM_barotropic.F90 index 5a02f64240..83cc0b7d09 100644 --- a/src/core/MOM_barotropic.F90 +++ b/src/core/MOM_barotropic.F90 @@ -398,6 +398,10 @@ module MOM_barotropic character*(20), parameter :: BT_CONT_STRING = "FROM_BT_CONT" !>@} +!> A negligible parameter which avoids division by zero, but is too small to +!! modify physical values. +real, parameter :: subroundoff = 1e-30 + contains !> This subroutine time steps the barotropic equations explicitly. @@ -1001,23 +1005,23 @@ subroutine btstep(U_in, V_in, eta_in, dt, bc_accel_u, bc_accel_v, forces, pbce, !$OMP parallel do default(shared) private(visc_rem) do k=1,nz ; do j=js,je ; do I=is-1,ie - ! rem needs greater than visc_rem_u and 1-Instep/visc_rem_u. + ! rem needs to be greater than visc_rem_u and 1-Instep/visc_rem_u. ! The 0.5 below is just for safety. - if (visc_rem_u(I,j,k) <= 0.0) then ; visc_rem = 0.0 - elseif (visc_rem_u(I,j,k) >= 1.0) then ; visc_rem = 1.0 - elseif (visc_rem_u(I,j,k)**2 > visc_rem_u(I,j,k) - 0.5*Instep) then - visc_rem = visc_rem_u(I,j,k) - else ; visc_rem = 1.0 - 0.5*Instep/visc_rem_u(I,j,k) ; endif + ! NOTE: subroundoff is a neglible value used to prevent division by zero. + ! When 1-0.5*Instep/visc_rem exceeds visc_rem, the subroundoff is too small + ! to modify the significand. When visc_rem is small, the max() operators + ! select visc_rem or 0. So subroundoff cannot impact the final value. + visc_rem = min(visc_rem_u(I,j,k), 1.) + visc_rem = max(visc_rem, 1. - 0.5 * Instep / (visc_rem + subroundoff)) + visc_rem = max(visc_rem, 0.) wt_u(I,j,k) = CS%frhatu(I,j,k) * visc_rem enddo ; enddo ; enddo !$OMP parallel do default(shared) private(visc_rem) do k=1,nz ; do J=js-1,je ; do i=is,ie - ! rem needs greater than visc_rem_v and 1-Instep/visc_rem_v. - if (visc_rem_v(i,J,k) <= 0.0) then ; visc_rem = 0.0 - elseif (visc_rem_v(i,J,k) >= 1.0) then ; visc_rem = 1.0 - elseif (visc_rem_v(i,J,k)**2 > visc_rem_v(i,J,k) - 0.5*Instep) then - visc_rem = visc_rem_v(i,J,k) - else ; visc_rem = 1.0 - 0.5*Instep/visc_rem_v(i,J,k) ; endif + ! As above, rem must be greater than visc_rem_v and 1-Instep/visc_rem_v. + visc_rem = min(visc_rem_v(I,j,k), 1.) + visc_rem = max(visc_rem, 1. - 0.5 * Instep / (visc_rem + subroundoff)) + visc_rem = max(visc_rem, 0.) wt_v(i,J,k) = CS%frhatv(i,J,k) * visc_rem enddo ; enddo ; enddo