Skip to content

Commit

Permalink
[develop] Enable workflow runs on single node linux/mac machine using…
Browse files Browse the repository at this point in the history
… rocoto. (#508)

* Increase precision of degs_per_radian to 15 digits.

* Use generic date util.

* Add fake slurm commands for rocoto usage on linux.

* Modify machine files for linux and mac.

* Modify linux and macos wflow modules.

* Fix unittest.

* Remove openmpi module loading in linux/mac build modulefile.

* Fix sacct.

* Fix crontab unspecified USER issue.

* Add EXTRN_MDL_DATA_STORES to macos.

* Add more states to squeue/sacct.

* Add a taskthrottle=1 option for linux/mac.

* Don't specifiy number of processes for mpirun.

* Get exit code directly instead of from log file.

* Set taskthrottle to 1000 by default.

* Fix linux lmod path bug.

* Set stack size to unlimited for linux/mac.

* Fix unittest.
  • Loading branch information
danielabdi-noaa authored Jan 10, 2023
1 parent d3b10e6 commit 70da0e8
Show file tree
Hide file tree
Showing 19 changed files with 270 additions and 39 deletions.
2 changes: 1 addition & 1 deletion etc/lmod-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ if [ "$L_MACHINE" = macos ]; then
module purge

elif [ "$L_MACHINE" = linux ]; then
export BASH_ENV="/usr/share/share/lmod/init/bash"
export BASH_ENV="/usr/share/lmod/lmod/init/bash"
source $BASH_ENV

module purge
Expand Down
1 change: 0 additions & 1 deletion modulefiles/build_linux_gnu.lua
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ load("hpc")
load("hpc-python")

load("hpc-gnu")
load("openmpi")
load("hpc-openmpi")

load("srw_common")
Expand Down
1 change: 0 additions & 1 deletion modulefiles/build_macos_gnu.lua
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ load("hpc")
load("hpc-python")

load("hpc-gnu")
load("openmpi")
load("hpc-openmpi")

load("srw_common")
Expand Down
33 changes: 26 additions & 7 deletions modulefiles/wflow_linux.lua
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,35 @@ This module sets a path to activate conda environment needed for running the UFS
whatis([===[This module sets a path for conda environment needed for running the UFS SRW App on Linux]===])

setenv("CMAKE_Platform", "linux")
setenv("VENV", pathJoin(os.getenv("HOME"), "condaenv/envs/regional_workflow"))

--[[
local ROCOTOmod="/Users/username/modules"
prepend_path("MODULEPATH", ROCOTOmod)
load(rocoto)
--]]
-- Conda initialization function
function init_conda(conda_path)
local shell=myShellType()
local conda_file
if shell == "csh" then
conda_file=pathJoin(conda_path,"etc/profile.d/conda.csh")
else
conda_file=pathJoin(conda_path,"etc/profile.d/conda.sh")
end
local mcmd="source " .. conda_file
execute{cmd=mcmd, modeA={"load"}}
end

-- initialize conda
local conda_path="/home/username/miniconda3"
init_conda(conda_path)

-- add rocoto to path
local rocoto_path="/home/username/rocoto"
prepend_path("PATH", pathJoin(rocoto_path,"bin"))

-- add fake slurm commands
local srw_path="/home/username/ufs-srweather-app"
prepend_path("PATH", pathJoin(srw_path, "ush/rocoto_fake_slurm"))

-- display conda activation message
if mode() == "load" then
LmodMsgRaw([===[Please do the following to activate conda:
> conda activate $VENV
> conda activate regional_workflow
]===])
end
33 changes: 26 additions & 7 deletions modulefiles/wflow_macos.lua
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,36 @@ This module set a path needed to activate conda environement for running UFS SRW
whatis([===[This module activates conda environment for running the UFS SRW App on macOS]===])

setenv("CMAKE_Platform", "macos")
setenv("VENV", pathJoin(os.getenv("HOME"), "condaenv/envs/regional_workflow"))

--[[
local ROCOTOmod="/Users/username/modules"
prepend_path("MODULEPATH", ROCOTOmod)
load(rocoto)
--]]
-- Conda initialization function
function init_conda(conda_path)
local shell=myShellType()
local conda_file
if shell == "csh" then
conda_file=pathJoin(conda_path,"etc/profile.d/conda.csh")
else
conda_file=pathJoin(conda_path,"etc/profile.d/conda.sh")
end
local mcmd="source " .. conda_file
execute{cmd=mcmd, modeA={"load"}}
end

-- initialize conda
local conda_path="/Users/username/miniconda3"
init_conda(conda_path)

-- add rocoto to path
local rocoto_path="/Users/username/rocoto"
prepend_path("PATH", pathJoin(rocoto_path,"bin"))

-- add fake slurm commands
local srw_path="/Users/username/ufs-srweather-app"
prepend_path("PATH", pathJoin(srw_path, "ush/rocoto_fake_slurm"))

-- display conda activation message
if mode() == "load" then
LmodMsgRaw([===[Please do the following to activate conda virtual environment:
> conda activate $VENV "
> conda activate regional_workflow"
]===])
end

2 changes: 1 addition & 1 deletion parm/FV3LAM_wflow.xml
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ tasks; and the "FCST" type is used for the RUN_FCST_TN task.

]>

<workflow realtime="F" scheduler="&SCHED;" cyclethrottle="20">
<workflow realtime="F" scheduler="&SCHED;" cyclethrottle="20" taskthrottle="{{ taskthrottle }}">
{# Double quotes are required inside the strftime! Expect an error from reading the template if using single quotes. #}
<cycledef group="at_start">{{ cdate_first_cycl.strftime("%M %H %d %m %Y *") }}</cycledef>
<cycledef group="forecast">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ platform:
workflow:
CCPP_PHYS_SUITE: FV3_GFS_2017_gfdlmp
PREDEF_GRID_NAME: RRFS_CONUS_25km
DATE_FIRST_CYCL: date --utc --date="2 days ago" +%Y%m%d00
DATE_LAST_CYCL: date --utc --date="2 days ago" +%Y%m%d00
DATE_FIRST_CYCL: $DATE_UTIL --utc --date="2 days ago" +%Y%m%d00
DATE_LAST_CYCL: $DATE_UTIL --utc --date="2 days ago" +%Y%m%d00
FCST_LEN_HRS: 6
PREEXISTING_DIR_METHOD: rename
task_get_extrn_ics:
Expand Down
5 changes: 5 additions & 0 deletions ush/config_defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,10 @@ platform:
# The number of cores available per node on the compute platform, now
# configurable for all platforms.
#
# TASKTHROTTLE:
# The number of active tasks run simultaneously. For linux/mac setting this
# to 1 makes sense
#
# BUILD_MOD_FN:
# Name of alternative build module file to use if using an
# unsupported platform. Is set automatically for supported machines.
Expand Down Expand Up @@ -158,6 +162,7 @@ platform:
#
WORKFLOW_MANAGER: ""
NCORES_PER_NODE: ""
TASKTHROTTLE: 1000
BUILD_MOD_FN: 'build_{{ user.MACHINE|lower() }}_{{ workflow.COMPILER }}'
WFLOW_MOD_FN: 'wflow_{{ user.MACHINE|lower() }}'
BUILD_VER_FN: 'build.ver.{{ user.MACHINE|lower() }}'
Expand Down
2 changes: 1 addition & 1 deletion ush/constants.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ constants:
#-----------------------------------------------------------------------
#
PI_GEOM: 3.14159265358979323846264338327
DEGS_PER_RADIAN: 57.2957795131
DEGS_PER_RADIAN: 57.29577951308232087679
RADIUS_EARTH: 6371200.0
#
#-----------------------------------------------------------------------
Expand Down
25 changes: 19 additions & 6 deletions ush/machine/linux.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,31 @@
platform:
WORKFLOW_MANAGER: none
WORKFLOW_MANAGER: rocoto
NCORES_PER_NODE: 8
SCHED: none
RUN_CMD_FCST: 'mpirun -n ${PE_MEMBER01} '
RUN_CMD_POST: 'mpirun -n 4 '
TASKTHROTTLE: 1
SCHED: slurm
CCPA_OBS_DIR: /home/username/DATA/UFS/obs_data/ccpa/proc
MRMS_OBS_DIR: /home/username/DATA/UFS/obs_data/mrms/proc
NDAS_OBS_DIR: /home/username/DATA/UFS/obs_data/ndas/proc
METPLUS_PATH: ""
MET_BIN_EXEC: bin
MET_INSTALL_DIR: ""
DOMAIN_PREGEN_BASEDIR: /home/username/DATA/UFS/FV3LAM_pregen
RUN_CMD_FCST: mpirun -n ${PE_MEMBER01}
RUN_CMD_POST: mpirun
RUN_CMD_SERIAL: time
RUN_CMD_UTILS: mpirun -n 4
PRE_TASK_CMDS: '{ ulimit -a; }'
RUN_CMD_UTILS: mpirun
PRE_TASK_CMDS: '{ ulimit -a; ulimit -s unlimited; }'
TEST_EXTRN_MDL_SOURCE_BASEDIR: /home/username/DATA/UFS/input_model_data
TEST_PREGEN_BASEDIR: /home/username/DATA/UFS/FV3LAM_pregen
TEST_ALT_EXTRN_MDL_SYSBASEDIR_ICS: /home/username/DATA/UFS/dummy_FV3GFS_sys_dir
TEST_ALT_EXTRN_MDL_SYSBASEDIR_LBCS: /home/username/DATA/UFS/dummy_FV3GFS_sys_dir
FIXaer: /home/username/DATA/UFS/fix/fix_aer
FIXgsm: /home/username/DATA/UFS/fix/fix_am
FIXlut: /home/username/DATA/UFS/fix/fix_lut
FIXorg: /home/username/DATA/UFS/fix/fix_orog
FIXsfc: /home/username/DATA/UFS/fix/fix_sfc_climo
FIXshp: /home/username/DATA/UFS/NaturalEarth
EXTRN_MDL_DATA_STORES: aws nomads
data:
ics_lbcs:
FV3GFS: /home/username/DATA/UFS/FV3GFS
25 changes: 19 additions & 6 deletions ush/machine/macos.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,31 @@
platform:
WORKFLOW_MANAGER: none
WORKFLOW_MANAGER: rocoto
NCORES_PER_NODE: 8
SCHED: none
RUN_CMD_FCST: 'mpirun -n ${PE_MEMBER01} '
RUN_CMD_POST: 'mpirun -n 4 '
TASKTHROTTLE: 1
SCHED: slurm
CCPA_OBS_DIR: /Users/username/DATA/UFS/obs_data/ccpa/proc
MRMS_OBS_DIR: /Users/username/DATA/UFS/obs_data/mrms/proc
NDAS_OBS_DIR: /Users/username/DATA/UFS/obs_data/ndas/proc
DOMAIN_PREGEN_BASEDIR: /Users/username/DATA/UFS/FV3LAM_pregen
METPLUS_PATH: ""
MET_BIN_EXEC: bin
MET_INSTALL_DIR: ""
RUN_CMD_FCST: mpirun -n ${PE_MEMBER01}
RUN_CMD_POST: mpirun
RUN_CMD_SERIAL: time
RUN_CMD_UTILS: mpirun -n 4
PRE_TASK_CMDS: '{ ulimit -a; }'
RUN_CMD_UTILS: mpirun
PRE_TASK_CMDS: '{ ulimit -a; ulimit -s unlimited; }'
TEST_EXTRN_MDL_SOURCE_BASEDIR: /Users/username/DATA/UFS/input_model_data
TEST_PREGEN_BASEDIR: /Users/username/DATA/UFS/FV3LAM_pregen
TEST_ALT_EXTRN_MDL_SYSBASEDIR_ICS: /Users/username/DATA/UFS/dummy_FV3GFS_sys_dir
TEST_ALT_EXTRN_MDL_SYSBASEDIR_LBCS: /Users/username/DATA/UFS/dummy_FV3GFS_sys_dir
FIXaer: /Users/username/DATA/UFS/fix/fix_aer
FIXgsm: /Users/username/DATA/UFS/fix/fix_am
FIXlut: /Users/username/DATA/UFS/fix/fix_lut
FIXorg: /Users/username/DATA/UFS/fix/fix_orog
FIXsfc: /Users/username/DATA/UFS/fix/fix_sfc_climo
FIXshp: /Users/username/DATA/UFS/NaturalEarth
EXTRN_MDL_DATA_STORES: aws nomads
data:
ics_lbcs:
FV3GFS: /Users/username/DATA/UFS/FV3GFS
4 changes: 1 addition & 3 deletions ush/python_utils/config_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
except ModuleNotFoundError:
pass
# The rest of the formats: JSON/SHELL/INI/XML do not need
# external pakcages
# external packages
import json
import os
import re
Expand Down Expand Up @@ -398,8 +398,6 @@ def cfg_to_xml_str(cfg):
##################
# CONFIG utils
##################


def flatten_dict(dictionary, keys=None):
"""Flatten a recursive dictionary (e.g.yaml/json) to be one level deep
Expand Down
42 changes: 42 additions & 0 deletions ush/rocoto_fake_slurm/sacct
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/bash

# Emulates slurm's sacct
if [[ "$1" = "--jobs="* ]]; then
PIDS="${1:7}"
PIDS="${PIDS//,/' '}"
elif [[ -f .job_database ]]; then
PIDS=$(cat .job_database | grep submitted | sort -u -k1,1 | awk '{print $3}')
fi

# Output info the way rocoto calls sacct
FMT="%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s\n"
echo "JobID|User|JobName|Partition|Priority|Submit|Start|End|NCPUS|ExitCode|State"

for pid in ${PIDS}; do

t_sub="N/A"
t_start=$t_sub
t_end=$t_sub
name=$pid
user=${USER:-user}
exitc=0
state="UNKNOWN"

v=$(cat .job_database | grep "pid $pid submitted" | awk '{print $1" "$5}')
if [ ! -z "$v" ]; then
state="PENDING"
read name t_sub <<< "$v"
v=$(cat .job_database | grep "pid $pid started" | awk '{print $5" "$7}')
if [ ! -z "$v" ]; then
state="RUNNING"
read t_start t_end<<< "$v"
fi
v=$(cat .job_database | grep "pid $pid ended" | awk '{print $5" "$7}')
if [ ! -z "$v" ]; then
state="COMPLETED"
read t_end exitc <<< "$v"
fi
fi

printf "$FMT" $pid ${user:0:30} ${name:0:30} linux 0.1 $t_sub $t_start $t_end 1 $exitc $state
done
48 changes: 48 additions & 0 deletions ush/rocoto_fake_slurm/sbatch
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/bin/bash

# Emulates slurm's sbatch

FD=${1:-/dev/stdin}

#parse log file
LOG=`grep "#SBATCH -o" $FD | awk '{ print $3 }'`
if [ -z "$LOG" ]; then
LOG=/dev/null
fi

#parse time
TIM=`grep "#SBATCH -t" $FD | awk '{ print $3 }'`
if [ -z "$TIM" ]; then
SECS=
CTIM=
else
SECS=`echo $TIM | awk 'BEGIN { FS = ":" } ; { secs = $1 * 3600 + $2 * 60 + $3; print secs };'`
CTIM="timeout ${SECS}s"
fi

#parse job name
JOBNAME=`grep "#SBATCH --job-name" $FD | awk 'BEGIN { FS = "=" }; { print $2 }'`
if [ -z "$JOBNAME" ]; then
JOBNAME="default"
fi

#command
CMD="`cat $FD`"

#execute job in background
bash -c "\
ds=\$(date --utc +%Y-%m-%d:%H:%M:%S); \
de=\$(date --utc -d '$SECS sec' +%Y-%m-%d:%H:%M:%S); \
echo $JOBNAME pid \$$ started \$ds ends \$de >>.job_database; \
\
${CTIM} ${CMD} &>$LOG; \
excode=\$?; \
\
de=\$(date --utc +%Y-%m-%d:%H:%M:%S); \
echo $JOBNAME pid \$$ ended \$de exitcode \$excode >>.job_database;" &

#submission info
pid=$!
dsub=$(date --utc +%Y-%m-%d:%H:%M:%S)
echo $JOBNAME pid $pid submitted $dsub >>.job_database
echo "Submitted batch job "$pid
4 changes: 4 additions & 0 deletions ush/rocoto_fake_slurm/scancel
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

# Emulates slurm's scancel
exec kill -9 -$1
4 changes: 4 additions & 0 deletions ush/rocoto_fake_slurm/sinfo
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

# Emulates slurm's sinfo
exec lscpu
42 changes: 42 additions & 0 deletions ush/rocoto_fake_slurm/squeue
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/bash

# Emulates slurm's squeue
if [[ "$1" = "--jobs="* ]]; then
PIDS="${1:7}"
PIDS="${PIDS//,/' '}"
elif [[ -f .job_database ]]; then
PIDS=$(cat .job_database | grep submitted | sort -u -k1,1 | awk '{print $3}')
fi

# Output info the way rocoto calls squeue
FMT="%-40s%-40s%-10s%-20s%-30s%-30s%-30s%-30s%-10s%-30s%-200s\n"
printf "$FMT" JOBID USER CPUS PARTITION SUBMIT_TIME START_TIME END_TIME PRIORITY EXIT_CODE STATE NAME

for pid in ${PIDS}; do

t_sub="N/A"
t_start=$t_sub
t_end=$t_sub
name=$pid
user=${USER:-user}
exitc=0
state="UNKNOWN"

v=$(cat .job_database | grep "pid $pid submitted" | awk '{print $1" "$5}')
if [ ! -z "$v" ]; then
state="PENDING"
read name t_sub <<< "$v"
v=$(cat .job_database | grep "pid $pid started" | awk '{print $5" "$7}')
if [ ! -z "$v" ]; then
state="RUNNING"
read t_start t_end<<< "$v"
fi
v=$(cat .job_database | grep "pid $pid ended" | awk '{print $5" "$7}')
if [ ! -z "$v" ]; then
state="COMPLETED"
read t_end exitc <<< "$v"
fi
fi

printf "$FMT" $pid $user 1 linux $t_sub $t_start $t_end 0.1 $exitc $state $name
done
Loading

0 comments on commit 70da0e8

Please sign in to comment.