-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Ray Tune checkpointing fix, allow LR schedules for non-PCGrad opt, an…
…d more. (#142) * feat: add option to include SLURM jobid name in training dir * feat: add command-line option to enable horovod * feat: Use comet offline logging in Ray Tune runs * fix: bug in raytune command * fix: handle TF version-dependent names of the legacy optimizer * feat: add event and met losses to raytune search space * feat: added sbatch script for Horovod training on JURECA * fix: Ray Tune checkpoint saving and loading * feat: allow lr schedules when not using PCGrad * chore: add print of loaded opt weights * fix: handle TF version-dependent names of the legacy optimizer Former-commit-id: deb05ea
- Loading branch information
Showing
6 changed files
with
224 additions
and
90 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#!/bin/sh | ||
|
||
#SBATCH --account=raise-ctp2 | ||
#SBATCH --partition=dc-gpu | ||
#SBATCH --time 24:00:00 | ||
#SBATCH --nodes 1 | ||
#SBATCH --tasks-per-node=1 | ||
#SBATCH --gres=gpu:4 | ||
|
||
# Job name | ||
#SBATCH -J pipehorovod | ||
|
||
# Output and error logs | ||
#SBATCH -o logs_slurm/log_%x_%j.out | ||
#SBATCH -e logs_slurm/log_%x_%j.err | ||
|
||
# Add jobscript to job output | ||
echo "#################### Job submission script. #############################" | ||
cat $0 | ||
echo "################# End of job submission script. #########################" | ||
|
||
|
||
module --force purge | ||
module load Stages/2022 | ||
module load GCC GCCcore/.11.2.0 CMake NCCL CUDA cuDNN OpenMPI | ||
|
||
export CUDA_VISIBLE_DEVICES=0,1,2,3 | ||
jutil env activate -p raise-ctp2 | ||
|
||
sleep 1 | ||
nvidia-smi | ||
|
||
source /p/project/raise-ctp2/cern/miniconda3/bin/activate tf2 | ||
echo "Python used:" | ||
which python3 | ||
python3 --version | ||
|
||
|
||
echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" | ||
echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" | ||
echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" | ||
echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" | ||
echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" | ||
echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" | ||
echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" | ||
echo "DEBUG: SLURM_NODEID: $SLURM_NODEID" | ||
echo "DEBUG: SLURM_LOCALID: $SLURM_LOCALID" | ||
echo "DEBUG: SLURM_PROCID: $SLURM_PROCID" | ||
echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" | ||
|
||
|
||
export NCCL_DEBUG=INFO | ||
export OMP_NUM_THREADS=1 | ||
if [ "$SLURM_CPUS_PER_TASK" > 0 ] ; then | ||
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK | ||
fi | ||
echo $OMP_NUM_THREADS | ||
|
||
|
||
echo 'Starting training.' | ||
srun --cpu-bind=none python mlpf/pipeline.py train -c $1 -p $2 --comet-offline -j $SLURM_JOBID -m | ||
echo 'Training done.' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.