-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
TF perf tuning, CLIC benchmarks, flatiron scripts (#185)
* fix: error in raytune search space * enable best practice settings for optimal model performance * only max out NVIDIA L2 cache if GPUs are found * enable benchmarking callback for clic dataset schema * feat: configure tensorboard profiling from config file * Update eval script on flatiron * Update training batch script on flatiron * Update raytune batch script on flatiron * Add batch scripts for 8 GPU training on flatiron * Update raytune search space file * Setting numpy==1.23.5 in requirements.txt, later versions are incompatible with tf2onnx 1.14.0 (latest at time of commit)
- Loading branch information
Showing
10 changed files
with
209 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
#!/bin/sh | ||
|
||
# Walltime limit | ||
#SBATCH -t 2-00:00:00 | ||
#SBATCH -N 1 | ||
#SBATCH --exclusive | ||
#SBATCH --tasks-per-node=1 | ||
#SBATCH -p gpu | ||
#SBATCH --gpus-per-task=8 | ||
#SBATCH --constraint=h100,ib | ||
|
||
# Job name | ||
#SBATCH -J pipetrain | ||
|
||
# Output and error logs | ||
#SBATCH -o logs_slurm/log_%x_%j.out | ||
#SBATCH -e logs_slurm/log_%x_%j.err | ||
|
||
# Add jobscript to job output | ||
echo "#################### Job submission script. #############################" | ||
cat $0 | ||
echo "################# End of job submission script. #########################" | ||
|
||
# module --force purge; module load modules/2.0-20220630 | ||
# module load slurm gcc cmake/3.22.3 nccl cuda/11.4.4 cudnn/8.2.4.15-11.4 openmpi/4.0.7 | ||
|
||
module --force purge; module load modules/2.1.1-20230405 | ||
module load slurm gcc cmake nccl cuda/12.0.0 cudnn/8.4.0.27-11.6 openmpi/4.0.7 | ||
|
||
nvidia-smi | ||
|
||
source ~/miniconda3/bin/activate tf2 | ||
which python3 | ||
python3 --version | ||
|
||
|
||
echo 'Starting training.' | ||
# Run the training of the base GNN model using e.g. 8 GPUs in a data-parallel mode | ||
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 mlpf/pipeline.py train -c $1 -p $2 \ | ||
--seeds --comet-exp-name particleflow-tf-clic | ||
echo 'Training done.' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#!/bin/sh | ||
|
||
# Walltime limit | ||
#SBATCH -t 0-04:00:00 | ||
#SBATCH -N 1 | ||
#SBATCH --exclusive | ||
#SBATCH --tasks-per-node=1 | ||
#SBATCH -p gpu | ||
#SBATCH --gpus-per-task=8 | ||
#SBATCH --constraint=h100,ib | ||
# #SBATCH --mem 256G | ||
|
||
# Job name | ||
#SBATCH -J pipetrain | ||
|
||
# Output and error logs | ||
#SBATCH -o logs_slurm/log_%x_%j.out | ||
#SBATCH -e logs_slurm/log_%x_%j.err | ||
|
||
# Add jobscript to job output | ||
echo "#################### Job submission script. #############################" | ||
cat $0 | ||
echo "################# End of job submission script. #########################" | ||
|
||
export MODULEPATH=/mnt/home/gkrawezik/modules/rocky8:$MODULEPATH | ||
module load cuda/12 cudnn/cuda12 nccl/cuda12 singularity # these names are specific to gkrawezik's modules | ||
nvidia-smi | ||
|
||
# ensure CPU is keeping private threads for scheduling operations on the GPUs | ||
# https://www.tensorflow.org/guide/gpu_performance_analysis#2_gpu_host_thread_contention | ||
export TF_GPU_THREAD_MODE=gpu_private | ||
export TF_GPU_THREAD_COUNT=2 | ||
|
||
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 | ||
nvidia-smi | ||
|
||
echo 'Starting training.' | ||
singularity run --nv -B /mnt/ceph/users/ewulff/tensorflow_datasets,/mnt/ceph/users/ewulff/particleflow \ | ||
tensorflow_23.05-tf2-py3.sif \ | ||
python3 $PWD/mlpf/pipeline.py train -c $1 -p $2 \ | ||
--seeds --comet-exp-name particleflow-tf-clic --benchmark_dir exp_dir | ||
echo 'Training done.' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.