-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* fix: error in raytune search space * enable best practice settings for optimal model performance * only max out NVIDIA L2 cache if GPUs are found * enable benchmarking callback for clic dataset schema * feat: configure tensorboard profiling from config file * Update eval script on flatiron * Update training batch script on flatiron * Update raytune batch script on flatiron * Add batch scripts for 8 GPU training on flatiron * Update raytune search space file * Setting numpy==1.23.5 in requirements.txt Later versions are incompatible with tf2onnx 1.14.0 (latest at time of commit) * Add inference command in TF pipeline script * Additional flatiron batch scripts Former-commit-id: 23fa559
- Loading branch information
Showing
5 changed files
with
368 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#!/bin/sh | ||
|
||
# Walltime limit | ||
#SBATCH -t 10:00:00 | ||
#SBATCH -N 1 | ||
#SBATCH --exclusive | ||
#SBATCH --tasks-per-node=1 | ||
|
||
#SBATCH --mem=1000G | ||
#SBATCH --cpus-per-task=112 | ||
#SBATCH -p eval | ||
#SBATCH --constraint=sapphire | ||
#SBATCH -w worker6302 | ||
|
||
# #SBATCH -p gpu | ||
# #SBATCH --constraint=v100 | ||
# #SBATCH --gpus-per-task=1 | ||
# #SBATCH -w workergpu094 | ||
|
||
# Job name | ||
#SBATCH -J pipeinfer | ||
|
||
# Output and error logs | ||
#SBATCH -o logs_slurm/log_%x_%j.out | ||
#SBATCH -e logs_slurm/log_%x_%j.err | ||
|
||
# Add jobscript to job output | ||
echo "#################### Job submission script. #############################" | ||
cat $0 | ||
echo "################# End of job submission script. #########################" | ||
|
||
module --force purge; module load modules/2.1.1-20230405 | ||
module load slurm gcc cmake nccl cuda/11.8.0 cudnn/8.4.0.27-11.6 openmpi/4.0.7 | ||
|
||
nvidia-smi | ||
|
||
source ~/miniconda3/bin/activate tf2 | ||
which python3 | ||
python3 --version | ||
|
||
# make tensorflow find cupti (needed for profiling) | ||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/mnt/sw/nix/store/3xpm36w2kcri3j1m5j15hg025my1p4kx-cuda-11.8.0/extras/CUPTI/lib64/ | ||
|
||
train_dir="/mnt/ceph/users/ewulff/particleflow/experiments/bsm10_1GPU_clic-gnn-tuned-v130_20230724_035617_375578.workergpu037" | ||
|
||
# export CUDA_VISIBLE_DEVICES=0 | ||
|
||
## declare an array variable | ||
declare -a bs=(1024 512 256 128 64 32 16 8 4 2 1) | ||
|
||
## now loop through the above array | ||
for i in "${bs[@]}" | ||
do | ||
echo 'Starting inference.' | ||
python3 mlpf/pipeline.py infer \ | ||
--train-dir $train_dir \ | ||
--nevents 4000 \ | ||
--bs "$i" \ | ||
--num-runs 11 \ | ||
-o /mnt/ceph/users/ewulff/particleflow/inference_tests/results_$SLURMD_NODENAME.json | ||
echo 'Inference done.' | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#!/bin/sh | ||
|
||
# Walltime limit | ||
#SBATCH -t 7-00:00:00 | ||
#SBATCH -N 1 | ||
#SBATCH --tasks-per-node=1 | ||
#SBATCH -p gpu | ||
#SBATCH --gpus-per-task=1 | ||
#SBATCH --constraint=a100-80gb,ib | ||
#SBATCH --mem=200G | ||
|
||
# Job name | ||
#SBATCH -J pipetrain | ||
|
||
# Output and error logs | ||
#SBATCH -o logs_slurm/log_%x_%j.out | ||
#SBATCH -e logs_slurm/log_%x_%j.err | ||
|
||
# Add jobscript to job output | ||
echo "#################### Job submission script. #############################" | ||
cat $0 | ||
echo "################# End of job submission script. #########################" | ||
|
||
|
||
module --force purge; module load modules/2.1.1-20230405 | ||
module load slurm gcc cmake nccl cuda/11.8.0 cudnn/8.4.0.27-11.6 openmpi/4.0.7 | ||
|
||
source ~/miniconda3/bin/activate tf2 | ||
which python3 | ||
python3 --version | ||
|
||
# make tensorflow find cupti (needed for profiling) | ||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/mnt/sw/nix/store/3xpm36w2kcri3j1m5j15hg025my1p4kx-cuda-11.8.0/extras/CUPTI/lib64/ | ||
|
||
export TF_GPU_THREAD_MODE=gpu_private | ||
export TF_GPU_THREAD_COUNT=2 | ||
nvidia-smi | ||
|
||
echo 'Starting training.' | ||
CUDA_VISIBLE_DEVICES=0 python3 mlpf/pipeline.py train -c $1 -p $2 \ | ||
--seeds --comet-exp-name particleflow-tf-clic | ||
echo 'Training done.' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
#!/bin/bash | ||
|
||
#SBATCH -t 168:00:00 | ||
#SBATCH -N 8 | ||
#SBATCH --tasks-per-node=1 | ||
#SBATCH -p gpu | ||
#SBATCH --constraint=a100,ib | ||
#SBATCH --gpus-per-task=4 | ||
#SBATCH --cpus-per-task=64 | ||
|
||
# Job name | ||
#SBATCH -J raytune | ||
|
||
# Output and error logs | ||
#SBATCH -o logs_slurm/log_%x_%j.out | ||
#SBATCH -e logs_slurm/log_%x_%j.err | ||
|
||
# Add jobscript to job output | ||
echo "#################### Job submission script. #############################" | ||
cat $0 | ||
echo "################# End of job submission script. #########################" | ||
|
||
set -x | ||
|
||
export TUNE_RESULT_DIR="/mnt/ceph/users/ewulff/ray_results/tune_result_dir" | ||
export TUNE_MAX_PENDING_TRIALS_PG=$((SLURM_NNODES*2)) | ||
|
||
module --force purge; module load modules/2.0-20220630 | ||
module load slurm gcc cmake/3.22.3 nccl cuda/11.4.4 cudnn/8.2.4.15-11.4 openmpi/4.0.7 | ||
nvidia-smi | ||
|
||
export CUDA_VISIBLE_DEVICES=0,1,2,3 | ||
num_gpus=4 | ||
|
||
source ~/miniconda3/bin/activate tf2 | ||
echo "Python used:" | ||
which python3 | ||
python3 --version | ||
|
||
|
||
################# DON NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING ############### | ||
# This script is a modification to the implementation suggest by gregSchwartz18 here: | ||
# https://github.com/ray-project/ray/issues/826#issuecomment-522116599 | ||
redis_password=$(uuidgen) | ||
export redis_password | ||
echo "Redis password: ${redis_password}" | ||
|
||
nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST) # Getting the node names | ||
nodes_array=( $nodes ) | ||
|
||
node_1=${nodes_array[0]} | ||
ip=$(srun --nodes=1 --ntasks=1 -w $node_1 hostname --ip-address) # making redis-address | ||
port=6379 | ||
ip_head=$ip:$port | ||
export ip_head | ||
echo "IP Head: $ip_head" | ||
|
||
echo "STARTING HEAD at $node_1" | ||
srun --nodes=1 --ntasks=1 -w $node_1 \ | ||
ray start --head --node-ip-address="$node_1" --port=$port \ | ||
--num-cpus $((SLURM_CPUS_PER_TASK)) --num-gpus $num_gpus --block & # mlpf/raytune/start-head.sh $ip $port & | ||
|
||
sleep 10 | ||
|
||
worker_num=$(($SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node | ||
for (( i=1; i<=$worker_num; i++ )) | ||
do | ||
node_i=${nodes_array[$i]} | ||
echo "STARTING WORKER $i at $node_i" | ||
srun --nodes=1 --ntasks=1 -w $node_i \ | ||
ray start --address "$node_1":"$port" \ | ||
--num-cpus $((SLURM_CPUS_PER_TASK)) --num-gpus $num_gpus --block & # mlpf/raytune/start-worker.sh $ip_head & | ||
sleep 5 | ||
done | ||
|
||
echo All Ray workers started. | ||
############################################################################################## | ||
|
||
#### call your code below | ||
python3 mlpf/pipeline.py raytune -c $1 -n $2 --cpus $((SLURM_CPUS_PER_TASK/4)) \ | ||
--gpus 1 --seeds --comet-exp-name particleflow-raytune | ||
# --ntrain 100 --ntest 100 #--comet-online | ||
|
||
exit |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.