-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #77 from erwulff/raytune
Raytune Former-commit-id: 30237f5
- Loading branch information
Showing
24 changed files
with
1,448 additions
and
44 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#!/bin/sh | ||
|
||
#SBATCH -t 1:00:00 | ||
#SBATCH -N 1 | ||
#SBATCH --tasks-per-node=1 | ||
#SBATCH -p gpu | ||
#SBATCH --gpus 4 | ||
#SBATCH --constraint=a100 | ||
|
||
# Job name | ||
#SBATCH -J find_lr | ||
|
||
# Output and error logs | ||
#SBATCH -o logs_slurm/log_%x_%j.out | ||
#SBATCH -e logs_slurm/log_%x_%j.err | ||
|
||
# Add jobscript to job output | ||
echo "#################### Job submission script. #############################" | ||
cat $0 | ||
echo "################# End of job submission script. #########################" | ||
|
||
module purge | ||
module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1 | ||
nvidia-smi | ||
|
||
source ~/miniconda3/bin/activate tf2 | ||
which python3 | ||
python3 --version | ||
|
||
mkdir $TMPDIR/particleflow | ||
rsync -ar --exclude={".git","experiments"} . $TMPDIR/particleflow | ||
cd $TMPDIR/particleflow | ||
if [ $? -eq 0 ] | ||
then | ||
echo "Successfully changed directory" | ||
else | ||
echo "Could not change directory" >&2 | ||
exit 1 | ||
fi | ||
|
||
python3 tf_list_gpus.py | ||
|
||
CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py find-lr -c $1 | ||
|
||
cp lr_finder.jpg $SLURM_SUBMIT_DIR/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
#!/bin/sh | ||
|
||
#SBATCH -t 168:00:00 | ||
#SBATCH -N 4 | ||
#SBATCH --exclusive | ||
#SBATCH --tasks-per-node=1 | ||
#SBATCH -p gpu | ||
#SBATCH --constraint=a100,sxm4 | ||
#SBATCH --gpus-per-task=4 | ||
|
||
# Job name | ||
#SBATCH -J hypertune | ||
|
||
# Output and error logs | ||
#SBATCH -o logs_slurm/log_%x_%j.out | ||
#SBATCH -e logs_slurm/log_%x_%j.err | ||
|
||
# Add jobscript to job output | ||
echo "#################### Job submission script. #############################" | ||
cat $0 | ||
echo "################# End of job submission script. #########################" | ||
|
||
module purge | ||
module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1 | ||
|
||
source ~/miniconda3/bin/activate tf2 | ||
which python3 | ||
python3 --version | ||
|
||
|
||
# Getting the node hostnames | ||
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") | ||
nodes_array=($nodes) | ||
|
||
echo $nodes | ||
|
||
head_node=${nodes_array[0]} | ||
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) | ||
|
||
# if we detect a space character in the head node IP, we'll | ||
# convert it to an ipv4 address. This step is optional. | ||
if [[ "$head_node_ip" == *" "* ]]; then | ||
IFS=' ' read -ra ADDR <<<"$head_node_ip" | ||
if [[ ${#ADDR[0]} -gt 16 ]]; then | ||
head_node_ip=${ADDR[1]} | ||
else | ||
head_node_ip=${ADDR[0]} | ||
fi | ||
echo "IPV6 address detected. We split the IPV4 address as $head_node_ip" | ||
fi | ||
|
||
port=6379 | ||
ip_head=$head_node_ip:$port | ||
export ip_head | ||
echo "IP Head: $ip_head" | ||
|
||
echo "Starting HEAD at $head_node" | ||
srun --nodes=1 --ntasks=1 -w "$head_node" mlpf/hypertune_scripts/run_chief.sh "chief" $head_node_ip $port $1 "/mnt/ceph/users/ewulff/hypertunes/hypertune_out${2}_${SLURM_JOB_ID}" &> logs_slurm/chief_${SLURM_JOB_ID} & | ||
sleep 5 | ||
|
||
# number of nodes other than the head node | ||
worker_num=$((SLURM_JOB_NUM_NODES - 1)) | ||
|
||
for ((i = 1; i <= worker_num; i++)); do | ||
node_i=${nodes_array[$i]} | ||
echo "Starting WORKER $i at $node_i" | ||
tunerID="tuner$i" | ||
srun --nodes=1 --ntasks=1 -w "$node_i" \ | ||
mlpf/hypertune_scripts/run_tuner.sh $tunerID $head_node_ip $port $1 "/mnt/ceph/users/ewulff/hypertunes/hypertune_out${2}_${SLURM_JOB_ID}" &> logs_slurm/tuner_${SLURM_JOB_ID}_${i} & | ||
sleep 1 | ||
done | ||
wait # keep the wait statement, it is important | ||
echo "Done." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
#!/bin/sh | ||
|
||
# Walltime limit | ||
#SBATCH -t 03:00:00 | ||
#SBATCH -N 1 | ||
#SBATCH --exclusive | ||
#SBATCH --tasks-per-node=1 | ||
#SBATCH -p gpu | ||
#SBATCH --gpus 1 | ||
#SBATCH --constraint=a100 | ||
|
||
# Job name | ||
#SBATCH -J pipeeval | ||
|
||
# Output and error logs | ||
#SBATCH -o logs_slurm/log_%x_%j.out | ||
#SBATCH -e logs_slurm/log_%x_%j.err | ||
|
||
# Add jobscript to job output | ||
echo "#################### Job submission script. #############################" | ||
cat $0 | ||
echo "################# End of job submission script. #########################" | ||
|
||
module purge | ||
module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1 | ||
nvidia-smi | ||
|
||
source ~/miniconda3/bin/activate tf2 | ||
which python3 | ||
python3 --version | ||
|
||
python3 tf_list_gpus.py | ||
|
||
echo 'Starting evaluation.' | ||
CUDA_VISIBLE_DEVICES=0 python3 mlpf/pipeline.py evaluate -c $1 -t $2 | ||
echo 'Evaluation done.' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
#!/bin/sh | ||
|
||
# Walltime limit | ||
#SBATCH -t 168:00:00 | ||
#SBATCH -N 1 | ||
#SBATCH --exclusive | ||
#SBATCH --tasks-per-node=1 | ||
#SBATCH -p gpu | ||
#SBATCH --gpus-per-task=4 | ||
#SBATCH --constraint=a100,sxm4 | ||
|
||
# Job name | ||
#SBATCH -J pipetrain | ||
|
||
# Output and error logs | ||
#SBATCH -o logs_slurm/log_%x_%j.out | ||
#SBATCH -e logs_slurm/log_%x_%j.err | ||
|
||
# Add jobscript to job output | ||
echo "#################### Job submission script. #############################" | ||
cat $0 | ||
echo "################# End of job submission script. #########################" | ||
|
||
module purge | ||
module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1 | ||
nvidia-smi | ||
|
||
source ~/miniconda3/bin/activate tf2 | ||
which python3 | ||
python3 --version | ||
|
||
mkdir $TMPDIR/particleflow | ||
rsync -ar --exclude={".git","experiments"} . $TMPDIR/particleflow | ||
cd $TMPDIR/particleflow | ||
if [ $? -eq 0 ] | ||
then | ||
echo "Successfully changed directory" | ||
else | ||
echo "Could not change directory" >&2 | ||
exit 1 | ||
fi | ||
mkdir experiments | ||
|
||
python3 tf_list_gpus.py | ||
|
||
echo 'Starting training.' | ||
# Run the training of the base GNN model using e.g. 4 GPUs in a data-parallel mode | ||
CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py train -c $1 -p $2 | ||
echo 'Training done.' | ||
|
||
rsync -a experiments/ /mnt/ceph/users/ewulff/experiments/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
#!/bin/bash | ||
|
||
#SBATCH -t 168:00:00 | ||
#SBATCH -N 4 | ||
#SBATCH --tasks-per-node=1 | ||
#SBATCH -p gpu | ||
#SBATCH --constraint=a100,sxm4 | ||
#SBATCH --gpus-per-task=4 | ||
#SBATCH --cpus-per-task=64 | ||
|
||
# Job name | ||
#SBATCH -J raytune | ||
|
||
# Output and error logs | ||
#SBATCH -o logs_slurm/log_%x_%j.out | ||
#SBATCH -e logs_slurm/log_%x_%j.err | ||
|
||
# Add jobscript to job output | ||
echo "#################### Job submission script. #############################" | ||
cat $0 | ||
echo "################# End of job submission script. #########################" | ||
|
||
export TUNE_RESULT_DIR="/mnt/ceph/users/ewulff/ray_results/tune_result_dir" | ||
export TUNE_MAX_PENDING_TRIALS_PG=${SLURM_NNODES} | ||
|
||
module purge | ||
module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1 | ||
nvidia-smi | ||
|
||
source ~/miniconda3/bin/activate tf2 | ||
which python3 | ||
python3 --version | ||
|
||
|
||
################# DON NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING ############### | ||
# This script is a modification to the implementation suggest by gregSchwartz18 here: | ||
# https://github.com/ray-project/ray/issues/826#issuecomment-522116599 | ||
redis_password=$(uuidgen) | ||
export redis_password | ||
echo "Redis password: ${redis_password}" | ||
|
||
nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST) # Getting the node names | ||
nodes_array=( $nodes ) | ||
|
||
node_1=${nodes_array[0]} | ||
ip=$(srun --nodes=1 --ntasks=1 -w $node_1 hostname --ip-address) # making redis-address | ||
port=6379 | ||
ip_head=$ip:$port | ||
export ip_head | ||
echo "IP Head: $ip_head" | ||
|
||
echo "STARTING HEAD at $node_1" | ||
srun --nodes=1 --ntasks=1 -w $node_1 mlpf/flatiron/start-head.sh $ip $SLURM_JOB_ID $2 & | ||
sleep 30 | ||
|
||
worker_num=$(($SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node | ||
for (( i=1; i<=$worker_num; i++ )) | ||
do | ||
node_i=${nodes_array[$i]} | ||
echo "STARTING WORKER $i at $node_i" | ||
srun --nodes=1 --ntasks=1 -w $node_i mlpf/flatiron/start-worker.sh $ip_head $SLURM_JOB_ID $i $2 & | ||
sleep 5 | ||
done | ||
############################################################################################## | ||
|
||
#### call your code below | ||
python3 mlpf/pipeline.py raytune -c $1 -n $2 --cpus "${SLURM_CPUS_PER_TASK}" --gpus "${SLURM_GPUS_PER_TASK}" | ||
exit |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
#!/bin/bash | ||
|
||
export LC_ALL=C.UTF-8 | ||
export LANG=C.UTF-8 | ||
|
||
echo "starting ray head node" | ||
# Launch the head node | ||
mkdir -p "/mnt/ceph/users/ewulff/nvidia_smi_logs/$3_$2" | ||
nvidia-smi --query-gpu=timestamp,name,pci.bus_id,pstate,power.draw,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f "/mnt/ceph/users/ewulff/nvidia_smi_logs/$3_$2/head.csv" & | ||
ray start --head --node-ip-address=$1 --port=6379 | ||
sleep infinity |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#!/bin/bash | ||
|
||
export LC_ALL=C.UTF-8 | ||
export LANG=C.UTF-8 | ||
|
||
echo "starting ray worker node" | ||
mkdir -p "/mnt/ceph/users/ewulff/nvidia_smi_logs/$4_$2" | ||
nvidia-smi --query-gpu=timestamp,name,pci.bus_id,pstate,power.draw,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f "/mnt/ceph/users/ewulff/nvidia_smi_logs/$4_$2/worker_$3.csv" & | ||
ray start --address $1 | ||
sleep infinity |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
#!/bin/sh | ||
|
||
# Walltime limit | ||
#SBATCH -t 72:00:00 | ||
#SBATCH -N 1 | ||
#SBATCH --tasks-per-node=1 | ||
#SBATCH -p gpu | ||
#SBATCH --gpus 4 | ||
#SBATCH --constraint=a100 | ||
|
||
# Job name | ||
#SBATCH -J train | ||
|
||
# Output and error logs | ||
#SBATCH -o logs_slurm/log_%x_%j.out | ||
#SBATCH -e logs_slurm/log_%x_%j.err | ||
|
||
# Add jobscript to job output | ||
echo "#################### Job submission script. #############################" | ||
cat $0 | ||
echo "################# End of job submission script. #########################" | ||
|
||
module purge | ||
module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1 | ||
nvidia-smi | ||
|
||
source ~/miniconda3/bin/activate tf2 | ||
which python3 | ||
python3 --version | ||
|
||
mkdir $TMPDIR/particleflow | ||
rsync -ar --exclude={".git","experiments"} . $TMPDIR/particleflow | ||
cd $TMPDIR/particleflow | ||
if [ $? -eq 0 ] | ||
then | ||
echo "Successfully changed directory" | ||
else | ||
echo "Could not change directory" >&2 | ||
exit 1 | ||
fi | ||
|
||
python3 tf_list_gpus.py | ||
|
||
echo 'Starting training.' | ||
# Run the training of the base GNN model using e.g. 4 GPUs in a data-parallel mode | ||
CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/launcher.py --action train --model-spec $1 | ||
echo 'Training done.' | ||
ls -l experiments | ||
|
||
rsync -a experiments/ /mnt/ceph/users/ewulff/experiments/ |
Oops, something went wrong.