Skip to content

Commit

Permalink
Merge pull request #77 from erwulff/raytune
Browse files Browse the repository at this point in the history
Raytune

Former-commit-id: 30237f5
  • Loading branch information
jpata authored Sep 3, 2021
2 parents c3178e7 + 4cdafc7 commit ddd605b
Show file tree
Hide file tree
Showing 24 changed files with 1,448 additions and 44 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
sudo apt install python3 python3-pip wget
sudo python3 -m pip install --upgrade pip
sudo python3 -m pip install --upgrade setuptools
sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click
sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click 'ray[default]' 'ray[tune]'
- name: Run delphes TF model
run: ./scripts/local_test_delphes_pipeline.sh

Expand All @@ -31,7 +31,7 @@ jobs:
sudo apt install python3 python3-pip wget
sudo python3 -m pip install --upgrade pip
sudo python3 -m pip install --upgrade setuptools
sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click
sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click 'ray[default]' 'ray[tune]'
- name: Run CMS TF model using the pipeline
run: ./scripts/local_test_cms_pipeline.sh

Expand Down
45 changes: 45 additions & 0 deletions mlpf/flatiron/find_lr_4GPUs.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/sh

#SBATCH -t 1:00:00
#SBATCH -N 1
#SBATCH --tasks-per-node=1
#SBATCH -p gpu
#SBATCH --gpus 4
#SBATCH --constraint=a100

# Job name
#SBATCH -J find_lr

# Output and error logs
#SBATCH -o logs_slurm/log_%x_%j.out
#SBATCH -e logs_slurm/log_%x_%j.err

# Add jobscript to job output
echo "#################### Job submission script. #############################"
cat $0
echo "################# End of job submission script. #########################"

module purge
module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
nvidia-smi

source ~/miniconda3/bin/activate tf2
which python3
python3 --version

mkdir $TMPDIR/particleflow
rsync -ar --exclude={".git","experiments"} . $TMPDIR/particleflow
cd $TMPDIR/particleflow
if [ $? -eq 0 ]
then
echo "Successfully changed directory"
else
echo "Could not change directory" >&2
exit 1
fi

python3 tf_list_gpus.py

CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py find-lr -c $1

cp lr_finder.jpg $SLURM_SUBMIT_DIR/
73 changes: 73 additions & 0 deletions mlpf/flatiron/hypertune.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/bin/sh

#SBATCH -t 168:00:00
#SBATCH -N 4
#SBATCH --exclusive
#SBATCH --tasks-per-node=1
#SBATCH -p gpu
#SBATCH --constraint=a100,sxm4
#SBATCH --gpus-per-task=4

# Job name
#SBATCH -J hypertune

# Output and error logs
#SBATCH -o logs_slurm/log_%x_%j.out
#SBATCH -e logs_slurm/log_%x_%j.err

# Add jobscript to job output
echo "#################### Job submission script. #############################"
cat $0
echo "################# End of job submission script. #########################"

module purge
module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1

source ~/miniconda3/bin/activate tf2
which python3
python3 --version


# Getting the node hostnames
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
nodes_array=($nodes)

echo $nodes

head_node=${nodes_array[0]}
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)

# if we detect a space character in the head node IP, we'll
# convert it to an ipv4 address. This step is optional.
if [[ "$head_node_ip" == *" "* ]]; then
IFS=' ' read -ra ADDR <<<"$head_node_ip"
if [[ ${#ADDR[0]} -gt 16 ]]; then
head_node_ip=${ADDR[1]}
else
head_node_ip=${ADDR[0]}
fi
echo "IPV6 address detected. We split the IPV4 address as $head_node_ip"
fi

port=6379
ip_head=$head_node_ip:$port
export ip_head
echo "IP Head: $ip_head"

echo "Starting HEAD at $head_node"
srun --nodes=1 --ntasks=1 -w "$head_node" mlpf/hypertune_scripts/run_chief.sh "chief" $head_node_ip $port $1 "/mnt/ceph/users/ewulff/hypertunes/hypertune_out${2}_${SLURM_JOB_ID}" &> logs_slurm/chief_${SLURM_JOB_ID} &
sleep 5

# number of nodes other than the head node
worker_num=$((SLURM_JOB_NUM_NODES - 1))

for ((i = 1; i <= worker_num; i++)); do
node_i=${nodes_array[$i]}
echo "Starting WORKER $i at $node_i"
tunerID="tuner$i"
srun --nodes=1 --ntasks=1 -w "$node_i" \
mlpf/hypertune_scripts/run_tuner.sh $tunerID $head_node_ip $port $1 "/mnt/ceph/users/ewulff/hypertunes/hypertune_out${2}_${SLURM_JOB_ID}" &> logs_slurm/tuner_${SLURM_JOB_ID}_${i} &
sleep 1
done
wait # keep the wait statement, it is important
echo "Done."
36 changes: 36 additions & 0 deletions mlpf/flatiron/pipeline_evaluate_1GPU.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/sh

# Walltime limit
#SBATCH -t 03:00:00
#SBATCH -N 1
#SBATCH --exclusive
#SBATCH --tasks-per-node=1
#SBATCH -p gpu
#SBATCH --gpus 1
#SBATCH --constraint=a100

# Job name
#SBATCH -J pipeeval

# Output and error logs
#SBATCH -o logs_slurm/log_%x_%j.out
#SBATCH -e logs_slurm/log_%x_%j.err

# Add jobscript to job output
echo "#################### Job submission script. #############################"
cat $0
echo "################# End of job submission script. #########################"

module purge
module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
nvidia-smi

source ~/miniconda3/bin/activate tf2
which python3
python3 --version

python3 tf_list_gpus.py

echo 'Starting evaluation.'
CUDA_VISIBLE_DEVICES=0 python3 mlpf/pipeline.py evaluate -c $1 -t $2
echo 'Evaluation done.'
51 changes: 51 additions & 0 deletions mlpf/flatiron/pipeline_train_4GPUs.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/sh

# Walltime limit
#SBATCH -t 168:00:00
#SBATCH -N 1
#SBATCH --exclusive
#SBATCH --tasks-per-node=1
#SBATCH -p gpu
#SBATCH --gpus-per-task=4
#SBATCH --constraint=a100,sxm4

# Job name
#SBATCH -J pipetrain

# Output and error logs
#SBATCH -o logs_slurm/log_%x_%j.out
#SBATCH -e logs_slurm/log_%x_%j.err

# Add jobscript to job output
echo "#################### Job submission script. #############################"
cat $0
echo "################# End of job submission script. #########################"

module purge
module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
nvidia-smi

source ~/miniconda3/bin/activate tf2
which python3
python3 --version

mkdir $TMPDIR/particleflow
rsync -ar --exclude={".git","experiments"} . $TMPDIR/particleflow
cd $TMPDIR/particleflow
if [ $? -eq 0 ]
then
echo "Successfully changed directory"
else
echo "Could not change directory" >&2
exit 1
fi
mkdir experiments

python3 tf_list_gpus.py

echo 'Starting training.'
# Run the training of the base GNN model using e.g. 4 GPUs in a data-parallel mode
CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py train -c $1 -p $2
echo 'Training done.'

rsync -a experiments/ /mnt/ceph/users/ewulff/experiments/
68 changes: 68 additions & 0 deletions mlpf/flatiron/raytune.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/bin/bash

#SBATCH -t 168:00:00
#SBATCH -N 4
#SBATCH --tasks-per-node=1
#SBATCH -p gpu
#SBATCH --constraint=a100,sxm4
#SBATCH --gpus-per-task=4
#SBATCH --cpus-per-task=64

# Job name
#SBATCH -J raytune

# Output and error logs
#SBATCH -o logs_slurm/log_%x_%j.out
#SBATCH -e logs_slurm/log_%x_%j.err

# Add jobscript to job output
echo "#################### Job submission script. #############################"
cat $0
echo "################# End of job submission script. #########################"

export TUNE_RESULT_DIR="/mnt/ceph/users/ewulff/ray_results/tune_result_dir"
export TUNE_MAX_PENDING_TRIALS_PG=${SLURM_NNODES}

module purge
module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
nvidia-smi

source ~/miniconda3/bin/activate tf2
which python3
python3 --version


################# DON NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING ###############
# This script is a modification to the implementation suggest by gregSchwartz18 here:
# https://github.com/ray-project/ray/issues/826#issuecomment-522116599
redis_password=$(uuidgen)
export redis_password
echo "Redis password: ${redis_password}"

nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST) # Getting the node names
nodes_array=( $nodes )

node_1=${nodes_array[0]}
ip=$(srun --nodes=1 --ntasks=1 -w $node_1 hostname --ip-address) # making redis-address
port=6379
ip_head=$ip:$port
export ip_head
echo "IP Head: $ip_head"

echo "STARTING HEAD at $node_1"
srun --nodes=1 --ntasks=1 -w $node_1 mlpf/flatiron/start-head.sh $ip $SLURM_JOB_ID $2 &
sleep 30

worker_num=$(($SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node
for (( i=1; i<=$worker_num; i++ ))
do
node_i=${nodes_array[$i]}
echo "STARTING WORKER $i at $node_i"
srun --nodes=1 --ntasks=1 -w $node_i mlpf/flatiron/start-worker.sh $ip_head $SLURM_JOB_ID $i $2 &
sleep 5
done
##############################################################################################

#### call your code below
python3 mlpf/pipeline.py raytune -c $1 -n $2 --cpus "${SLURM_CPUS_PER_TASK}" --gpus "${SLURM_GPUS_PER_TASK}"
exit
11 changes: 11 additions & 0 deletions mlpf/flatiron/start-head.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

export LC_ALL=C.UTF-8
export LANG=C.UTF-8

echo "starting ray head node"
# Launch the head node
mkdir -p "/mnt/ceph/users/ewulff/nvidia_smi_logs/$3_$2"
nvidia-smi --query-gpu=timestamp,name,pci.bus_id,pstate,power.draw,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f "/mnt/ceph/users/ewulff/nvidia_smi_logs/$3_$2/head.csv" &
ray start --head --node-ip-address=$1 --port=6379
sleep infinity
10 changes: 10 additions & 0 deletions mlpf/flatiron/start-worker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

export LC_ALL=C.UTF-8
export LANG=C.UTF-8

echo "starting ray worker node"
mkdir -p "/mnt/ceph/users/ewulff/nvidia_smi_logs/$4_$2"
nvidia-smi --query-gpu=timestamp,name,pci.bus_id,pstate,power.draw,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f "/mnt/ceph/users/ewulff/nvidia_smi_logs/$4_$2/worker_$3.csv" &
ray start --address $1
sleep infinity
50 changes: 50 additions & 0 deletions mlpf/flatiron/train_4GPUs.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/bin/sh

# Walltime limit
#SBATCH -t 72:00:00
#SBATCH -N 1
#SBATCH --tasks-per-node=1
#SBATCH -p gpu
#SBATCH --gpus 4
#SBATCH --constraint=a100

# Job name
#SBATCH -J train

# Output and error logs
#SBATCH -o logs_slurm/log_%x_%j.out
#SBATCH -e logs_slurm/log_%x_%j.err

# Add jobscript to job output
echo "#################### Job submission script. #############################"
cat $0
echo "################# End of job submission script. #########################"

module purge
module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
nvidia-smi

source ~/miniconda3/bin/activate tf2
which python3
python3 --version

mkdir $TMPDIR/particleflow
rsync -ar --exclude={".git","experiments"} . $TMPDIR/particleflow
cd $TMPDIR/particleflow
if [ $? -eq 0 ]
then
echo "Successfully changed directory"
else
echo "Could not change directory" >&2
exit 1
fi

python3 tf_list_gpus.py

echo 'Starting training.'
# Run the training of the base GNN model using e.g. 4 GPUs in a data-parallel mode
CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/launcher.py --action train --model-spec $1
echo 'Training done.'
ls -l experiments

rsync -a experiments/ /mnt/ceph/users/ewulff/experiments/
Loading

0 comments on commit ddd605b

Please sign in to comment.