Merge pull request #77 from erwulff/raytune

Raytune Former-commit-id: 30237f5
jpata · Sep 3, 2021 · ddd605b · ddd605b
2 parents c3178e7 + 4cdafc7
commit ddd605b
Show file tree

Hide file tree

Showing 24 changed files with 1,448 additions and 44 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -18,7 +18,7 @@ jobs:
           sudo apt install python3 python3-pip wget
           sudo python3 -m pip install --upgrade pip
           sudo python3 -m pip install --upgrade setuptools
-          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click
+          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click 'ray[default]' 'ray[tune]'
       - name: Run delphes TF model
         run: ./scripts/local_test_delphes_pipeline.sh
 
@@ -31,7 +31,7 @@ jobs:
           sudo apt install python3 python3-pip wget
           sudo python3 -m pip install --upgrade pip
           sudo python3 -m pip install --upgrade setuptools
-          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click
+          sudo python3 -m pip install tensorflow==2.4 setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 keras-tuner networkx tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 tqdm click 'ray[default]' 'ray[tune]'
       - name: Run CMS TF model using the pipeline
         run: ./scripts/local_test_cms_pipeline.sh
 

diff --git a/mlpf/flatiron/find_lr_4GPUs.slurm b/mlpf/flatiron/find_lr_4GPUs.slurm
@@ -0,0 +1,45 @@
+#!/bin/sh
+
+#SBATCH -t 1:00:00
+#SBATCH -N 1
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --gpus 4
+#SBATCH --constraint=a100
+
+# Job name
+#SBATCH -J find_lr
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
+nvidia-smi
+
+source ~/miniconda3/bin/activate tf2
+which python3
+python3 --version
+
+mkdir $TMPDIR/particleflow
+rsync -ar --exclude={".git","experiments"} . $TMPDIR/particleflow
+cd $TMPDIR/particleflow
+if [ $? -eq 0 ]
+then
+  echo "Successfully changed directory"
+else
+  echo "Could not change directory" >&2
+  exit 1
+fi
+
+python3 tf_list_gpus.py
+
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py find-lr -c $1
+
+cp lr_finder.jpg $SLURM_SUBMIT_DIR/
diff --git a/mlpf/flatiron/hypertune.slurm b/mlpf/flatiron/hypertune.slurm
@@ -0,0 +1,73 @@
+#!/bin/sh
+
+#SBATCH -t 168:00:00
+#SBATCH -N 4
+#SBATCH --exclusive
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --constraint=a100,sxm4
+#SBATCH --gpus-per-task=4
+
+# Job name
+#SBATCH -J hypertune
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
+
+source ~/miniconda3/bin/activate tf2
+which python3
+python3 --version
+
+
+# Getting the node hostnames
+nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
+nodes_array=($nodes)
+
+echo $nodes
+
+head_node=${nodes_array[0]}
+head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+
+# if we detect a space character in the head node IP, we'll
+# convert it to an ipv4 address. This step is optional.
+if [[ "$head_node_ip" == *" "* ]]; then
+IFS=' ' read -ra ADDR <<<"$head_node_ip"
+if [[ ${#ADDR[0]} -gt 16 ]]; then
+  head_node_ip=${ADDR[1]}
+else
+  head_node_ip=${ADDR[0]}
+fi
+echo "IPV6 address detected. We split the IPV4 address as $head_node_ip"
+fi
+
+port=6379
+ip_head=$head_node_ip:$port
+export ip_head
+echo "IP Head: $ip_head"
+
+echo "Starting HEAD at $head_node"
+srun --nodes=1 --ntasks=1 -w "$head_node" mlpf/hypertune_scripts/run_chief.sh "chief" $head_node_ip $port $1 "/mnt/ceph/users/ewulff/hypertunes/hypertune_out${2}_${SLURM_JOB_ID}" &> logs_slurm/chief_${SLURM_JOB_ID} &
+sleep 5
+
+# number of nodes other than the head node
+worker_num=$((SLURM_JOB_NUM_NODES - 1))
+
+for ((i = 1; i <= worker_num; i++)); do
+    node_i=${nodes_array[$i]}
+    echo "Starting WORKER $i at $node_i"
+    tunerID="tuner$i"
+    srun --nodes=1 --ntasks=1 -w "$node_i" \
+        mlpf/hypertune_scripts/run_tuner.sh $tunerID $head_node_ip $port $1 "/mnt/ceph/users/ewulff/hypertunes/hypertune_out${2}_${SLURM_JOB_ID}" &> logs_slurm/tuner_${SLURM_JOB_ID}_${i} &
+    sleep 1
+done
+wait # keep the wait statement, it is important
+echo "Done."
diff --git a/mlpf/flatiron/pipeline_evaluate_1GPU.slurm b/mlpf/flatiron/pipeline_evaluate_1GPU.slurm
@@ -0,0 +1,36 @@
+#!/bin/sh
+
+# Walltime limit
+#SBATCH -t 03:00:00
+#SBATCH -N 1
+#SBATCH --exclusive
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --gpus 1
+#SBATCH --constraint=a100
+
+# Job name
+#SBATCH -J pipeeval
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
+nvidia-smi
+
+source ~/miniconda3/bin/activate tf2
+which python3
+python3 --version
+
+python3 tf_list_gpus.py
+
+echo 'Starting evaluation.'
+CUDA_VISIBLE_DEVICES=0 python3 mlpf/pipeline.py evaluate -c $1 -t $2
+echo 'Evaluation done.'
diff --git a/mlpf/flatiron/pipeline_train_4GPUs.slurm b/mlpf/flatiron/pipeline_train_4GPUs.slurm
@@ -0,0 +1,51 @@
+#!/bin/sh
+
+# Walltime limit
+#SBATCH -t 168:00:00
+#SBATCH -N 1
+#SBATCH --exclusive
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --gpus-per-task=4
+#SBATCH --constraint=a100,sxm4
+
+# Job name
+#SBATCH -J pipetrain
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
+nvidia-smi
+
+source ~/miniconda3/bin/activate tf2
+which python3
+python3 --version
+
+mkdir $TMPDIR/particleflow
+rsync -ar --exclude={".git","experiments"} . $TMPDIR/particleflow
+cd $TMPDIR/particleflow
+if [ $? -eq 0 ]
+then
+  echo "Successfully changed directory"
+else
+  echo "Could not change directory" >&2
+  exit 1
+fi
+mkdir experiments
+
+python3 tf_list_gpus.py
+
+echo 'Starting training.'
+# Run the training of the base GNN model using e.g. 4 GPUs in a data-parallel mode
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py train -c $1 -p $2
+echo 'Training done.'
+
+rsync -a experiments/ /mnt/ceph/users/ewulff/experiments/
diff --git a/mlpf/flatiron/raytune.sh b/mlpf/flatiron/raytune.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+#SBATCH -t 168:00:00
+#SBATCH -N 4
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --constraint=a100,sxm4
+#SBATCH --gpus-per-task=4
+#SBATCH --cpus-per-task=64
+
+# Job name
+#SBATCH -J raytune
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+export TUNE_RESULT_DIR="/mnt/ceph/users/ewulff/ray_results/tune_result_dir"
+export TUNE_MAX_PENDING_TRIALS_PG=${SLURM_NNODES}
+
+module purge
+module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
+nvidia-smi
+
+source ~/miniconda3/bin/activate tf2
+which python3
+python3 --version
+
+
+################# DON NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING ###############
+# This script is a modification to the implementation suggest by gregSchwartz18 here:
+# https://github.com/ray-project/ray/issues/826#issuecomment-522116599
+redis_password=$(uuidgen)
+export redis_password
+echo "Redis password: ${redis_password}"
+
+nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST) # Getting the node names
+nodes_array=( $nodes )
+
+node_1=${nodes_array[0]} 
+ip=$(srun --nodes=1 --ntasks=1 -w $node_1 hostname --ip-address) # making redis-address
+port=6379
+ip_head=$ip:$port
+export ip_head
+echo "IP Head: $ip_head"
+
+echo "STARTING HEAD at $node_1"
+srun --nodes=1 --ntasks=1 -w $node_1 mlpf/flatiron/start-head.sh $ip $SLURM_JOB_ID $2 &
+sleep 30
+
+worker_num=$(($SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node
+for ((  i=1; i<=$worker_num; i++ ))
+do
+  node_i=${nodes_array[$i]}
+  echo "STARTING WORKER $i at $node_i"
+  srun --nodes=1 --ntasks=1 -w $node_i mlpf/flatiron/start-worker.sh $ip_head $SLURM_JOB_ID $i $2 &
+  sleep 5
+done
+##############################################################################################
+
+#### call your code below
+python3 mlpf/pipeline.py raytune -c $1 -n $2 --cpus "${SLURM_CPUS_PER_TASK}" --gpus "${SLURM_GPUS_PER_TASK}"
+exit
diff --git a/mlpf/flatiron/start-head.sh b/mlpf/flatiron/start-head.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+export LC_ALL=C.UTF-8
+export LANG=C.UTF-8
+
+echo "starting ray head node"
+# Launch the head node
+mkdir -p "/mnt/ceph/users/ewulff/nvidia_smi_logs/$3_$2"
+nvidia-smi --query-gpu=timestamp,name,pci.bus_id,pstate,power.draw,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f "/mnt/ceph/users/ewulff/nvidia_smi_logs/$3_$2/head.csv" &
+ray start --head --node-ip-address=$1 --port=6379
+sleep infinity
diff --git a/mlpf/flatiron/start-worker.sh b/mlpf/flatiron/start-worker.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+export LC_ALL=C.UTF-8
+export LANG=C.UTF-8
+
+echo "starting ray worker node"
+mkdir -p "/mnt/ceph/users/ewulff/nvidia_smi_logs/$4_$2"
+nvidia-smi --query-gpu=timestamp,name,pci.bus_id,pstate,power.draw,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f "/mnt/ceph/users/ewulff/nvidia_smi_logs/$4_$2/worker_$3.csv" &
+ray start --address $1
+sleep infinity
diff --git a/mlpf/flatiron/train_4GPUs.slurm b/mlpf/flatiron/train_4GPUs.slurm
@@ -0,0 +1,50 @@
+#!/bin/sh
+
+# Walltime limit
+#SBATCH -t 72:00:00
+#SBATCH -N 1
+#SBATCH --tasks-per-node=1
+#SBATCH -p gpu
+#SBATCH --gpus 4
+#SBATCH --constraint=a100
+
+# Job name
+#SBATCH -J train
+
+# Output and error logs
+#SBATCH -o logs_slurm/log_%x_%j.out
+#SBATCH -e logs_slurm/log_%x_%j.err
+
+# Add jobscript to job output
+echo "#################### Job submission script. #############################"
+cat $0
+echo "################# End of job submission script. #########################"
+
+module purge
+module load slurm gcc cuda/11.1.0_455.23.05 cudnn/v8.0.4-cuda-11.1
+nvidia-smi
+
+source ~/miniconda3/bin/activate tf2
+which python3
+python3 --version
+
+mkdir $TMPDIR/particleflow
+rsync -ar --exclude={".git","experiments"} . $TMPDIR/particleflow
+cd $TMPDIR/particleflow
+if [ $? -eq 0 ]
+then
+  echo "Successfully changed directory"
+else
+  echo "Could not change directory" >&2
+  exit 1
+fi
+
+python3 tf_list_gpus.py
+
+echo 'Starting training.'
+# Run the training of the base GNN model using e.g. 4 GPUs in a data-parallel mode
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/launcher.py --action train --model-spec $1
+echo 'Training done.'
+ls -l experiments
+
+rsync -a experiments/ /mnt/ceph/users/ewulff/experiments/