Skip to content

Commit

Permalink
TF perf tuning, CLIC benchmarks, flatiron scripts (#185)
Browse files Browse the repository at this point in the history
* fix: error in raytune search space

* enable best practice settings for optimal model performance

* only max out NVIDIA L2 cache if GPUs are found

* enable benchmarking callback for clic dataset schema

* feat: configure tensorboard profiling from config file

* Update eval script on flatiron

* Update training batch script on flatiron

* Update raytune batch script on flatiron

* Add batch scripts for 8 GPU training on flatiron

* Update raytune search space file

* Setting numpy==1.23.5 in requirements.txt, later versions are incompatible with tf2onnx 1.14.0 (latest at time of commit)
  • Loading branch information
erwulff authored Jul 28, 2023
1 parent 0523950 commit 35fc5d8
Show file tree
Hide file tree
Showing 10 changed files with 209 additions and 33 deletions.
24 changes: 18 additions & 6 deletions mlpf/flatiron/pipeline_evaluate_1GPU.slurm
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#!/bin/sh

# Walltime limit
#SBATCH -t 03:00:00
#SBATCH -t 1-00:00:00
#SBATCH -N 1
#SBATCH --exclusive
#SBATCH --tasks-per-node=1
#SBATCH -p gpu
#SBATCH --gpus 1
#SBATCH --constraint=a100
#SBATCH --gpus-per-task=1
#SBATCH --constraint=a100-80gb,ib

# Job name
#SBATCH -J pipeeval
Expand All @@ -21,14 +21,26 @@ echo "#################### Job submission script. #############################"
cat $0
echo "################# End of job submission script. #########################"

module --force purge; module load modules/1.49-20211101
module load slurm gcc nccl cuda/11.3.1 cudnn/8.2.0.53-11.3 openmpi/4.0.6
module --force purge; module load modules/2.1.1-20230405
module load slurm gcc cmake nccl cuda/11.8.0 cudnn/8.4.0.27-11.6 openmpi/4.0.7

nvidia-smi

source ~/miniconda3/bin/activate tf2
which python3
python3 --version

# make tensorflow find cupti (needed for profiling)
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/mnt/sw/nix/store/3xpm36w2kcri3j1m5j15hg025my1p4kx-cuda-11.8.0/extras/CUPTI/lib64/

train_dir="experiments/hits_bs16_clic-hits_20230508_064411_129925_RESUMED2_clic-hits_20230522_170633_350485.workergpu064"

echo 'Starting evaluation.'
CUDA_VISIBLE_DEVICES=0 python3 mlpf/pipeline.py evaluate -c $1 -t $2
CUDA_VISIBLE_DEVICES=0 python3 mlpf/pipeline.py evaluate \
--train-dir $train_dir
echo 'Evaluation done.'

echo 'Starting plotting.'
CUDA_VISIBLE_DEVICES=0 python3 mlpf/pipeline.py plots \
--train-dir $train_dir
echo 'Plotting done.'
21 changes: 14 additions & 7 deletions mlpf/flatiron/pipeline_train_4GPUs.slurm
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#!/bin/sh

# Walltime limit
#SBATCH -t 168:00:00
#SBATCH -t 7-0:00:00
#SBATCH -N 1
#SBATCH --exclusive
#SBATCH --tasks-per-node=1
#SBATCH -p gpu
#SBATCH --gpus-per-task=4
#SBATCH --constraint=a100-80gb
#SBATCH --constraint=a100-80gb,ib

# Job name
#SBATCH -J pipetrain
Expand All @@ -21,19 +21,26 @@ echo "#################### Job submission script. #############################"
cat $0
echo "################# End of job submission script. #########################"

# module --force purge; module load modules/2.1-alpha1
# module load slurm gcc/11.3.0 nccl cuda/11.8.0 cudnn/8.4.0.27-11.6
module --force purge; module load modules/2.0-20220630
module load slurm gcc cmake/3.22.3 nccl cuda/11.4.4 cudnn/8.2.4.15-11.4 openmpi/4.0.7
# module --force purge; module load modules/2.0-20220630
# module load slurm gcc cmake/3.22.3 nccl cuda/11.4.4 cudnn/8.2.4.15-11.4 openmpi/4.0.7

module --force purge; module load modules/2.1.1-20230405
module load slurm gcc cmake nccl cuda/11.8.0 cudnn/8.4.0.27-11.6 openmpi/4.0.7

nvidia-smi

source ~/miniconda3/bin/activate tf2
which python3
python3 --version

# make tensorflow find cupti (needed for profiling)
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/mnt/sw/nix/store/3xpm36w2kcri3j1m5j15hg025my1p4kx-cuda-11.8.0/extras/CUPTI/lib64/

export TF_GPU_THREAD_MODE=gpu_private
export TF_GPU_THREAD_COUNT=2

echo 'Starting training.'
# Run the training of the base GNN model using e.g. 4 GPUs in a data-parallel mode
CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py train -c $1 -p $2 \
--seeds --comet-exp-name particleflow-tf-gen
--seeds --comet-exp-name particleflow-tf-clic
echo 'Training done.'
41 changes: 41 additions & 0 deletions mlpf/flatiron/pipeline_train_8GPUs.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/sh

# Walltime limit
#SBATCH -t 2-00:00:00
#SBATCH -N 1
#SBATCH --exclusive
#SBATCH --tasks-per-node=1
#SBATCH -p gpu
#SBATCH --gpus-per-task=8
#SBATCH --constraint=h100,ib

# Job name
#SBATCH -J pipetrain

# Output and error logs
#SBATCH -o logs_slurm/log_%x_%j.out
#SBATCH -e logs_slurm/log_%x_%j.err

# Add jobscript to job output
echo "#################### Job submission script. #############################"
cat $0
echo "################# End of job submission script. #########################"

# module --force purge; module load modules/2.0-20220630
# module load slurm gcc cmake/3.22.3 nccl cuda/11.4.4 cudnn/8.2.4.15-11.4 openmpi/4.0.7

module --force purge; module load modules/2.1.1-20230405
module load slurm gcc cmake nccl cuda/12.0.0 cudnn/8.4.0.27-11.6 openmpi/4.0.7

nvidia-smi

source ~/miniconda3/bin/activate tf2
which python3
python3 --version


echo 'Starting training.'
# Run the training of the base GNN model using e.g. 8 GPUs in a data-parallel mode
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 mlpf/pipeline.py train -c $1 -p $2 \
--seeds --comet-exp-name particleflow-tf-clic
echo 'Training done.'
42 changes: 42 additions & 0 deletions mlpf/flatiron/pipeline_train_8GPUs_singularity.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/sh

# Walltime limit
#SBATCH -t 0-04:00:00
#SBATCH -N 1
#SBATCH --exclusive
#SBATCH --tasks-per-node=1
#SBATCH -p gpu
#SBATCH --gpus-per-task=8
#SBATCH --constraint=h100,ib
# #SBATCH --mem 256G

# Job name
#SBATCH -J pipetrain

# Output and error logs
#SBATCH -o logs_slurm/log_%x_%j.out
#SBATCH -e logs_slurm/log_%x_%j.err

# Add jobscript to job output
echo "#################### Job submission script. #############################"
cat $0
echo "################# End of job submission script. #########################"

export MODULEPATH=/mnt/home/gkrawezik/modules/rocky8:$MODULEPATH
module load cuda/12 cudnn/cuda12 nccl/cuda12 singularity # these names are specific to gkrawezik's modules
nvidia-smi

# ensure CPU is keeping private threads for scheduling operations on the GPUs
# https://www.tensorflow.org/guide/gpu_performance_analysis#2_gpu_host_thread_contention
export TF_GPU_THREAD_MODE=gpu_private
export TF_GPU_THREAD_COUNT=2

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
nvidia-smi

echo 'Starting training.'
singularity run --nv -B /mnt/ceph/users/ewulff/tensorflow_datasets,/mnt/ceph/users/ewulff/particleflow \
tensorflow_23.05-tf2-py3.sif \
python3 $PWD/mlpf/pipeline.py train -c $1 -p $2 \
--seeds --comet-exp-name particleflow-tf-clic --benchmark_dir exp_dir
echo 'Training done.'
4 changes: 2 additions & 2 deletions mlpf/flatiron/raytune.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

#SBATCH -t 168:00:00
#SBATCH -N 12
#SBATCH -N 8
#SBATCH --tasks-per-node=1
#SBATCH -p gpu
#SBATCH --constraint=a100-80gb,ib
Expand Down Expand Up @@ -78,5 +78,5 @@ echo All Ray workers started.

#### call your code below
python3 mlpf/pipeline.py raytune -c $1 -n $2 --cpus "${SLURM_CPUS_PER_TASK}" \
--gpus $num_gpus --seeds --comet-exp-name particleflow-raytune
--gpus $num_gpus --seeds --comet-exp-name particleflow-raytune # --comet-online
exit
27 changes: 25 additions & 2 deletions mlpf/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from datetime import datetime
from functools import partial
from pathlib import Path
import ctypes

import boost_histogram as bh
import click
Expand Down Expand Up @@ -168,6 +169,15 @@ def train(

# tf.debugging.enable_check_numerics()

# Configure GPU threads according to TensorFlow's best practices for optimal model performance
os.environ["TF_GPU_THREAD_MODE"] = "gpu_private"
os.environ["TF_GPU_THREAD_COUNT"] = "2"

# According to TensorFlow's best practices for optimal model performance, set GPU memory growth to True
physical_devices = tf.config.list_physical_devices("GPU")
for pd in physical_devices:
tf.config.experimental.set_memory_growth(pd, True)

if seeds:
random.seed(1234)
np.random.seed(1234)
Expand Down Expand Up @@ -261,6 +271,17 @@ def train(
with strategy.scope():
model, optim_callbacks, initial_epoch = model_scope(config, total_steps, weights)

if num_gpus > 0:
# According to TensorFlow's best practices for optimal model performance,
# max out the L2 fetch granularity to 128 bytes when using NVIDIA GPUs
_libcudart = ctypes.CDLL("libcudart.so")
# Set device limit on the current device
# cudaLimitMaxL2FetchGranularity = 0x05
pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int))
_libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128))
_libcudart.cudaDeviceGetLimit(pValue, ctypes.c_int(0x05))
assert pValue.contents.value == 128

with strategy.scope():
callbacks = prepare_callbacks(
config,
Expand Down Expand Up @@ -304,8 +325,8 @@ def train(
model.normalizer.variance = tf.convert_to_tensor(cache["variance"])

model.fit(
ds_train.tensorflow_dataset.repeat(),
validation_data=ds_test.tensorflow_dataset.repeat(),
ds_train.tensorflow_dataset.repeat().prefetch(tf.data.AUTOTUNE),
validation_data=ds_test.tensorflow_dataset.repeat().prefetch(tf.data.AUTOTUNE),
epochs=config["setup"]["num_epochs"],
callbacks=callbacks,
steps_per_epoch=ds_train.num_steps(),
Expand Down Expand Up @@ -705,6 +726,8 @@ def raytune(
from raytune.search_space import raytune_num_samples, search_space
from raytune.utils import get_raytune_schedule, get_raytune_search_alg

os.environ["TUNE_DISABLE_STRICT_METRIC_CHECKING"] = "1" # don't crash if a metric is missing

if seeds:
# Set seeds for reproducibility
random.seed(1234)
Expand Down
68 changes: 58 additions & 10 deletions mlpf/raytune/search_space.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from ray.tune import grid_search # choice, loguniform, quniform
from ray.tune import choice # grid_search, choice, loguniform, quniform

raytune_num_samples = 1 # Number of random samples to draw from search space. Set to 1 for grid search.
samp = grid_search
raytune_num_samples = 300 # Number of random samples to draw from search space. Set to 1 for grid search.
samp = choice
# search_space = {
# Optimizer parameters
# "lr": samp([1e-4, 1e-3]),
Expand Down Expand Up @@ -73,14 +73,48 @@
# # "mask_reg_cls0": choice([False, True]),
# }

# onecycle scan
# # onecycle scan
# search_space = {
# # "lr": samp([1e-4, 1e-3, 1e-2]),
# # "batch_size_physical": samp([24, 40]),
# "batch_multiplier": samp([1, 5, 10]),
# # "model": samp(["gnn_dense", "transformer"]),
# # "lr_schedule": samp(["none", "cosinedecay", "onecycle"]),
# # "optimizer": samp(["pcgrad_adam", "adam", "sgd"]),
# }

# transformer scan
# search_space = {
# # optimizer parameters
# "lr": samp([1e-5, 1e-4, 1e-3]),
# "batch_multiplier": samp([10, 20, 40]),
# # model arch parameters
# "num_layers_encoder": samp([1, 2, 3, 4]), # default is 1
# "num_layers_decoder_reg": samp([1, 2, 3, 4]), # default is 1
# "num_layers_decoder_cls": samp([1, 2, 3, 4]), # default is 1
# "hidden_dim": samp([32, 64, 128]), # default is 64
# "num_heads": samp([8, 16, 32, 64]), # default is 16
# "num_random_features": samp([16, 32, 64, 128]), # default is 32
# # output_decoding parameters
# "out_hidden_dim": samp([128, 256, 512]), # default is ~256
# "out_num_layers": samp([1, 2, 3, 4]), # default is ~2
# }

# gnn scan
search_space = {
# "lr": samp([1e-4, 1e-3, 1e-2]),
# "batch_size_physical": samp([24, 40]),
"batch_multiplier": samp([1, 5, 10]),
# "model": samp(["gnn_dense", "transformer"]),
# "lr_schedule": samp(["none", "cosinedecay", "onecycle"]),
# "optimizer": samp(["pcgrad_adam", "adam", "sgd"]),
# optimizer parameters
"lr": samp([1e-4, 1e-3, 1e-2]),
# "batch_multiplier": samp([10, 20, 40]),
# model arch parameters
"num_graph_layers_id": samp([1, 2, 3, 4, 5, 6]),
"num_graph_layers_reg": samp([1, 2, 3, 4, 5, 6]),
"bin_size": samp([16, 32, 64, 128, 256]),
"output_dim": samp([64, 128, 256]),
"ffn_dist_hidden_dim": samp([64, 128, 256]),
"ffn_dist_num_layers": samp([1, 2, 3, 4, 5]),
# output_decoding parameters
"out_hidden_dim": samp([64, 128, 256, 512]),
"out_num_layers": samp([1, 2, 3, 4, 5]),
}


Expand Down Expand Up @@ -217,4 +251,18 @@ def set_raytune_search_parameters(search_space, config):
config["parameters"]["output_decoding"]["phi_num_layers"] = search_space["out_num_layers"]
config["parameters"]["output_decoding"]["energy_num_layers"] = search_space["out_num_layers"]

# transformer specific parameters
if "num_layers_encoder" in search_space.keys():
config["parameters"]["num_layers_encoder"] = search_space["num_layers_encoder"]
if "num_layers_decoder_reg" in search_space.keys():
config["parameters"]["num_layers_decoder_reg"] = search_space["num_layers_decoder_reg"]
if "num_layers_decoder_cls" in search_space.keys():
config["parameters"]["num_layers_decoder_cls"] = search_space["num_layers_decoder_cls"]
if "hidden_dim" in search_space.keys():
config["parameters"]["hidden_dim"] = search_space["hidden_dim"]
if "num_heads" in search_space.keys():
config["parameters"]["num_heads"] = search_space["num_heads"]
if "num_random_features" in search_space.keys():
config["parameters"]["num_random_features"] = search_space["num_random_features"]

return config
10 changes: 6 additions & 4 deletions mlpf/tfmodel/model_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def prepare_callbacks(
benchmark_dir = outdir
if config["dataset"]["schema"] == "delphes":
bmk_bs = config["train_test_datasets"]["delphes"]["batch_per_gpu"]
elif config["dataset"]["schema"] == "cms":
elif (config["dataset"]["schema"] == "cms") or (config["dataset"]["schema"] == "clic"):
assert (
len(config["train_test_datasets"]) == 1
), "Expected exactly 1 key, physical OR delphes, \
Expand All @@ -176,8 +176,8 @@ def prepare_callbacks(
bmk_bs = config["train_test_datasets"]["physical"]["batch_per_gpu"]
else:
raise ValueError(
"Benchmark callback only supports delphes or \
cms dataset schema. {}".format(
"Benchmark callback only supports delphes \
cms or clic dataset schema. {}".format(
config["dataset"]["schema"]
)
)
Expand Down Expand Up @@ -232,7 +232,9 @@ def get_checkpoint_history_callback(outdir, config, dataset, comet_experiment, h
write_graph=False,
write_images=False,
update_freq="batch",
# profile_batch=(50,100),
profile_batch=config["callbacks"]["tensorboard"]["profile_batch"]
if "profile_batch" in config["callbacks"]["tensorboard"].keys()
else 0,
dump_history=config["callbacks"]["tensorboard"]["dump_history"],
)
# Change the class name of CustomTensorBoard TensorBoard to make keras_tuner recognise it
Expand Down
4 changes: 2 additions & 2 deletions mlpf/tfmodel/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def get_strategy(num_cpus=None):
num_batches_multiplier = 1
if num_gpus > 1:
num_batches_multiplier = num_gpus
logging.info("Multiple GPUs detected, num_batces_multiplier={}".format(num_batches_multiplier))
logging.info("Multiple GPUs detected, num_batches_multiplier={}".format(num_batches_multiplier))

return strategy, num_gpus, num_batches_multiplier

Expand Down Expand Up @@ -368,7 +368,7 @@ def load_and_interleave(
):
datasets = [mlpf_dataset_from_config(ds_name, config, split, max_events) for ds_name in dataset_names]
ds = interleave_datasets(joint_dataset_name, split, datasets)
tensorflow_dataset = ds.tensorflow_dataset.map(get_map_to_supervised(config))
tensorflow_dataset = ds.tensorflow_dataset.map(get_map_to_supervised(config), num_parallel_calls=tf.data.AUTOTUNE)

# use dynamic batching depending on the sequence length
if config["batching"]["bucket_by_sequence_length"]:
Expand Down
Loading

0 comments on commit 35fc5d8

Please sign in to comment.