diff --git a/mlpf/flatiron/pipeline_evaluate_1GPU.slurm b/mlpf/flatiron/pipeline_evaluate_1GPU.slurm index 165690a7a..2ec8fe882 100644 --- a/mlpf/flatiron/pipeline_evaluate_1GPU.slurm +++ b/mlpf/flatiron/pipeline_evaluate_1GPU.slurm @@ -1,13 +1,13 @@ #!/bin/sh # Walltime limit -#SBATCH -t 03:00:00 +#SBATCH -t 1-00:00:00 #SBATCH -N 1 #SBATCH --exclusive #SBATCH --tasks-per-node=1 #SBATCH -p gpu -#SBATCH --gpus 1 -#SBATCH --constraint=a100 +#SBATCH --gpus-per-task=1 +#SBATCH --constraint=a100-80gb,ib # Job name #SBATCH -J pipeeval @@ -21,14 +21,26 @@ echo "#################### Job submission script. #############################" cat $0 echo "################# End of job submission script. #########################" -module --force purge; module load modules/1.49-20211101 -module load slurm gcc nccl cuda/11.3.1 cudnn/8.2.0.53-11.3 openmpi/4.0.6 +module --force purge; module load modules/2.1.1-20230405 +module load slurm gcc cmake nccl cuda/11.8.0 cudnn/8.4.0.27-11.6 openmpi/4.0.7 + nvidia-smi source ~/miniconda3/bin/activate tf2 which python3 python3 --version +# make tensorflow find cupti (needed for profiling) +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/mnt/sw/nix/store/3xpm36w2kcri3j1m5j15hg025my1p4kx-cuda-11.8.0/extras/CUPTI/lib64/ + +train_dir="experiments/hits_bs16_clic-hits_20230508_064411_129925_RESUMED2_clic-hits_20230522_170633_350485.workergpu064" + echo 'Starting evaluation.' -CUDA_VISIBLE_DEVICES=0 python3 mlpf/pipeline.py evaluate -c $1 -t $2 +CUDA_VISIBLE_DEVICES=0 python3 mlpf/pipeline.py evaluate \ + --train-dir $train_dir echo 'Evaluation done.' + +echo 'Starting plotting.' +CUDA_VISIBLE_DEVICES=0 python3 mlpf/pipeline.py plots \ + --train-dir $train_dir +echo 'Plotting done.' diff --git a/mlpf/flatiron/pipeline_train_4GPUs.slurm b/mlpf/flatiron/pipeline_train_4GPUs.slurm index c8298c7ad..88ea5f711 100644 --- a/mlpf/flatiron/pipeline_train_4GPUs.slurm +++ b/mlpf/flatiron/pipeline_train_4GPUs.slurm @@ -1,13 +1,13 @@ #!/bin/sh # Walltime limit -#SBATCH -t 168:00:00 +#SBATCH -t 7-0:00:00 #SBATCH -N 1 #SBATCH --exclusive #SBATCH --tasks-per-node=1 #SBATCH -p gpu #SBATCH --gpus-per-task=4 -#SBATCH --constraint=a100-80gb +#SBATCH --constraint=a100-80gb,ib # Job name #SBATCH -J pipetrain @@ -21,19 +21,26 @@ echo "#################### Job submission script. #############################" cat $0 echo "################# End of job submission script. #########################" -# module --force purge; module load modules/2.1-alpha1 -# module load slurm gcc/11.3.0 nccl cuda/11.8.0 cudnn/8.4.0.27-11.6 -module --force purge; module load modules/2.0-20220630 -module load slurm gcc cmake/3.22.3 nccl cuda/11.4.4 cudnn/8.2.4.15-11.4 openmpi/4.0.7 +# module --force purge; module load modules/2.0-20220630 +# module load slurm gcc cmake/3.22.3 nccl cuda/11.4.4 cudnn/8.2.4.15-11.4 openmpi/4.0.7 + +module --force purge; module load modules/2.1.1-20230405 +module load slurm gcc cmake nccl cuda/11.8.0 cudnn/8.4.0.27-11.6 openmpi/4.0.7 + nvidia-smi source ~/miniconda3/bin/activate tf2 which python3 python3 --version +# make tensorflow find cupti (needed for profiling) +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/mnt/sw/nix/store/3xpm36w2kcri3j1m5j15hg025my1p4kx-cuda-11.8.0/extras/CUPTI/lib64/ + +export TF_GPU_THREAD_MODE=gpu_private +export TF_GPU_THREAD_COUNT=2 echo 'Starting training.' # Run the training of the base GNN model using e.g. 4 GPUs in a data-parallel mode CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py train -c $1 -p $2 \ - --seeds --comet-exp-name particleflow-tf-gen + --seeds --comet-exp-name particleflow-tf-clic echo 'Training done.' diff --git a/mlpf/flatiron/pipeline_train_8GPUs.slurm b/mlpf/flatiron/pipeline_train_8GPUs.slurm new file mode 100644 index 000000000..7474edba3 --- /dev/null +++ b/mlpf/flatiron/pipeline_train_8GPUs.slurm @@ -0,0 +1,41 @@ +#!/bin/sh + +# Walltime limit +#SBATCH -t 2-00:00:00 +#SBATCH -N 1 +#SBATCH --exclusive +#SBATCH --tasks-per-node=1 +#SBATCH -p gpu +#SBATCH --gpus-per-task=8 +#SBATCH --constraint=h100,ib + +# Job name +#SBATCH -J pipetrain + +# Output and error logs +#SBATCH -o logs_slurm/log_%x_%j.out +#SBATCH -e logs_slurm/log_%x_%j.err + +# Add jobscript to job output +echo "#################### Job submission script. #############################" +cat $0 +echo "################# End of job submission script. #########################" + +# module --force purge; module load modules/2.0-20220630 +# module load slurm gcc cmake/3.22.3 nccl cuda/11.4.4 cudnn/8.2.4.15-11.4 openmpi/4.0.7 + +module --force purge; module load modules/2.1.1-20230405 +module load slurm gcc cmake nccl cuda/12.0.0 cudnn/8.4.0.27-11.6 openmpi/4.0.7 + +nvidia-smi + +source ~/miniconda3/bin/activate tf2 +which python3 +python3 --version + + +echo 'Starting training.' +# Run the training of the base GNN model using e.g. 8 GPUs in a data-parallel mode +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 mlpf/pipeline.py train -c $1 -p $2 \ + --seeds --comet-exp-name particleflow-tf-clic +echo 'Training done.' diff --git a/mlpf/flatiron/pipeline_train_8GPUs_singularity.slurm b/mlpf/flatiron/pipeline_train_8GPUs_singularity.slurm new file mode 100644 index 000000000..38aad26cd --- /dev/null +++ b/mlpf/flatiron/pipeline_train_8GPUs_singularity.slurm @@ -0,0 +1,42 @@ +#!/bin/sh + +# Walltime limit +#SBATCH -t 0-04:00:00 +#SBATCH -N 1 +#SBATCH --exclusive +#SBATCH --tasks-per-node=1 +#SBATCH -p gpu +#SBATCH --gpus-per-task=8 +#SBATCH --constraint=h100,ib +# #SBATCH --mem 256G + +# Job name +#SBATCH -J pipetrain + +# Output and error logs +#SBATCH -o logs_slurm/log_%x_%j.out +#SBATCH -e logs_slurm/log_%x_%j.err + +# Add jobscript to job output +echo "#################### Job submission script. #############################" +cat $0 +echo "################# End of job submission script. #########################" + +export MODULEPATH=/mnt/home/gkrawezik/modules/rocky8:$MODULEPATH +module load cuda/12 cudnn/cuda12 nccl/cuda12 singularity # these names are specific to gkrawezik's modules +nvidia-smi + +# ensure CPU is keeping private threads for scheduling operations on the GPUs +# https://www.tensorflow.org/guide/gpu_performance_analysis#2_gpu_host_thread_contention +export TF_GPU_THREAD_MODE=gpu_private +export TF_GPU_THREAD_COUNT=2 + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +nvidia-smi + +echo 'Starting training.' +singularity run --nv -B /mnt/ceph/users/ewulff/tensorflow_datasets,/mnt/ceph/users/ewulff/particleflow \ + tensorflow_23.05-tf2-py3.sif \ + python3 $PWD/mlpf/pipeline.py train -c $1 -p $2 \ + --seeds --comet-exp-name particleflow-tf-clic --benchmark_dir exp_dir +echo 'Training done.' diff --git a/mlpf/flatiron/raytune.sh b/mlpf/flatiron/raytune.sh index 2dd709530..f8e1ad187 100755 --- a/mlpf/flatiron/raytune.sh +++ b/mlpf/flatiron/raytune.sh @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH -t 168:00:00 -#SBATCH -N 12 +#SBATCH -N 8 #SBATCH --tasks-per-node=1 #SBATCH -p gpu #SBATCH --constraint=a100-80gb,ib @@ -78,5 +78,5 @@ echo All Ray workers started. #### call your code below python3 mlpf/pipeline.py raytune -c $1 -n $2 --cpus "${SLURM_CPUS_PER_TASK}" \ - --gpus $num_gpus --seeds --comet-exp-name particleflow-raytune + --gpus $num_gpus --seeds --comet-exp-name particleflow-raytune # --comet-online exit diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py index 3ab0e2e32..239e984af 100644 --- a/mlpf/pipeline.py +++ b/mlpf/pipeline.py @@ -17,6 +17,7 @@ from datetime import datetime from functools import partial from pathlib import Path +import ctypes import boost_histogram as bh import click @@ -168,6 +169,15 @@ def train( # tf.debugging.enable_check_numerics() + # Configure GPU threads according to TensorFlow's best practices for optimal model performance + os.environ["TF_GPU_THREAD_MODE"] = "gpu_private" + os.environ["TF_GPU_THREAD_COUNT"] = "2" + + # According to TensorFlow's best practices for optimal model performance, set GPU memory growth to True + physical_devices = tf.config.list_physical_devices("GPU") + for pd in physical_devices: + tf.config.experimental.set_memory_growth(pd, True) + if seeds: random.seed(1234) np.random.seed(1234) @@ -261,6 +271,17 @@ def train( with strategy.scope(): model, optim_callbacks, initial_epoch = model_scope(config, total_steps, weights) + if num_gpus > 0: + # According to TensorFlow's best practices for optimal model performance, + # max out the L2 fetch granularity to 128 bytes when using NVIDIA GPUs + _libcudart = ctypes.CDLL("libcudart.so") + # Set device limit on the current device + # cudaLimitMaxL2FetchGranularity = 0x05 + pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int)) + _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128)) + _libcudart.cudaDeviceGetLimit(pValue, ctypes.c_int(0x05)) + assert pValue.contents.value == 128 + with strategy.scope(): callbacks = prepare_callbacks( config, @@ -304,8 +325,8 @@ def train( model.normalizer.variance = tf.convert_to_tensor(cache["variance"]) model.fit( - ds_train.tensorflow_dataset.repeat(), - validation_data=ds_test.tensorflow_dataset.repeat(), + ds_train.tensorflow_dataset.repeat().prefetch(tf.data.AUTOTUNE), + validation_data=ds_test.tensorflow_dataset.repeat().prefetch(tf.data.AUTOTUNE), epochs=config["setup"]["num_epochs"], callbacks=callbacks, steps_per_epoch=ds_train.num_steps(), @@ -705,6 +726,8 @@ def raytune( from raytune.search_space import raytune_num_samples, search_space from raytune.utils import get_raytune_schedule, get_raytune_search_alg + os.environ["TUNE_DISABLE_STRICT_METRIC_CHECKING"] = "1" # don't crash if a metric is missing + if seeds: # Set seeds for reproducibility random.seed(1234) diff --git a/mlpf/raytune/search_space.py b/mlpf/raytune/search_space.py index c77cc88b4..8509ce9b2 100644 --- a/mlpf/raytune/search_space.py +++ b/mlpf/raytune/search_space.py @@ -1,7 +1,7 @@ -from ray.tune import grid_search # choice, loguniform, quniform +from ray.tune import choice # grid_search, choice, loguniform, quniform -raytune_num_samples = 1 # Number of random samples to draw from search space. Set to 1 for grid search. -samp = grid_search +raytune_num_samples = 300 # Number of random samples to draw from search space. Set to 1 for grid search. +samp = choice # search_space = { # Optimizer parameters # "lr": samp([1e-4, 1e-3]), @@ -73,14 +73,48 @@ # # "mask_reg_cls0": choice([False, True]), # } -# onecycle scan +# # onecycle scan +# search_space = { +# # "lr": samp([1e-4, 1e-3, 1e-2]), +# # "batch_size_physical": samp([24, 40]), +# "batch_multiplier": samp([1, 5, 10]), +# # "model": samp(["gnn_dense", "transformer"]), +# # "lr_schedule": samp(["none", "cosinedecay", "onecycle"]), +# # "optimizer": samp(["pcgrad_adam", "adam", "sgd"]), +# } + +# transformer scan +# search_space = { +# # optimizer parameters +# "lr": samp([1e-5, 1e-4, 1e-3]), +# "batch_multiplier": samp([10, 20, 40]), +# # model arch parameters +# "num_layers_encoder": samp([1, 2, 3, 4]), # default is 1 +# "num_layers_decoder_reg": samp([1, 2, 3, 4]), # default is 1 +# "num_layers_decoder_cls": samp([1, 2, 3, 4]), # default is 1 +# "hidden_dim": samp([32, 64, 128]), # default is 64 +# "num_heads": samp([8, 16, 32, 64]), # default is 16 +# "num_random_features": samp([16, 32, 64, 128]), # default is 32 +# # output_decoding parameters +# "out_hidden_dim": samp([128, 256, 512]), # default is ~256 +# "out_num_layers": samp([1, 2, 3, 4]), # default is ~2 +# } + +# gnn scan search_space = { - # "lr": samp([1e-4, 1e-3, 1e-2]), - # "batch_size_physical": samp([24, 40]), - "batch_multiplier": samp([1, 5, 10]), - # "model": samp(["gnn_dense", "transformer"]), - # "lr_schedule": samp(["none", "cosinedecay", "onecycle"]), - # "optimizer": samp(["pcgrad_adam", "adam", "sgd"]), + # optimizer parameters + "lr": samp([1e-4, 1e-3, 1e-2]), + # "batch_multiplier": samp([10, 20, 40]), + # model arch parameters + "num_graph_layers_id": samp([1, 2, 3, 4, 5, 6]), + "num_graph_layers_reg": samp([1, 2, 3, 4, 5, 6]), + "bin_size": samp([16, 32, 64, 128, 256]), + "output_dim": samp([64, 128, 256]), + "ffn_dist_hidden_dim": samp([64, 128, 256]), + "ffn_dist_num_layers": samp([1, 2, 3, 4, 5]), + # output_decoding parameters + "out_hidden_dim": samp([64, 128, 256, 512]), + "out_num_layers": samp([1, 2, 3, 4, 5]), } @@ -217,4 +251,18 @@ def set_raytune_search_parameters(search_space, config): config["parameters"]["output_decoding"]["phi_num_layers"] = search_space["out_num_layers"] config["parameters"]["output_decoding"]["energy_num_layers"] = search_space["out_num_layers"] + # transformer specific parameters + if "num_layers_encoder" in search_space.keys(): + config["parameters"]["num_layers_encoder"] = search_space["num_layers_encoder"] + if "num_layers_decoder_reg" in search_space.keys(): + config["parameters"]["num_layers_decoder_reg"] = search_space["num_layers_decoder_reg"] + if "num_layers_decoder_cls" in search_space.keys(): + config["parameters"]["num_layers_decoder_cls"] = search_space["num_layers_decoder_cls"] + if "hidden_dim" in search_space.keys(): + config["parameters"]["hidden_dim"] = search_space["hidden_dim"] + if "num_heads" in search_space.keys(): + config["parameters"]["num_heads"] = search_space["num_heads"] + if "num_random_features" in search_space.keys(): + config["parameters"]["num_random_features"] = search_space["num_random_features"] + return config diff --git a/mlpf/tfmodel/model_setup.py b/mlpf/tfmodel/model_setup.py index 95ddbc437..3b634fdbf 100644 --- a/mlpf/tfmodel/model_setup.py +++ b/mlpf/tfmodel/model_setup.py @@ -166,7 +166,7 @@ def prepare_callbacks( benchmark_dir = outdir if config["dataset"]["schema"] == "delphes": bmk_bs = config["train_test_datasets"]["delphes"]["batch_per_gpu"] - elif config["dataset"]["schema"] == "cms": + elif (config["dataset"]["schema"] == "cms") or (config["dataset"]["schema"] == "clic"): assert ( len(config["train_test_datasets"]) == 1 ), "Expected exactly 1 key, physical OR delphes, \ @@ -176,8 +176,8 @@ def prepare_callbacks( bmk_bs = config["train_test_datasets"]["physical"]["batch_per_gpu"] else: raise ValueError( - "Benchmark callback only supports delphes or \ - cms dataset schema. {}".format( + "Benchmark callback only supports delphes \ + cms or clic dataset schema. {}".format( config["dataset"]["schema"] ) ) @@ -232,7 +232,9 @@ def get_checkpoint_history_callback(outdir, config, dataset, comet_experiment, h write_graph=False, write_images=False, update_freq="batch", - # profile_batch=(50,100), + profile_batch=config["callbacks"]["tensorboard"]["profile_batch"] + if "profile_batch" in config["callbacks"]["tensorboard"].keys() + else 0, dump_history=config["callbacks"]["tensorboard"]["dump_history"], ) # Change the class name of CustomTensorBoard TensorBoard to make keras_tuner recognise it diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py index aa382a7e1..dd95416cb 100644 --- a/mlpf/tfmodel/utils.py +++ b/mlpf/tfmodel/utils.py @@ -221,7 +221,7 @@ def get_strategy(num_cpus=None): num_batches_multiplier = 1 if num_gpus > 1: num_batches_multiplier = num_gpus - logging.info("Multiple GPUs detected, num_batces_multiplier={}".format(num_batches_multiplier)) + logging.info("Multiple GPUs detected, num_batches_multiplier={}".format(num_batches_multiplier)) return strategy, num_gpus, num_batches_multiplier @@ -368,7 +368,7 @@ def load_and_interleave( ): datasets = [mlpf_dataset_from_config(ds_name, config, split, max_events) for ds_name in dataset_names] ds = interleave_datasets(joint_dataset_name, split, datasets) - tensorflow_dataset = ds.tensorflow_dataset.map(get_map_to_supervised(config)) + tensorflow_dataset = ds.tensorflow_dataset.map(get_map_to_supervised(config), num_parallel_calls=tf.data.AUTOTUNE) # use dynamic batching depending on the sequence length if config["batching"]["bucket_by_sequence_length"]: diff --git a/requirements.txt b/requirements.txt index b3d2f1257..217fec984 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,6 +15,7 @@ networkx nevergrad notebook numba +numpy==1.23.5 # later versions are incompatible with tf2onnx v1.14.0 (latest as of this commit) onnxruntime pandas papermill