From b1d13ec590abda8cedd697867e230ece6ce3261d Mon Sep 17 00:00:00 2001 From: Bartol Freskura Date: Fri, 7 Jun 2024 16:47:56 +0200 Subject: [PATCH 1/6] Add inference code --- src/inference.py | 145 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 src/inference.py diff --git a/src/inference.py b/src/inference.py new file mode 100644 index 0000000..45432e4 --- /dev/null +++ b/src/inference.py @@ -0,0 +1,145 @@ +import argparse +from pprint import pprint + +import torch +import torch.utils.benchmark as benchmark + +ARCHITECTURES = { + "resnet50": "resnet50", + "convnext": "convnext_base", + "vgg16": "vgg16", + "efficient_net_v2": "efficientnet_v2_m", + "mobilenet_v3": "mobilenet_v3_large", + "resnext50": "resnext50_32x4d", + "swin": "swin_b", + "vit": "vit_b_16", + "ssd_vgg16": "ssd300_vgg16", + "fasterrcnn_resnet50_v2": "fasterrcnn_resnet50_fpn_v2", +} + + +def benchmark_inference( + stmt: str, + setup: str, + input: torch.Tensor, + n_runs=100, + num_threads: int = 1, +): + """ + Benchmark a model using torch.utils.benchmark. + + When evaluating model speed in MP/s only the video height, width and batch size are taken into + account. The number of channels and sequence length are ignored. Speed evaluation measures + how fast can we process an arbitrary input video so channels and sequence length don't + affect the model computation speed. + """ + + timer = benchmark.Timer( + stmt=stmt, + setup=setup, + num_threads=num_threads, + globals={"x": input}, + ) + + print( + f"Running benchmark on sample of {n_runs} runs with {num_threads} thread(s)..." + ) + result = timer.timeit(n_runs) + + batch, height, width = input.size(0), input.size(-2), input.size(-1) + total_pixels = batch * width * height + + print(f"Batch size: {batch}") + print(f"Input resolution: {width}x{height} pixels\n") + + mean_per_batch = result.mean + median_per_batch = result.median + + mean_speed_mpx = (total_pixels / 1e6) / mean_per_batch + median_speed_mpx = (total_pixels / 1e6) / median_per_batch + + print(f"Mean time per {batch} {width}x{height} px frames: {mean_per_batch:.4f} s") + print( + f"Median time per {batch} {width}x{height} px frames: {median_per_batch:.4f} s\n" + ) + + print(f"Model mean speed in megapixels per second: {mean_speed_mpx:.3f} MP/s") + print(f"Model median speed in megapixels per second: {median_speed_mpx:.3f} MP/s\n") + + +def main(args): + args_dict = vars(args) + print("Arguments:") + pprint(args_dict) + + if args.model.lower() not in ARCHITECTURES: + raise ValueError("Architecture not supported.") + + stmt = """ \ + with torch.inference_mode(): + out = model(x) + out = out.clamp(0, 1).float().cpu() + """ + + arch = ARCHITECTURES[args.model.lower()] + setup = f"from torchvision.models import {arch}; model = {arch}(); model.eval()" + + input_shape = [3, args.height, args.width] + precision = torch.float16 if args.precision == "16" else torch.float32 + + x = torch.rand(*input_shape, dtype=precision) + x = x.cuda(0, non_blocking=True) + setup = f"{setup}; model.cuda(0)" + + if args.precision == "16": + setup = f"{setup}; model.half()" + + benchmark_inference( + stmt=stmt, + setup=setup, + input=x, + n_runs=args.n_iters, + num_threads=args.num_workers, + ) + + +if __name__ == "__main__": + if not torch.cuda.is_available(): + raise ValueError("CUDA device not found on this system.") + else: + print("CUDA Device Name:", torch.cuda.get_device_name(0)) + print("CUDNN version:", torch.backends.cudnn.version()) + print( + "CUDA Device Total Memory: " + + f"{(torch.cuda.get_device_properties(0).total_memory / 1e9):.2f} GB", + ) + + parser = argparse.ArgumentParser(description="Benchmark CV models training on GPU.") + + parser.add_argument("--batch-size", type=int, required=True) + parser.add_argument( + "--n-iters", + type=int, + default=100, + help="Number of training iterations to benchmark for. One iteration = one batch update", + ) + parser.add_argument("--precision", choices=["32", "16"], default="16") + parser.add_argument("--n-workers", type=int, default=1) + + parser.add_argument("--width", type=int, default=192, help="Input width") + parser.add_argument("--height", type=int, default=192, help="Input height") + + parser.add_argument( + "--model", + default="resnet50", + choices=list(ARCHITECTURES.keys()), + help="Architecture to benchmark.", + ) + parser.add_argument("--list-requirements", action="store_true") + + args = parser.parse_args() + + if args.n_iters <= 0: + raise ValueError("Number of iterations must be > 0") + + main(args=args) From f1aaafb4b46f350ba050d7629784d8bfaa1d73b5 Mon Sep 17 00:00:00 2001 From: Bartol Freskura Date: Fri, 7 Jun 2024 17:24:28 +0200 Subject: [PATCH 2/6] Fix bugs; explicit torch versions --- requirements.txt | 4 +++- src/inference.py | 12 ++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index ecbf612..e92447c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,3 @@ -lightning==2.1.1 \ No newline at end of file +lightning==2.1.1 +torch==2.3.1 +torchvision==0.18.1 diff --git a/src/inference.py b/src/inference.py index 45432e4..348507b 100644 --- a/src/inference.py +++ b/src/inference.py @@ -63,8 +63,12 @@ def benchmark_inference( f"Median time per {batch} {width}x{height} px frames: {median_per_batch:.4f} s\n" ) - print(f"Model mean speed in megapixels per second: {mean_speed_mpx:.3f} MP/s") - print(f"Model median speed in megapixels per second: {median_speed_mpx:.3f} MP/s\n") + print( + f"Model mean throughoutput in megapixels per second: {mean_speed_mpx:.3f} MP/s" + ) + print( + f"Model median throughoutput in megapixels per second: {median_speed_mpx:.3f} MP/s\n" + ) def main(args): @@ -84,7 +88,7 @@ def main(args): arch = ARCHITECTURES[args.model.lower()] setup = f"from torchvision.models import {arch}; model = {arch}(); model.eval()" - input_shape = [3, args.height, args.width] + input_shape = [args.batch_size, 3, args.height, args.width] precision = torch.float16 if args.precision == "16" else torch.float32 x = torch.rand(*input_shape, dtype=precision) @@ -99,7 +103,7 @@ def main(args): setup=setup, input=x, n_runs=args.n_iters, - num_threads=args.num_workers, + num_threads=args.n_workers, ) From 2bac7131330213e0f2a70019b0558172d0ab06f9 Mon Sep 17 00:00:00 2001 From: Bartol Freskura Date: Sat, 8 Jun 2024 18:18:05 +0200 Subject: [PATCH 3/6] Document inference; refactor; update packages --- README.md | 39 +++++++++++++-------- dockerfiles/cuda118/Dockerfile | 3 +- dockerfiles/cuda120/Dockerfile | 5 +-- requirements.txt | 6 ++-- src/inference.py | 62 ++++++++++++++++++++-------------- src/log.py | 11 ++++++ src/train.py | 10 ++---- 7 files changed, 82 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index e36ed59..d3dc561 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,3 @@ -

Tensorpix logo

@@ -38,15 +37,15 @@ You can use this benchmark repo to: Please open an issue if you need support for a new architecture. -* ResNet50 -* ConvNext (base) -* VGG16 -* Efficient Net v2 -* MobileNet V3 -* ResNeXt50 -* SWIN -* VIT -* UNet with ResNet50 backbone +- ResNet50 +- ConvNext (base) +- VGG16 +- Efficient Net v2 +- MobileNet V3 +- ResNeXt50 +- SWIN +- VIT +- UNet with ResNet50 backbone ## 📖 How to benchmark @@ -58,19 +57,31 @@ In order to run benchmark docker containers you must have the following installe - NVIDIA drivers. See [Versions](#versions) when choosing the docker image. - [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) - required in order to use CUDA inside docker containers +### Training vs Inference + +By default, the container will benchmark model training. If you want to benchmark model inference, append the `src.inference` to the docker run command. See examples below for more details. + ### Examples **Minimal** -`docker run --rm --ipc=host --ulimit memlock=-1 --gpus all ghcr.io/tensorpix/benchmarking-cv-models --batch-size 32` +`docker run --rm --ipc=host --ulimit memlock=-1 --gpus all ghcr.io/tensorpix/benchmarking-cv-models src.train --batch-size 32` **Advanced** -`docker run --rm --ipc=host --ulimit memlock=-1 --gpus '"device=0,1"' -v ./benchmarks:/workdir/benchmarks ghcr.io/tensorpix/benchmarking-cv-models --batch-size 32 --n-iters 1000 --warmup-steps 100 --model resnext50 --precision 16-mixed --width 320 --height 320` +`docker run --rm --ipc=host --ulimit memlock=-1 --gpus '"device=0,1"' -v ./benchmarks:/workdir/benchmarks ghcr.io/tensorpix/benchmarking-cv-models src.train --batch-size 32 --n-iters 1000 --warmup-steps 100 --model resnext50 --precision 16-mixed --width 320 --height 320` + +**Benchmark Inference** + +`docker run --rm --ipc=host --ulimit memlock=-1 --gpus all ghcr.io/tensorpix/benchmarking-cv-models src.inference --batch-size 32 --n-iters 1000 --model resnext50 --precision 16 --width 256 --height 256` + +**List all train options:** + +`docker run --rm ghcr.io/tensorpix/benchmarking-cv-models src.train --help` -**List all options:** +**List all inference options:** -`docker run --rm ghcr.io/tensorpix/benchmarking-cv-models --help` +`docker run --rm ghcr.io/tensorpix/benchmarking-cv-models src.inference --help` ### How to select particular GPUs diff --git a/dockerfiles/cuda118/Dockerfile b/dockerfiles/cuda118/Dockerfile index c15bd9a..f7af36d 100644 --- a/dockerfiles/cuda118/Dockerfile +++ b/dockerfiles/cuda118/Dockerfile @@ -13,4 +13,5 @@ RUN pip3 install --no-cache-dir -r /tmp/requirements.txt --extra-index-url https COPY ./src /workdir/src WORKDIR /workdir -ENTRYPOINT [ "python3", "-m", "src.train" ] +ENTRYPOINT [ "python3", "-m" ] +CMD [ "src.train" ] diff --git a/dockerfiles/cuda120/Dockerfile b/dockerfiles/cuda120/Dockerfile index 89f0e3d..d008540 100644 --- a/dockerfiles/cuda120/Dockerfile +++ b/dockerfiles/cuda120/Dockerfile @@ -8,9 +8,10 @@ RUN apt update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install rm -rf /var/lib/apt/lists/* COPY requirements.txt /tmp/requirements.txt -RUN pip3 install --no-cache-dir -r /tmp/requirements.txt +RUN pip3 install --no-cache-dir -r /tmp/requirements.txt --extra-index-url https://download.pytorch.org/whl/cu121 COPY ./src /workdir/src WORKDIR /workdir -ENTRYPOINT [ "python3", "-m", "src.train" ] +ENTRYPOINT [ "python3", "-m" ] +CMD [ "src.train" ] diff --git a/requirements.txt b/requirements.txt index 0417f66..ec0233d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -lightning==2.1.4 +lightning==2.2.5 protobuf==3.20.* segmentation-models-pytorch==0.3.3 six==1.16.0 -torch==2.1.2 -torchvision==0.16.2 +torch==2.3.1 +torchvision==0.18.1 diff --git a/src/inference.py b/src/inference.py index 348507b..224202d 100644 --- a/src/inference.py +++ b/src/inference.py @@ -1,9 +1,13 @@ import argparse -from pprint import pprint import torch import torch.utils.benchmark as benchmark +from src import log +from src.log import print_requirements + +logger = log.logger + ARCHITECTURES = { "resnet50": "resnet50", "convnext": "convnext_base", @@ -41,7 +45,7 @@ def benchmark_inference( globals={"x": input}, ) - print( + logger.info( f"Running benchmark on sample of {n_runs} runs with {num_threads} thread(s)..." ) result = timer.timeit(n_runs) @@ -49,8 +53,8 @@ def benchmark_inference( batch, height, width = input.size(0), input.size(-2), input.size(-1) total_pixels = batch * width * height - print(f"Batch size: {batch}") - print(f"Input resolution: {width}x{height} pixels\n") + logger.info(f"Batch size: {batch}") + logger.info(f"Input resolution: {width}x{height} pixels\n") mean_per_batch = result.mean median_per_batch = result.median @@ -58,23 +62,24 @@ def benchmark_inference( mean_speed_mpx = (total_pixels / 1e6) / mean_per_batch median_speed_mpx = (total_pixels / 1e6) / median_per_batch - print(f"Mean time per {batch} {width}x{height} px frames: {mean_per_batch:.4f} s") - print( - f"Median time per {batch} {width}x{height} px frames: {median_per_batch:.4f} s\n" + logger.info( + f"Mean throughoutput per {batch} {width}x{height} px frames: {mean_per_batch:.4f} s" + ) + logger.info( + f"Median throughoutput per {batch} {width}x{height} px frames: {median_per_batch:.4f} s\n" ) - print( + logger.info( f"Model mean throughoutput in megapixels per second: {mean_speed_mpx:.3f} MP/s" ) - print( + logger.info( f"Model median throughoutput in megapixels per second: {median_speed_mpx:.3f} MP/s\n" ) def main(args): - args_dict = vars(args) - print("Arguments:") - pprint(args_dict) + if args.list_requirements: + print_requirements() if args.model.lower() not in ARCHITECTURES: raise ValueError("Architecture not supported.") @@ -92,8 +97,8 @@ def main(args): precision = torch.float16 if args.precision == "16" else torch.float32 x = torch.rand(*input_shape, dtype=precision) - x = x.cuda(0, non_blocking=True) - setup = f"{setup}; model.cuda(0)" + x = x.cuda(args.gpu_device_index, non_blocking=True) + setup = f"{setup}; model.cuda({args.gpu_device_index})" if args.precision == "16": setup = f"{setup}; model.half()" @@ -108,16 +113,6 @@ def main(args): if __name__ == "__main__": - if not torch.cuda.is_available(): - raise ValueError("CUDA device not found on this system.") - else: - print("CUDA Device Name:", torch.cuda.get_device_name(0)) - print("CUDNN version:", torch.backends.cudnn.version()) - print( - "CUDA Device Total Memory: " - + f"{(torch.cuda.get_device_properties(0).total_memory / 1e9):.2f} GB", - ) - parser = argparse.ArgumentParser(description="Benchmark CV models training on GPU.") parser.add_argument("--batch-size", type=int, required=True) @@ -129,9 +124,10 @@ def main(args): ) parser.add_argument("--precision", choices=["32", "16"], default="16") parser.add_argument("--n-workers", type=int, default=1) + parser.add_argument("--gpu-device-index", type=int, default=0) - parser.add_argument("--width", type=int, default=192, help="Input width") - parser.add_argument("--height", type=int, default=192, help="Input height") + parser.add_argument("--width", type=int, default=224, help="Input width") + parser.add_argument("--height", type=int, default=224, help="Input height") parser.add_argument( "--model", @@ -146,4 +142,18 @@ def main(args): if args.n_iters <= 0: raise ValueError("Number of iterations must be > 0") + logger.info("########## STARTING NEW INFERENCE BENCHMARK RUN ###########") + + if not torch.cuda.is_available(): + raise ValueError("CUDA device not found on this system.") + else: + logger.info( + f"CUDA Device Name: {torch.cuda.get_device_name(args.gpu_device_index)}" + ) + logger.info(f"CUDNN version: {torch.backends.cudnn.version()}") + logger.info( + "CUDA Device Total Memory: " + + f"{(torch.cuda.get_device_properties(args.gpu_device_index).total_memory / 1e9):.2f} GB" + ) + main(args=args) diff --git a/src/log.py b/src/log.py index 011552e..2259d22 100644 --- a/src/log.py +++ b/src/log.py @@ -1,5 +1,7 @@ import logging +from pip._internal.operations import freeze + def setup_custom_logger(name: str = "benchmark"): logger = logging.getLogger(name) @@ -14,3 +16,12 @@ def setup_custom_logger(name: str = "benchmark"): logger.setLevel(level=logging.DEBUG) return logger + + +def print_requirements(): + pkgs = freeze.freeze() + for pkg in pkgs: + logger.info(pkg) + + +logger = setup_custom_logger() diff --git a/src/train.py b/src/train.py index eaa23c8..03438a8 100644 --- a/src/train.py +++ b/src/train.py @@ -3,7 +3,6 @@ import segmentation_models_pytorch as smp import torch from lightning import Trainer -from pip._internal.operations import freeze from torch.utils.data import DataLoader from torchvision.models import ( convnext_base, @@ -19,9 +18,10 @@ from src import log from src.callbacks import BenchmarkCallback from src.data.in_memory_dataset import InMemoryDataset +from src.log import print_requirements from src.models.lightning_modules import LitClassification -logger = log.setup_custom_logger() +logger = log.logger ARCHITECTURES = { "resnet50": resnet50, @@ -38,12 +38,6 @@ } -def print_requirements(): - pkgs = freeze.freeze() - for pkg in pkgs: - logger.info(pkg) - - def main(args): if args.list_requirements: print_requirements() From 878243dccf1995cebf2ee0d0fc24379dceed1e70 Mon Sep 17 00:00:00 2001 From: Bartol Freskura Date: Mon, 10 Jun 2024 16:32:40 +0200 Subject: [PATCH 4/6] Clarify docs --- README.md | 2 +- dockerfiles/cuda118/Dockerfile | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index d3dc561..188b340 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ In order to run benchmark docker containers you must have the following installe ### Training vs Inference -By default, the container will benchmark model training. If you want to benchmark model inference, append the `src.inference` to the docker run command. See examples below for more details. +To benchmark model training, append the `src.train` when running the container. If you want to benchmark model inference, append the `src.inference` to the docker run command. See examples below for more details. ### Examples diff --git a/dockerfiles/cuda118/Dockerfile b/dockerfiles/cuda118/Dockerfile index f7af36d..e8f676d 100644 --- a/dockerfiles/cuda118/Dockerfile +++ b/dockerfiles/cuda118/Dockerfile @@ -14,4 +14,3 @@ COPY ./src /workdir/src WORKDIR /workdir ENTRYPOINT [ "python3", "-m" ] -CMD [ "src.train" ] From 751431fefacdf44d6fc23b9464b2821c3d037ee0 Mon Sep 17 00:00:00 2001 From: Bartol Freskura Date: Tue, 11 Jun 2024 10:15:48 +0200 Subject: [PATCH 5/6] Clarify docs; Remove clamp; default batch size --- src/inference.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/inference.py b/src/inference.py index 224202d..538e196 100644 --- a/src/inference.py +++ b/src/inference.py @@ -26,16 +26,16 @@ def benchmark_inference( stmt: str, setup: str, input: torch.Tensor, - n_runs=100, + n_runs: int = 100, num_threads: int = 1, ): """ Benchmark a model using torch.utils.benchmark. - When evaluating model speed in MP/s only the video height, width and batch size are taken into - account. The number of channels and sequence length are ignored. Speed evaluation measures - how fast can we process an arbitrary input video so channels and sequence length don't - affect the model computation speed. + When evaluating model throughoutput in MP/s only the image height, width and batch size are taken into + account. The number of channels are ignored as they are fixed to 3 channels in most cases (RGB images). + Speed evaluation measures how fast can we process an arbitrary input image so channels + don't affect the model computation speed. """ timer = benchmark.Timer( @@ -87,7 +87,7 @@ def main(args): stmt = """ \ with torch.inference_mode(): out = model(x) - out = out.clamp(0, 1).float().cpu() + out = out.float().cpu() """ arch = ARCHITECTURES[args.model.lower()] @@ -115,7 +115,7 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser(description="Benchmark CV models training on GPU.") - parser.add_argument("--batch-size", type=int, required=True) + parser.add_argument("--batch-size", type=int, required=True, default=1) parser.add_argument( "--n-iters", type=int, From 4a98dae0d2c4d2fc344b9c98d6f4eb2119069fc9 Mon Sep 17 00:00:00 2001 From: Bartol Freskura Date: Tue, 11 Jun 2024 17:41:16 +0200 Subject: [PATCH 6/6] Remove cmd from cuda 120 dockerfile --- dockerfiles/cuda120/Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/dockerfiles/cuda120/Dockerfile b/dockerfiles/cuda120/Dockerfile index d008540..2b112e1 100644 --- a/dockerfiles/cuda120/Dockerfile +++ b/dockerfiles/cuda120/Dockerfile @@ -14,4 +14,3 @@ COPY ./src /workdir/src WORKDIR /workdir ENTRYPOINT [ "python3", "-m" ] -CMD [ "src.train" ]