From b1d13ec590abda8cedd697867e230ece6ce3261d Mon Sep 17 00:00:00 2001
From: Bartol Freskura <freskura.bartol@gmail.com>
Date: Fri, 7 Jun 2024 16:47:56 +0200
Subject: [PATCH 1/6] Add inference code

---
 src/inference.py | 145 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 145 insertions(+)
 create mode 100644 src/inference.py

diff --git a/src/inference.py b/src/inference.py
new file mode 100644
index 0000000..45432e4
--- /dev/null
+++ b/src/inference.py
@@ -0,0 +1,145 @@
+import argparse
+from pprint import pprint
+
+import torch
+import torch.utils.benchmark as benchmark
+
+ARCHITECTURES = {
+    "resnet50": "resnet50",
+    "convnext": "convnext_base",
+    "vgg16": "vgg16",
+    "efficient_net_v2": "efficientnet_v2_m",
+    "mobilenet_v3": "mobilenet_v3_large",
+    "resnext50": "resnext50_32x4d",
+    "swin": "swin_b",
+    "vit": "vit_b_16",
+    "ssd_vgg16": "ssd300_vgg16",
+    "fasterrcnn_resnet50_v2": "fasterrcnn_resnet50_fpn_v2",
+}
+
+
+def benchmark_inference(
+    stmt: str,
+    setup: str,
+    input: torch.Tensor,
+    n_runs=100,
+    num_threads: int = 1,
+):
+    """
+    Benchmark a model using torch.utils.benchmark.
+
+    When evaluating model speed in MP/s only the video height, width and batch size are taken into
+    account. The number of channels and sequence length are ignored. Speed evaluation measures
+    how fast can we process an arbitrary input video so channels and sequence length don't
+    affect the model computation speed.
+    """
+
+    timer = benchmark.Timer(
+        stmt=stmt,
+        setup=setup,
+        num_threads=num_threads,
+        globals={"x": input},
+    )
+
+    print(
+        f"Running benchmark on sample of {n_runs} runs with {num_threads} thread(s)..."
+    )
+    result = timer.timeit(n_runs)
+
+    batch, height, width = input.size(0), input.size(-2), input.size(-1)
+    total_pixels = batch * width * height
+
+    print(f"Batch size: {batch}")
+    print(f"Input resolution: {width}x{height} pixels\n")
+
+    mean_per_batch = result.mean
+    median_per_batch = result.median
+
+    mean_speed_mpx = (total_pixels / 1e6) / mean_per_batch
+    median_speed_mpx = (total_pixels / 1e6) / median_per_batch
+
+    print(f"Mean time per {batch} {width}x{height} px frames: {mean_per_batch:.4f} s")
+    print(
+        f"Median time per {batch} {width}x{height} px frames: {median_per_batch:.4f} s\n"
+    )
+
+    print(f"Model mean speed in megapixels per second: {mean_speed_mpx:.3f} MP/s")
+    print(f"Model median speed in megapixels per second: {median_speed_mpx:.3f} MP/s\n")
+
+
+def main(args):
+    args_dict = vars(args)
+    print("Arguments:")
+    pprint(args_dict)
+
+    if args.model.lower() not in ARCHITECTURES:
+        raise ValueError("Architecture not supported.")
+
+    stmt = """ \
+    with torch.inference_mode():
+        out = model(x)
+        out = out.clamp(0, 1).float().cpu()
+    """
+
+    arch = ARCHITECTURES[args.model.lower()]
+    setup = f"from torchvision.models import {arch}; model = {arch}(); model.eval()"
+
+    input_shape = [3, args.height, args.width]
+    precision = torch.float16 if args.precision == "16" else torch.float32
+
+    x = torch.rand(*input_shape, dtype=precision)
+    x = x.cuda(0, non_blocking=True)
+    setup = f"{setup}; model.cuda(0)"
+
+    if args.precision == "16":
+        setup = f"{setup}; model.half()"
+
+    benchmark_inference(
+        stmt=stmt,
+        setup=setup,
+        input=x,
+        n_runs=args.n_iters,
+        num_threads=args.num_workers,
+    )
+
+
+if __name__ == "__main__":
+    if not torch.cuda.is_available():
+        raise ValueError("CUDA device not found on this system.")
+    else:
+        print("CUDA Device Name:", torch.cuda.get_device_name(0))
+        print("CUDNN version:", torch.backends.cudnn.version())
+        print(
+            "CUDA Device Total Memory: "
+            + f"{(torch.cuda.get_device_properties(0).total_memory / 1e9):.2f} GB",
+        )
+
+    parser = argparse.ArgumentParser(description="Benchmark CV models training on GPU.")
+
+    parser.add_argument("--batch-size", type=int, required=True)
+    parser.add_argument(
+        "--n-iters",
+        type=int,
+        default=100,
+        help="Number of training iterations to benchmark for. One iteration = one batch update",
+    )
+    parser.add_argument("--precision", choices=["32", "16"], default="16")
+    parser.add_argument("--n-workers", type=int, default=1)
+
+    parser.add_argument("--width", type=int, default=192, help="Input width")
+    parser.add_argument("--height", type=int, default=192, help="Input height")
+
+    parser.add_argument(
+        "--model",
+        default="resnet50",
+        choices=list(ARCHITECTURES.keys()),
+        help="Architecture to benchmark.",
+    )
+    parser.add_argument("--list-requirements", action="store_true")
+
+    args = parser.parse_args()
+
+    if args.n_iters <= 0:
+        raise ValueError("Number of iterations must be > 0")
+
+    main(args=args)

From f1aaafb4b46f350ba050d7629784d8bfaa1d73b5 Mon Sep 17 00:00:00 2001
From: Bartol Freskura <freskura.bartol@gmail.com>
Date: Fri, 7 Jun 2024 17:24:28 +0200
Subject: [PATCH 2/6] Fix bugs; explicit torch versions

---
 requirements.txt |  4 +++-
 src/inference.py | 12 ++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ecbf612..e92447c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,3 @@
-lightning==2.1.1
\ No newline at end of file
+lightning==2.1.1
+torch==2.3.1
+torchvision==0.18.1
diff --git a/src/inference.py b/src/inference.py
index 45432e4..348507b 100644
--- a/src/inference.py
+++ b/src/inference.py
@@ -63,8 +63,12 @@ def benchmark_inference(
         f"Median time per {batch} {width}x{height} px frames: {median_per_batch:.4f} s\n"
     )
 
-    print(f"Model mean speed in megapixels per second: {mean_speed_mpx:.3f} MP/s")
-    print(f"Model median speed in megapixels per second: {median_speed_mpx:.3f} MP/s\n")
+    print(
+        f"Model mean throughoutput in megapixels per second: {mean_speed_mpx:.3f} MP/s"
+    )
+    print(
+        f"Model median throughoutput in megapixels per second: {median_speed_mpx:.3f} MP/s\n"
+    )
 
 
 def main(args):
@@ -84,7 +88,7 @@ def main(args):
     arch = ARCHITECTURES[args.model.lower()]
     setup = f"from torchvision.models import {arch}; model = {arch}(); model.eval()"
 
-    input_shape = [3, args.height, args.width]
+    input_shape = [args.batch_size, 3, args.height, args.width]
     precision = torch.float16 if args.precision == "16" else torch.float32
 
     x = torch.rand(*input_shape, dtype=precision)
@@ -99,7 +103,7 @@ def main(args):
         setup=setup,
         input=x,
         n_runs=args.n_iters,
-        num_threads=args.num_workers,
+        num_threads=args.n_workers,
     )
 
 

From 2bac7131330213e0f2a70019b0558172d0ab06f9 Mon Sep 17 00:00:00 2001
From: Bartol Freskura <freskura.bartol@gmail.com>
Date: Sat, 8 Jun 2024 18:18:05 +0200
Subject: [PATCH 3/6] Document inference; refactor; update packages

---
 README.md                      | 39 +++++++++++++--------
 dockerfiles/cuda118/Dockerfile |  3 +-
 dockerfiles/cuda120/Dockerfile |  5 +--
 requirements.txt               |  6 ++--
 src/inference.py               | 62 ++++++++++++++++++++--------------
 src/log.py                     | 11 ++++++
 src/train.py                   | 10 ++----
 7 files changed, 82 insertions(+), 54 deletions(-)

diff --git a/README.md b/README.md
index e36ed59..d3dc561 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,3 @@
-
 <p align="center" >
   <img width="400" src="https://cdn.tensorpix.ai/TensorPix-Logo-color.svg" alt="Tensorpix logo"/>
 </p>
@@ -38,15 +37,15 @@ You can use this benchmark repo to:
 
 Please open an issue if you need support for a new architecture.
 
-* ResNet50
-* ConvNext (base)
-* VGG16
-* Efficient Net v2
-* MobileNet V3
-* ResNeXt50
-* SWIN
-* VIT
-* UNet with ResNet50 backbone
+- ResNet50
+- ConvNext (base)
+- VGG16
+- Efficient Net v2
+- MobileNet V3
+- ResNeXt50
+- SWIN
+- VIT
+- UNet with ResNet50 backbone
 
 ## 📖 How to benchmark
 
@@ -58,19 +57,31 @@ In order to run benchmark docker containers you must have the following installe
 - NVIDIA drivers. See [Versions](#versions) when choosing the docker image.
 - [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) - required in order to use CUDA inside docker containers
 
+### Training vs Inference
+
+By default, the container will benchmark model training. If you want to benchmark model inference, append the `src.inference` to the docker run command. See examples below for more details.
+
 ### Examples
 
 **Minimal**
 
-`docker run --rm --ipc=host --ulimit memlock=-1 --gpus all ghcr.io/tensorpix/benchmarking-cv-models --batch-size 32`
+`docker run --rm --ipc=host --ulimit memlock=-1 --gpus all ghcr.io/tensorpix/benchmarking-cv-models src.train --batch-size 32`
 
 **Advanced**
 
-`docker run --rm --ipc=host --ulimit memlock=-1 --gpus '"device=0,1"' -v ./benchmarks:/workdir/benchmarks ghcr.io/tensorpix/benchmarking-cv-models --batch-size 32 --n-iters 1000 --warmup-steps 100 --model resnext50 --precision 16-mixed --width 320 --height 320`
+`docker run --rm --ipc=host --ulimit memlock=-1 --gpus '"device=0,1"' -v ./benchmarks:/workdir/benchmarks ghcr.io/tensorpix/benchmarking-cv-models src.train --batch-size 32 --n-iters 1000 --warmup-steps 100 --model resnext50 --precision 16-mixed --width 320 --height 320`
+
+**Benchmark Inference**
+
+`docker run --rm --ipc=host --ulimit memlock=-1 --gpus all ghcr.io/tensorpix/benchmarking-cv-models src.inference --batch-size 32 --n-iters 1000 --model resnext50 --precision 16 --width 256 --height 256`
+
+**List all train options:**
+
+`docker run --rm ghcr.io/tensorpix/benchmarking-cv-models src.train --help`
 
-**List all options:**
+**List all inference options:**
 
-`docker run --rm ghcr.io/tensorpix/benchmarking-cv-models --help`
+`docker run --rm ghcr.io/tensorpix/benchmarking-cv-models src.inference --help`
 
 ### How to select particular GPUs
 
diff --git a/dockerfiles/cuda118/Dockerfile b/dockerfiles/cuda118/Dockerfile
index c15bd9a..f7af36d 100644
--- a/dockerfiles/cuda118/Dockerfile
+++ b/dockerfiles/cuda118/Dockerfile
@@ -13,4 +13,5 @@ RUN pip3 install --no-cache-dir -r /tmp/requirements.txt --extra-index-url https
 COPY ./src /workdir/src
 WORKDIR /workdir
 
-ENTRYPOINT [ "python3", "-m", "src.train" ]
+ENTRYPOINT [ "python3", "-m" ]
+CMD [ "src.train" ]
diff --git a/dockerfiles/cuda120/Dockerfile b/dockerfiles/cuda120/Dockerfile
index 89f0e3d..d008540 100644
--- a/dockerfiles/cuda120/Dockerfile
+++ b/dockerfiles/cuda120/Dockerfile
@@ -8,9 +8,10 @@ RUN apt update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install
 	rm -rf /var/lib/apt/lists/*
 
 COPY requirements.txt /tmp/requirements.txt
-RUN pip3 install --no-cache-dir -r /tmp/requirements.txt
+RUN pip3 install --no-cache-dir -r /tmp/requirements.txt --extra-index-url https://download.pytorch.org/whl/cu121
 
 COPY ./src /workdir/src
 WORKDIR /workdir
 
-ENTRYPOINT [ "python3", "-m", "src.train" ]
+ENTRYPOINT [ "python3", "-m" ]
+CMD [ "src.train" ]
diff --git a/requirements.txt b/requirements.txt
index 0417f66..ec0233d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
-lightning==2.1.4
+lightning==2.2.5
 protobuf==3.20.*
 segmentation-models-pytorch==0.3.3
 six==1.16.0
-torch==2.1.2
-torchvision==0.16.2
+torch==2.3.1
+torchvision==0.18.1
diff --git a/src/inference.py b/src/inference.py
index 348507b..224202d 100644
--- a/src/inference.py
+++ b/src/inference.py
@@ -1,9 +1,13 @@
 import argparse
-from pprint import pprint
 
 import torch
 import torch.utils.benchmark as benchmark
 
+from src import log
+from src.log import print_requirements
+
+logger = log.logger
+
 ARCHITECTURES = {
     "resnet50": "resnet50",
     "convnext": "convnext_base",
@@ -41,7 +45,7 @@ def benchmark_inference(
         globals={"x": input},
     )
 
-    print(
+    logger.info(
         f"Running benchmark on sample of {n_runs} runs with {num_threads} thread(s)..."
     )
     result = timer.timeit(n_runs)
@@ -49,8 +53,8 @@ def benchmark_inference(
     batch, height, width = input.size(0), input.size(-2), input.size(-1)
     total_pixels = batch * width * height
 
-    print(f"Batch size: {batch}")
-    print(f"Input resolution: {width}x{height} pixels\n")
+    logger.info(f"Batch size: {batch}")
+    logger.info(f"Input resolution: {width}x{height} pixels\n")
 
     mean_per_batch = result.mean
     median_per_batch = result.median
@@ -58,23 +62,24 @@ def benchmark_inference(
     mean_speed_mpx = (total_pixels / 1e6) / mean_per_batch
     median_speed_mpx = (total_pixels / 1e6) / median_per_batch
 
-    print(f"Mean time per {batch} {width}x{height} px frames: {mean_per_batch:.4f} s")
-    print(
-        f"Median time per {batch} {width}x{height} px frames: {median_per_batch:.4f} s\n"
+    logger.info(
+        f"Mean throughoutput per {batch} {width}x{height} px frames: {mean_per_batch:.4f} s"
+    )
+    logger.info(
+        f"Median throughoutput per {batch} {width}x{height} px frames: {median_per_batch:.4f} s\n"
     )
 
-    print(
+    logger.info(
         f"Model mean throughoutput in megapixels per second: {mean_speed_mpx:.3f} MP/s"
     )
-    print(
+    logger.info(
         f"Model median throughoutput in megapixels per second: {median_speed_mpx:.3f} MP/s\n"
     )
 
 
 def main(args):
-    args_dict = vars(args)
-    print("Arguments:")
-    pprint(args_dict)
+    if args.list_requirements:
+        print_requirements()
 
     if args.model.lower() not in ARCHITECTURES:
         raise ValueError("Architecture not supported.")
@@ -92,8 +97,8 @@ def main(args):
     precision = torch.float16 if args.precision == "16" else torch.float32
 
     x = torch.rand(*input_shape, dtype=precision)
-    x = x.cuda(0, non_blocking=True)
-    setup = f"{setup}; model.cuda(0)"
+    x = x.cuda(args.gpu_device_index, non_blocking=True)
+    setup = f"{setup}; model.cuda({args.gpu_device_index})"
 
     if args.precision == "16":
         setup = f"{setup}; model.half()"
@@ -108,16 +113,6 @@ def main(args):
 
 
 if __name__ == "__main__":
-    if not torch.cuda.is_available():
-        raise ValueError("CUDA device not found on this system.")
-    else:
-        print("CUDA Device Name:", torch.cuda.get_device_name(0))
-        print("CUDNN version:", torch.backends.cudnn.version())
-        print(
-            "CUDA Device Total Memory: "
-            + f"{(torch.cuda.get_device_properties(0).total_memory / 1e9):.2f} GB",
-        )
-
     parser = argparse.ArgumentParser(description="Benchmark CV models training on GPU.")
 
     parser.add_argument("--batch-size", type=int, required=True)
@@ -129,9 +124,10 @@ def main(args):
     )
     parser.add_argument("--precision", choices=["32", "16"], default="16")
     parser.add_argument("--n-workers", type=int, default=1)
+    parser.add_argument("--gpu-device-index", type=int, default=0)
 
-    parser.add_argument("--width", type=int, default=192, help="Input width")
-    parser.add_argument("--height", type=int, default=192, help="Input height")
+    parser.add_argument("--width", type=int, default=224, help="Input width")
+    parser.add_argument("--height", type=int, default=224, help="Input height")
 
     parser.add_argument(
         "--model",
@@ -146,4 +142,18 @@ def main(args):
     if args.n_iters <= 0:
         raise ValueError("Number of iterations must be > 0")
 
+    logger.info("########## STARTING NEW INFERENCE BENCHMARK RUN ###########")
+
+    if not torch.cuda.is_available():
+        raise ValueError("CUDA device not found on this system.")
+    else:
+        logger.info(
+            f"CUDA Device Name: {torch.cuda.get_device_name(args.gpu_device_index)}"
+        )
+        logger.info(f"CUDNN version: {torch.backends.cudnn.version()}")
+        logger.info(
+            "CUDA Device Total Memory: "
+            + f"{(torch.cuda.get_device_properties(args.gpu_device_index).total_memory / 1e9):.2f} GB"
+        )
+
     main(args=args)
diff --git a/src/log.py b/src/log.py
index 011552e..2259d22 100644
--- a/src/log.py
+++ b/src/log.py
@@ -1,5 +1,7 @@
 import logging
 
+from pip._internal.operations import freeze
+
 
 def setup_custom_logger(name: str = "benchmark"):
     logger = logging.getLogger(name)
@@ -14,3 +16,12 @@ def setup_custom_logger(name: str = "benchmark"):
     logger.setLevel(level=logging.DEBUG)
 
     return logger
+
+
+def print_requirements():
+    pkgs = freeze.freeze()
+    for pkg in pkgs:
+        logger.info(pkg)
+
+
+logger = setup_custom_logger()
diff --git a/src/train.py b/src/train.py
index eaa23c8..03438a8 100644
--- a/src/train.py
+++ b/src/train.py
@@ -3,7 +3,6 @@
 import segmentation_models_pytorch as smp
 import torch
 from lightning import Trainer
-from pip._internal.operations import freeze
 from torch.utils.data import DataLoader
 from torchvision.models import (
     convnext_base,
@@ -19,9 +18,10 @@
 from src import log
 from src.callbacks import BenchmarkCallback
 from src.data.in_memory_dataset import InMemoryDataset
+from src.log import print_requirements
 from src.models.lightning_modules import LitClassification
 
-logger = log.setup_custom_logger()
+logger = log.logger
 
 ARCHITECTURES = {
     "resnet50": resnet50,
@@ -38,12 +38,6 @@
 }
 
 
-def print_requirements():
-    pkgs = freeze.freeze()
-    for pkg in pkgs:
-        logger.info(pkg)
-
-
 def main(args):
     if args.list_requirements:
         print_requirements()

From 878243dccf1995cebf2ee0d0fc24379dceed1e70 Mon Sep 17 00:00:00 2001
From: Bartol Freskura <freskura.bartol@gmail.com>
Date: Mon, 10 Jun 2024 16:32:40 +0200
Subject: [PATCH 4/6] Clarify docs

---
 README.md                      | 2 +-
 dockerfiles/cuda118/Dockerfile | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d3dc561..188b340 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ In order to run benchmark docker containers you must have the following installe
 
 ### Training vs Inference
 
-By default, the container will benchmark model training. If you want to benchmark model inference, append the `src.inference` to the docker run command. See examples below for more details.
+To benchmark model training, append the `src.train` when running the container. If you want to benchmark model inference, append the `src.inference` to the docker run command. See examples below for more details.
 
 ### Examples
 
diff --git a/dockerfiles/cuda118/Dockerfile b/dockerfiles/cuda118/Dockerfile
index f7af36d..e8f676d 100644
--- a/dockerfiles/cuda118/Dockerfile
+++ b/dockerfiles/cuda118/Dockerfile
@@ -14,4 +14,3 @@ COPY ./src /workdir/src
 WORKDIR /workdir
 
 ENTRYPOINT [ "python3", "-m" ]
-CMD [ "src.train" ]

From 751431fefacdf44d6fc23b9464b2821c3d037ee0 Mon Sep 17 00:00:00 2001
From: Bartol Freskura <freskura.bartol@gmail.com>
Date: Tue, 11 Jun 2024 10:15:48 +0200
Subject: [PATCH 5/6] Clarify docs; Remove clamp; default batch size

---
 src/inference.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/inference.py b/src/inference.py
index 224202d..538e196 100644
--- a/src/inference.py
+++ b/src/inference.py
@@ -26,16 +26,16 @@ def benchmark_inference(
     stmt: str,
     setup: str,
     input: torch.Tensor,
-    n_runs=100,
+    n_runs: int = 100,
     num_threads: int = 1,
 ):
     """
     Benchmark a model using torch.utils.benchmark.
 
-    When evaluating model speed in MP/s only the video height, width and batch size are taken into
-    account. The number of channels and sequence length are ignored. Speed evaluation measures
-    how fast can we process an arbitrary input video so channels and sequence length don't
-    affect the model computation speed.
+    When evaluating model throughoutput in MP/s only the image height, width and batch size are taken into
+    account. The number of channels are ignored as they are fixed to 3 channels in most cases (RGB images).
+    Speed evaluation measures how fast can we process an arbitrary input image so channels
+    don't affect the model computation speed.
     """
 
     timer = benchmark.Timer(
@@ -87,7 +87,7 @@ def main(args):
     stmt = """ \
     with torch.inference_mode():
         out = model(x)
-        out = out.clamp(0, 1).float().cpu()
+        out = out.float().cpu()
     """
 
     arch = ARCHITECTURES[args.model.lower()]
@@ -115,7 +115,7 @@ def main(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Benchmark CV models training on GPU.")
 
-    parser.add_argument("--batch-size", type=int, required=True)
+    parser.add_argument("--batch-size", type=int, required=True, default=1)
     parser.add_argument(
         "--n-iters",
         type=int,

From 4a98dae0d2c4d2fc344b9c98d6f4eb2119069fc9 Mon Sep 17 00:00:00 2001
From: Bartol Freskura <freskura.bartol@gmail.com>
Date: Tue, 11 Jun 2024 17:41:16 +0200
Subject: [PATCH 6/6] Remove cmd from cuda 120 dockerfile

---
 dockerfiles/cuda120/Dockerfile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dockerfiles/cuda120/Dockerfile b/dockerfiles/cuda120/Dockerfile
index d008540..2b112e1 100644
--- a/dockerfiles/cuda120/Dockerfile
+++ b/dockerfiles/cuda120/Dockerfile
@@ -14,4 +14,3 @@ COPY ./src /workdir/src
 WORKDIR /workdir
 
 ENTRYPOINT [ "python3", "-m" ]
-CMD [ "src.train" ]