Document inference; refactor; update packages

tensorpix · Jun 8, 2024 · 2bac713 · 2bac713
1 parent 4367742
commit 2bac713
Show file tree

Hide file tree

Showing 7 changed files with 82 additions and 54 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,3 @@
-
 <p align="center" >
   <img width="400" src="https://cdn.tensorpix.ai/TensorPix-Logo-color.svg" alt="Tensorpix logo"/>
 </p>
@@ -38,15 +37,15 @@ You can use this benchmark repo to:
 
 Please open an issue if you need support for a new architecture.
 
-* ResNet50
-* ConvNext (base)
-* VGG16
-* Efficient Net v2
-* MobileNet V3
-* ResNeXt50
-* SWIN
-* VIT
-* UNet with ResNet50 backbone
+- ResNet50
+- ConvNext (base)
+- VGG16
+- Efficient Net v2
+- MobileNet V3
+- ResNeXt50
+- SWIN
+- VIT
+- UNet with ResNet50 backbone
 
 ## 📖 How to benchmark
 
@@ -58,19 +57,31 @@ In order to run benchmark docker containers you must have the following installe
 - NVIDIA drivers. See [Versions](#versions) when choosing the docker image.
 - [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) - required in order to use CUDA inside docker containers
 
+### Training vs Inference
+
+By default, the container will benchmark model training. If you want to benchmark model inference, append the `src.inference` to the docker run command. See examples below for more details.
+
 ### Examples
 
 **Minimal**
 
-`docker run --rm --ipc=host --ulimit memlock=-1 --gpus all ghcr.io/tensorpix/benchmarking-cv-models --batch-size 32`
+`docker run --rm --ipc=host --ulimit memlock=-1 --gpus all ghcr.io/tensorpix/benchmarking-cv-models src.train --batch-size 32`
 
 **Advanced**
 
-`docker run --rm --ipc=host --ulimit memlock=-1 --gpus '"device=0,1"' -v ./benchmarks:/workdir/benchmarks ghcr.io/tensorpix/benchmarking-cv-models --batch-size 32 --n-iters 1000 --warmup-steps 100 --model resnext50 --precision 16-mixed --width 320 --height 320`
+`docker run --rm --ipc=host --ulimit memlock=-1 --gpus '"device=0,1"' -v ./benchmarks:/workdir/benchmarks ghcr.io/tensorpix/benchmarking-cv-models src.train --batch-size 32 --n-iters 1000 --warmup-steps 100 --model resnext50 --precision 16-mixed --width 320 --height 320`
+
+**Benchmark Inference**
+
+`docker run --rm --ipc=host --ulimit memlock=-1 --gpus all ghcr.io/tensorpix/benchmarking-cv-models src.inference --batch-size 32 --n-iters 1000 --model resnext50 --precision 16 --width 256 --height 256`
+
+**List all train options:**
+
+`docker run --rm ghcr.io/tensorpix/benchmarking-cv-models src.train --help`
 
-**List all options:**
+**List all inference options:**
 
-`docker run --rm ghcr.io/tensorpix/benchmarking-cv-models --help`
+`docker run --rm ghcr.io/tensorpix/benchmarking-cv-models src.inference --help`
 
 ### How to select particular GPUs
 

diff --git a/dockerfiles/cuda118/Dockerfile b/dockerfiles/cuda118/Dockerfile
@@ -13,4 +13,5 @@ RUN pip3 install --no-cache-dir -r /tmp/requirements.txt --extra-index-url https
 COPY ./src /workdir/src
 WORKDIR /workdir
 
-ENTRYPOINT [ "python3", "-m", "src.train" ]
+ENTRYPOINT [ "python3", "-m" ]
+CMD [ "src.train" ]
diff --git a/dockerfiles/cuda120/Dockerfile b/dockerfiles/cuda120/Dockerfile
@@ -8,9 +8,10 @@ RUN apt update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install
 	rm -rf /var/lib/apt/lists/*
 
 COPY requirements.txt /tmp/requirements.txt
-RUN pip3 install --no-cache-dir -r /tmp/requirements.txt
+RUN pip3 install --no-cache-dir -r /tmp/requirements.txt --extra-index-url https://download.pytorch.org/whl/cu121
 
 COPY ./src /workdir/src
 WORKDIR /workdir
 
-ENTRYPOINT [ "python3", "-m", "src.train" ]
+ENTRYPOINT [ "python3", "-m" ]
+CMD [ "src.train" ]
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
-lightning==2.1.4
+lightning==2.2.5
 protobuf==3.20.*
 segmentation-models-pytorch==0.3.3
 six==1.16.0
-torch==2.1.2
-torchvision==0.16.2
+torch==2.3.1
+torchvision==0.18.1
diff --git a/src/inference.py b/src/inference.py
@@ -1,9 +1,13 @@
 import argparse
-from pprint import pprint
 
 import torch
 import torch.utils.benchmark as benchmark
 
+from src import log
+from src.log import print_requirements
+
+logger = log.logger
+
 ARCHITECTURES = {
     "resnet50": "resnet50",
     "convnext": "convnext_base",
@@ -41,40 +45,41 @@ def benchmark_inference(
         globals={"x": input},
     )
 
-    print(
+    logger.info(
         f"Running benchmark on sample of {n_runs} runs with {num_threads} thread(s)..."
     )
     result = timer.timeit(n_runs)
 
     batch, height, width = input.size(0), input.size(-2), input.size(-1)
     total_pixels = batch * width * height
 
-    print(f"Batch size: {batch}")
-    print(f"Input resolution: {width}x{height} pixels\n")
+    logger.info(f"Batch size: {batch}")
+    logger.info(f"Input resolution: {width}x{height} pixels\n")
 
     mean_per_batch = result.mean
     median_per_batch = result.median
 
     mean_speed_mpx = (total_pixels / 1e6) / mean_per_batch
     median_speed_mpx = (total_pixels / 1e6) / median_per_batch
 
-    print(f"Mean time per {batch} {width}x{height} px frames: {mean_per_batch:.4f} s")
-    print(
-        f"Median time per {batch} {width}x{height} px frames: {median_per_batch:.4f} s\n"
+    logger.info(
+        f"Mean throughoutput per {batch} {width}x{height} px frames: {mean_per_batch:.4f} s"
+    )
+    logger.info(
+        f"Median throughoutput per {batch} {width}x{height} px frames: {median_per_batch:.4f} s\n"
     )
 
-    print(
+    logger.info(
         f"Model mean throughoutput in megapixels per second: {mean_speed_mpx:.3f} MP/s"
     )
-    print(
+    logger.info(
         f"Model median throughoutput in megapixels per second: {median_speed_mpx:.3f} MP/s\n"
     )
 
 
 def main(args):
-    args_dict = vars(args)
-    print("Arguments:")
-    pprint(args_dict)
+    if args.list_requirements:
+        print_requirements()
 
     if args.model.lower() not in ARCHITECTURES:
         raise ValueError("Architecture not supported.")
@@ -92,8 +97,8 @@ def main(args):
     precision = torch.float16 if args.precision == "16" else torch.float32
 
     x = torch.rand(*input_shape, dtype=precision)
-    x = x.cuda(0, non_blocking=True)
-    setup = f"{setup}; model.cuda(0)"
+    x = x.cuda(args.gpu_device_index, non_blocking=True)
+    setup = f"{setup}; model.cuda({args.gpu_device_index})"
 
     if args.precision == "16":
         setup = f"{setup}; model.half()"
@@ -108,16 +113,6 @@ def main(args):
 
 
 if __name__ == "__main__":
-    if not torch.cuda.is_available():
-        raise ValueError("CUDA device not found on this system.")
-    else:
-        print("CUDA Device Name:", torch.cuda.get_device_name(0))
-        print("CUDNN version:", torch.backends.cudnn.version())
-        print(
-            "CUDA Device Total Memory: "
-            + f"{(torch.cuda.get_device_properties(0).total_memory / 1e9):.2f} GB",
-        )
-
     parser = argparse.ArgumentParser(description="Benchmark CV models training on GPU.")
 
     parser.add_argument("--batch-size", type=int, required=True)
@@ -129,9 +124,10 @@ def main(args):
     )
     parser.add_argument("--precision", choices=["32", "16"], default="16")
     parser.add_argument("--n-workers", type=int, default=1)
+    parser.add_argument("--gpu-device-index", type=int, default=0)
 
-    parser.add_argument("--width", type=int, default=192, help="Input width")
-    parser.add_argument("--height", type=int, default=192, help="Input height")
+    parser.add_argument("--width", type=int, default=224, help="Input width")
+    parser.add_argument("--height", type=int, default=224, help="Input height")
 
     parser.add_argument(
         "--model",
@@ -146,4 +142,18 @@ def main(args):
     if args.n_iters <= 0:
         raise ValueError("Number of iterations must be > 0")
 
+    logger.info("########## STARTING NEW INFERENCE BENCHMARK RUN ###########")
+
+    if not torch.cuda.is_available():
+        raise ValueError("CUDA device not found on this system.")
+    else:
+        logger.info(
+            f"CUDA Device Name: {torch.cuda.get_device_name(args.gpu_device_index)}"
+        )
+        logger.info(f"CUDNN version: {torch.backends.cudnn.version()}")
+        logger.info(
+            "CUDA Device Total Memory: "
+            + f"{(torch.cuda.get_device_properties(args.gpu_device_index).total_memory / 1e9):.2f} GB"
+        )
+
     main(args=args)
diff --git a/src/log.py b/src/log.py
@@ -1,5 +1,7 @@
 import logging
 
+from pip._internal.operations import freeze
+
 
 def setup_custom_logger(name: str = "benchmark"):
     logger = logging.getLogger(name)
@@ -14,3 +16,12 @@ def setup_custom_logger(name: str = "benchmark"):
     logger.setLevel(level=logging.DEBUG)
 
     return logger
+
+
+def print_requirements():
+    pkgs = freeze.freeze()
+    for pkg in pkgs:
+        logger.info(pkg)
+
+
+logger = setup_custom_logger()
diff --git a/src/train.py b/src/train.py
@@ -3,7 +3,6 @@
 import segmentation_models_pytorch as smp
 import torch
 from lightning import Trainer
-from pip._internal.operations import freeze
 from torch.utils.data import DataLoader
 from torchvision.models import (
     convnext_base,
@@ -19,9 +18,10 @@
 from src import log
 from src.callbacks import BenchmarkCallback
 from src.data.in_memory_dataset import InMemoryDataset
+from src.log import print_requirements
 from src.models.lightning_modules import LitClassification
 
-logger = log.setup_custom_logger()
+logger = log.logger
 
 ARCHITECTURES = {
     "resnet50": resnet50,
@@ -38,12 +38,6 @@
 }
 
 
-def print_requirements():
-    pkgs = freeze.freeze()
-    for pkg in pkgs:
-        logger.info(pkg)
-
-
 def main(args):
     if args.list_requirements:
         print_requirements()