Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPU jpeg decoder: add batch support and hardware decoding #8496

Merged
merged 41 commits into from
Aug 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
133d7c1
Adding GPU acceleration to encode_jpeg
deekay42 Apr 12, 2024
4cc30cb
fix test cases
deekay42 Apr 23, 2024
2db02f0
fix lints
deekay42 Apr 23, 2024
6acef83
fix lints2
deekay42 Apr 23, 2024
ae0450d
latest round of updates
deekay42 Apr 29, 2024
a799c53
fix lints
deekay42 Apr 30, 2024
c5810ff
Ignore mypy
NicolasHug May 31, 2024
ff40253
Add comment
NicolasHug May 31, 2024
0972863
minor test refactor
NicolasHug May 31, 2024
4ce658d
Merge branch 'main' of github.com:pytorch/vision into add_gpu_encode
NicolasHug May 31, 2024
65372a3
Merge branch 'pytorch:main' into add_gpu_encode
deekay42 Jun 4, 2024
62e072a
Caching nvjpeg vars across calls
deekay42 Jun 5, 2024
b3d06cb
Update if nvjpeg not found
deekay42 Jun 5, 2024
fcf8a78
Adding gpu decode
deekay42 Jun 5, 2024
f190d99
Update if nvjpeg not found
deekay42 Jun 5, 2024
c471db8
merge
deekay42 Jun 5, 2024
b5eaa89
Merge branch 'main' of github.com:pytorch/vision into add_gpu_encode
NicolasHug Jun 10, 2024
5051050
Revert "Ignore mypy"
NicolasHug Jun 10, 2024
136f790
Add comment
NicolasHug Jun 10, 2024
0a88d27
minor changes to address ahmad's comments
deekay42 Jun 11, 2024
df60183
Merge branch 'add_gpu_encode' of https://github.com/deekay42/vision i…
deekay42 Jun 11, 2024
f3c8a72
add dtor log messages
deekay42 Jun 12, 2024
117d1f1
Skip CUDA cleanup altogether
deekay42 Jun 12, 2024
21eca4c
Merge branch 'main' into add_gpu_encode
NicolasHug Jun 13, 2024
64f2cf9
Merge branch 'add_gpu_encode' into add_gpu_decode
deekay42 Jun 17, 2024
156e250
disable cleanup
deekay42 Jun 17, 2024
3efb658
Merge branch 'add_gpu_decode'
deekay42 Jun 17, 2024
5f77eea
disable cleanup
deekay42 Jun 17, 2024
ac8edd2
merge
deekay42 Jun 17, 2024
cebe75f
Merge branch 'add_gpu_encode' into add_gpu_decode
deekay42 Jun 17, 2024
2e60784
Merge branch 'deekay42-add_gpu_decode'
deekay42 Jun 17, 2024
01a5621
merge
deekay42 Jun 17, 2024
ccdafd4
ahmad's comments
deekay42 Jun 26, 2024
c44599d
Merge branch 'main' of github.com:pytorch/vision into add_gpu_decode
NicolasHug Aug 5, 2024
25ca905
Fix syntax
NicolasHug Aug 5, 2024
43b317b
self address a few comments / nits
NicolasHug Aug 5, 2024
223f8a0
lint
NicolasHug Aug 5, 2024
863cf76
ahmads comments 2
deekay42 Aug 6, 2024
fc28c60
lint
NicolasHug Aug 7, 2024
dcd1c07
lint
NicolasHug Aug 7, 2024
efa746d
Merge branch 'main' into add_gpu_decode
NicolasHug Aug 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 0 additions & 67 deletions benchmarks/encoding.py

This file was deleted.

99 changes: 99 additions & 0 deletions benchmarks/encoding_decoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import os
import platform
import statistics

import torch
import torch.utils.benchmark as benchmark
import torchvision


def print_machine_specs():
print("Processor:", platform.processor())
print("Platform:", platform.platform())
print("Logical CPUs:", os.cpu_count())
print(f"\nCUDA device: {torch.cuda.get_device_name()}")
print(f"Total Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")


def get_data():
transform = torchvision.transforms.Compose(
[
torchvision.transforms.PILToTensor(),
]
)
path = os.path.join(os.getcwd(), "data")
testset = torchvision.datasets.Places365(
root="./data", download=not os.path.exists(path), transform=transform, split="val"
)
testloader = torch.utils.data.DataLoader(
testset, batch_size=1000, shuffle=False, num_workers=1, collate_fn=lambda batch: [r[0] for r in batch]
)
return next(iter(testloader))


def run_encoding_benchmark(decoded_images):
results = []
for device in ["cpu", "cuda"]:
decoded_images_device = [t.to(device=device) for t in decoded_images]
for size in [1, 100, 1000]:
for num_threads in [1, 12, 24]:
for stmt, strat in zip(
[
"[torchvision.io.encode_jpeg(img) for img in decoded_images_device_trunc]",
"torchvision.io.encode_jpeg(decoded_images_device_trunc)",
],
["unfused", "fused"],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could be wrong, but batched seems like a better term than fused since it appears to be batching images, not fusing kernels necessarily.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the images are batched it uses a fused kernel

):
decoded_images_device_trunc = decoded_images_device[:size]
t = benchmark.Timer(
stmt=stmt,
setup="import torchvision",
globals={"decoded_images_device_trunc": decoded_images_device_trunc},
label="Image Encoding",
sub_label=f"{device.upper()} ({strat}): {stmt}",
description=f"{size} images",
num_threads=num_threads,
)
results.append(t.blocked_autorange())
compare = benchmark.Compare(results)
compare.print()


def run_decoding_benchmark(encoded_images):
results = []
for device in ["cpu", "cuda"]:
for size in [1, 100, 1000]:
for num_threads in [1, 12, 24]:
for stmt, strat in zip(
[
f"[torchvision.io.decode_jpeg(img, device='{device}') for img in encoded_images_trunc]",
f"torchvision.io.decode_jpeg(encoded_images_trunc, device='{device}')",
],
["unfused", "fused"],
):
encoded_images_trunc = encoded_images[:size]
t = benchmark.Timer(
stmt=stmt,
setup="import torchvision",
globals={"encoded_images_trunc": encoded_images_trunc},
label="Image Decoding",
sub_label=f"{device.upper()} ({strat}): {stmt}",
description=f"{size} images",
num_threads=num_threads,
)
results.append(t.blocked_autorange())
compare = benchmark.Compare(results)
compare.print()


if __name__ == "__main__":
print_machine_specs()
decoded_images = get_data()
mean_h, mean_w = statistics.mean(t.shape[-2] for t in decoded_images), statistics.mean(
t.shape[-1] for t in decoded_images
)
print(f"\nMean image size: {int(mean_h)}x{int(mean_w)}")
run_encoding_benchmark(decoded_images)
encoded_images_cuda = torchvision.io.encode_jpeg([img.cuda() for img in decoded_images])
encoded_images_cpu = [img.cpu() for img in encoded_images_cuda]
run_decoding_benchmark(encoded_images_cpu)
121 changes: 99 additions & 22 deletions test/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,23 +413,32 @@ def test_read_interlaced_png():


@needs_cuda
@pytest.mark.parametrize(
"img_path",
[pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(IMAGE_ROOT, ".jpg")],
)
@pytest.mark.parametrize("mode", [ImageReadMode.UNCHANGED, ImageReadMode.GRAY, ImageReadMode.RGB])
@pytest.mark.parametrize("scripted", (False, True))
def test_decode_jpeg_cuda(mode, img_path, scripted):
if "cmyk" in img_path:
pytest.xfail("Decoding a CMYK jpeg isn't supported")
def test_decode_jpegs_cuda(mode, scripted):
encoded_images = []
for jpeg_path in get_images(IMAGE_ROOT, ".jpg"):
if "cmyk" in jpeg_path:
continue
encoded_image = read_file(jpeg_path)
encoded_images.append(encoded_image)
decoded_images_cpu = decode_jpeg(encoded_images, mode=mode)
decode_fn = torch.jit.script(decode_jpeg) if scripted else decode_jpeg

data = read_file(img_path)
img = decode_image(data, mode=mode)
f = torch.jit.script(decode_jpeg) if scripted else decode_jpeg
img_nvjpeg = f(data, mode=mode, device="cuda")
# test multithreaded decoding
# in the current version we prevent this by using a lock but we still want to test it
num_workers = 10

# Some difference expected between jpeg implementations
assert (img.float() - img_nvjpeg.cpu().float()).abs().mean() < 2
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
futures = [executor.submit(decode_fn, encoded_images, mode, "cuda") for _ in range(num_workers)]
decoded_images_threaded = [future.result() for future in futures]
assert len(decoded_images_threaded) == num_workers
for decoded_images in decoded_images_threaded:
assert len(decoded_images) == len(encoded_images)
for decoded_image_cuda, decoded_image_cpu in zip(decoded_images, decoded_images_cpu):
assert decoded_image_cuda.shape == decoded_image_cpu.shape
assert decoded_image_cuda.dtype == decoded_image_cpu.dtype == torch.uint8
assert (decoded_image_cuda.cpu().float() - decoded_image_cpu.cpu().float()).abs().mean() < 2


@needs_cuda
Expand All @@ -440,25 +449,95 @@ def test_decode_image_cuda_raises():


@needs_cuda
@pytest.mark.parametrize("cuda_device", ("cuda", "cuda:0", torch.device("cuda")))
def test_decode_jpeg_cuda_device_param(cuda_device):
"""Make sure we can pass a string or a torch.device as device param"""
def test_decode_jpeg_cuda_device_param():
path = next(path for path in get_images(IMAGE_ROOT, ".jpg") if "cmyk" not in path)
data = read_file(path)
decode_jpeg(data, device=cuda_device)
current_device = torch.cuda.current_device()
current_stream = torch.cuda.current_stream()
num_devices = torch.cuda.device_count()
devices = ["cuda", torch.device("cuda")] + [torch.device(f"cuda:{i}") for i in range(num_devices)]
results = []
for device in devices:
results.append(decode_jpeg(data, device=device))
assert len(results) == len(devices)
for result in results:
assert torch.all(result.cpu() == results[0].cpu())
assert current_device == torch.cuda.current_device()
assert current_stream == torch.cuda.current_stream()


@needs_cuda
def test_decode_jpeg_cuda_errors():
data = read_file(next(get_images(IMAGE_ROOT, ".jpg")))
with pytest.raises(RuntimeError, match="Expected a non empty 1-dimensional tensor"):
decode_jpeg(data.reshape(-1, 1), device="cuda")
with pytest.raises(RuntimeError, match="input tensor must be on CPU"):
with pytest.raises(ValueError, match="must be tensors"):
decode_jpeg([1, 2, 3])
with pytest.raises(ValueError, match="Input tensor must be a CPU tensor"):
decode_jpeg(data.to("cuda"), device="cuda")
with pytest.raises(RuntimeError, match="Expected a torch.uint8 tensor"):
decode_jpeg(data.to(torch.float), device="cuda")
with pytest.raises(RuntimeError, match="Expected a cuda device"):
torch.ops.image.decode_jpeg_cuda(data, ImageReadMode.UNCHANGED.value, "cpu")
with pytest.raises(RuntimeError, match="Expected the device parameter to be a cuda device"):
torch.ops.image.decode_jpegs_cuda([data], ImageReadMode.UNCHANGED.value, "cpu")
with pytest.raises(ValueError, match="Input tensor must be a CPU tensor"):
decode_jpeg(
torch.empty((100,), dtype=torch.uint8, device="cuda"),
)
with pytest.raises(ValueError, match="Input list must contain tensors on CPU"):
decode_jpeg(
[
torch.empty((100,), dtype=torch.uint8, device="cuda"),
torch.empty((100,), dtype=torch.uint8, device="cuda"),
]
)

with pytest.raises(ValueError, match="Input list must contain tensors on CPU"):
decode_jpeg(
[
torch.empty((100,), dtype=torch.uint8, device="cuda"),
torch.empty((100,), dtype=torch.uint8, device="cuda"),
],
device="cuda",
)

with pytest.raises(ValueError, match="Input list must contain tensors on CPU"):
decode_jpeg(
[
torch.empty((100,), dtype=torch.uint8, device="cpu"),
torch.empty((100,), dtype=torch.uint8, device="cuda"),
],
device="cuda",
)

with pytest.raises(RuntimeError, match="Expected a torch.uint8 tensor"):
decode_jpeg(
[
torch.empty((100,), dtype=torch.uint8),
torch.empty((100,), dtype=torch.float32),
],
device="cuda",
)

with pytest.raises(RuntimeError, match="Expected a non empty 1-dimensional tensor"):
decode_jpeg(
[
torch.empty((100,), dtype=torch.uint8),
torch.empty((1, 100), dtype=torch.uint8),
],
device="cuda",
)

with pytest.raises(RuntimeError, match="Error while decoding JPEG images"):
decode_jpeg(
[
torch.empty((100,), dtype=torch.uint8),
torch.empty((100,), dtype=torch.uint8),
],
device="cuda",
)

with pytest.raises(ValueError, match="Input list must contain at least one element"):
decode_jpeg([], device="cuda")


def test_encode_jpeg_errors():
Expand Down Expand Up @@ -515,12 +594,10 @@ def test_encode_jpeg_cuda_device_param():
devices = ["cuda", torch.device("cuda")] + [torch.device(f"cuda:{i}") for i in range(num_devices)]
results = []
for device in devices:
print(f"python: device: {device}")
results.append(encode_jpeg(data.to(device=device)))
assert len(results) == len(devices)
for result in results:
assert torch.all(result.cpu() == results[0].cpu())

assert current_device == torch.cuda.current_device()
assert current_stream == torch.cuda.current_stream()

Expand Down
Loading
Loading