From 025747f88c254ca28554eade375b4bad608cb54a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 27 Aug 2024 14:12:10 +0100 Subject: [PATCH] Add transparency support to webp decoder --- test/test_image.py | 22 +++++++- .../csrc/io/image/cpu/decode_image.cpp | 2 +- torchvision/csrc/io/image/cpu/decode_webp.cpp | 50 ++++++++++++++++--- torchvision/csrc/io/image/cpu/decode_webp.h | 5 +- torchvision/csrc/io/image/image.cpp | 2 +- torchvision/io/image.py | 16 ++++-- 6 files changed, 82 insertions(+), 15 deletions(-) diff --git a/test/test_image.py b/test/test_image.py index 5b0da3481ab..0e5390da497 100644 --- a/test/test_image.py +++ b/test/test_image.py @@ -875,7 +875,7 @@ def test_decode_gif_webp_errors(decode_fun): if decode_fun is decode_gif: expected_match = re.escape("DGifOpenFileName() failed - 103") elif decode_fun is decode_webp: - expected_match = "WebPDecodeRGB failed." + expected_match = "WebPGetFeatures failed." with pytest.raises(RuntimeError, match=expected_match): decode_fun(encoded_data) @@ -891,6 +891,26 @@ def test_decode_webp(decode_fun, scripted): assert img[None].is_contiguous(memory_format=torch.channels_last) +# TODO: explain this test and why it's skipped +@pytest.mark.skip(reason="Need to download test images first") +@pytest.mark.parametrize("decode_fun", (decode_webp, decode_image)) +@pytest.mark.parametrize("scripted", (False, True)) +@pytest.mark.parametrize( + "mode, pil_mode", ((ImageReadMode.RGB, "RGB"), (ImageReadMode.RGB_ALPHA, "RGBA"), (ImageReadMode.UNCHANGED, None)) +) +@pytest.mark.parametrize("filename", Path("/home/nicolashug/webp_samples").glob("*.webp")) +def test_decode_webp_against_pil(decode_fun, scripted, mode, pil_mode, filename): + encoded_bytes = read_file(filename) + if scripted: + decode_fun = torch.jit.script(decode_fun) + img = decode_fun(encoded_bytes, mode=mode) + assert img[None].is_contiguous(memory_format=torch.channels_last) + + pil_img = Image.open(filename).convert(pil_mode) + from_pil = F.pil_to_tensor(pil_img) + assert_equal(img, from_pil) + + @pytest.mark.xfail(reason="AVIF support not enabled yet.") @pytest.mark.parametrize("decode_fun", (_decode_avif, decode_image)) @pytest.mark.parametrize("scripted", (False, True)) diff --git a/torchvision/csrc/io/image/cpu/decode_image.cpp b/torchvision/csrc/io/image/cpu/decode_image.cpp index 75c7e06195a..0bc9d4396a5 100644 --- a/torchvision/csrc/io/image/cpu/decode_image.cpp +++ b/torchvision/csrc/io/image/cpu/decode_image.cpp @@ -67,7 +67,7 @@ torch::Tensor decode_image( TORCH_CHECK(data.numel() >= 15, err_msg); if ((memcmp(webp_signature_begin, datap, 4) == 0) && (memcmp(webp_signature_end, datap + 8, 7) == 0)) { - return decode_webp(data); + return decode_webp(data, mode); } TORCH_CHECK(false, err_msg); diff --git a/torchvision/csrc/io/image/cpu/decode_webp.cpp b/torchvision/csrc/io/image/cpu/decode_webp.cpp index 844ce61a3e3..174498db047 100644 --- a/torchvision/csrc/io/image/cpu/decode_webp.cpp +++ b/torchvision/csrc/io/image/cpu/decode_webp.cpp @@ -8,13 +8,17 @@ namespace vision { namespace image { #if !WEBP_FOUND -torch::Tensor decode_webp(const torch::Tensor& data) { +torch::Tensor decode_webp( + const torch::Tensor& encoded_data, + ImageReadMode mode) { TORCH_CHECK( false, "decode_webp: torchvision not compiled with libwebp support"); } #else -torch::Tensor decode_webp(const torch::Tensor& encoded_data) { +torch::Tensor decode_webp( + const torch::Tensor& encoded_data, + ImageReadMode mode) { TORCH_CHECK(encoded_data.is_contiguous(), "Input tensor must be contiguous."); TORCH_CHECK( encoded_data.dtype() == torch::kU8, @@ -26,13 +30,45 @@ torch::Tensor decode_webp(const torch::Tensor& encoded_data) { encoded_data.dim(), " dims."); + auto encoded_data_p = encoded_data.data_ptr(); + auto encoded_data_size = encoded_data.numel(); + + WebPBitstreamFeatures features; + auto res = WebPGetFeatures(encoded_data_p, encoded_data_size, &features); + TORCH_CHECK( + res == VP8_STATUS_OK, "WebPGetFeatures failed with error code ", res); + TORCH_CHECK( + !features.has_animation, "Animated webp files are not supported."); + + auto decoding_func = WebPDecodeRGB; + int num_channels = 0; + if (mode == IMAGE_READ_MODE_RGB) { + decoding_func = WebPDecodeRGB; + num_channels = 3; + } else if (mode == IMAGE_READ_MODE_RGB_ALPHA) { + decoding_func = WebPDecodeRGBA; + num_channels = 4; + } else { + // Assume mode is "unchanged" + decoding_func = features.has_alpha ? WebPDecodeRGBA : WebPDecodeRGB; + num_channels = features.has_alpha ? 4 : 3; + } + int width = 0; int height = 0; - auto decoded_data = WebPDecodeRGB( - encoded_data.data_ptr(), encoded_data.numel(), &width, &height); - TORCH_CHECK(decoded_data != nullptr, "WebPDecodeRGB failed."); - auto out = torch::from_blob(decoded_data, {height, width, 3}, torch::kUInt8); - return out.permute({2, 0, 1}); // return CHW, channels-last + + auto decoded_data = + decoding_func(encoded_data_p, encoded_data_size, &width, &height); + TORCH_CHECK(decoded_data != nullptr, "WebPDecodeRGB[A] failed."); + + auto out = torch::from_blob( + decoded_data, {height, width, num_channels}, torch::kUInt8); + + if (features.has_alpha && mode == IMAGE_READ_MODE_RGB) { + namespace idx = torch::indexing; + out = out.index({idx::Slice(), idx::Slice(), idx::Slice(idx::None, 3)}); + } + return out.permute({2, 0, 1}); } #endif // WEBP_FOUND diff --git a/torchvision/csrc/io/image/cpu/decode_webp.h b/torchvision/csrc/io/image/cpu/decode_webp.h index 00a0c3362f7..5632ea56ff9 100644 --- a/torchvision/csrc/io/image/cpu/decode_webp.h +++ b/torchvision/csrc/io/image/cpu/decode_webp.h @@ -1,11 +1,14 @@ #pragma once #include +#include "../image_read_mode.h" namespace vision { namespace image { -C10_EXPORT torch::Tensor decode_webp(const torch::Tensor& data); +C10_EXPORT torch::Tensor decode_webp( + const torch::Tensor& encoded_data, + ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED); } // namespace image } // namespace vision diff --git a/torchvision/csrc/io/image/image.cpp b/torchvision/csrc/io/image/image.cpp index 43e8ecbe4a2..4ed9b03e79e 100644 --- a/torchvision/csrc/io/image/image.cpp +++ b/torchvision/csrc/io/image/image.cpp @@ -21,7 +21,7 @@ static auto registry = .op("image::encode_png", &encode_png) .op("image::decode_jpeg(Tensor data, int mode, bool apply_exif_orientation=False) -> Tensor", &decode_jpeg) - .op("image::decode_webp", &decode_webp) + .op("image::decode_webp(Tensor encoded_data, int mode) -> Tensor", &decode_webp) .op("image::decode_avif", &decode_avif) .op("image::encode_jpeg", &encode_jpeg) .op("image::read_file", &read_file) diff --git a/torchvision/io/image.py b/torchvision/io/image.py index 6d4613f703b..df3e44ab713 100644 --- a/torchvision/io/image.py +++ b/torchvision/io/image.py @@ -28,6 +28,11 @@ class ImageReadMode(Enum): ``ImageReadMode.GRAY_ALPHA`` for grayscale with transparency, ``ImageReadMode.RGB`` for RGB and ``ImageReadMode.RGB_ALPHA`` for RGB with transparency. + + .. note:: + + Some decoders won't support all possible values, e.g. a decoder may only + support "RGB" and "RGBA" mode. """ UNCHANGED = 0 @@ -365,23 +370,26 @@ def decode_gif(input: torch.Tensor) -> torch.Tensor: def decode_webp( input: torch.Tensor, + mode: ImageReadMode = ImageReadMode.UNCHANGED, ) -> torch.Tensor: """ - Decode a WEBP image into a 3 dimensional RGB Tensor. + Decode a WEBP image into a 3 dimensional RGB[A] Tensor. - The values of the output tensor are uint8 between 0 and 255. If the input - image is RGBA, the transparency is ignored. + The values of the output tensor are uint8 between 0 and 255. Args: input (Tensor[1]): a one dimensional contiguous uint8 tensor containing the raw bytes of the WEBP image. + mode (ImageReadMode): The read mode used for optionally + converting the image color space. Default: ``ImageReadMode.UNCHANGED``. + Other supported values are ``ImageReadMode.RGB`` and ``ImageReadMode.RGB_ALPHA``. Returns: Decoded image (Tensor[image_channels, image_height, image_width]) """ if not torch.jit.is_scripting() and not torch.jit.is_tracing(): _log_api_usage_once(decode_webp) - return torch.ops.image.decode_webp(input) + return torch.ops.image.decode_webp(input, mode.value) def _decode_avif(