From 2d8a288f78b576f25150533c3e3b6534f5fce5c4 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 1 Oct 2024 13:51:05 +0100 Subject: [PATCH] [Cherry-Pick for 0.20] Revamp decoding docs (#8633) (#8666) --- docs/source/io.rst | 107 +++++++++++++++++++-------------- torchvision/io/image.py | 80 ++++++++++++------------ torchvision/io/video.py | 24 ++++++++ torchvision/io/video_reader.py | 8 +++ 4 files changed, 131 insertions(+), 88 deletions(-) diff --git a/docs/source/io.rst b/docs/source/io.rst index d372091cc6a..6a76f95e897 100644 --- a/docs/source/io.rst +++ b/docs/source/io.rst @@ -3,33 +3,46 @@ Decoding / Encoding images and videos .. currentmodule:: torchvision.io -The :mod:`torchvision.io` package provides functions for performing IO -operations. They are currently specific to reading and writing images and -videos. +The :mod:`torchvision.io` module provides utilities for decoding and encoding +images and videos. -Images ------- +Image Decoding +-------------- Torchvision currently supports decoding JPEG, PNG, WEBP and GIF images. JPEG decoding can also be done on CUDA GPUs. -For encoding, JPEG (cpu and CUDA) and PNG are supported. +The main entry point is the :func:`~torchvision.io.decode_image` function, which +you can use as an alternative to ``PIL.Image.open()``. It will decode images +straight into image Tensors, thus saving you the conversion and allowing you to +run transforms/preproc natively on tensors. + +.. code:: + + from torchvision.io import decode_image + + img = decode_image("path_to_image", mode="RGB") + img.dtype # torch.uint8 + + # Or + raw_encoded_bytes = ... # read encoded bytes from your file system + img = decode_image(raw_encoded_bytes, mode="RGB") + + +:func:`~torchvision.io.decode_image` will automatically detect the image format, +and call the corresponding decoder. You can also use the lower-level +format-specific decoders which can be more powerful, e.g. if you want to +encode/decode JPEGs on CUDA. .. autosummary:: :toctree: generated/ :template: function.rst decode_image - encode_jpeg decode_jpeg - write_jpeg + encode_png decode_gif decode_webp - encode_png - decode_png - write_png - read_file - write_file .. autosummary:: :toctree: generated/ @@ -41,14 +54,47 @@ Obsolete decoding function: .. autosummary:: :toctree: generated/ - :template: class.rst + :template: function.rst read_image +Image Encoding +-------------- + +For encoding, JPEG (cpu and CUDA) and PNG are supported. + + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + encode_jpeg + write_jpeg + encode_png + write_png + +IO operations +------------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + read_file + write_file Video ----- +.. warning:: + + Torchvision supports video decoding through different APIs listed below, + some of which are still in BETA stage. In the near future, we intend to + centralize PyTorch's video decoding capabilities within the `torchcodec + `_ project. We encourage you to try + it out and share your feedback, as the torchvision video decoders will + eventually be deprecated. + .. autosummary:: :toctree: generated/ :template: function.rst @@ -58,45 +104,14 @@ Video write_video -Fine-grained video API -^^^^^^^^^^^^^^^^^^^^^^ +**Fine-grained video API** In addition to the :mod:`read_video` function, we provide a high-performance lower-level API for more fine-grained control compared to the :mod:`read_video` function. It does all this whilst fully supporting torchscript. -.. betastatus:: fine-grained video API - .. autosummary:: :toctree: generated/ :template: class.rst VideoReader - - -Example of inspecting a video: - -.. code:: python - - import torchvision - video_path = "path to a test video" - # Constructor allocates memory and a threaded decoder - # instance per video. At the moment it takes two arguments: - # path to the video file, and a wanted stream. - reader = torchvision.io.VideoReader(video_path, "video") - - # The information about the video can be retrieved using the - # `get_metadata()` method. It returns a dictionary for every stream, with - # duration and other relevant metadata (often frame rate) - reader_md = reader.get_metadata() - - # metadata is structured as a dict of dicts with following structure - # {"stream_type": {"attribute": [attribute per stream]}} - # - # following would print out the list of frame rates for every present video stream - print(reader_md["video"]["fps"]) - - # we explicitly select the stream we would like to operate on. In - # the constructor we select a default video stream, but - # in practice, we can set whichever stream we would like - video.set_current_stream("video:0") diff --git a/torchvision/io/image.py b/torchvision/io/image.py index 8a2281946a9..cb48d0e6816 100644 --- a/torchvision/io/image.py +++ b/torchvision/io/image.py @@ -20,19 +20,25 @@ class ImageReadMode(Enum): - """ - Support for various modes while reading images. + """Allow automatic conversion to RGB, RGBA, etc while decoding. + + .. note:: + + You don't need to use this struct, you can just pass strings to all + ``mode`` parameters, e.g. ``mode="RGB"``. - Use ``ImageReadMode.UNCHANGED`` for loading the image as-is, - ``ImageReadMode.GRAY`` for converting to grayscale, - ``ImageReadMode.GRAY_ALPHA`` for grayscale with transparency, - ``ImageReadMode.RGB`` for RGB and ``ImageReadMode.RGB_ALPHA`` for - RGB with transparency. + The different available modes are the following. + + - UNCHANGED: loads the image as-is + - RGB: converts to RGB + - RGBA: converts to RGB with transparency (also aliased as RGB_ALPHA) + - GRAY: converts to grayscale + - GRAY_ALPHA: converts to grayscale with transparency .. note:: - Some decoders won't support all possible values, e.g. a decoder may only - support "RGB" and "RGBA" mode. + Some decoders won't support all possible values, e.g. GRAY and + GRAY_ALPHA are only supported for PNG and JPEG images. """ UNCHANGED = 0 @@ -45,8 +51,7 @@ class ImageReadMode(Enum): def read_file(path: str) -> torch.Tensor: """ - Reads and outputs the bytes contents of a file as a uint8 Tensor - with one dimension. + Return the bytes contents of a file as a uint8 1D Tensor. Args: path (str or ``pathlib.Path``): the path to the file to be read @@ -62,8 +67,7 @@ def read_file(path: str) -> torch.Tensor: def write_file(filename: str, data: torch.Tensor) -> None: """ - Writes the contents of an uint8 tensor with one dimension to a - file. + Write the content of an uint8 1D tensor to a file. Args: filename (str or ``pathlib.Path``): the path to the file to be written @@ -93,10 +97,9 @@ def decode_png( Args: input (Tensor[1]): a one dimensional uint8 tensor containing the raw bytes of the PNG image. - mode (str or ImageReadMode): the read mode used for optionally - converting the image. Default: ``ImageReadMode.UNCHANGED``. - See `ImageReadMode` class for more information on various - available modes. + mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB". + Default is "UNCHANGED". See :class:`~torchvision.io.ImageReadMode` + for available modes. apply_exif_orientation (bool): apply EXIF orientation transformation to the output tensor. Default: False. @@ -156,8 +159,7 @@ def decode_jpeg( device: Union[str, torch.device] = "cpu", apply_exif_orientation: bool = False, ) -> Union[torch.Tensor, List[torch.Tensor]]: - """ - Decode JPEG image(s) into 3 dimensional RGB or grayscale Tensor(s). + """Decode JPEG image(s) into 3D RGB or grayscale Tensor(s), on CPU or CUDA. The values of the output tensor are uint8 between 0 and 255. @@ -171,12 +173,9 @@ def decode_jpeg( input (Tensor[1] or list[Tensor[1]]): a (list of) one dimensional uint8 tensor(s) containing the raw bytes of the JPEG image. The tensor(s) must be on CPU, regardless of the ``device`` parameter. - mode (str or ImageReadMode): the read mode used for optionally - converting the image(s). The supported modes are: ``ImageReadMode.UNCHANGED``, - ``ImageReadMode.GRAY`` and ``ImageReadMode.RGB`` - Default: ``ImageReadMode.UNCHANGED``. - See ``ImageReadMode`` class for more information on various - available modes. + mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB". + Default is "UNCHANGED". See :class:`~torchvision.io.ImageReadMode` + for available modes. device (str or torch.device): The device on which the decoded image will be stored. If a cuda device is specified, the image will be decoded with `nvjpeg `_. This is only @@ -228,9 +227,7 @@ def decode_jpeg( def encode_jpeg( input: Union[torch.Tensor, List[torch.Tensor]], quality: int = 75 ) -> Union[torch.Tensor, List[torch.Tensor]]: - """ - Takes a (list of) input tensor(s) in CHW layout and returns a (list of) buffer(s) with the contents - of the corresponding JPEG file(s). + """Encode RGB tensor(s) into raw encoded jpeg bytes, on CPU or CUDA. .. note:: Passing a list of CUDA tensors is more efficient than repeated individual calls to ``encode_jpeg``. @@ -286,7 +283,7 @@ def decode_image( mode: ImageReadMode = ImageReadMode.UNCHANGED, apply_exif_orientation: bool = False, ) -> torch.Tensor: - """Decode an image into a tensor. + """Decode an image into a uint8 tensor, from a path or from raw encoded bytes. Currently supported image formats are jpeg, png, gif and webp. @@ -303,10 +300,9 @@ def decode_image( input (Tensor or str or ``pathlib.Path``): The image to decode. If a tensor is passed, it must be one dimensional uint8 tensor containing the raw bytes of the image. Otherwise, this must be a path to the image file. - mode (str or ImageReadMode): the read mode used for optionally converting the image. - Default: ``ImageReadMode.UNCHANGED``. - See ``ImageReadMode`` class for more information on various - available modes. Only applies to JPEG and PNG images. + mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB". + Default is "UNCHANGED". See :class:`~torchvision.io.ImageReadMode` + for available modes. apply_exif_orientation (bool): apply EXIF orientation transformation to the output tensor. Only applies to JPEG and PNG images. Default: False. @@ -367,9 +363,9 @@ def decode_webp( Args: input (Tensor[1]): a one dimensional contiguous uint8 tensor containing the raw bytes of the WEBP image. - mode (str or ImageReadMode): The read mode used for optionally - converting the image color space. Default: ``ImageReadMode.UNCHANGED``. - Other supported values are ``ImageReadMode.RGB`` and ``ImageReadMode.RGB_ALPHA``. + mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB". + Default is "UNCHANGED". See :class:`~torchvision.io.ImageReadMode` + for available modes. Returns: Decoded image (Tensor[image_channels, image_height, image_width]) @@ -398,9 +394,9 @@ def _decode_avif( Args: input (Tensor[1]): a one dimensional contiguous uint8 tensor containing the raw bytes of the AVIF image. - mode (str or ImageReadMode): The read mode used for optionally - converting the image color space. Default: ``ImageReadMode.UNCHANGED``. - Other supported values are ``ImageReadMode.RGB`` and ``ImageReadMode.RGB_ALPHA``. + mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB". + Default is "UNCHANGED". See :class:`~torchvision.io.ImageReadMode` + for available modes. Returns: Decoded image (Tensor[image_channels, image_height, image_width]) @@ -426,9 +422,9 @@ def _decode_heic(input: torch.Tensor, mode: ImageReadMode = ImageReadMode.UNCHAN Args: input (Tensor[1]): a one dimensional contiguous uint8 tensor containing the raw bytes of the HEIC image. - mode (str or ImageReadMode): The read mode used for optionally - converting the image color space. Default: ``ImageReadMode.UNCHANGED``. - Other supported values are ``ImageReadMode.RGB`` and ``ImageReadMode.RGB_ALPHA``. + mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB". + Default is "UNCHANGED". See :class:`~torchvision.io.ImageReadMode` + for available modes. Returns: Decoded image (Tensor[image_channels, image_height, image_width]) diff --git a/torchvision/io/video.py b/torchvision/io/video.py index c8f7d2ebde2..73c97f37e29 100644 --- a/torchvision/io/video.py +++ b/torchvision/io/video.py @@ -64,6 +64,14 @@ def write_video( """ Writes a 4d tensor in [T, H, W, C] format in a video file + .. warning:: + + In the near future, we intend to centralize PyTorch's video decoding + capabilities within the `torchcodec + `_ project. We encourage you to + try it out and share your feedback, as the torchvision video decoders + will eventually be deprecated. + Args: filename (str): path where the video will be saved video_array (Tensor[T, H, W, C]): tensor containing the individual frames, @@ -243,6 +251,14 @@ def read_video( """ Reads a video from a file, returning both the video frames and the audio frames + .. warning:: + + In the near future, we intend to centralize PyTorch's video decoding + capabilities within the `torchcodec + `_ project. We encourage you to + try it out and share your feedback, as the torchvision video decoders + will eventually be deprecated. + Args: filename (str): path to the video file. If using the pyav backend, this can be whatever ``av.open`` accepts. start_pts (int if pts_unit = 'pts', float / Fraction if pts_unit = 'sec', optional): @@ -367,6 +383,14 @@ def read_video_timestamps(filename: str, pts_unit: str = "pts") -> Tuple[List[in """ List the video frames timestamps. + .. warning:: + + In the near future, we intend to centralize PyTorch's video decoding + capabilities within the `torchcodec + `_ project. We encourage you to + try it out and share your feedback, as the torchvision video decoders + will eventually be deprecated. + Note that the function decodes the whole video frame-by-frame. Args: diff --git a/torchvision/io/video_reader.py b/torchvision/io/video_reader.py index 505909fd984..cf319fe288e 100644 --- a/torchvision/io/video_reader.py +++ b/torchvision/io/video_reader.py @@ -52,6 +52,14 @@ class VideoReader: backends: video_reader, pyav, and cuda. Backends can be set via `torchvision.set_video_backend` function. + .. warning:: + + In the near future, we intend to centralize PyTorch's video decoding + capabilities within the `torchcodec + `_ project. We encourage you to + try it out and share your feedback, as the torchvision video decoders + will eventually be deprecated. + .. betastatus:: VideoReader class Example: