Merge branch 'main' into laefjnlajnflajenf

pytorch · Aug 23, 2023 · 156228e · 156228e
2 parents 2334e8d + 6f72b76
commit 156228e
Show file tree

Hide file tree

Showing 9 changed files with 285 additions and 161 deletions.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -83,6 +83,7 @@
     "backreferences_dir": "gen_modules/backreferences",
     "doc_module": ("torchvision",),
     "remove_config_comments": True,
+    "ignore_pattern": "helpers.py",
 }
 
 napoleon_use_ivar = True

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
@@ -45,13 +45,17 @@ tasks (image classification, detection, segmentation, video classification).
 Transforms are typically passed as the ``transform`` or ``transforms`` argument
 to the :ref:`Datasets <datasets>`.
 
+.. TODO: Reader guide, i.e. what to read depending on what you're looking for
 .. TODO: add link to getting started guide here.
 
+.. _conventions:
+
 Supported input types and conventions
 -------------------------------------
 
 Most transformations accept both `PIL <https://pillow.readthedocs.io>`_ images
-and tensor images. The result of both backends (PIL or Tensors) should be very
+and tensor inputs. Both CPU and CUDA tensors are supported.
+The result of both backends (PIL or Tensors) should be very
 close. In general, we recommend relying on the tensor backend :ref:`for
 performance <transforms_perf>`.  The :ref:`conversion transforms
 <conversion_transforms>` may be used to convert to and from PIL images, or for
@@ -152,13 +156,15 @@ The above should give you the best performance in a typical training environment
 that relies on the :class:`torch.utils.data.DataLoader` with ``num_workers >
 0``.
 
-Transforms tend to be sensitive to the input strides / memory layout. Some
+Transforms tend to be sensitive to the input strides / memory format. Some
 transforms will be faster with channels-first images while others prefer
-channels-last. You may want to experiment a bit if you're chasing the very
-best performance. Using :func:`torch.compile` on individual transforms may
-also help factoring out the memory layout variable (e.g. on
+channels-last. Like ``torch`` operators, most transforms will preserve the
+memory format of the input, but this may not always be respected due to
+implementation details. You may want to experiment a bit if you're chasing the
+very best performance.  Using :func:`torch.compile` on individual transforms may
+also help factoring out the memory format variable (e.g. on
 :class:`~torchvision.transforms.v2.Normalize`). Note that we're talking about
-**memory layout**, not tensor shape.
+**memory format**, not :ref:`tensor shape <conventions>`.
 
 Note that resize transforms like :class:`~torchvision.transforms.v2.Resize`
 and :class:`~torchvision.transforms.v2.RandomResizedCrop` typically prefer

diff --git a/gallery/v2_transforms/helpers.py b/gallery/v2_transforms/helpers.py
@@ -0,0 +1,33 @@
+import matplotlib.pyplot as plt
+from torchvision.utils import draw_bounding_boxes
+
+
+def plot(imgs):
+    if not isinstance(imgs[0], list):
+        # Make a 2d grid even if there's just 1 row
+        imgs = [imgs]
+
+    num_rows = len(imgs)
+    num_cols = len(imgs[0])
+    _, axs = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False)
+    for row_idx, row in enumerate(imgs):
+        for col_idx, img in enumerate(row):
+            bboxes = None
+            if isinstance(img, tuple):
+                bboxes = img[1]
+                img = img[0]
+                if isinstance(bboxes, dict):
+                    bboxes = bboxes['bboxes']
+            if img.dtype.is_floating_point and img.min() < 0:
+                # Poor man's re-normalization for the colors to be OK-ish. This
+                # is useful for images coming out of Normalize()
+                img -= img.min()
+                img /= img.max()
+
+            if bboxes is not None:
+                img = draw_bounding_boxes(img, bboxes, colors="yellow", width=3)
+            ax = axs[row_idx, col_idx]
+            ax.imshow(img.permute(1, 2, 0).numpy())
+            ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+
+    plt.tight_layout()
diff --git a/gallery/v2_transforms/plot_datapoints.py b/gallery/v2_transforms/plot_datapoints.py
@@ -28,7 +28,6 @@
 
 import torch
 from torchvision import datapoints
-from torchvision.transforms.v2 import functional as F
 
 
 # %%
@@ -119,83 +118,10 @@
 assert isinstance(new_bboxes, datapoints.BoundingBoxes)
 assert new_bboxes.canvas_size == bboxes.canvas_size
 
-
 # %%
 # The metadata of ``new_bboxes`` is the same as ``bboxes``, but you could pass
 # it as a parameter to override it.
 #
-# Do I have to wrap the output of the datasets myself?
-# ----------------------------------------------------
-#
-# TODO: Move this in another guide - this is user-facing, not dev-facing.
-#
-# Only if you are using custom datasets. For the built-in ones, you can use
-# :func:`torchvision.datasets.wrap_dataset_for_transforms_v2`. Note that the function also supports subclasses of the
-# built-in datasets. Meaning, if your custom dataset subclasses from a built-in one and the output type is the same, you
-# also don't have to wrap manually.
-#
-# If you have a custom dataset, for example the ``PennFudanDataset`` from
-# `this tutorial <https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html>`_, you have two options:
-#
-# 1. Perform the wrapping inside ``__getitem__``:
-
-class PennFudanDataset(torch.utils.data.Dataset):
-    ...
-
-    def __getitem__(self, item):
-        ...
-
-        target["bboxes"] = datapoints.BoundingBoxes(
-            bboxes,
-            format=datapoints.BoundingBoxFormat.XYXY,
-            canvas_size=F.get_size(img),
-        )
-        target["labels"] = labels
-        target["masks"] = datapoints.Mask(masks)
-
-        ...
-
-        if self.transforms is not None:
-            img, target = self.transforms(img, target)
-
-        ...
-
-# %%
-# 2. Perform the wrapping inside a custom transformation at the beginning of your pipeline:
-
-
-class WrapPennFudanDataset:
-    def __call__(self, img, target):
-        target["boxes"] = datapoints.BoundingBoxes(
-            target["boxes"],
-            format=datapoints.BoundingBoxFormat.XYXY,
-            canvas_size=F.get_size(img),
-        )
-        target["masks"] = datapoints.Mask(target["masks"])
-        return img, target
-
-
-...
-
-
-def get_transform(train):
-    transforms = []
-    transforms.append(WrapPennFudanDataset())
-    transforms.append(T.PILToTensor())
-    ...
-
-# %%
-# .. note::
-#
-#    If both :class:`~torchvision.datapoints.BoundingBoxes` and :class:`~torchvision.datapoints.Mask`'s are included in
-#    the sample, ``torchvision.transforms.v2`` will transform them both. Meaning, if you don't need both, dropping or
-#    at least not wrapping the obsolete parts, can lead to a significant performance boost.
-#
-#    For example, if you are using the ``PennFudanDataset`` for object detection, not wrapping the masks avoids
-#    transforming them over and over again in the pipeline just to ultimately ignoring them. In general, it would be
-#    even better to not load the masks at all, but this is not possible in this example, since the bounding boxes are
-#    generated from the masks.
-#
 # .. _datapoint_unwrapping_behaviour:
 #
 # I had a Datapoint but now I have a Tensor. Help!