Merge branch 'main' into aeojnlajnef

pytorch · Aug 2, 2023 · a7383a6 · a7383a6
2 parents 4b02136 + cab9fba
commit a7383a6
Show file tree

Hide file tree

Showing 48 changed files with 3,478 additions and 1,577 deletions.
diff --git a/.github/scripts/run-clang-format.py b/.github/scripts/run-clang-format.py
@@ -48,7 +48,7 @@
     DEVNULL = open(os.devnull, "wb")
 
 
-DEFAULT_EXTENSIONS = "c,h,C,H,cpp,hpp,cc,hh,c++,h++,cxx,hxx,cu"
+DEFAULT_EXTENSIONS = "c,h,C,H,cpp,hpp,cc,hh,c++,h++,cxx,hxx,cu,mm"
 
 
 class ExitStatus:

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -4,6 +4,7 @@ set(CMAKE_CXX_STANDARD 17)
 file(STRINGS version.txt TORCHVISION_VERSION)
 
 option(WITH_CUDA "Enable CUDA support" OFF)
+option(WITH_MPS "Enable MPS support" OFF)
 option(WITH_PNG "Enable features requiring LibPNG." ON)
 option(WITH_JPEG "Enable features requiring LibJPEG." ON)
 option(USE_PYTHON "Link to Python when building" OFF)
@@ -15,6 +16,11 @@ if(WITH_CUDA)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 endif()
 
+if(WITH_MPS)
+  enable_language(OBJC OBJCXX)
+  add_definitions(-DWITH_MPS)
+endif()
+
 find_package(Torch REQUIRED)
 
 if (WITH_PNG)
@@ -79,6 +85,9 @@ list(APPEND ALLOW_LISTED ${TVCPP} ${TVCPP}/io/image ${TVCPP}/io/image/cpu ${TVCP
 if(WITH_CUDA)
     list(APPEND ALLOW_LISTED ${TVCPP}/ops/cuda ${TVCPP}/ops/autocast)
 endif()
+if(WITH_MPS)
+    list(APPEND ALLOW_LISTED ${TVCPP}/ops/mps)
+endif()
 
 FOREACH(DIR ${ALLOW_LISTED})
     file(GLOB ALL_SOURCES ${ALL_SOURCES} ${DIR}/*.*)

diff --git a/gallery/plot_cutmix_mixup.py b/gallery/plot_cutmix_mixup.py
@@ -4,8 +4,8 @@
 How to use CutMix and MixUp
 ===========================
 
-:class:`~torchvision.transforms.v2.Cutmix` and
-:class:`~torchvision.transforms.v2.Mixup` are popular augmentation strategies
+:class:`~torchvision.transforms.v2.CutMix` and
+:class:`~torchvision.transforms.v2.MixUp` are popular augmentation strategies
 that can improve classification accuracy.
 
 These transforms are slightly different from the rest of the Torchvision
@@ -79,8 +79,8 @@
 
 dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
 
-cutmix = v2.Cutmix(num_classes=NUM_CLASSES)
-mixup = v2.Mixup(num_classes=NUM_CLASSES)
+cutmix = v2.CutMix(num_classes=NUM_CLASSES)
+mixup = v2.MixUp(num_classes=NUM_CLASSES)
 cutmix_or_mixup = v2.RandomChoice([cutmix, mixup])
 
 for images, labels in dataloader:
@@ -148,5 +148,5 @@ def labels_getter(batch):
     return batch["target"]["classes"]
 
 
-out = v2.Cutmix(num_classes=NUM_CLASSES, labels_getter=labels_getter)(batch)
+out = v2.CutMix(num_classes=NUM_CLASSES, labels_getter=labels_getter)(batch)
 print(f"{out['imgs'].shape = }, {out['target']['classes'].shape = }")
diff --git a/gallery/plot_datapoints.py b/gallery/plot_datapoints.py
@@ -23,7 +23,7 @@
 from torchvision.transforms.v2 import functional as F
 
 
-########################################################################################################################
+# %%
 # What are datapoints?
 # --------------------
 #
@@ -36,7 +36,7 @@
 assert image.data_ptr() == tensor.data_ptr()
 
 
-########################################################################################################################
+# %%
 # Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function
 # for the input data.
 #
@@ -59,22 +59,22 @@
 print(image)
 
 
-########################################################################################################################
+# %%
 # Similar to other PyTorch creations ops, the constructor also takes the ``dtype``, ``device``, and ``requires_grad``
 # parameters.
 
 float_image = datapoints.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires_grad=True)
 print(float_image)
 
 
-########################################################################################################################
+# %%
 # In addition, :class:`~torchvision.datapoints.Image` and :class:`~torchvision.datapoints.Mask` also take a
 # :class:`PIL.Image.Image` directly:
 
 image = datapoints.Image(PIL.Image.open("assets/astronaut.jpg"))
 print(image.shape, image.dtype)
 
-########################################################################################################################
+# %%
 # In general, the datapoints can also store additional metadata that complements the underlying tensor. For example,
 # :class:`~torchvision.datapoints.BoundingBoxes` stores the coordinate format as well as the spatial size of the
 # corresponding image alongside the actual values:
@@ -85,7 +85,7 @@
 print(bounding_box)
 
 
-########################################################################################################################
+# %%
 # Do I have to wrap the output of the datasets myself?
 # ----------------------------------------------------
 #
@@ -120,7 +120,7 @@ def __getitem__(self, item):
 
         ...
 
-########################################################################################################################
+# %%
 # 2. Perform the wrapping inside a custom transformation at the beginning of your pipeline:
 
 
@@ -144,7 +144,7 @@ def get_transform(train):
     transforms.append(T.PILToTensor())
     ...
 
-########################################################################################################################
+# %%
 # .. note::
 #
 #    If both :class:`~torchvision.datapoints.BoundingBoxes`'es and :class:`~torchvision.datapoints.Mask`'s are included in
@@ -171,7 +171,7 @@ def get_transform(train):
 
 assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, datapoints.Image)
 
-########################################################################################################################
+# %%
 # .. note::
 #
 #    This "unwrapping" behaviour is something we're actively seeking feedback on. If you find this surprising or if you

diff --git a/gallery/plot_optical_flow.py b/gallery/plot_optical_flow.py
@@ -42,7 +42,7 @@ def plot(imgs, **imshow_kwargs):
 
     plt.tight_layout()
 
-###################################
+# %%
 # Reading Videos Using Torchvision
 # --------------------------------
 # We will first read a video using :func:`~torchvision.io.read_video`.
@@ -62,7 +62,7 @@ def plot(imgs, **imshow_kwargs):
 video_path = Path(tempfile.mkdtemp()) / "basketball.mp4"
 _ = urlretrieve(video_url, video_path)
 
-#########################
+# %%
 # :func:`~torchvision.io.read_video` returns the video frames, audio frames and
 # the metadata associated with the video. In our case, we only need the video
 # frames.
@@ -79,7 +79,7 @@ def plot(imgs, **imshow_kwargs):
 
 plot(img1_batch)
 
-#########################
+# %%
 # The RAFT model accepts RGB images. We first get the frames from
 # :func:`~torchvision.io.read_video` and resize them to ensure their dimensions
 # are divisible by 8. Note that we explicitly use ``antialias=False``, because
@@ -104,7 +104,7 @@ def preprocess(img1_batch, img2_batch):
 print(f"shape = {img1_batch.shape}, dtype = {img1_batch.dtype}")
 
 
-####################################
+# %%
 # Estimating Optical flow using RAFT
 # ----------------------------------
 # We will use our RAFT implementation from
@@ -125,7 +125,7 @@ def preprocess(img1_batch, img2_batch):
 print(f"type = {type(list_of_flows)}")
 print(f"length = {len(list_of_flows)} = number of iterations of the model")
 
-####################################
+# %%
 # The RAFT model outputs lists of predicted flows where each entry is a
 # (N, 2, H, W) batch of predicted flows that corresponds to a given "iteration"
 # in the model. For more details on the iterative nature of the model, please
@@ -144,7 +144,7 @@ def preprocess(img1_batch, img2_batch):
 print(f"min = {predicted_flows.min()}, max = {predicted_flows.max()}")
 
 
-####################################
+# %%
 # Visualizing predicted flows
 # ---------------------------
 # Torchvision provides the :func:`~torchvision.utils.flow_to_image` utlity to
@@ -166,7 +166,7 @@ def preprocess(img1_batch, img2_batch):
 grid = [[img1, flow_img] for (img1, flow_img) in zip(img1_batch, flow_imgs)]
 plot(grid)
 
-####################################
+# %%
 # Bonus: Creating GIFs of predicted flows
 # ---------------------------------------
 # In the example above we have only shown the predicted flows of 2 pairs of
@@ -187,7 +187,7 @@ def preprocess(img1_batch, img2_batch):
 #     output_folder = "/tmp/"  # Update this to the folder of your choice
 #     write_jpeg(flow_img, output_folder + f"predicted_flow_{i}.jpg")
 
-####################################
+# %%
 # Once the .jpg flow images are saved, you can convert them into a video or a
 # GIF using ffmpeg with e.g.:
 #

diff --git a/gallery/plot_repurposing_annotations.py b/gallery/plot_repurposing_annotations.py
@@ -36,7 +36,7 @@ def show(imgs):
         axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
 
 
-####################################
+# %%
 # Masks
 # -----
 # In tasks like instance and panoptic segmentation, masks are commonly defined, and are defined by this package,
@@ -53,7 +53,7 @@ def show(imgs):
 # A nice property of masks is that they can be easily repurposed to be used in methods to solve a variety of object
 # localization tasks.
 
-####################################
+# %%
 # Converting Masks to Bounding Boxes
 # -----------------------------------------------
 # For example, the :func:`~torchvision.ops.masks_to_boxes` operation can be used to
@@ -70,7 +70,7 @@ def show(imgs):
 mask = read_image(mask_path)
 
 
-#########################
+# %%
 # Here the masks are represented as a PNG Image, with floating point values.
 # Each pixel is encoded as different colors, with 0 being background.
 # Notice that the spatial dimensions of image and mask match.
@@ -79,7 +79,7 @@ def show(imgs):
 print(img.size())
 print(mask)
 
-############################
+# %%
 
 # We get the unique colors, as these would be the object ids.
 obj_ids = torch.unique(mask)
@@ -91,7 +91,7 @@ def show(imgs):
 # Note that this snippet would work as well if the masks were float values instead of ints.
 masks = mask == obj_ids[:, None, None]
 
-########################
+# %%
 # Now the masks are a boolean tensor.
 # The first dimension in this case 3 and denotes the number of instances: there are 3 people in the image.
 # The other two dimensions are height and width, which are equal to the dimensions of the image.
@@ -101,7 +101,7 @@ def show(imgs):
 print(masks.size())
 print(masks)
 
-####################################
+# %%
 # Let us visualize an image and plot its corresponding segmentation masks.
 # We will use the :func:`~torchvision.utils.draw_segmentation_masks` to draw the segmentation masks.
 
@@ -113,7 +113,7 @@ def show(imgs):
 
 show(drawn_masks)
 
-####################################
+# %%
 # To convert the boolean masks into bounding boxes.
 # We will use the :func:`~torchvision.ops.masks_to_boxes` from the torchvision.ops module
 # It returns the boxes in ``(xmin, ymin, xmax, ymax)`` format.
@@ -124,7 +124,7 @@ def show(imgs):
 print(boxes.size())
 print(boxes)
 
-####################################
+# %%
 # As the shape denotes, there are 3 boxes and in ``(xmin, ymin, xmax, ymax)`` format.
 # These can be visualized very easily with :func:`~torchvision.utils.draw_bounding_boxes` utility
 # provided in :ref:`torchvision.utils <utils>`.
@@ -134,7 +134,7 @@ def show(imgs):
 drawn_boxes = draw_bounding_boxes(img, boxes, colors="red")
 show(drawn_boxes)
 
-###################################
+# %%
 # These boxes can now directly be used by detection models in torchvision.
 # Here is demo with a Faster R-CNN model loaded from
 # :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`
@@ -153,7 +153,7 @@ def show(imgs):
 detection_outputs = model(img.unsqueeze(0), [target])
 
 
-####################################
+# %%
 # Converting Segmentation Dataset to Detection Dataset
 # ----------------------------------------------------
 #

diff --git a/gallery/plot_scripted_tensor_transforms.py b/gallery/plot_scripted_tensor_transforms.py
@@ -45,15 +45,15 @@ def show(imgs):
         axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
 
 
-####################################
+# %%
 # The :func:`~torchvision.io.read_image` function allows to read an image and
 # directly load it as a tensor
 
 dog1 = read_image(str(Path('assets') / 'dog1.jpg'))
 dog2 = read_image(str(Path('assets') / 'dog2.jpg'))
 show([dog1, dog2])
 
-####################################
+# %%
 # Transforming images on GPU
 # --------------------------
 # Most transforms natively support tensors on top of PIL images (to visualize
@@ -76,7 +76,7 @@ def show(imgs):
 transformed_dog2 = transforms(dog2)
 show([transformed_dog1, transformed_dog2])
 
-####################################
+# %%
 # Scriptable transforms for easier deployment via torchscript
 # -----------------------------------------------------------
 # We now show how to combine image transformations and a model forward pass,
@@ -103,7 +103,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             return y_pred.argmax(dim=1)
 
 
-####################################
+# %%
 # Now, let's define scripted and non-scripted instances of ``Predictor`` and
 # apply it on multiple tensor images of the same size
 
@@ -115,7 +115,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 res = predictor(batch)
 res_scripted = scripted_predictor(batch)
 
-####################################
+# %%
 # We can verify that the prediction of the scripted and non-scripted models are
 # the same:
 
@@ -128,7 +128,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
     assert pred == pred_scripted
     print(f"Prediction for Dog {i + 1}: {labels[str(pred.item())]}")
 
-####################################
+# %%
 # Since the model is scripted, it can be easily dumped on disk and re-used
 
 import tempfile