diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index f4c8055b0e8..196d806907d 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -118,5 +118,6 @@ jobs:
         
         git config user.name 'pytorchbot'
         git config user.email 'soumith+bot@pytorch.org'
+        git config http.postBuffer 524288000
         git commit -m "auto-generating sphinx docs" || true
         git push
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7c73e8f9266..57df2c27410 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -30,30 +30,49 @@ clear and has sufficient instructions to be able to reproduce the issue.
 
 ## Development installation
 
-### Install PyTorch Nightly 
+
+### Dependencies
+
+Start by installing the **nightly** build of PyTorch following the [official
+instructions](https://pytorch.org/get-started/locally/).
+
+**Optionally**, install `libpng` and `libjpeg-turbo` if you want to enable
+support for
+native encoding / decoding of PNG and JPEG formats in
+[torchvision.io](https://pytorch.org/vision/stable/io.html#image):
 
 ```bash
-conda install pytorch -c pytorch-nightly
-# or with pip (see https://pytorch.org/get-started/locally/)
-# pip install numpy
-# pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
+conda install libpng libjpeg-turbo -c pytorch
 ```
 
-### Install Torchvision
+Note: you can use the `TORCHVISION_INCLUDE` and `TORCHVISION_LIBRARY`
+environment variables to tell the build system where to find those libraries if
+they are in specific locations. Take a look at
+[setup.py](https://github.com/pytorch/vision/blob/main/setup.py) for more
+details.
+
+### Clone and install torchvision
 
 ```bash
 git clone https://github.com/pytorch/vision.git
 cd vision
-python setup.py develop
+python setup.py develop  # use install instead of develop if you don't care about development.
 # or, for OSX
 # MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py develop
-# for C++ debugging, please use DEBUG=1
+# for C++ debugging, use DEBUG=1
 # DEBUG=1 python setup.py develop
-pip install flake8 typing mypy pytest pytest-mock scipy
 ```
-You may also have to install `libpng-dev` and `libjpeg-turbo8-dev` libraries:
-```bash
-conda install libpng jpeg
+
+By default, GPU support is built if CUDA is found and `torch.cuda.is_available()` is true. It's possible to force
+building GPU support by setting `FORCE_CUDA=1` environment variable, which is useful when building a docker image.
+
+We don't officially support building from source using `pip`, but _if_ you do, you'll need to use the
+`--no-build-isolation` flag.
+
+Other development dependencies include:
+
+```
+pip install flake8 typing mypy pytest pytest-mock scipy
 ```
 
 ## Development Process
@@ -192,7 +211,7 @@ Please refer to the guidelines in [Contributing to Torchvision - Models](https:/
  
 ### New dataset
 
-More details on how to add a new dataset will be provided later. Please, do not send any PR with a new dataset without discussing 
+Please, do not send any PR with a new dataset without discussing 
 it in an issue as, most likely, it will not be accepted.
 
 ### Pull Request
diff --git a/README.md b/README.md
index e9ceb664ae2..dd29290a032 100644
--- a/README.md
+++ b/README.md
@@ -8,8 +8,14 @@ vision.
 
 ## Installation
 
-We recommend Anaconda as Python package management system. Please refer to [pytorch.org](https://pytorch.org/) for the
-detail of PyTorch (`torch`) installation. The following is the corresponding `torchvision` versions and supported Python
+Please refer to the [official
+instructions](https://pytorch.org/get-started/locally/) to install the stable
+versions of `torch` and `torchvision` on your system.
+
+To build source, refer to our [contributing
+page](https://github.com/pytorch/vision/blob/main/CONTRIBUTING.md#development-installation).
+
+The following is the corresponding `torchvision` versions and supported Python
 versions.
 
 | `torch`            | `torchvision`      | Python              |
@@ -39,54 +45,18 @@ versions.
 
 </details>
 
-Anaconda:
-
-```
-conda install torchvision -c pytorch
-```
-
-pip:
-
-```
-pip install torchvision
-```
+## Image Backends
 
-From source:
-
-```
-python setup.py install
-# or, for OSX
-# MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install
-```
-
-We don't officially support building from source using `pip`, but _if_ you do, you'll need to use the
-`--no-build-isolation` flag. In case building TorchVision from source fails, install the nightly version of PyTorch
-following the linked guide on the
-[contributing page](https://github.com/pytorch/vision/blob/main/CONTRIBUTING.md#development-installation) and retry the
-install.
-
-By default, GPU support is built if CUDA is found and `torch.cuda.is_available()` is true. It's possible to force
-building GPU support by setting `FORCE_CUDA=1` environment variable, which is useful when building a docker image.
+Torchvision currently supports the following image backends:
 
-## Image Backend
+- torch tensors
+- PIL images:
+    - [Pillow](https://python-pillow.org/)
+    - [Pillow-SIMD](https://github.com/uploadcare/pillow-simd) - a **much faster** drop-in replacement for Pillow with SIMD.
 
-Torchvision currently supports the following image backends:
+Read more in in our [docs](https://pytorch.org/vision/stable/transforms.html).
 
-- [Pillow](https://python-pillow.org/) (default)
-- [Pillow-SIMD](https://github.com/uploadcare/pillow-simd) - a **much faster** drop-in replacement for Pillow with SIMD.
-  If installed will be used as the default.
-- [accimage](https://github.com/pytorch/accimage) - if installed can be activated by calling
-  `torchvision.set_image_backend('accimage')`
-- [libpng](http://www.libpng.org/pub/png/libpng.html) - can be installed via conda `conda install libpng` or any of the
-  package managers for debian-based and RHEL-based Linux distributions.
-- [libjpeg](http://ijg.org/) - can be installed via conda `conda install jpeg` or any of the package managers for
-  debian-based and RHEL-based Linux distributions. [libjpeg-turbo](https://libjpeg-turbo.org/) can be used as well.
-
-**Notes:** `libpng` and `libjpeg` must be available at compilation time in order to be available. Make sure that it is
-available on the standard library locations, otherwise, add the include and library paths in the environment variables
-`TORCHVISION_INCLUDE` and `TORCHVISION_LIBRARY`, respectively.
-
-## Video Backend
+## [UNSTABLE] Video Backend
 
 Torchvision currently supports the following video backends:
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index fed3884ea27..4a331b6cd75 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -29,6 +29,7 @@
 import pytorch_sphinx_theme
 import torchvision
 import torchvision.models as M
+from sphinx_gallery.sorting import ExplicitOrder
 from tabulate import tabulate
 
 sys.path.append(os.path.abspath("."))
@@ -61,6 +62,7 @@
 sphinx_gallery_conf = {
     "examples_dirs": "../../gallery/",  # path to your example scripts
     "gallery_dirs": "auto_examples",  # path to where to save gallery generated output
+    "subsection_order": ExplicitOrder(["../../gallery/v2_transforms", "../../gallery/others"]),
     "backreferences_dir": "gen_modules/backreferences",
     "doc_module": ("torchvision",),
     "remove_config_comments": True,
diff --git a/docs/source/datapoints.rst b/docs/source/datapoints.rst
index 0599545f7f3..abaefef602d 100644
--- a/docs/source/datapoints.rst
+++ b/docs/source/datapoints.rst
@@ -6,7 +6,7 @@ Datapoints
 Datapoints are tensor subclasses which the :mod:`~torchvision.transforms.v2` v2 transforms use under the hood to
 dispatch their inputs to the appropriate lower-level kernels. Most users do not
 need to manipulate datapoints directly and can simply rely on dataset wrapping -
-see e.g. :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`.
+see e.g. :ref:`sphx_glr_auto_examples_v2_transforms_plot_transforms_v2_e2e.py`.
 
 .. autosummary::
     :toctree: generated/
diff --git a/docs/source/io.rst b/docs/source/io.rst
index 258a1ee16dc..1da9bb6882a 100644
--- a/docs/source/io.rst
+++ b/docs/source/io.rst
@@ -1,11 +1,37 @@
-Reading/Writing images and videos
-=================================
+Decoding / Encoding images and videos
+=====================================
 
 .. currentmodule:: torchvision.io
 
 The :mod:`torchvision.io` package provides functions for performing IO
-operations. They are currently specific to reading and writing video and
-images.
+operations. They are currently specific to reading and writing images and
+videos.
+
+Images
+------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    read_image
+    decode_image
+    encode_jpeg
+    decode_jpeg
+    write_jpeg
+    encode_png
+    decode_png
+    write_png
+    read_file
+    write_file
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    ImageReadMode
+
+
 
 Video
 -----
@@ -20,7 +46,7 @@ Video
 
 
 Fine-grained video API
-----------------------
+^^^^^^^^^^^^^^^^^^^^^^
 
 In addition to the :mod:`read_video` function, we provide a high-performance 
 lower-level API for more fine-grained control compared to the :mod:`read_video` function.
@@ -61,28 +87,3 @@ Example of inspecting a video:
     # the constructor we select a default video stream, but
     # in practice, we can set whichever stream we would like 
     video.set_current_stream("video:0")
-
-
-Image
------
-
-.. autosummary::
-    :toctree: generated/
-    :template: class.rst
-
-    ImageReadMode
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    read_image
-    decode_image
-    encode_jpeg
-    decode_jpeg
-    write_jpeg
-    encode_png
-    decode_png
-    write_png
-    read_file
-    write_file
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 6700395717f..812c17fdab8 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -13,7 +13,7 @@ Transforming and augmenting images
     are fully backward compatible with the current ones, and you'll see them
     documented below with a `v2.` prefix. To get started with those new
     transforms, you can check out
-    :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`.
+    :ref:`sphx_glr_auto_examples_v2_transforms_plot_transforms_v2_e2e.py`.
     Note that these transforms are still BETA, and while we don't expect major
     breaking changes in the future, some APIs may still change according to user
     feedback. Please submit any feedback you may have `here
@@ -54,15 +54,15 @@ across calls. For reproducible transformations across calls, you may use
 
 The following examples illustrate the use of the available transforms:
 
-    * :ref:`sphx_glr_auto_examples_plot_transforms.py`
+    * :ref:`sphx_glr_auto_examples_others_plot_transforms.py`
 
-        .. figure:: ../source/auto_examples/images/sphx_glr_plot_transforms_001.png
+        .. figure:: ../source/auto_examples/others/images/sphx_glr_plot_transforms_001.png
             :align: center
             :scale: 65%
 
-    * :ref:`sphx_glr_auto_examples_plot_scripted_tensor_transforms.py`
+    * :ref:`sphx_glr_auto_examples_others_plot_scripted_tensor_transforms.py`
 
-        .. figure:: ../source/auto_examples/images/sphx_glr_plot_scripted_tensor_transforms_001.png
+        .. figure:: ../source/auto_examples/others/images/sphx_glr_plot_scripted_tensor_transforms_001.png
             :align: center
             :scale: 30%
 
@@ -237,6 +237,7 @@ Conversion
     v2.ConvertImageDtype
     v2.ToDtype
     v2.ConvertBoundingBoxFormat
+    v2.ToPureTensor
 
 Auto-Augmentation
 -----------------
@@ -268,7 +269,7 @@ CutMix and MixUp are special transforms that
 are meant to be used on batches rather than on individual images, because they
 are combining pairs of images together. These can be used after the dataloader
 (once the samples are batched), or part of a collation function. See
-:ref:`sphx_glr_auto_examples_plot_cutmix_mixup.py` for detailed usage examples.
+:ref:`sphx_glr_auto_examples_v2_transforms_plot_cutmix_mixup.py` for detailed usage examples.
 
 .. autosummary::
     :toctree: generated/
diff --git a/docs/source/utils.rst b/docs/source/utils.rst
index 971381a658f..cda04de900a 100644
--- a/docs/source/utils.rst
+++ b/docs/source/utils.rst
@@ -4,7 +4,7 @@ Utils
 =====
 
 The ``torchvision.utils`` module contains various utilities, mostly :ref:`for
-visualization <sphx_glr_auto_examples_plot_visualization_utils.py>`.
+visualization <sphx_glr_auto_examples_others_plot_visualization_utils.py>`.
 
 .. currentmodule:: torchvision.utils
 
diff --git a/gallery/README.rst b/gallery/README.rst
index 868afe74351..9a0838f493f 100644
--- a/gallery/README.rst
+++ b/gallery/README.rst
@@ -1,4 +1,2 @@
-Example gallery
-===============
-
-Below is a gallery of examples
+Examples and tutorials
+======================
diff --git a/gallery/others/README.rst b/gallery/others/README.rst
new file mode 100644
index 00000000000..fafb007d985
--- /dev/null
+++ b/gallery/others/README.rst
@@ -0,0 +1,2 @@
+Others
+------
diff --git a/gallery/plot_optical_flow.py b/gallery/others/plot_optical_flow.py
similarity index 100%
rename from gallery/plot_optical_flow.py
rename to gallery/others/plot_optical_flow.py
diff --git a/gallery/plot_repurposing_annotations.py b/gallery/others/plot_repurposing_annotations.py
similarity index 99%
rename from gallery/plot_repurposing_annotations.py
rename to gallery/others/plot_repurposing_annotations.py
index 99f75f03fc1..f47c301812b 100644
--- a/gallery/plot_repurposing_annotations.py
+++ b/gallery/others/plot_repurposing_annotations.py
@@ -20,7 +20,7 @@
 import torchvision.transforms.functional as F
 
 
-ASSETS_DIRECTORY = "assets"
+ASSETS_DIRECTORY = "../assets"
 
 plt.rcParams["savefig.bbox"] = "tight"
 
diff --git a/gallery/plot_scripted_tensor_transforms.py b/gallery/others/plot_scripted_tensor_transforms.py
similarity index 94%
rename from gallery/plot_scripted_tensor_transforms.py
rename to gallery/others/plot_scripted_tensor_transforms.py
index e803da7799e..5bf48d69f36 100644
--- a/gallery/plot_scripted_tensor_transforms.py
+++ b/gallery/others/plot_scripted_tensor_transforms.py
@@ -49,8 +49,8 @@ def show(imgs):
 # The :func:`~torchvision.io.read_image` function allows to read an image and
 # directly load it as a tensor
 
-dog1 = read_image(str(Path('assets') / 'dog1.jpg'))
-dog2 = read_image(str(Path('assets') / 'dog2.jpg'))
+dog1 = read_image(str(Path('../assets') / 'dog1.jpg'))
+dog2 = read_image(str(Path('../assets') / 'dog2.jpg'))
 show([dog1, dog2])
 
 # %%
@@ -58,7 +58,7 @@ def show(imgs):
 # --------------------------
 # Most transforms natively support tensors on top of PIL images (to visualize
 # the effect of the transforms, you may refer to see
-# :ref:`sphx_glr_auto_examples_plot_transforms.py`).
+# :ref:`sphx_glr_auto_examples_others_plot_transforms.py`).
 # Using tensor images, we can run the transforms on GPUs if cuda is available!
 
 import torch.nn as nn
@@ -121,7 +121,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 import json
 
-with open(Path('assets') / 'imagenet_class_index.json') as labels_file:
+with open(Path('../assets') / 'imagenet_class_index.json') as labels_file:
     labels = json.load(labels_file)
 
 for i, (pred, pred_scripted) in enumerate(zip(res, res_scripted)):
diff --git a/gallery/plot_transforms.py b/gallery/others/plot_transforms.py
similarity index 99%
rename from gallery/plot_transforms.py
rename to gallery/others/plot_transforms.py
index ac6e50a397e..2cb0e34693c 100644
--- a/gallery/plot_transforms.py
+++ b/gallery/others/plot_transforms.py
@@ -19,7 +19,7 @@
 
 
 plt.rcParams["savefig.bbox"] = 'tight'
-orig_img = Image.open(Path('assets') / 'astronaut.jpg')
+orig_img = Image.open(Path('../assets') / 'astronaut.jpg')
 # if you change the seed, make sure that the randomly-applied transforms
 # properly show that the image can be both transformed and *not* transformed!
 torch.manual_seed(0)
diff --git a/gallery/plot_video_api.py b/gallery/others/plot_video_api.py
similarity index 100%
rename from gallery/plot_video_api.py
rename to gallery/others/plot_video_api.py
diff --git a/gallery/plot_visualization_utils.py b/gallery/others/plot_visualization_utils.py
similarity index 98%
rename from gallery/plot_visualization_utils.py
rename to gallery/others/plot_visualization_utils.py
index 5e629cb8cb8..bb3d1c8bcfc 100644
--- a/gallery/plot_visualization_utils.py
+++ b/gallery/others/plot_visualization_utils.py
@@ -41,8 +41,8 @@ def show(imgs):
 from torchvision.io import read_image
 from pathlib import Path
 
-dog1_int = read_image(str(Path('assets') / 'dog1.jpg'))
-dog2_int = read_image(str(Path('assets') / 'dog2.jpg'))
+dog1_int = read_image(str(Path('../assets') / 'dog1.jpg'))
+dog2_int = read_image(str(Path('../assets') / 'dog2.jpg'))
 dog_list = [dog1_int, dog2_int]
 
 grid = make_grid(dog_list)
@@ -360,7 +360,7 @@ def show(imgs):
 from torchvision.models.detection import keypointrcnn_resnet50_fpn, KeypointRCNN_ResNet50_FPN_Weights
 from torchvision.io import read_image
 
-person_int = read_image(str(Path("assets") / "person1.jpg"))
+person_int = read_image(str(Path("../assets") / "person1.jpg"))
 
 weights = KeypointRCNN_ResNet50_FPN_Weights.DEFAULT
 transforms = weights.transforms()
diff --git a/gallery/v2_transforms/README.rst b/gallery/v2_transforms/README.rst
new file mode 100644
index 00000000000..55a0893e847
--- /dev/null
+++ b/gallery/v2_transforms/README.rst
@@ -0,0 +1,2 @@
+V2 transforms
+-------------
diff --git a/gallery/plot_custom_datapoints.py b/gallery/v2_transforms/plot_custom_datapoints.py
similarity index 98%
rename from gallery/plot_custom_datapoints.py
rename to gallery/v2_transforms/plot_custom_datapoints.py
index a8db878119a..dcad5f0a406 100644
--- a/gallery/plot_custom_datapoints.py
+++ b/gallery/v2_transforms/plot_custom_datapoints.py
@@ -6,7 +6,7 @@
 This guide is intended for advanced users and downstream library maintainers. We explain how to
 write your own datapoint class, and how to make it compatible with the built-in
 Torchvision v2 transforms. Before continuing, make sure you have read
-:ref:`sphx_glr_auto_examples_plot_datapoints.py`.
+:ref:`sphx_glr_auto_examples_v2_transforms_plot_datapoints.py`.
 """
 
 # %%
diff --git a/gallery/plot_custom_transforms.py b/gallery/v2_transforms/plot_custom_transforms.py
similarity index 100%
rename from gallery/plot_custom_transforms.py
rename to gallery/v2_transforms/plot_custom_transforms.py
diff --git a/gallery/plot_cutmix_mixup.py b/gallery/v2_transforms/plot_cutmix_mixup.py
similarity index 100%
rename from gallery/plot_cutmix_mixup.py
rename to gallery/v2_transforms/plot_cutmix_mixup.py
diff --git a/gallery/plot_datapoints.py b/gallery/v2_transforms/plot_datapoints.py
similarity index 99%
rename from gallery/plot_datapoints.py
rename to gallery/v2_transforms/plot_datapoints.py
index b98f032d6f1..0bab2d34088 100644
--- a/gallery/plot_datapoints.py
+++ b/gallery/v2_transforms/plot_datapoints.py
@@ -92,7 +92,7 @@
 # In addition, :class:`~torchvision.datapoints.Image` and :class:`~torchvision.datapoints.Mask` can also take a
 # :class:`PIL.Image.Image` directly:
 
-image = datapoints.Image(PIL.Image.open("assets/astronaut.jpg"))
+image = datapoints.Image(PIL.Image.open("../assets/astronaut.jpg"))
 print(image.shape, image.dtype)
 
 # %%
diff --git a/gallery/plot_transforms_v2.py b/gallery/v2_transforms/plot_transforms_v2.py
similarity index 95%
rename from gallery/plot_transforms_v2.py
rename to gallery/v2_transforms/plot_transforms_v2.py
index b85481ae1a5..e6c8b3ffdb5 100644
--- a/gallery/plot_transforms_v2.py
+++ b/gallery/v2_transforms/plot_transforms_v2.py
@@ -19,7 +19,7 @@ def load_data():
     from torchvision import datapoints
     from torchvision.ops import masks_to_boxes
 
-    assets_directory = pathlib.Path("assets")
+    assets_directory = pathlib.Path("../assets")
 
     path = assets_directory / "FudanPed00054.png"
     image = datapoints.Image(read_image(str(path)))
@@ -72,9 +72,9 @@ def load_data():
 
 # %%
 # Under the hood, :mod:`torchvision.transforms.v2` relies on :mod:`torchvision.datapoints` for the dispatch to the
-# appropriate function for the input data: :ref:`sphx_glr_auto_examples_plot_datapoints.py`. Note however, that as
+# appropriate function for the input data: :ref:`sphx_glr_auto_examples_v2_transforms_plot_datapoints.py`. Note however, that as
 # regular user, you likely don't have to touch this yourself. See
-# :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`.
+# :ref:`sphx_glr_auto_examples_v2_transforms_plot_transforms_v2_e2e.py`.
 #
 # All "foreign" types like :class:`str`'s or :class:`pathlib.Path`'s are passed through, allowing to store extra
 # information directly with the sample:
diff --git a/gallery/plot_transforms_v2_e2e.py b/gallery/v2_transforms/plot_transforms_v2_e2e.py
similarity index 99%
rename from gallery/plot_transforms_v2_e2e.py
rename to gallery/v2_transforms/plot_transforms_v2_e2e.py
index b837b9ba972..e6a36ebbf58 100644
--- a/gallery/plot_transforms_v2_e2e.py
+++ b/gallery/v2_transforms/plot_transforms_v2_e2e.py
@@ -55,7 +55,7 @@ def show(sample):
 def load_example_coco_detection_dataset(**kwargs):
     # This loads fake data for illustration purposes of this example. In practice, you'll have
     # to replace this with the proper data
-    root = pathlib.Path("assets") / "coco"
+    root = pathlib.Path("../assets") / "coco"
     return datasets.CocoDetection(str(root / "images"), str(root / "instances.json"), **kwargs)
 
 
diff --git a/references/classification/presets.py b/references/classification/presets.py
index 9b53f0ccd5d..84651493f01 100644
--- a/references/classification/presets.py
+++ b/references/classification/presets.py
@@ -68,6 +68,9 @@ def __init__(
         if random_erase_prob > 0:
             transforms.append(T.RandomErasing(p=random_erase_prob))
 
+        if use_v2:
+            transforms.append(T.ToPureTensor())
+
         self.transforms = T.Compose(transforms)
 
     def __call__(self, img):
@@ -107,6 +110,9 @@ def __init__(
             T.Normalize(mean=mean, std=std),
         ]
 
+        if use_v2:
+            transforms.append(T.ToPureTensor())
+
         self.transforms = T.Compose(transforms)
 
     def __call__(self, img):
diff --git a/references/detection/presets.py b/references/detection/presets.py
index 09ca148a263..0949a99896e 100644
--- a/references/detection/presets.py
+++ b/references/detection/presets.py
@@ -79,6 +79,7 @@ def __init__(
             transforms += [
                 T.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.XYXY),
                 T.SanitizeBoundingBoxes(),
+                T.ToPureTensor(),
             ]
 
         self.transforms = T.Compose(transforms)
@@ -103,6 +104,10 @@ def __init__(self, backend="pil", use_v2=False):
             raise ValueError(f"backend can be 'datapoint', 'tensor' or 'pil', but got {backend}")
 
         transforms += [T.ConvertImageDtype(torch.float)]
+
+        if use_v2:
+            transforms += [T.ToPureTensor()]
+
         self.transforms = T.Compose(transforms)
 
     def __call__(self, img, target):
diff --git a/references/segmentation/presets.py b/references/segmentation/presets.py
index 755cb236dcb..7b7d0493bd2 100644
--- a/references/segmentation/presets.py
+++ b/references/segmentation/presets.py
@@ -63,6 +63,8 @@ def __init__(
             transforms += [T.ConvertImageDtype(torch.float)]
 
         transforms += [T.Normalize(mean=mean, std=std)]
+        if use_v2:
+            transforms += [T.ToPureTensor()]
 
         self.transforms = T.Compose(transforms)
 
@@ -98,6 +100,9 @@ def __init__(
             T.ConvertImageDtype(torch.float),
             T.Normalize(mean=mean, std=std),
         ]
+        if use_v2:
+            transforms += [T.ToPureTensor()]
+
         self.transforms = T.Compose(transforms)
 
     def __call__(self, img, target):
diff --git a/test/common_utils.py b/test/common_utils.py
index 9713901bdcf..c815786b586 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -1,7 +1,4 @@
-import collections.abc
 import contextlib
-import dataclasses
-import enum
 import functools
 import itertools
 import os
@@ -12,12 +9,9 @@
 import sys
 import tempfile
 import warnings
-from collections import defaultdict
 from subprocess import CalledProcessError, check_output, STDOUT
-from typing import Callable, Sequence, Tuple, Union
 
 import numpy as np
-
 import PIL.Image
 import pytest
 import torch
@@ -27,7 +21,7 @@
 from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair
 from torchvision import datapoints, io
 from torchvision.transforms._functional_tensor import _max_value as get_max_value
-from torchvision.transforms.v2.functional import to_dtype_image, to_image, to_pil_image
+from torchvision.transforms.v2.functional import to_image, to_pil_image
 
 
 IN_OSS_CI = any(os.getenv(var) == "true" for var in ["CIRCLECI", "GITHUB_ACTIONS"])
@@ -363,132 +357,7 @@ def assert_close(
 assert_equal = functools.partial(assert_close, rtol=0, atol=0)
 
 
-def parametrized_error_message(*args, **kwargs):
-    def to_str(obj):
-        if isinstance(obj, torch.Tensor) and obj.numel() > 30:
-            return f"tensor(shape={list(obj.shape)}, dtype={obj.dtype}, device={obj.device})"
-        elif isinstance(obj, enum.Enum):
-            return f"{type(obj).__name__}.{obj.name}"
-        else:
-            return repr(obj)
-
-    if args or kwargs:
-        postfix = "\n".join(
-            [
-                "",
-                "Failure happened for the following parameters:",
-                "",
-                *[to_str(arg) for arg in args],
-                *[f"{name}={to_str(kwarg)}" for name, kwarg in kwargs.items()],
-            ]
-        )
-    else:
-        postfix = ""
-
-    def wrapper(msg):
-        return msg + postfix
-
-    return wrapper
-
-
-class ArgsKwargs:
-    def __init__(self, *args, **kwargs):
-        self.args = args
-        self.kwargs = kwargs
-
-    def __iter__(self):
-        yield self.args
-        yield self.kwargs
-
-    def load(self, device="cpu"):
-        return ArgsKwargs(
-            *(arg.load(device) if isinstance(arg, TensorLoader) else arg for arg in self.args),
-            **{
-                keyword: arg.load(device) if isinstance(arg, TensorLoader) else arg
-                for keyword, arg in self.kwargs.items()
-            },
-        )
-
-
-# new v2 default
 DEFAULT_SIZE = (17, 11)
-# old v2 defaults
-DEFAULT_SQUARE_SPATIAL_SIZE = 15
-DEFAULT_LANDSCAPE_SPATIAL_SIZE = (7, 33)
-DEFAULT_PORTRAIT_SPATIAL_SIZE = (31, 9)
-DEFAULT_SPATIAL_SIZES = (
-    DEFAULT_LANDSCAPE_SPATIAL_SIZE,
-    DEFAULT_PORTRAIT_SPATIAL_SIZE,
-    DEFAULT_SQUARE_SPATIAL_SIZE,
-)
-
-
-def _parse_size(size, *, name="size"):
-    if size == "random":
-        raise ValueError("This should never happen")
-    elif isinstance(size, int) and size > 0:
-        return (size, size)
-    elif (
-        isinstance(size, collections.abc.Sequence)
-        and len(size) == 2
-        and all(isinstance(length, int) and length > 0 for length in size)
-    ):
-        return tuple(size)
-    else:
-        raise pytest.UsageError(
-            f"'{name}' can either be `'random'`, a positive integer, or a sequence of two positive integers,"
-            f"but got {size} instead."
-        )
-
-
-VALID_EXTRA_DIMS = ((), (4,), (2, 3))
-DEGENERATE_BATCH_DIMS = ((0,), (5, 0), (0, 5))
-
-DEFAULT_EXTRA_DIMS = (*VALID_EXTRA_DIMS, *DEGENERATE_BATCH_DIMS)
-
-
-def from_loader(loader_fn):
-    def wrapper(*args, **kwargs):
-        device = kwargs.pop("device", "cpu")
-        loader = loader_fn(*args, **kwargs)
-        return loader.load(device)
-
-    return wrapper
-
-
-def from_loaders(loaders_fn):
-    def wrapper(*args, **kwargs):
-        device = kwargs.pop("device", "cpu")
-        loaders = loaders_fn(*args, **kwargs)
-        for loader in loaders:
-            yield loader.load(device)
-
-    return wrapper
-
-
-@dataclasses.dataclass
-class TensorLoader:
-    fn: Callable[[Sequence[int], torch.dtype, Union[str, torch.device]], torch.Tensor]
-    shape: Sequence[int]
-    dtype: torch.dtype
-
-    def load(self, device):
-        return self.fn(self.shape, self.dtype, device)
-
-
-@dataclasses.dataclass
-class ImageLoader(TensorLoader):
-    spatial_size: Tuple[int, int] = dataclasses.field(init=False)
-    num_channels: int = dataclasses.field(init=False)
-    memory_format: torch.memory_format = torch.contiguous_format
-    canvas_size: Tuple[int, int] = dataclasses.field(init=False)
-
-    def __post_init__(self):
-        self.spatial_size = self.canvas_size = self.shape[-2:]
-        self.num_channels = self.shape[-3]
-
-    def load(self, device):
-        return self.fn(self.shape, self.dtype, device, memory_format=self.memory_format)
 
 
 NUM_CHANNELS_MAP = {
@@ -499,13 +368,6 @@ def load(self, device):
 }
 
 
-def get_num_channels(color_space):
-    num_channels = NUM_CHANNELS_MAP.get(color_space)
-    if not num_channels:
-        raise pytest.UsageError(f"Can't determine the number of channels for color space {color_space}")
-    return num_channels
-
-
 def make_image(
     size=DEFAULT_SIZE,
     *,
@@ -515,10 +377,11 @@ def make_image(
     device="cpu",
     memory_format=torch.contiguous_format,
 ):
+    num_channels = NUM_CHANNELS_MAP[color_space]
     dtype = dtype or torch.uint8
     max_value = get_max_value(dtype)
     data = torch.testing.make_tensor(
-        (*batch_dims, get_num_channels(color_space), *size),
+        (*batch_dims, num_channels, *size),
         low=0,
         high=max_value,
         dtype=dtype,
@@ -539,109 +402,7 @@ def make_image_pil(*args, **kwargs):
     return to_pil_image(make_image(*args, **kwargs))
 
 
-def make_image_loader(
-    size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
-    *,
-    color_space="RGB",
-    extra_dims=(),
-    dtype=torch.float32,
-    constant_alpha=True,
-    memory_format=torch.contiguous_format,
-):
-    if not constant_alpha:
-        raise ValueError("This should never happen")
-    size = _parse_size(size)
-    num_channels = get_num_channels(color_space)
-
-    def fn(shape, dtype, device, memory_format):
-        *batch_dims, _, height, width = shape
-        return make_image(
-            (height, width),
-            color_space=color_space,
-            batch_dims=batch_dims,
-            dtype=dtype,
-            device=device,
-            memory_format=memory_format,
-        )
-
-    return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype, memory_format=memory_format)
-
-
-def make_image_loaders(
-    *,
-    sizes=DEFAULT_SPATIAL_SIZES,
-    color_spaces=(
-        "GRAY",
-        "GRAY_ALPHA",
-        "RGB",
-        "RGBA",
-    ),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.float32, torch.float64, torch.uint8),
-    constant_alpha=True,
-):
-    for params in combinations_grid(size=sizes, color_space=color_spaces, extra_dims=extra_dims, dtype=dtypes):
-        yield make_image_loader(**params, constant_alpha=constant_alpha)
-
-
-make_images = from_loaders(make_image_loaders)
-
-
-def make_image_loader_for_interpolation(
-    size=(233, 147), *, color_space="RGB", dtype=torch.uint8, memory_format=torch.contiguous_format
-):
-    size = _parse_size(size)
-    num_channels = get_num_channels(color_space)
-
-    def fn(shape, dtype, device, memory_format):
-        height, width = shape[-2:]
-
-        image_pil = (
-            PIL.Image.open(pathlib.Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg")
-            .resize((width, height))
-            .convert(
-                {
-                    "GRAY": "L",
-                    "GRAY_ALPHA": "LA",
-                    "RGB": "RGB",
-                    "RGBA": "RGBA",
-                }[color_space]
-            )
-        )
-
-        image_tensor = to_image(image_pil)
-        if memory_format == torch.contiguous_format:
-            image_tensor = image_tensor.to(device=device, memory_format=memory_format, copy=True)
-        else:
-            image_tensor = image_tensor.to(device=device)
-        image_tensor = to_dtype_image(image_tensor, dtype=dtype, scale=True)
-
-        return datapoints.Image(image_tensor)
-
-    return ImageLoader(fn, shape=(num_channels, *size), dtype=dtype, memory_format=memory_format)
-
-
-def make_image_loaders_for_interpolation(
-    sizes=((233, 147),),
-    color_spaces=("RGB",),
-    dtypes=(torch.uint8,),
-    memory_formats=(torch.contiguous_format, torch.channels_last),
-):
-    for params in combinations_grid(size=sizes, color_space=color_spaces, dtype=dtypes, memory_format=memory_formats):
-        yield make_image_loader_for_interpolation(**params)
-
-
-@dataclasses.dataclass
-class BoundingBoxesLoader(TensorLoader):
-    format: datapoints.BoundingBoxFormat
-    spatial_size: Tuple[int, int]
-    canvas_size: Tuple[int, int] = dataclasses.field(init=False)
-
-    def __post_init__(self):
-        self.canvas_size = self.spatial_size
-
-
-def make_bounding_box(
+def make_bounding_boxes(
     canvas_size=DEFAULT_SIZE,
     *,
     format=datapoints.BoundingBoxFormat.XYXY,
@@ -687,42 +448,6 @@ def sample_position(values, max_value):
     )
 
 
-def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.float32):
-    if isinstance(format, str):
-        format = datapoints.BoundingBoxFormat[format]
-
-    spatial_size = _parse_size(spatial_size, name="spatial_size")
-
-    def fn(shape, dtype, device):
-        *batch_dims, num_coordinates = shape
-        if num_coordinates != 4:
-            raise pytest.UsageError()
-
-        return make_bounding_box(
-            format=format, canvas_size=spatial_size, batch_dims=batch_dims, dtype=dtype, device=device
-        )
-
-    return BoundingBoxesLoader(fn, shape=(*extra_dims[-1:], 4), dtype=dtype, format=format, spatial_size=spatial_size)
-
-
-def make_bounding_box_loaders(
-    *,
-    extra_dims=tuple(d for d in DEFAULT_EXTRA_DIMS if len(d) < 2),
-    formats=tuple(datapoints.BoundingBoxFormat),
-    spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
-    dtypes=(torch.float32, torch.float64, torch.int64),
-):
-    for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes):
-        yield make_bounding_box_loader(**params, spatial_size=spatial_size)
-
-
-make_bounding_boxes = from_loaders(make_bounding_box_loaders)
-
-
-class MaskLoader(TensorLoader):
-    pass
-
-
 def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtype=None, device="cpu"):
     """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks"""
     return datapoints.Mask(
@@ -736,32 +461,6 @@ def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtyp
     )
 
 
-def make_detection_mask_loader(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_objects=5, extra_dims=(), dtype=torch.uint8):
-    # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects
-    size = _parse_size(size)
-
-    def fn(shape, dtype, device):
-        *batch_dims, num_objects, height, width = shape
-        return make_detection_mask(
-            (height, width), num_objects=num_objects, batch_dims=batch_dims, dtype=dtype, device=device
-        )
-
-    return MaskLoader(fn, shape=(*extra_dims, num_objects, *size), dtype=dtype)
-
-
-def make_detection_mask_loaders(
-    sizes=DEFAULT_SPATIAL_SIZES,
-    num_objects=(1, 0, 5),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8,),
-):
-    for params in combinations_grid(size=sizes, num_objects=num_objects, extra_dims=extra_dims, dtype=dtypes):
-        yield make_detection_mask_loader(**params)
-
-
-make_detection_masks = from_loaders(make_detection_mask_loaders)
-
-
 def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"):
     """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value"""
     return datapoints.Mask(
@@ -775,56 +474,6 @@ def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(
     )
 
 
-def make_segmentation_mask_loader(
-    size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_categories=10, extra_dims=(), dtype=torch.uint8
-):
-    # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
-    size = _parse_size(size)
-
-    def fn(shape, dtype, device):
-        *batch_dims, height, width = shape
-        return make_segmentation_mask(
-            (height, width), num_categories=num_categories, batch_dims=batch_dims, dtype=dtype, device=device
-        )
-
-    return MaskLoader(fn, shape=(*extra_dims, *size), dtype=dtype)
-
-
-def make_segmentation_mask_loaders(
-    *,
-    sizes=DEFAULT_SPATIAL_SIZES,
-    num_categories=(1, 2, 10),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8,),
-):
-    for params in combinations_grid(size=sizes, num_categories=num_categories, extra_dims=extra_dims, dtype=dtypes):
-        yield make_segmentation_mask_loader(**params)
-
-
-make_segmentation_masks = from_loaders(make_segmentation_mask_loaders)
-
-
-def make_mask_loaders(
-    *,
-    sizes=DEFAULT_SPATIAL_SIZES,
-    num_objects=(1, 0, 5),
-    num_categories=(1, 2, 10),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8,),
-):
-    yield from make_detection_mask_loaders(sizes=sizes, num_objects=num_objects, extra_dims=extra_dims, dtypes=dtypes)
-    yield from make_segmentation_mask_loaders(
-        sizes=sizes, num_categories=num_categories, extra_dims=extra_dims, dtypes=dtypes
-    )
-
-
-make_masks = from_loaders(make_mask_loaders)
-
-
-class VideoLoader(ImageLoader):
-    pass
-
-
 def make_video(size=DEFAULT_SIZE, *, num_frames=3, batch_dims=(), **kwargs):
     return datapoints.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs))
 
@@ -833,120 +482,6 @@ def make_video_tensor(*args, **kwargs):
     return make_video(*args, **kwargs).as_subclass(torch.Tensor)
 
 
-def make_video_loader(
-    size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
-    *,
-    color_space="RGB",
-    num_frames=3,
-    extra_dims=(),
-    dtype=torch.uint8,
-):
-    size = _parse_size(size)
-
-    def fn(shape, dtype, device, memory_format):
-        *batch_dims, num_frames, _, height, width = shape
-        return make_video(
-            (height, width),
-            num_frames=num_frames,
-            batch_dims=batch_dims,
-            color_space=color_space,
-            dtype=dtype,
-            device=device,
-            memory_format=memory_format,
-        )
-
-    return VideoLoader(fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype)
-
-
-def make_video_loaders(
-    *,
-    sizes=DEFAULT_SPATIAL_SIZES,
-    color_spaces=(
-        "GRAY",
-        "RGB",
-    ),
-    num_frames=(1, 0, 3),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8, torch.float32, torch.float64),
-):
-    for params in combinations_grid(
-        size=sizes, color_space=color_spaces, num_frames=num_frames, extra_dims=extra_dims, dtype=dtypes
-    ):
-        yield make_video_loader(**params)
-
-
-make_videos = from_loaders(make_video_loaders)
-
-
-class TestMark:
-    def __init__(
-        self,
-        # Tuple of test class name and test function name that identifies the test the mark is applied to. If there is
-        # no test class, i.e. a standalone test function, use `None`.
-        test_id,
-        # `pytest.mark.*` to apply, e.g. `pytest.mark.skip` or `pytest.mark.xfail`
-        mark,
-        *,
-        # Callable, that will be passed an `ArgsKwargs` and should return a boolean to indicate if the mark will be
-        # applied. If omitted, defaults to always apply.
-        condition=None,
-    ):
-        self.test_id = test_id
-        self.mark = mark
-        self.condition = condition or (lambda args_kwargs: True)
-
-
-def mark_framework_limitation(test_id, reason, condition=None):
-    # The purpose of this function is to have a single entry point for skip marks that are only there, because the test
-    # framework cannot handle the kernel in general or a specific parameter combination.
-    # As development progresses, we can change the `mark.skip` to `mark.xfail` from time to time to see if the skip is
-    # still justified.
-    # We don't want to use `mark.xfail` all the time, because that actually runs the test until an error happens. Thus,
-    # we are wasting CI resources for no reason for most of the time
-    return TestMark(test_id, pytest.mark.skip(reason=reason), condition=condition)
-
-
-class InfoBase:
-    def __init__(
-        self,
-        *,
-        # Identifier if the info that shows up the parametrization.
-        id,
-        # Test markers that will be (conditionally) applied to an `ArgsKwargs` parametrization.
-        # See the `TestMark` class for details
-        test_marks=None,
-        # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`. Keys are a 3-tuple of `test_id` (see
-        # `TestMark`), the dtype, and the device.
-        closeness_kwargs=None,
-    ):
-        self.id = id
-
-        self.test_marks = test_marks or []
-        test_marks_map = defaultdict(list)
-        for test_mark in self.test_marks:
-            test_marks_map[test_mark.test_id].append(test_mark)
-        self._test_marks_map = dict(test_marks_map)
-
-        self.closeness_kwargs = closeness_kwargs or dict()
-
-    def get_marks(self, test_id, args_kwargs):
-        return [
-            test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs)
-        ]
-
-    def get_closeness_kwargs(self, test_id, *, dtype, device):
-        if not (isinstance(test_id, tuple) and len(test_id) == 2):
-            msg = "`test_id` should be a `Tuple[Optional[str], str]` denoting the test class and function name"
-            if callable(test_id):
-                msg += ". Did you forget to add the `test_id` fixture to parameters of the test?"
-            else:
-                msg += f", but got {test_id} instead."
-            raise pytest.UsageError(msg)
-        if isinstance(device, torch.device):
-            device = device.type
-        return self.closeness_kwargs.get((test_id, dtype, device), dict())
-
-
 def assert_run_python_script(source_code):
     """Utility to check assertions in an independent Python subprocess.
 
diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index 8259246c0cb..acbe1a6a77a 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -4,12 +4,12 @@
 
 import pytest
 import torch
-
-from common_utils import combinations_grid, DEFAULT_EXTRA_DIMS, from_loader, from_loaders, TensorLoader
 from torch.nn.functional import one_hot
 
 from torchvision.prototype import datapoints
 
+from transforms_v2_legacy_utils import combinations_grid, DEFAULT_EXTRA_DIMS, from_loader, from_loaders, TensorLoader
+
 
 @dataclasses.dataclass
 class LabelLoader(TensorLoader):
diff --git a/test/test_datapoints.py b/test/test_datapoints.py
index 4da2eb39383..1aeb2367752 100644
--- a/test/test_datapoints.py
+++ b/test/test_datapoints.py
@@ -2,7 +2,7 @@
 
 import pytest
 import torch
-from common_utils import assert_equal, make_bounding_box, make_image, make_segmentation_mask, make_video
+from common_utils import assert_equal, make_bounding_boxes, make_image, make_segmentation_mask, make_video
 from PIL import Image
 
 from torchvision import datapoints
@@ -68,7 +68,7 @@ def test_new_requires_grad(data, input_requires_grad, expected_requires_grad):
     assert datapoint.requires_grad is expected_requires_grad
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 def test_isinstance(make_input):
     assert isinstance(make_input(), torch.Tensor)
 
@@ -80,7 +80,7 @@ def test_wrapping_no_copy():
     assert image.data_ptr() == tensor.data_ptr()
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 def test_to_wrapping(make_input):
     dp = make_input()
 
@@ -90,7 +90,7 @@ def test_to_wrapping(make_input):
     assert dp_to.dtype is torch.float64
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 def test_to_datapoint_reference(make_input, return_type):
     tensor = torch.rand((3, 16, 16), dtype=torch.float64)
@@ -104,7 +104,7 @@ def test_to_datapoint_reference(make_input, return_type):
     assert type(tensor) is torch.Tensor
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 def test_clone_wrapping(make_input, return_type):
     dp = make_input()
@@ -116,7 +116,7 @@ def test_clone_wrapping(make_input, return_type):
     assert dp_clone.data_ptr() != dp.data_ptr()
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 def test_requires_grad__wrapping(make_input, return_type):
     dp = make_input(dtype=torch.float)
@@ -131,7 +131,7 @@ def test_requires_grad__wrapping(make_input, return_type):
     assert dp_requires_grad.requires_grad
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 def test_detach_wrapping(make_input, return_type):
     dp = make_input(dtype=torch.float).requires_grad_(True)
@@ -170,7 +170,7 @@ def test_force_subclass_with_metadata(return_type):
     datapoints.set_return_type("tensor")
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 def test_other_op_no_wrapping(make_input, return_type):
     dp = make_input()
@@ -182,7 +182,7 @@ def test_other_op_no_wrapping(make_input, return_type):
     assert type(output) is (type(dp) if return_type == "datapoint" else torch.Tensor)
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize(
     "op",
     [
@@ -199,7 +199,7 @@ def test_no_tensor_output_op_no_wrapping(make_input, op):
     assert type(output) is not type(dp)
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 def test_inplace_op_no_wrapping(make_input, return_type):
     dp = make_input()
@@ -212,7 +212,7 @@ def test_inplace_op_no_wrapping(make_input, return_type):
     assert type(dp) is original_type
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 def test_wrap(make_input):
     dp = make_input()
 
@@ -225,7 +225,7 @@ def test_wrap(make_input):
     assert dp_new.data_ptr() == output.data_ptr()
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("requires_grad", [False, True])
 def test_deepcopy(make_input, requires_grad):
     dp = make_input(dtype=torch.float)
@@ -242,7 +242,7 @@ def test_deepcopy(make_input, requires_grad):
     assert dp_deepcopied.requires_grad is requires_grad
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 @pytest.mark.parametrize(
     "op",
diff --git a/test/test_prototype_datasets_builtin.py b/test/test_prototype_datasets_builtin.py
index 4d19b67967f..e29dfb17fe1 100644
--- a/test/test_prototype_datasets_builtin.py
+++ b/test/test_prototype_datasets_builtin.py
@@ -25,7 +25,7 @@
 from torchvision.prototype.datapoints import Label
 from torchvision.prototype.datasets.utils import EncodedImage
 from torchvision.prototype.datasets.utils._internal import INFINITE_BUFFER_SIZE
-from torchvision.transforms.v2.utils import is_simple_tensor
+from torchvision.transforms.v2.utils import is_pure_tensor
 
 
 def assert_samples_equal(*args, msg=None, **kwargs):
@@ -140,18 +140,18 @@ def make_msg_and_close(head):
             raise AssertionError(make_msg_and_close("The following streams were not closed after a full iteration:"))
 
     @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_no_unaccompanied_simple_tensors(self, dataset_mock, config):
+    def test_no_unaccompanied_pure_tensors(self, dataset_mock, config):
         dataset, _ = dataset_mock.load(config)
         sample = next_consume(iter(dataset))
 
-        simple_tensors = {key for key, value in sample.items() if is_simple_tensor(value)}
+        pure_tensors = {key for key, value in sample.items() if is_pure_tensor(value)}
 
-        if simple_tensors and not any(
+        if pure_tensors and not any(
             isinstance(item, (datapoints.Image, datapoints.Video, EncodedImage)) for item in sample.values()
         ):
             raise AssertionError(
                 f"The values of key(s) "
-                f"{sequence_to_str(sorted(simple_tensors), separate_last='and ')} contained simple tensors, "
+                f"{sequence_to_str(sorted(pure_tensors), separate_last='and ')} contained pure tensors, "
                 f"but didn't find any (encoded) image or video."
             )
 
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 32a68e14017..0410ecadc48 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -4,21 +4,21 @@
 import pytest
 import torch
 
-from common_utils import (
-    assert_equal,
-    DEFAULT_EXTRA_DIMS,
-    make_bounding_box,
-    make_detection_mask,
-    make_image,
-    make_video,
-)
+from common_utils import assert_equal
 
 from prototype_common_utils import make_label
 
 from torchvision.datapoints import BoundingBoxes, BoundingBoxFormat, Image, Mask, Video
 from torchvision.prototype import datapoints, transforms
 from torchvision.transforms.v2.functional import clamp_bounding_boxes, InterpolationMode, pil_to_tensor, to_pil_image
-from torchvision.transforms.v2.utils import check_type, is_simple_tensor
+from torchvision.transforms.v2.utils import check_type, is_pure_tensor
+from transforms_v2_legacy_utils import (
+    DEFAULT_EXTRA_DIMS,
+    make_bounding_boxes,
+    make_detection_mask,
+    make_image,
+    make_video,
+)
 
 BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims]
 
@@ -167,7 +167,7 @@ def test__get_params(self, mocker):
 
         flat_inputs = [
             make_image(size=canvas_size, color_space="RGB"),
-            make_bounding_box(format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=batch_shape),
+            make_bounding_boxes(format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=batch_shape),
         ]
         params = transform._get_params(flat_inputs)
 
@@ -202,7 +202,7 @@ def test__transform_culling(self, mocker):
             ),
         )
 
-        bounding_boxes = make_bounding_box(
+        bounding_boxes = make_bounding_boxes(
             format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(batch_size,)
         )
         masks = make_detection_mask(size=canvas_size, batch_dims=(batch_size,))
@@ -240,7 +240,7 @@ def test__transform_bounding_boxes_clamping(self, mocker):
             ),
         )
 
-        bounding_boxes = make_bounding_box(
+        bounding_boxes = make_bounding_boxes(
             format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(batch_size,)
         )
         mock = mocker.patch(
@@ -283,7 +283,7 @@ class TestPermuteDimensions:
     def test_call(self, dims, inverse_dims):
         sample = dict(
             image=make_image(),
-            bounding_boxes=make_bounding_box(format=BoundingBoxFormat.XYXY),
+            bounding_boxes=make_bounding_boxes(format=BoundingBoxFormat.XYXY),
             video=make_video(),
             str="str",
             int=0,
@@ -296,7 +296,7 @@ def test_call(self, dims, inverse_dims):
             value_type = type(value)
             transformed_value = transformed_sample[key]
 
-            if check_type(value, (Image, is_simple_tensor, Video)):
+            if check_type(value, (Image, is_pure_tensor, Video)):
                 if transform.dims.get(value_type) is not None:
                     assert transformed_value.permute(inverse_dims[value_type]).equal(value)
                 assert type(transformed_value) == torch.Tensor
@@ -327,7 +327,7 @@ class TestTransposeDimensions:
     def test_call(self, dims):
         sample = dict(
             image=make_image(),
-            bounding_boxes=make_bounding_box(format=BoundingBoxFormat.XYXY),
+            bounding_boxes=make_bounding_boxes(format=BoundingBoxFormat.XYXY),
             video=make_video(),
             str="str",
             int=0,
@@ -341,7 +341,7 @@ def test_call(self, dims):
             transformed_value = transformed_sample[key]
 
             transposed_dims = transform.dims.get(value_type)
-            if check_type(value, (Image, is_simple_tensor, Video)):
+            if check_type(value, (Image, is_pure_tensor, Video)):
                 if transposed_dims is not None:
                     assert transformed_value.transpose(*transposed_dims).equal(value)
                 assert type(transformed_value) == torch.Tensor
@@ -389,7 +389,7 @@ def make_datapoints():
 
         pil_image = to_pil_image(make_image(size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
             "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
         }
@@ -398,7 +398,7 @@ def make_datapoints():
 
         tensor_image = torch.Tensor(make_image(size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
             "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
         }
@@ -407,7 +407,7 @@ def make_datapoints():
 
         datapoint_image = make_image(size=size, color_space="RGB")
         target = {
-            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
             "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
         }
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index ade3bdf0b51..5752b323f79 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -11,25 +11,23 @@
 import torch
 import torchvision.transforms.v2 as transforms
 
-from common_utils import (
-    assert_equal,
-    assert_run_python_script,
-    cpu_and_cuda,
-    make_bounding_box,
+from common_utils import assert_equal, assert_run_python_script, cpu_and_cuda
+from torch.utils._pytree import tree_flatten, tree_unflatten
+from torchvision import datapoints
+from torchvision.ops.boxes import box_iou
+from torchvision.transforms.functional import to_pil_image
+from torchvision.transforms.v2 import functional as F
+from torchvision.transforms.v2.utils import check_type, is_pure_tensor, query_chw
+from transforms_v2_legacy_utils import (
     make_bounding_boxes,
     make_detection_mask,
     make_image,
     make_images,
+    make_multiple_bounding_boxes,
     make_segmentation_mask,
     make_video,
     make_videos,
 )
-from torch.utils._pytree import tree_flatten, tree_unflatten
-from torchvision import datapoints
-from torchvision.ops.boxes import box_iou
-from torchvision.transforms.functional import to_pil_image
-from torchvision.transforms.v2 import functional as F
-from torchvision.transforms.v2.utils import check_type, is_simple_tensor, query_chw
 
 
 def make_vanilla_tensor_images(*args, **kwargs):
@@ -45,7 +43,7 @@ def make_pil_images(*args, **kwargs):
 
 
 def make_vanilla_tensor_bounding_boxes(*args, **kwargs):
-    for bounding_boxes in make_bounding_boxes(*args, **kwargs):
+    for bounding_boxes in make_multiple_bounding_boxes(*args, **kwargs):
         yield bounding_boxes.data
 
 
@@ -71,7 +69,7 @@ def auto_augment_adapter(transform, input, device):
         if isinstance(value, (datapoints.BoundingBoxes, datapoints.Mask)):
             # AA transforms don't support bounding boxes or masks
             continue
-        elif check_type(value, (datapoints.Image, datapoints.Video, is_simple_tensor, PIL.Image.Image)):
+        elif check_type(value, (datapoints.Image, datapoints.Video, is_pure_tensor, PIL.Image.Image)):
             if image_or_video_found:
                 # AA transforms only support a single image or video
                 continue
@@ -101,7 +99,7 @@ def normalize_adapter(transform, input, device):
         if isinstance(value, PIL.Image.Image):
             # normalize doesn't support PIL images
             continue
-        elif check_type(value, (datapoints.Image, datapoints.Video, is_simple_tensor)):
+        elif check_type(value, (datapoints.Image, datapoints.Video, is_pure_tensor)):
             # normalize doesn't support integer images
             value = F.to_dtype(value, torch.float32, scale=True)
         adapted_input[key] = value
@@ -180,13 +178,13 @@ def test_common(self, transform, adapter, container_type, image_or_video, device
             image_datapoint=make_image(size=canvas_size),
             video_datapoint=make_video(size=canvas_size),
             image_pil=next(make_pil_images(sizes=[canvas_size], color_spaces=["RGB"])),
-            bounding_boxes_xyxy=make_bounding_box(
+            bounding_boxes_xyxy=make_bounding_boxes(
                 format=datapoints.BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(3,)
             ),
-            bounding_boxes_xywh=make_bounding_box(
+            bounding_boxes_xywh=make_bounding_boxes(
                 format=datapoints.BoundingBoxFormat.XYWH, canvas_size=canvas_size, batch_dims=(4,)
             ),
-            bounding_boxes_cxcywh=make_bounding_box(
+            bounding_boxes_cxcywh=make_bounding_boxes(
                 format=datapoints.BoundingBoxFormat.CXCYWH, canvas_size=canvas_size, batch_dims=(5,)
             ),
             bounding_boxes_degenerate_xyxy=datapoints.BoundingBoxes(
@@ -357,19 +355,19 @@ def test_random_resized_crop(self, transform, input):
         3,
     ),
 )
-def test_simple_tensor_heuristic(flat_inputs):
-    def split_on_simple_tensor(to_split):
+def test_pure_tensor_heuristic(flat_inputs):
+    def split_on_pure_tensor(to_split):
         # This takes a sequence that is structurally aligned with `flat_inputs` and splits its items into three parts:
-        # 1. The first simple tensor. If none is present, this will be `None`
-        # 2. A list of the remaining simple tensors
+        # 1. The first pure tensor. If none is present, this will be `None`
+        # 2. A list of the remaining pure tensors
         # 3. A list of all other items
-        simple_tensors = []
+        pure_tensors = []
         others = []
         # Splitting always happens on the original `flat_inputs` to avoid any erroneous type changes by the transform to
         # affect the splitting.
         for item, inpt in zip(to_split, flat_inputs):
-            (simple_tensors if is_simple_tensor(inpt) else others).append(item)
-        return simple_tensors[0] if simple_tensors else None, simple_tensors[1:], others
+            (pure_tensors if is_pure_tensor(inpt) else others).append(item)
+        return pure_tensors[0] if pure_tensors else None, pure_tensors[1:], others
 
     class CopyCloneTransform(transforms.Transform):
         def _transform(self, inpt, params):
@@ -385,20 +383,20 @@ def was_applied(output, inpt):
             assert_equal(output, inpt)
             return True
 
-    first_simple_tensor_input, other_simple_tensor_inputs, other_inputs = split_on_simple_tensor(flat_inputs)
+    first_pure_tensor_input, other_pure_tensor_inputs, other_inputs = split_on_pure_tensor(flat_inputs)
 
     transform = CopyCloneTransform()
     transformed_sample = transform(flat_inputs)
 
-    first_simple_tensor_output, other_simple_tensor_outputs, other_outputs = split_on_simple_tensor(transformed_sample)
+    first_pure_tensor_output, other_pure_tensor_outputs, other_outputs = split_on_pure_tensor(transformed_sample)
 
-    if first_simple_tensor_input is not None:
+    if first_pure_tensor_input is not None:
         if other_inputs:
-            assert not transform.was_applied(first_simple_tensor_output, first_simple_tensor_input)
+            assert not transform.was_applied(first_pure_tensor_output, first_pure_tensor_input)
         else:
-            assert transform.was_applied(first_simple_tensor_output, first_simple_tensor_input)
+            assert transform.was_applied(first_pure_tensor_output, first_pure_tensor_input)
 
-    for output, inpt in zip(other_simple_tensor_outputs, other_simple_tensor_inputs):
+    for output, inpt in zip(other_pure_tensor_outputs, other_pure_tensor_inputs):
         assert not transform.was_applied(output, inpt)
 
     for input, output in zip(other_inputs, other_outputs):
@@ -813,7 +811,7 @@ def test__transform(self, mocker):
 
         size = (32, 24)
         image = make_image(size)
-        bboxes = make_bounding_box(format="XYXY", canvas_size=size, batch_dims=(6,))
+        bboxes = make_bounding_boxes(format="XYXY", canvas_size=size, batch_dims=(6,))
         masks = make_detection_mask(size, num_objects=6)
 
         sample = [image, bboxes, masks]
@@ -1004,7 +1002,7 @@ def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor):
         image = to_pil_image(image[0])
     elif image_type is torch.Tensor:
         image = image.as_subclass(torch.Tensor)
-        assert is_simple_tensor(image)
+        assert is_pure_tensor(image)
 
     label = 1 if label_type is int else torch.tensor([1])
 
@@ -1125,7 +1123,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
         image = to_pil_image(image[0])
     elif image_type is torch.Tensor:
         image = image.as_subclass(torch.Tensor)
-        assert is_simple_tensor(image)
+        assert is_pure_tensor(image)
 
     label = torch.randint(0, 10, size=(num_boxes,))
 
@@ -1146,7 +1144,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
     out = t(sample)
 
     if isinstance(to_tensor, transforms.ToTensor) and image_type is not datapoints.Image:
-        assert is_simple_tensor(out["image"])
+        assert is_pure_tensor(out["image"])
     else:
         assert isinstance(out["image"], datapoints.Image)
     assert isinstance(out["label"], type(sample["label"]))
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index 5855fbe447f..61de769d885 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -12,17 +12,7 @@
 
 import torch
 import torchvision.transforms.v2 as v2_transforms
-from common_utils import (
-    ArgsKwargs,
-    assert_close,
-    assert_equal,
-    make_bounding_box,
-    make_detection_mask,
-    make_image,
-    make_images,
-    make_segmentation_mask,
-    set_rng_seed,
-)
+from common_utils import assert_close, assert_equal, set_rng_seed
 from torch import nn
 from torchvision import datapoints, transforms as legacy_transforms
 from torchvision._utils import sequence_to_str
@@ -32,6 +22,14 @@
 from torchvision.transforms.v2._utils import _get_fill
 from torchvision.transforms.v2.functional import to_pil_image
 from torchvision.transforms.v2.utils import query_size
+from transforms_v2_legacy_utils import (
+    ArgsKwargs,
+    make_bounding_boxes,
+    make_detection_mask,
+    make_image,
+    make_images,
+    make_segmentation_mask,
+)
 
 DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=["RGB"], extra_dims=[(4,)])
 
@@ -602,7 +600,7 @@ def check_call_consistency(
             raise AssertionError(
                 f"Transforming a tensor image with shape {image_repr} failed in the prototype transform with "
                 f"the error above. This means there is a consistency bug either in `_get_params` or in the "
-                f"`is_simple_tensor` path in `_transform`."
+                f"`is_pure_tensor` path in `_transform`."
             ) from exc
 
         assert_close(
@@ -1090,7 +1088,7 @@ def make_label(extra_dims, categories):
 
         pil_image = to_pil_image(make_image(size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
@@ -1100,7 +1098,7 @@ def make_label(extra_dims, categories):
 
         tensor_image = torch.Tensor(make_image(size=size, color_space="RGB", dtype=torch.float32))
         target = {
-            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
@@ -1110,7 +1108,7 @@ def make_label(extra_dims, categories):
 
         datapoint_image = make_image(size=size, color_space="RGB", dtype=torch.float32)
         target = {
-            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index 14a1f82b2cf..15af5a7a9ed 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -8,25 +8,21 @@
 import pytest
 import torch
 
-from common_utils import (
-    assert_close,
-    cache,
-    cpu_and_cuda,
-    DEFAULT_SQUARE_SPATIAL_SIZE,
-    make_bounding_boxes,
-    needs_cuda,
-    parametrized_error_message,
-    set_rng_seed,
-)
+from common_utils import assert_close, cache, cpu_and_cuda, needs_cuda, set_rng_seed
 from torch.utils._pytree import tree_map
 from torchvision import datapoints
 from torchvision.transforms.functional import _get_perspective_coeffs
 from torchvision.transforms.v2 import functional as F
 from torchvision.transforms.v2.functional._geometry import _center_crop_compute_padding
 from torchvision.transforms.v2.functional._meta import clamp_bounding_boxes, convert_format_bounding_boxes
-from torchvision.transforms.v2.utils import is_simple_tensor
+from torchvision.transforms.v2.utils import is_pure_tensor
 from transforms_v2_dispatcher_infos import DISPATCHER_INFOS
 from transforms_v2_kernel_infos import KERNEL_INFOS
+from transforms_v2_legacy_utils import (
+    DEFAULT_SQUARE_SPATIAL_SIZE,
+    make_multiple_bounding_boxes,
+    parametrized_error_message,
+)
 
 
 KERNEL_INFOS_MAP = {info.kernel: info for info in KERNEL_INFOS}
@@ -168,7 +164,7 @@ def _unbatch(self, batch, *, data_dims):
     def test_batched_vs_single(self, test_id, info, args_kwargs, device):
         (batched_input, *other_args), kwargs = args_kwargs.load(device)
 
-        datapoint_type = datapoints.Image if is_simple_tensor(batched_input) else type(batched_input)
+        datapoint_type = datapoints.Image if is_pure_tensor(batched_input) else type(batched_input)
         # This dictionary contains the number of rightmost dimensions that contain the actual data.
         # Everything to the left is considered a batch dimension.
         data_dims = {
@@ -333,9 +329,9 @@ def test_scripted_smoke(self, info, args_kwargs, device):
         dispatcher = script(info.dispatcher)
 
         (image_datapoint, *other_args), kwargs = args_kwargs.load(device)
-        image_simple_tensor = torch.Tensor(image_datapoint)
+        image_pure_tensor = torch.Tensor(image_datapoint)
 
-        dispatcher(image_simple_tensor, *other_args, **kwargs)
+        dispatcher(image_pure_tensor, *other_args, **kwargs)
 
     # TODO: We need this until the dispatchers below also have `DispatcherInfo`'s. If they do, `test_scripted_smoke`
     #  replaces this test for them.
@@ -358,11 +354,11 @@ def test_scriptable(self, dispatcher):
         script(dispatcher)
 
     @image_sample_inputs
-    def test_simple_tensor_output_type(self, info, args_kwargs):
+    def test_pure_tensor_output_type(self, info, args_kwargs):
         (image_datapoint, *other_args), kwargs = args_kwargs.load()
-        image_simple_tensor = image_datapoint.as_subclass(torch.Tensor)
+        image_pure_tensor = image_datapoint.as_subclass(torch.Tensor)
 
-        output = info.dispatcher(image_simple_tensor, *other_args, **kwargs)
+        output = info.dispatcher(image_pure_tensor, *other_args, **kwargs)
 
         # We cannot use `isinstance` here since all datapoints are instances of `torch.Tensor` as well
         assert type(output) is torch.Tensor
@@ -505,11 +501,11 @@ class TestClampBoundingBoxes:
             dict(canvas_size=(1, 1)),
         ],
     )
-    def test_simple_tensor_insufficient_metadata(self, metadata):
-        simple_tensor = next(make_bounding_boxes()).as_subclass(torch.Tensor)
+    def test_pure_tensor_insufficient_metadata(self, metadata):
+        pure_tensor = next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor)
 
         with pytest.raises(ValueError, match=re.escape("`format` and `canvas_size` has to be passed")):
-            F.clamp_bounding_boxes(simple_tensor, **metadata)
+            F.clamp_bounding_boxes(pure_tensor, **metadata)
 
     @pytest.mark.parametrize(
         "metadata",
@@ -520,7 +516,7 @@ def test_simple_tensor_insufficient_metadata(self, metadata):
         ],
     )
     def test_datapoint_explicit_metadata(self, metadata):
-        datapoint = next(make_bounding_boxes())
+        datapoint = next(make_multiple_bounding_boxes())
 
         with pytest.raises(ValueError, match=re.escape("`format` and `canvas_size` must not be passed")):
             F.clamp_bounding_boxes(datapoint, **metadata)
@@ -530,22 +526,22 @@ class TestConvertFormatBoundingBoxes:
     @pytest.mark.parametrize(
         ("inpt", "old_format"),
         [
-            (next(make_bounding_boxes()), None),
-            (next(make_bounding_boxes()).as_subclass(torch.Tensor), datapoints.BoundingBoxFormat.XYXY),
+            (next(make_multiple_bounding_boxes()), None),
+            (next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor), datapoints.BoundingBoxFormat.XYXY),
         ],
     )
     def test_missing_new_format(self, inpt, old_format):
         with pytest.raises(TypeError, match=re.escape("missing 1 required argument: 'new_format'")):
             F.convert_format_bounding_boxes(inpt, old_format)
 
-    def test_simple_tensor_insufficient_metadata(self):
-        simple_tensor = next(make_bounding_boxes()).as_subclass(torch.Tensor)
+    def test_pure_tensor_insufficient_metadata(self):
+        pure_tensor = next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor)
 
         with pytest.raises(ValueError, match=re.escape("`old_format` has to be passed")):
-            F.convert_format_bounding_boxes(simple_tensor, new_format=datapoints.BoundingBoxFormat.CXCYWH)
+            F.convert_format_bounding_boxes(pure_tensor, new_format=datapoints.BoundingBoxFormat.CXCYWH)
 
     def test_datapoint_explicit_metadata(self):
-        datapoint = next(make_bounding_boxes())
+        datapoint = next(make_multiple_bounding_boxes())
 
         with pytest.raises(ValueError, match=re.escape("`old_format` must not be passed")):
             F.convert_format_bounding_boxes(
@@ -736,7 +732,7 @@ def _compute_expected_canvas_size(bbox, padding_):
         height, width = bbox.canvas_size
         return height + pad_up + pad_down, width + pad_left + pad_right
 
-    for bboxes in make_bounding_boxes(extra_dims=((4,),)):
+    for bboxes in make_multiple_bounding_boxes(extra_dims=((4,),)):
         bboxes = bboxes.to(device)
         bboxes_format = bboxes.format
         bboxes_canvas_size = bboxes.canvas_size
@@ -822,7 +818,7 @@ def _compute_expected_bbox(bbox, format_, canvas_size_, pcoeffs_):
     pcoeffs = _get_perspective_coeffs(startpoints, endpoints)
     inv_pcoeffs = _get_perspective_coeffs(endpoints, startpoints)
 
-    for bboxes in make_bounding_boxes(spatial_size=canvas_size, extra_dims=((4,),)):
+    for bboxes in make_multiple_bounding_boxes(spatial_size=canvas_size, extra_dims=((4,),)):
         bboxes = bboxes.to(device)
 
         output_bboxes = F.perspective_bounding_boxes(
@@ -870,7 +866,7 @@ def _compute_expected_bbox(bbox, format_, canvas_size_, output_size_):
         out_bbox = clamp_bounding_boxes(out_bbox, format=format_, canvas_size=output_size)
         return out_bbox.to(dtype=dtype, device=bbox.device)
 
-    for bboxes in make_bounding_boxes(extra_dims=((4,),)):
+    for bboxes in make_multiple_bounding_boxes(extra_dims=((4,),)):
         bboxes = bboxes.to(device)
         bboxes_format = bboxes.format
         bboxes_canvas_size = bboxes.canvas_size
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 339725327bd..f57736e5abd 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -19,7 +19,7 @@
     cpu_and_cuda,
     freeze_rng_state,
     ignore_jit_no_profile_information_warning,
-    make_bounding_box,
+    make_bounding_boxes,
     make_detection_mask,
     make_image,
     make_image_pil,
@@ -456,7 +456,7 @@ def test_kernel_bounding_boxes(self, format, size, use_max_size, dtype, device):
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
-        bounding_boxes = make_bounding_box(
+        bounding_boxes = make_bounding_boxes(
             format=format,
             canvas_size=self.INPUT_SIZE,
             dtype=dtype,
@@ -481,7 +481,7 @@ def test_kernel_video(self):
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     def test_functional(self, size, make_input):
         check_functional(
@@ -514,7 +514,7 @@ def test_functional_signature(self, kernel, input_type):
             make_image_tensor,
             make_image_pil,
             make_image,
-            make_bounding_box,
+            make_bounding_boxes,
             make_segmentation_mask,
             make_detection_mask,
             make_video,
@@ -579,7 +579,7 @@ def test_bounding_boxes_correctness(self, format, size, use_max_size, fn):
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
-        bounding_boxes = make_bounding_box(format=format, canvas_size=self.INPUT_SIZE)
+        bounding_boxes = make_bounding_boxes(format=format, canvas_size=self.INPUT_SIZE)
 
         actual = fn(bounding_boxes, size=size, **max_size_kwarg)
         expected = self._reference_resize_bounding_boxes(bounding_boxes, size=size, **max_size_kwarg)
@@ -618,7 +618,7 @@ def test_functional_pil_antialias_warning(self):
             make_image_tensor,
             make_image_pil,
             make_image,
-            make_bounding_box,
+            make_bounding_boxes,
             make_segmentation_mask,
             make_detection_mask,
             make_video,
@@ -687,7 +687,7 @@ def test_transform_unknown_size_error(self):
             make_image_tensor,
             make_image_pil,
             make_image,
-            make_bounding_box,
+            make_bounding_boxes,
             make_segmentation_mask,
             make_detection_mask,
             make_video,
@@ -714,7 +714,7 @@ def test_noop(self, size, make_input):
             make_image_tensor,
             make_image_pil,
             make_image,
-            make_bounding_box,
+            make_bounding_boxes,
             make_segmentation_mask,
             make_detection_mask,
             make_video,
@@ -743,7 +743,7 @@ def test_kernel_image_tensor(self, dtype, device):
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_boxes(self, format, dtype, device):
-        bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device)
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
         check_kernel(
             F.horizontal_flip_bounding_boxes,
             bounding_boxes,
@@ -760,7 +760,7 @@ def test_kernel_video(self):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     def test_functional(self, make_input):
         check_functional(F.horizontal_flip, make_input())
@@ -781,7 +781,7 @@ def test_functional_signature(self, kernel, input_type):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, make_input, device):
@@ -821,7 +821,7 @@ def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes):
         "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
     )
     def test_bounding_boxes_correctness(self, format, fn):
-        bounding_boxes = make_bounding_box(format=format)
+        bounding_boxes = make_bounding_boxes(format=format)
 
         actual = fn(bounding_boxes)
         expected = self._reference_horizontal_flip_bounding_boxes(bounding_boxes)
@@ -830,7 +830,7 @@ def test_bounding_boxes_correctness(self, format, fn):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform_noop(self, make_input, device):
@@ -917,7 +917,7 @@ def test_kernel_image_tensor(self, param, value, dtype, device):
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_boxes(self, param, value, format, dtype, device):
-        bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device)
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
         self._check_kernel(
             F.affine_bounding_boxes,
             bounding_boxes,
@@ -936,7 +936,7 @@ def test_kernel_video(self):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     def test_functional(self, make_input):
         check_functional(F.affine, make_input(), **self._MINIMAL_AFFINE_KWARGS)
@@ -957,7 +957,7 @@ def test_functional_signature(self, kernel, input_type):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, make_input, device):
@@ -1076,7 +1076,7 @@ def _reference_affine_bounding_boxes(self, bounding_boxes, *, angle, translate,
     @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"])
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     def test_functional_bounding_boxes_correctness(self, format, angle, translate, scale, shear, center):
-        bounding_boxes = make_bounding_box(format=format)
+        bounding_boxes = make_bounding_boxes(format=format)
 
         actual = F.affine(
             bounding_boxes,
@@ -1101,7 +1101,7 @@ def test_functional_bounding_boxes_correctness(self, format, angle, translate, s
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     @pytest.mark.parametrize("seed", list(range(5)))
     def test_transform_bounding_boxes_correctness(self, format, center, seed):
-        bounding_boxes = make_bounding_box(format=format)
+        bounding_boxes = make_bounding_boxes(format=format)
 
         transform = transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center)
 
@@ -1208,7 +1208,7 @@ def test_kernel_image_tensor(self, dtype, device):
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_boxes(self, format, dtype, device):
-        bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device)
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
         check_kernel(
             F.vertical_flip_bounding_boxes,
             bounding_boxes,
@@ -1225,7 +1225,7 @@ def test_kernel_video(self):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     def test_functional(self, make_input):
         check_functional(F.vertical_flip, make_input())
@@ -1246,7 +1246,7 @@ def test_functional_signature(self, kernel, input_type):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, make_input, device):
@@ -1282,7 +1282,7 @@ def _reference_vertical_flip_bounding_boxes(self, bounding_boxes):
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
     def test_bounding_boxes_correctness(self, format, fn):
-        bounding_boxes = make_bounding_box(format=format)
+        bounding_boxes = make_bounding_boxes(format=format)
 
         actual = fn(bounding_boxes)
         expected = self._reference_vertical_flip_bounding_boxes(bounding_boxes)
@@ -1291,7 +1291,7 @@ def test_bounding_boxes_correctness(self, format, fn):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform_noop(self, make_input, device):
@@ -1356,7 +1356,7 @@ def test_kernel_bounding_boxes(self, param, value, format, dtype, device):
         if param != "angle":
             kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"]
 
-        bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device)
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
 
         check_kernel(
             F.rotate_bounding_boxes,
@@ -1375,7 +1375,7 @@ def test_kernel_video(self):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     def test_functional(self, make_input):
         check_functional(F.rotate, make_input(), **self._MINIMAL_AFFINE_KWARGS)
@@ -1396,7 +1396,7 @@ def test_functional_signature(self, kernel, input_type):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, make_input, device):
@@ -1490,7 +1490,7 @@ def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, cen
     @pytest.mark.parametrize("expand", [False])
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     def test_functional_bounding_boxes_correctness(self, format, angle, expand, center):
-        bounding_boxes = make_bounding_box(format=format)
+        bounding_boxes = make_bounding_boxes(format=format)
 
         actual = F.rotate(bounding_boxes, angle=angle, expand=expand, center=center)
         expected = self._reference_rotate_bounding_boxes(bounding_boxes, angle=angle, expand=expand, center=center)
@@ -1503,7 +1503,7 @@ def test_functional_bounding_boxes_correctness(self, format, angle, expand, cent
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     @pytest.mark.parametrize("seed", list(range(5)))
     def test_transform_bounding_boxes_correctness(self, format, expand, center, seed):
-        bounding_boxes = make_bounding_box(format=format)
+        bounding_boxes = make_bounding_boxes(format=format)
 
         transform = transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, expand=expand, center=center)
 
@@ -1652,7 +1652,7 @@ def test_functional(self, make_input, input_dtype, output_dtype, device, scale):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8])
     @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8])
@@ -1727,7 +1727,7 @@ def make_inpt_with_bbox_and_mask(self, make_input):
         mask_dtype = torch.bool
         sample = {
             "inpt": make_input(size=(H, W), dtype=inpt_dtype),
-            "bbox": make_bounding_box(canvas_size=(H, W), dtype=bbox_dtype),
+            "bbox": make_bounding_boxes(canvas_size=(H, W), dtype=bbox_dtype),
             "mask": make_detection_mask(size=(H, W), dtype=mask_dtype),
         }
 
@@ -2013,7 +2013,7 @@ def test_get_num_channels(self, kernel, make_input):
             (F.get_size_image, make_image_tensor),
             (F._get_size_image_pil, make_image_pil),
             (F.get_size_image, make_image),
-            (F.get_size_bounding_boxes, make_bounding_box),
+            (F.get_size_bounding_boxes, make_bounding_boxes),
             (F.get_size_mask, make_detection_mask),
             (F.get_size_mask, make_segmentation_mask),
             (F.get_size_video, make_video),
@@ -2043,15 +2043,15 @@ def test_get_num_frames(self, kernel, make_input):
     @pytest.mark.parametrize(
         ("functional", "make_input"),
         [
-            (F.get_dimensions, make_bounding_box),
+            (F.get_dimensions, make_bounding_boxes),
             (F.get_dimensions, make_detection_mask),
             (F.get_dimensions, make_segmentation_mask),
-            (F.get_num_channels, make_bounding_box),
+            (F.get_num_channels, make_bounding_boxes),
             (F.get_num_channels, make_detection_mask),
             (F.get_num_channels, make_segmentation_mask),
             (F.get_num_frames, make_image_pil),
             (F.get_num_frames, make_image),
-            (F.get_num_frames, make_bounding_box),
+            (F.get_num_frames, make_bounding_boxes),
             (F.get_num_frames, make_detection_mask),
             (F.get_num_frames, make_segmentation_mask),
         ],
@@ -2290,7 +2290,7 @@ def test_kernel_image_tensor(self, param, value, dtype, device):
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_boxes(self, format, dtype, device):
-        bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device)
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
 
         check_kernel(
             F.elastic_bounding_boxes,
@@ -2311,7 +2311,7 @@ def test_kernel_video(self):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     def test_functional(self, make_input):
         input = make_input()
@@ -2333,7 +2333,7 @@ def test_functional_signature(self, kernel, input_type):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     def test_displacement_error(self, make_input):
         input = make_input()
@@ -2346,10 +2346,31 @@ def test_displacement_error(self, make_input):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     # ElasticTransform needs larger images to avoid the needed internal padding being larger than the actual image
     @pytest.mark.parametrize("size", [(163, 163), (72, 333), (313, 95)])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, make_input, size, device):
         check_transform(transforms.ElasticTransform, make_input(size, device=device))
+
+
+class TestToPureTensor:
+    def test_correctness(self):
+        input = {
+            "img": make_image(),
+            "img_tensor": make_image_tensor(),
+            "img_pil": make_image_pil(),
+            "mask": make_detection_mask(),
+            "video": make_video(),
+            "bbox": make_bounding_boxes(),
+            "str": "str",
+        }
+
+        out = transforms.ToPureTensor()(input)
+
+        for input_value, out_value in zip(input.values(), out.values()):
+            if isinstance(input_value, datapoints.Datapoint):
+                assert isinstance(out_value, torch.Tensor) and not isinstance(out_value, datapoints.Datapoint)
+            else:
+                assert isinstance(out_value, type(input_value))
diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py
index 0cf7a77ac0d..55825d652e6 100644
--- a/test/test_transforms_v2_utils.py
+++ b/test/test_transforms_v2_utils.py
@@ -4,7 +4,7 @@
 import torch
 
 import torchvision.transforms.v2.utils
-from common_utils import DEFAULT_SIZE, make_bounding_box, make_detection_mask, make_image
+from common_utils import DEFAULT_SIZE, make_bounding_boxes, make_detection_mask, make_image
 
 from torchvision import datapoints
 from torchvision.transforms.v2.functional import to_pil_image
@@ -12,7 +12,7 @@
 
 
 IMAGE = make_image(DEFAULT_SIZE, color_space="RGB")
-BOUNDING_BOX = make_bounding_box(DEFAULT_SIZE, format=datapoints.BoundingBoxFormat.XYXY)
+BOUNDING_BOX = make_bounding_boxes(DEFAULT_SIZE, format=datapoints.BoundingBoxFormat.XYXY)
 MASK = make_detection_mask(DEFAULT_SIZE)
 
 
@@ -37,15 +37,15 @@
         ((IMAGE, BOUNDING_BOX, MASK), (lambda obj: isinstance(obj, datapoints.Image),), True),
         ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False),
         ((IMAGE, BOUNDING_BOX, MASK), (lambda _: True,), True),
-        ((IMAGE,), (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2.utils.is_simple_tensor), True),
+        ((IMAGE,), (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2.utils.is_pure_tensor), True),
         (
             (torch.Tensor(IMAGE),),
-            (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2.utils.is_simple_tensor),
+            (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2.utils.is_pure_tensor),
             True,
         ),
         (
             (to_pil_image(IMAGE),),
-            (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2.utils.is_simple_tensor),
+            (datapoints.Image, PIL.Image.Image, torchvision.transforms.v2.utils.is_pure_tensor),
             True,
         ),
     ],
diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py
index 8f212c850cb..375c307324c 100644
--- a/test/transforms_v2_dispatcher_infos.py
+++ b/test/transforms_v2_dispatcher_infos.py
@@ -2,9 +2,9 @@
 
 import pytest
 import torchvision.transforms.v2.functional as F
-from common_utils import InfoBase, TestMark
 from torchvision import datapoints
 from transforms_v2_kernel_infos import KERNEL_INFOS, pad_xfail_jit_fill_condition
+from transforms_v2_legacy_utils import InfoBase, TestMark
 
 __all__ = ["DispatcherInfo", "DISPATCHER_INFOS"]
 
@@ -107,7 +107,7 @@ def xfail_jit_python_scalar_arg(name, *, reason=None):
         ("TestDispatchers", test_name),
         pytest.mark.skip(reason="Multi-crop dispatchers return a sequence of items rather than a single one."),
     )
-    for test_name in ["test_simple_tensor_output_type", "test_pil_output_type", "test_datapoint_output_type"]
+    for test_name in ["test_pure_tensor_output_type", "test_pil_output_type", "test_datapoint_output_type"]
 ]
 multi_crop_skips.append(skip_dispatch_datapoint)
 
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index acb9a857750..33813b6519d 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -7,7 +7,9 @@
 import torch.testing
 import torchvision.ops
 import torchvision.transforms.v2.functional as F
-from common_utils import (
+from torchvision import datapoints
+from torchvision.transforms._functional_tensor import _max_value as get_max_value, _parse_pad_padding
+from transforms_v2_legacy_utils import (
     ArgsKwargs,
     combinations_grid,
     DEFAULT_PORTRAIT_SPATIAL_SIZE,
@@ -26,8 +28,6 @@
     mark_framework_limitation,
     TestMark,
 )
-from torchvision import datapoints
-from torchvision.transforms._functional_tensor import _max_value as get_max_value, _parse_pad_padding
 
 __all__ = ["KernelInfo", "KERNEL_INFOS"]
 
diff --git a/test/transforms_v2_legacy_utils.py b/test/transforms_v2_legacy_utils.py
new file mode 100644
index 00000000000..bb8943a8889
--- /dev/null
+++ b/test/transforms_v2_legacy_utils.py
@@ -0,0 +1,633 @@
+"""
+As the name implies, these are legacy utilities that are hopefully removed soon. The future of
+transforms v2 testing is in test/test_transforms_v2_refactored.py. All new test should be
+implemented there and must not use any of the utilities here.
+
+The following legacy modules depend on this module
+
+- transforms_v2_kernel_infos.py
+- transforms_v2_dispatcher_infos.py
+- test_transforms_v2_functional.py
+- test_transforms_v2_consistency.py
+- test_transforms.py
+
+When all the logic is ported from the files above to test_transforms_v2_refactored.py, delete
+all the legacy modules including this one and drop the _refactored prefix from the name.
+"""
+
+import collections.abc
+import dataclasses
+import enum
+import itertools
+import pathlib
+from collections import defaultdict
+from typing import Callable, Sequence, Tuple, Union
+
+import PIL.Image
+import pytest
+import torch
+
+from torchvision import datapoints
+from torchvision.transforms._functional_tensor import _max_value as get_max_value
+from torchvision.transforms.v2.functional import to_dtype_image, to_image, to_pil_image
+
+
+def combinations_grid(**kwargs):
+    """Creates a grid of input combinations.
+
+    Each element in the returned sequence is a dictionary containing one possible combination as values.
+
+    Example:
+        >>> combinations_grid(foo=("bar", "baz"), spam=("eggs", "ham"))
+        [
+            {'foo': 'bar', 'spam': 'eggs'},
+            {'foo': 'bar', 'spam': 'ham'},
+            {'foo': 'baz', 'spam': 'eggs'},
+            {'foo': 'baz', 'spam': 'ham'}
+        ]
+    """
+    return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())]
+
+
+DEFAULT_SIZE = (17, 11)
+
+NUM_CHANNELS_MAP = {
+    "GRAY": 1,
+    "GRAY_ALPHA": 2,
+    "RGB": 3,
+    "RGBA": 4,
+}
+
+
+def make_image(
+    size=DEFAULT_SIZE,
+    *,
+    color_space="RGB",
+    batch_dims=(),
+    dtype=None,
+    device="cpu",
+    memory_format=torch.contiguous_format,
+):
+    num_channels = NUM_CHANNELS_MAP[color_space]
+    dtype = dtype or torch.uint8
+    max_value = get_max_value(dtype)
+    data = torch.testing.make_tensor(
+        (*batch_dims, num_channels, *size),
+        low=0,
+        high=max_value,
+        dtype=dtype,
+        device=device,
+        memory_format=memory_format,
+    )
+    if color_space in {"GRAY_ALPHA", "RGBA"}:
+        data[..., -1, :, :] = max_value
+
+    return datapoints.Image(data)
+
+
+def make_image_tensor(*args, **kwargs):
+    return make_image(*args, **kwargs).as_subclass(torch.Tensor)
+
+
+def make_image_pil(*args, **kwargs):
+    return to_pil_image(make_image(*args, **kwargs))
+
+
+def make_bounding_boxes(
+    canvas_size=DEFAULT_SIZE,
+    *,
+    format=datapoints.BoundingBoxFormat.XYXY,
+    batch_dims=(),
+    dtype=None,
+    device="cpu",
+):
+    def sample_position(values, max_value):
+        # We cannot use torch.randint directly here, because it only allows integer scalars as values for low and high.
+        # However, if we have batch_dims, we need tensors as limits.
+        return torch.stack([torch.randint(max_value - v, ()) for v in values.flatten().tolist()]).reshape(values.shape)
+
+    if isinstance(format, str):
+        format = datapoints.BoundingBoxFormat[format]
+
+    dtype = dtype or torch.float32
+
+    if any(dim == 0 for dim in batch_dims):
+        return datapoints.BoundingBoxes(
+            torch.empty(*batch_dims, 4, dtype=dtype, device=device), format=format, canvas_size=canvas_size
+        )
+
+    h, w = [torch.randint(1, c, batch_dims) for c in canvas_size]
+    y = sample_position(h, canvas_size[0])
+    x = sample_position(w, canvas_size[1])
+
+    if format is datapoints.BoundingBoxFormat.XYWH:
+        parts = (x, y, w, h)
+    elif format is datapoints.BoundingBoxFormat.XYXY:
+        x1, y1 = x, y
+        x2 = x1 + w
+        y2 = y1 + h
+        parts = (x1, y1, x2, y2)
+    elif format is datapoints.BoundingBoxFormat.CXCYWH:
+        cx = x + w / 2
+        cy = y + h / 2
+        parts = (cx, cy, w, h)
+    else:
+        raise ValueError(f"Format {format} is not supported")
+
+    return datapoints.BoundingBoxes(
+        torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, canvas_size=canvas_size
+    )
+
+
+def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtype=None, device="cpu"):
+    """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks"""
+    return datapoints.Mask(
+        torch.testing.make_tensor(
+            (*batch_dims, num_objects, *size),
+            low=0,
+            high=2,
+            dtype=dtype or torch.bool,
+            device=device,
+        )
+    )
+
+
+def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"):
+    """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value"""
+    return datapoints.Mask(
+        torch.testing.make_tensor(
+            (*batch_dims, *size),
+            low=0,
+            high=num_categories,
+            dtype=dtype or torch.uint8,
+            device=device,
+        )
+    )
+
+
+def make_video(size=DEFAULT_SIZE, *, num_frames=3, batch_dims=(), **kwargs):
+    return datapoints.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs))
+
+
+def make_video_tensor(*args, **kwargs):
+    return make_video(*args, **kwargs).as_subclass(torch.Tensor)
+
+
+DEFAULT_SQUARE_SPATIAL_SIZE = 15
+DEFAULT_LANDSCAPE_SPATIAL_SIZE = (7, 33)
+DEFAULT_PORTRAIT_SPATIAL_SIZE = (31, 9)
+DEFAULT_SPATIAL_SIZES = (
+    DEFAULT_LANDSCAPE_SPATIAL_SIZE,
+    DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    DEFAULT_SQUARE_SPATIAL_SIZE,
+)
+
+
+def _parse_size(size, *, name="size"):
+    if size == "random":
+        raise ValueError("This should never happen")
+    elif isinstance(size, int) and size > 0:
+        return (size, size)
+    elif (
+        isinstance(size, collections.abc.Sequence)
+        and len(size) == 2
+        and all(isinstance(length, int) and length > 0 for length in size)
+    ):
+        return tuple(size)
+    else:
+        raise pytest.UsageError(
+            f"'{name}' can either be `'random'`, a positive integer, or a sequence of two positive integers,"
+            f"but got {size} instead."
+        )
+
+
+def get_num_channels(color_space):
+    num_channels = NUM_CHANNELS_MAP.get(color_space)
+    if not num_channels:
+        raise pytest.UsageError(f"Can't determine the number of channels for color space {color_space}")
+    return num_channels
+
+
+VALID_EXTRA_DIMS = ((), (4,), (2, 3))
+DEGENERATE_BATCH_DIMS = ((0,), (5, 0), (0, 5))
+
+DEFAULT_EXTRA_DIMS = (*VALID_EXTRA_DIMS, *DEGENERATE_BATCH_DIMS)
+
+
+def from_loader(loader_fn):
+    def wrapper(*args, **kwargs):
+        device = kwargs.pop("device", "cpu")
+        loader = loader_fn(*args, **kwargs)
+        return loader.load(device)
+
+    return wrapper
+
+
+def from_loaders(loaders_fn):
+    def wrapper(*args, **kwargs):
+        device = kwargs.pop("device", "cpu")
+        loaders = loaders_fn(*args, **kwargs)
+        for loader in loaders:
+            yield loader.load(device)
+
+    return wrapper
+
+
+@dataclasses.dataclass
+class TensorLoader:
+    fn: Callable[[Sequence[int], torch.dtype, Union[str, torch.device]], torch.Tensor]
+    shape: Sequence[int]
+    dtype: torch.dtype
+
+    def load(self, device):
+        return self.fn(self.shape, self.dtype, device)
+
+
+@dataclasses.dataclass
+class ImageLoader(TensorLoader):
+    spatial_size: Tuple[int, int] = dataclasses.field(init=False)
+    num_channels: int = dataclasses.field(init=False)
+    memory_format: torch.memory_format = torch.contiguous_format
+    canvas_size: Tuple[int, int] = dataclasses.field(init=False)
+
+    def __post_init__(self):
+        self.spatial_size = self.canvas_size = self.shape[-2:]
+        self.num_channels = self.shape[-3]
+
+    def load(self, device):
+        return self.fn(self.shape, self.dtype, device, memory_format=self.memory_format)
+
+
+def make_image_loader(
+    size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    *,
+    color_space="RGB",
+    extra_dims=(),
+    dtype=torch.float32,
+    constant_alpha=True,
+    memory_format=torch.contiguous_format,
+):
+    if not constant_alpha:
+        raise ValueError("This should never happen")
+    size = _parse_size(size)
+    num_channels = get_num_channels(color_space)
+
+    def fn(shape, dtype, device, memory_format):
+        *batch_dims, _, height, width = shape
+        return make_image(
+            (height, width),
+            color_space=color_space,
+            batch_dims=batch_dims,
+            dtype=dtype,
+            device=device,
+            memory_format=memory_format,
+        )
+
+    return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype, memory_format=memory_format)
+
+
+def make_image_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    color_spaces=(
+        "GRAY",
+        "GRAY_ALPHA",
+        "RGB",
+        "RGBA",
+    ),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.float32, torch.float64, torch.uint8),
+    constant_alpha=True,
+):
+    for params in combinations_grid(size=sizes, color_space=color_spaces, extra_dims=extra_dims, dtype=dtypes):
+        yield make_image_loader(**params, constant_alpha=constant_alpha)
+
+
+make_images = from_loaders(make_image_loaders)
+
+
+def make_image_loader_for_interpolation(
+    size=(233, 147), *, color_space="RGB", dtype=torch.uint8, memory_format=torch.contiguous_format
+):
+    size = _parse_size(size)
+    num_channels = get_num_channels(color_space)
+
+    def fn(shape, dtype, device, memory_format):
+        height, width = shape[-2:]
+
+        image_pil = (
+            PIL.Image.open(pathlib.Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg")
+            .resize((width, height))
+            .convert(
+                {
+                    "GRAY": "L",
+                    "GRAY_ALPHA": "LA",
+                    "RGB": "RGB",
+                    "RGBA": "RGBA",
+                }[color_space]
+            )
+        )
+
+        image_tensor = to_image(image_pil)
+        if memory_format == torch.contiguous_format:
+            image_tensor = image_tensor.to(device=device, memory_format=memory_format, copy=True)
+        else:
+            image_tensor = image_tensor.to(device=device)
+        image_tensor = to_dtype_image(image_tensor, dtype=dtype, scale=True)
+
+        return datapoints.Image(image_tensor)
+
+    return ImageLoader(fn, shape=(num_channels, *size), dtype=dtype, memory_format=memory_format)
+
+
+def make_image_loaders_for_interpolation(
+    sizes=((233, 147),),
+    color_spaces=("RGB",),
+    dtypes=(torch.uint8,),
+    memory_formats=(torch.contiguous_format, torch.channels_last),
+):
+    for params in combinations_grid(size=sizes, color_space=color_spaces, dtype=dtypes, memory_format=memory_formats):
+        yield make_image_loader_for_interpolation(**params)
+
+
+@dataclasses.dataclass
+class BoundingBoxesLoader(TensorLoader):
+    format: datapoints.BoundingBoxFormat
+    spatial_size: Tuple[int, int]
+    canvas_size: Tuple[int, int] = dataclasses.field(init=False)
+
+    def __post_init__(self):
+        self.canvas_size = self.spatial_size
+
+
+def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.float32):
+    if isinstance(format, str):
+        format = datapoints.BoundingBoxFormat[format]
+
+    spatial_size = _parse_size(spatial_size, name="spatial_size")
+
+    def fn(shape, dtype, device):
+        *batch_dims, num_coordinates = shape
+        if num_coordinates != 4:
+            raise pytest.UsageError()
+
+        return make_bounding_boxes(
+            format=format, canvas_size=spatial_size, batch_dims=batch_dims, dtype=dtype, device=device
+        )
+
+    return BoundingBoxesLoader(fn, shape=(*extra_dims[-1:], 4), dtype=dtype, format=format, spatial_size=spatial_size)
+
+
+def make_bounding_box_loaders(
+    *,
+    extra_dims=tuple(d for d in DEFAULT_EXTRA_DIMS if len(d) < 2),
+    formats=tuple(datapoints.BoundingBoxFormat),
+    spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    dtypes=(torch.float32, torch.float64, torch.int64),
+):
+    for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes):
+        yield make_bounding_box_loader(**params, spatial_size=spatial_size)
+
+
+make_multiple_bounding_boxes = from_loaders(make_bounding_box_loaders)
+
+
+class MaskLoader(TensorLoader):
+    pass
+
+
+def make_detection_mask_loader(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_objects=5, extra_dims=(), dtype=torch.uint8):
+    # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects
+    size = _parse_size(size)
+
+    def fn(shape, dtype, device):
+        *batch_dims, num_objects, height, width = shape
+        return make_detection_mask(
+            (height, width), num_objects=num_objects, batch_dims=batch_dims, dtype=dtype, device=device
+        )
+
+    return MaskLoader(fn, shape=(*extra_dims, num_objects, *size), dtype=dtype)
+
+
+def make_detection_mask_loaders(
+    sizes=DEFAULT_SPATIAL_SIZES,
+    num_objects=(1, 0, 5),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8,),
+):
+    for params in combinations_grid(size=sizes, num_objects=num_objects, extra_dims=extra_dims, dtype=dtypes):
+        yield make_detection_mask_loader(**params)
+
+
+make_detection_masks = from_loaders(make_detection_mask_loaders)
+
+
+def make_segmentation_mask_loader(
+    size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_categories=10, extra_dims=(), dtype=torch.uint8
+):
+    # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
+    size = _parse_size(size)
+
+    def fn(shape, dtype, device):
+        *batch_dims, height, width = shape
+        return make_segmentation_mask(
+            (height, width), num_categories=num_categories, batch_dims=batch_dims, dtype=dtype, device=device
+        )
+
+    return MaskLoader(fn, shape=(*extra_dims, *size), dtype=dtype)
+
+
+def make_segmentation_mask_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    num_categories=(1, 2, 10),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8,),
+):
+    for params in combinations_grid(size=sizes, num_categories=num_categories, extra_dims=extra_dims, dtype=dtypes):
+        yield make_segmentation_mask_loader(**params)
+
+
+make_segmentation_masks = from_loaders(make_segmentation_mask_loaders)
+
+
+def make_mask_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    num_objects=(1, 0, 5),
+    num_categories=(1, 2, 10),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8,),
+):
+    yield from make_detection_mask_loaders(sizes=sizes, num_objects=num_objects, extra_dims=extra_dims, dtypes=dtypes)
+    yield from make_segmentation_mask_loaders(
+        sizes=sizes, num_categories=num_categories, extra_dims=extra_dims, dtypes=dtypes
+    )
+
+
+make_masks = from_loaders(make_mask_loaders)
+
+
+class VideoLoader(ImageLoader):
+    pass
+
+
+def make_video_loader(
+    size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    *,
+    color_space="RGB",
+    num_frames=3,
+    extra_dims=(),
+    dtype=torch.uint8,
+):
+    size = _parse_size(size)
+
+    def fn(shape, dtype, device, memory_format):
+        *batch_dims, num_frames, _, height, width = shape
+        return make_video(
+            (height, width),
+            num_frames=num_frames,
+            batch_dims=batch_dims,
+            color_space=color_space,
+            dtype=dtype,
+            device=device,
+            memory_format=memory_format,
+        )
+
+    return VideoLoader(fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype)
+
+
+def make_video_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    color_spaces=(
+        "GRAY",
+        "RGB",
+    ),
+    num_frames=(1, 0, 3),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8, torch.float32, torch.float64),
+):
+    for params in combinations_grid(
+        size=sizes, color_space=color_spaces, num_frames=num_frames, extra_dims=extra_dims, dtype=dtypes
+    ):
+        yield make_video_loader(**params)
+
+
+make_videos = from_loaders(make_video_loaders)
+
+
+class TestMark:
+    def __init__(
+        self,
+        # Tuple of test class name and test function name that identifies the test the mark is applied to. If there is
+        # no test class, i.e. a standalone test function, use `None`.
+        test_id,
+        # `pytest.mark.*` to apply, e.g. `pytest.mark.skip` or `pytest.mark.xfail`
+        mark,
+        *,
+        # Callable, that will be passed an `ArgsKwargs` and should return a boolean to indicate if the mark will be
+        # applied. If omitted, defaults to always apply.
+        condition=None,
+    ):
+        self.test_id = test_id
+        self.mark = mark
+        self.condition = condition or (lambda args_kwargs: True)
+
+
+def mark_framework_limitation(test_id, reason, condition=None):
+    # The purpose of this function is to have a single entry point for skip marks that are only there, because the test
+    # framework cannot handle the kernel in general or a specific parameter combination.
+    # As development progresses, we can change the `mark.skip` to `mark.xfail` from time to time to see if the skip is
+    # still justified.
+    # We don't want to use `mark.xfail` all the time, because that actually runs the test until an error happens. Thus,
+    # we are wasting CI resources for no reason for most of the time
+    return TestMark(test_id, pytest.mark.skip(reason=reason), condition=condition)
+
+
+class InfoBase:
+    def __init__(
+        self,
+        *,
+        # Identifier if the info that shows up the parametrization.
+        id,
+        # Test markers that will be (conditionally) applied to an `ArgsKwargs` parametrization.
+        # See the `TestMark` class for details
+        test_marks=None,
+        # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`. Keys are a 3-tuple of `test_id` (see
+        # `TestMark`), the dtype, and the device.
+        closeness_kwargs=None,
+    ):
+        self.id = id
+
+        self.test_marks = test_marks or []
+        test_marks_map = defaultdict(list)
+        for test_mark in self.test_marks:
+            test_marks_map[test_mark.test_id].append(test_mark)
+        self._test_marks_map = dict(test_marks_map)
+
+        self.closeness_kwargs = closeness_kwargs or dict()
+
+    def get_marks(self, test_id, args_kwargs):
+        return [
+            test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs)
+        ]
+
+    def get_closeness_kwargs(self, test_id, *, dtype, device):
+        if not (isinstance(test_id, tuple) and len(test_id) == 2):
+            msg = "`test_id` should be a `Tuple[Optional[str], str]` denoting the test class and function name"
+            if callable(test_id):
+                msg += ". Did you forget to add the `test_id` fixture to parameters of the test?"
+            else:
+                msg += f", but got {test_id} instead."
+            raise pytest.UsageError(msg)
+        if isinstance(device, torch.device):
+            device = device.type
+        return self.closeness_kwargs.get((test_id, dtype, device), dict())
+
+
+class ArgsKwargs:
+    def __init__(self, *args, **kwargs):
+        self.args = args
+        self.kwargs = kwargs
+
+    def __iter__(self):
+        yield self.args
+        yield self.kwargs
+
+    def load(self, device="cpu"):
+        return ArgsKwargs(
+            *(arg.load(device) if isinstance(arg, TensorLoader) else arg for arg in self.args),
+            **{
+                keyword: arg.load(device) if isinstance(arg, TensorLoader) else arg
+                for keyword, arg in self.kwargs.items()
+            },
+        )
+
+
+def parametrized_error_message(*args, **kwargs):
+    def to_str(obj):
+        if isinstance(obj, torch.Tensor) and obj.numel() > 30:
+            return f"tensor(shape={list(obj.shape)}, dtype={obj.dtype}, device={obj.device})"
+        elif isinstance(obj, enum.Enum):
+            return f"{type(obj).__name__}.{obj.name}"
+        else:
+            return repr(obj)
+
+    if args or kwargs:
+        postfix = "\n".join(
+            [
+                "",
+                "Failure happened for the following parameters:",
+                "",
+                *[to_str(arg) for arg in args],
+                *[f"{name}={to_str(kwarg)}" for name, kwarg in kwargs.items()],
+            ]
+        )
+    else:
+        postfix = ""
+
+    def wrapper(msg):
+        return msg + postfix
+
+    return wrapper
diff --git a/torchvision/csrc/io/image/cpu/decode_jpeg.cpp b/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
index d07844a5e27..09a0618ad1f 100644
--- a/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
+++ b/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
@@ -155,7 +155,7 @@ torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) {
 #endif // #if !JPEG_FOUND
 
 int64_t _jpeg_version() {
-#ifdef JPEG_FOUND
+#if JPEG_FOUND
   return JPEG_LIB_VERSION;
 #else
   return -1;
diff --git a/torchvision/datapoints/_datapoint.py b/torchvision/datapoints/_datapoint.py
index 59b017b4417..11f869103b0 100644
--- a/torchvision/datapoints/_datapoint.py
+++ b/torchvision/datapoints/_datapoint.py
@@ -17,7 +17,7 @@ class Datapoint(torch.Tensor):
 
     You probably don't want to use this class unless you're defining your own
     custom Datapoints. See
-    :ref:`sphx_glr_auto_examples_plot_custom_datapoints.py` for details.
+    :ref:`sphx_glr_auto_examples_v2_transforms_plot_custom_datapoints.py` for details.
     """
 
     @staticmethod
diff --git a/torchvision/extension.py b/torchvision/extension.py
index c417c54f954..67801056e88 100644
--- a/torchvision/extension.py
+++ b/torchvision/extension.py
@@ -74,9 +74,9 @@ def _check_cuda_version():
         t_version = torch_version_cuda.split(".")
         t_major = int(t_version[0])
         t_minor = int(t_version[1])
-        if t_major != tv_major or t_minor != tv_minor:
+        if t_major != tv_major:
             raise RuntimeError(
-                "Detected that PyTorch and torchvision were compiled with different CUDA versions. "
+                "Detected that PyTorch and torchvision were compiled with different CUDA major versions. "
                 f"PyTorch has CUDA Version={t_major}.{t_minor} and torchvision has "
                 f"CUDA Version={tv_major}.{tv_minor}. "
                 "Please reinstall the torchvision that matches your PyTorch install."
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 81f726a2dbd..eaa181b6717 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -9,7 +9,7 @@
 from torchvision.transforms.v2 import functional as F, InterpolationMode, Transform
 
 from torchvision.transforms.v2.functional._geometry import _check_interpolation
-from torchvision.transforms.v2.utils import is_simple_tensor
+from torchvision.transforms.v2.utils import is_pure_tensor
 
 
 class SimpleCopyPaste(Transform):
@@ -109,7 +109,7 @@ def _extract_image_targets(
         # with List[image], List[BoundingBoxes], List[Mask], List[Label]
         images, bboxes, masks, labels = [], [], [], []
         for obj in flat_sample:
-            if isinstance(obj, datapoints.Image) or is_simple_tensor(obj):
+            if isinstance(obj, datapoints.Image) or is_pure_tensor(obj):
                 images.append(obj)
             elif isinstance(obj, PIL.Image.Image):
                 images.append(F.to_image(obj))
@@ -146,7 +146,7 @@ def _insert_outputs(
             elif isinstance(obj, PIL.Image.Image):
                 flat_sample[i] = F.to_pil_image(output_images[c0])
                 c0 += 1
-            elif is_simple_tensor(obj):
+            elif is_pure_tensor(obj):
                 flat_sample[i] = output_images[c0]
                 c0 += 1
             elif isinstance(obj, datapoints.BoundingBoxes):
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
index 8d8e7eb42f0..1350b6d1bd1 100644
--- a/torchvision/prototype/transforms/_geometry.py
+++ b/torchvision/prototype/transforms/_geometry.py
@@ -7,7 +7,7 @@
 from torchvision.prototype.datapoints import Label, OneHotLabel
 from torchvision.transforms.v2 import functional as F, Transform
 from torchvision.transforms.v2._utils import _FillType, _get_fill, _setup_fill_arg, _setup_size
-from torchvision.transforms.v2.utils import get_bounding_boxes, has_any, is_simple_tensor, query_size
+from torchvision.transforms.v2.utils import get_bounding_boxes, has_any, is_pure_tensor, query_size
 
 
 class FixedSizeCrop(Transform):
@@ -32,7 +32,7 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None:
             flat_inputs,
             PIL.Image.Image,
             datapoints.Image,
-            is_simple_tensor,
+            is_pure_tensor,
             datapoints.Video,
         ):
             raise TypeError(
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
index f1b859aac03..0dd495ab05b 100644
--- a/torchvision/prototype/transforms/_misc.py
+++ b/torchvision/prototype/transforms/_misc.py
@@ -8,7 +8,7 @@
 from torchvision import datapoints
 from torchvision.transforms.v2 import Transform
 
-from torchvision.transforms.v2.utils import is_simple_tensor
+from torchvision.transforms.v2.utils import is_pure_tensor
 
 
 T = TypeVar("T")
@@ -25,7 +25,7 @@ def _get_defaultdict(default: T) -> Dict[Any, T]:
 
 
 class PermuteDimensions(Transform):
-    _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
+    _transformed_types = (is_pure_tensor, datapoints.Image, datapoints.Video)
 
     def __init__(self, dims: Union[Sequence[int], Dict[Type, Optional[Sequence[int]]]]) -> None:
         super().__init__()
@@ -47,7 +47,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> torch.Tensor:
 
 
 class TransposeDimensions(Transform):
-    _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
+    _transformed_types = (is_pure_tensor, datapoints.Image, datapoints.Video)
 
     def __init__(self, dims: Union[Tuple[int, int], Dict[Type, Optional[Tuple[int, int]]]]) -> None:
         super().__init__()
diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py
index 38da78fa4d7..bc15c96b51b 100644
--- a/torchvision/transforms/v2/__init__.py
+++ b/torchvision/transforms/v2/__init__.py
@@ -52,7 +52,7 @@
     ToDtype,
 )
 from ._temporal import UniformTemporalSubsample
-from ._type_conversion import PILToTensor, ToImage, ToPILImage
+from ._type_conversion import PILToTensor, ToImage, ToPILImage, ToPureTensor
 
 from ._deprecated import ToTensor  # usort: skip
 
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index f64ae564b54..a6af96a5ef6 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -12,7 +12,7 @@
 
 from ._transform import _RandomApplyTransform, Transform
 from ._utils import _parse_labels_getter
-from .utils import has_any, is_simple_tensor, query_chw, query_size
+from .utils import has_any, is_pure_tensor, query_chw, query_size
 
 
 class RandomErasing(_RandomApplyTransform):
@@ -217,7 +217,7 @@ class MixUp(_BaseMixUpCutMix):
     .. note::
         This transform is meant to be used on **batches** of samples, not
         individual images. See
-        :ref:`sphx_glr_auto_examples_plot_cutmix_mixup.py` for detailed usage
+        :ref:`sphx_glr_auto_examples_v2_transforms_plot_cutmix_mixup.py` for detailed usage
         examples.
         The sample pairing is deterministic and done by matching consecutive
         samples in the batch, so the batch needs to be shuffled (this is an
@@ -243,7 +243,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
         if inpt is params["labels"]:
             return self._mixup_label(inpt, lam=lam)
-        elif isinstance(inpt, (datapoints.Image, datapoints.Video)) or is_simple_tensor(inpt):
+        elif isinstance(inpt, (datapoints.Image, datapoints.Video)) or is_pure_tensor(inpt):
             self._check_image_or_video(inpt, batch_size=params["batch_size"])
 
             output = inpt.roll(1, 0).mul_(1.0 - lam).add_(inpt.mul(lam))
@@ -267,7 +267,7 @@ class CutMix(_BaseMixUpCutMix):
     .. note::
         This transform is meant to be used on **batches** of samples, not
         individual images. See
-        :ref:`sphx_glr_auto_examples_plot_cutmix_mixup.py` for detailed usage
+        :ref:`sphx_glr_auto_examples_v2_transforms_plot_cutmix_mixup.py` for detailed usage
         examples.
         The sample pairing is deterministic and done by matching consecutive
         samples in the batch, so the batch needs to be shuffled (this is an
@@ -310,7 +310,7 @@ def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         if inpt is params["labels"]:
             return self._mixup_label(inpt, lam=params["lam_adjusted"])
-        elif isinstance(inpt, (datapoints.Image, datapoints.Video)) or is_simple_tensor(inpt):
+        elif isinstance(inpt, (datapoints.Image, datapoints.Video)) or is_pure_tensor(inpt):
             self._check_image_or_video(inpt, batch_size=params["batch_size"])
 
             x1, y1, x2, y2 = params["box"]
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index 687a2396e67..097e90fc4ab 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -13,7 +13,7 @@
 from torchvision.transforms.v2.functional._utils import _FillType, _FillTypeJIT
 
 from ._utils import _get_fill, _setup_fill_arg
-from .utils import check_type, is_simple_tensor
+from .utils import check_type, is_pure_tensor
 
 
 ImageOrVideo = Union[torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.Video]
@@ -50,7 +50,7 @@ def _flatten_and_extract_image_or_video(
                 (
                     datapoints.Image,
                     PIL.Image.Image,
-                    is_simple_tensor,
+                    is_pure_tensor,
                     datapoints.Video,
                 ),
             ):
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index f441a0b747b..0be62ae8a12 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -24,7 +24,7 @@
     _setup_float_or_seq,
     _setup_size,
 )
-from .utils import get_bounding_boxes, has_all, has_any, is_simple_tensor, query_size
+from .utils import get_bounding_boxes, has_all, has_any, is_pure_tensor, query_size
 
 
 class RandomHorizontalFlip(_RandomApplyTransform):
@@ -1149,7 +1149,7 @@ def __init__(
     def _check_inputs(self, flat_inputs: List[Any]) -> None:
         if not (
             has_all(flat_inputs, datapoints.BoundingBoxes)
-            and has_any(flat_inputs, PIL.Image.Image, datapoints.Image, is_simple_tensor)
+            and has_any(flat_inputs, PIL.Image.Image, datapoints.Image, is_pure_tensor)
         ):
             raise TypeError(
                 f"{type(self).__name__}() requires input sample to contain tensor or PIL images "
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index ef9ac5fd0c7..405fbc6c43a 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -10,7 +10,7 @@
 from torchvision.transforms.v2 import functional as F, Transform
 
 from ._utils import _parse_labels_getter, _setup_float_or_seq, _setup_size
-from .utils import get_bounding_boxes, has_any, is_simple_tensor
+from .utils import get_bounding_boxes, has_any, is_pure_tensor
 
 
 # TODO: do we want/need to expose this?
@@ -75,7 +75,7 @@ class LinearTransformation(Transform):
 
     _v1_transform_cls = _transforms.LinearTransformation
 
-    _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
+    _transformed_types = (is_pure_tensor, datapoints.Image, datapoints.Video)
 
     def __init__(self, transformation_matrix: torch.Tensor, mean_vector: torch.Tensor):
         super().__init__()
@@ -264,7 +264,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         if isinstance(self.dtype, torch.dtype):
             # For consistency / BC with ConvertImageDtype, we only care about images or videos when dtype
             # is a simple torch.dtype
-            if not is_simple_tensor(inpt) and not isinstance(inpt, (datapoints.Image, datapoints.Video)):
+            if not is_pure_tensor(inpt) and not isinstance(inpt, (datapoints.Image, datapoints.Video)):
                 return inpt
 
             dtype: Optional[torch.dtype] = self.dtype
@@ -281,7 +281,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
                 'e.g. dtype={datapoints.Mask: torch.int64, "others": None} to pass-through the rest of the inputs.'
             )
 
-        supports_scaling = is_simple_tensor(inpt) or isinstance(inpt, (datapoints.Image, datapoints.Video))
+        supports_scaling = is_pure_tensor(inpt) or isinstance(inpt, (datapoints.Image, datapoints.Video))
         if dtype is None:
             if self.scale and supports_scaling:
                 warnings.warn(
diff --git a/torchvision/transforms/v2/_transform.py b/torchvision/transforms/v2/_transform.py
index d4ee8af556d..e9af4b426fa 100644
--- a/torchvision/transforms/v2/_transform.py
+++ b/torchvision/transforms/v2/_transform.py
@@ -8,7 +8,7 @@
 from torch import nn
 from torch.utils._pytree import tree_flatten, tree_unflatten
 from torchvision import datapoints
-from torchvision.transforms.v2.utils import check_type, has_any, is_simple_tensor
+from torchvision.transforms.v2.utils import check_type, has_any, is_pure_tensor
 from torchvision.utils import _log_api_usage_once
 
 from .functional._utils import _get_kernel
@@ -55,32 +55,32 @@ def forward(self, *inputs: Any) -> Any:
         return tree_unflatten(flat_outputs, spec)
 
     def _needs_transform_list(self, flat_inputs: List[Any]) -> List[bool]:
-        # Below is a heuristic on how to deal with simple tensor inputs:
-        # 1. Simple tensors, i.e. tensors that are not a datapoint, are passed through if there is an explicit image
+        # Below is a heuristic on how to deal with pure tensor inputs:
+        # 1. Pure tensors, i.e. tensors that are not a datapoint, are passed through if there is an explicit image
         #    (`datapoints.Image` or `PIL.Image.Image`) or video (`datapoints.Video`) in the sample.
-        # 2. If there is no explicit image or video in the sample, only the first encountered simple tensor is
+        # 2. If there is no explicit image or video in the sample, only the first encountered pure tensor is
         #    transformed as image, while the rest is passed through. The order is defined by the returned `flat_inputs`
         #    of `tree_flatten`, which recurses depth-first through the input.
         #
         # This heuristic stems from two requirements:
-        # 1. We need to keep BC for single input simple tensors and treat them as images.
-        # 2. We don't want to treat all simple tensors as images, because some datasets like `CelebA` or `Widerface`
+        # 1. We need to keep BC for single input pure tensors and treat them as images.
+        # 2. We don't want to treat all pure tensors as images, because some datasets like `CelebA` or `Widerface`
         #    return supplemental numerical data as tensors that cannot be transformed as images.
         #
         # The heuristic should work well for most people in practice. The only case where it doesn't is if someone
-        # tries to transform multiple simple tensors at the same time, expecting them all to be treated as images.
+        # tries to transform multiple pure tensors at the same time, expecting them all to be treated as images.
         # However, this case wasn't supported by transforms v1 either, so there is no BC concern.
 
         needs_transform_list = []
-        transform_simple_tensor = not has_any(flat_inputs, datapoints.Image, datapoints.Video, PIL.Image.Image)
+        transform_pure_tensor = not has_any(flat_inputs, datapoints.Image, datapoints.Video, PIL.Image.Image)
         for inpt in flat_inputs:
             needs_transform = True
 
             if not check_type(inpt, self._transformed_types):
                 needs_transform = False
-            elif is_simple_tensor(inpt):
-                if transform_simple_tensor:
-                    transform_simple_tensor = False
+            elif is_pure_tensor(inpt):
+                if transform_pure_tensor:
+                    transform_pure_tensor = False
                 else:
                     needs_transform = False
             needs_transform_list.append(needs_transform)
diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
index aec82f46f14..26d23375400 100644
--- a/torchvision/transforms/v2/_type_conversion.py
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -7,7 +7,7 @@
 from torchvision import datapoints
 from torchvision.transforms.v2 import functional as F, Transform
 
-from torchvision.transforms.v2.utils import is_simple_tensor
+from torchvision.transforms.v2.utils import is_pure_tensor
 
 
 class PILToTensor(Transform):
@@ -30,12 +30,12 @@ class ToImage(Transform):
     """[BETA] Convert a tensor, ndarray, or PIL Image to :class:`~torchvision.datapoints.Image`
     ; this does not scale values.
 
-    .. v2betastatus:: ToImageTensor transform
+    .. v2betastatus:: ToImage transform
 
     This transform does not support torchscript.
     """
 
-    _transformed_types = (is_simple_tensor, PIL.Image.Image, np.ndarray)
+    _transformed_types = (is_pure_tensor, PIL.Image.Image, np.ndarray)
 
     def _transform(
         self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: Dict[str, Any]
@@ -46,7 +46,7 @@ def _transform(
 class ToPILImage(Transform):
     """[BETA] Convert a tensor or an ndarray to PIL Image - this does not scale values.
 
-    .. v2betastatus:: ToImagePIL transform
+    .. v2betastatus:: ToPILImage transform
 
     This transform does not support torchscript.
 
@@ -65,7 +65,7 @@ class ToPILImage(Transform):
     .. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes
     """
 
-    _transformed_types = (is_simple_tensor, datapoints.Image, np.ndarray)
+    _transformed_types = (is_pure_tensor, datapoints.Image, np.ndarray)
 
     def __init__(self, mode: Optional[str] = None) -> None:
         super().__init__()
@@ -75,3 +75,17 @@ def _transform(
         self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: Dict[str, Any]
     ) -> PIL.Image.Image:
         return F.to_pil_image(inpt, mode=self.mode)
+
+
+class ToPureTensor(Transform):
+    """[BETA] Convert all datapoints to pure tensors, removing associated metadata (if any).
+
+    .. v2betastatus:: ToPureTensor transform
+
+    This doesn't scale or change the values, only the type.
+    """
+
+    _transformed_types = (datapoints.Datapoint,)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> torch.Tensor:
+        return inpt.as_subclass(torch.Tensor)
diff --git a/torchvision/transforms/v2/functional/__init__.py b/torchvision/transforms/v2/functional/__init__.py
index 3510962ff3a..5d3a18a9151 100644
--- a/torchvision/transforms/v2/functional/__init__.py
+++ b/torchvision/transforms/v2/functional/__init__.py
@@ -1,6 +1,6 @@
 from torchvision.transforms import InterpolationMode  # usort: skip
 
-from ._utils import is_simple_tensor, register_kernel  # usort: skip
+from ._utils import is_pure_tensor, register_kernel  # usort: skip
 
 from ._meta import (
     clamp_bounding_boxes,
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
index f2675728ce3..fc4dfb60d60 100644
--- a/torchvision/transforms/v2/functional/_meta.py
+++ b/torchvision/transforms/v2/functional/_meta.py
@@ -8,7 +8,7 @@
 
 from torchvision.utils import _log_api_usage_once
 
-from ._utils import _get_kernel, _register_kernel_internal, is_simple_tensor
+from ._utils import _get_kernel, _register_kernel_internal, is_pure_tensor
 
 
 def get_dimensions(inpt: torch.Tensor) -> List[int]:
@@ -203,7 +203,7 @@ def convert_format_bounding_boxes(
     new_format: Optional[BoundingBoxFormat] = None,
     inplace: bool = False,
 ) -> torch.Tensor:
-    # This being a kernel / functional hybrid, we need an option to pass `old_format` explicitly for simple tensor
+    # This being a kernel / functional hybrid, we need an option to pass `old_format` explicitly for pure tensor
     # inputs as well as extract it from `datapoints.BoundingBoxes` inputs. However, putting a default value on
     # `old_format` means we also need to put one on `new_format` to have syntactically correct Python. Here we mimic the
     # default error that would be thrown if `new_format` had no default value.
@@ -213,9 +213,9 @@ def convert_format_bounding_boxes(
     if not torch.jit.is_scripting():
         _log_api_usage_once(convert_format_bounding_boxes)
 
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting() or is_pure_tensor(inpt):
         if old_format is None:
-            raise ValueError("For simple tensor inputs, `old_format` has to be passed.")
+            raise ValueError("For pure tensor inputs, `old_format` has to be passed.")
         return _convert_format_bounding_boxes(inpt, old_format=old_format, new_format=new_format, inplace=inplace)
     elif isinstance(inpt, datapoints.BoundingBoxes):
         if old_format is not None:
@@ -256,10 +256,10 @@ def clamp_bounding_boxes(
     if not torch.jit.is_scripting():
         _log_api_usage_once(clamp_bounding_boxes)
 
-    if torch.jit.is_scripting() or is_simple_tensor(inpt):
+    if torch.jit.is_scripting() or is_pure_tensor(inpt):
 
         if format is None or canvas_size is None:
-            raise ValueError("For simple tensor inputs, `format` and `canvas_size` has to be passed.")
+            raise ValueError("For pure tensor inputs, `format` and `canvas_size` has to be passed.")
         return _clamp_bounding_boxes(inpt, format=format, canvas_size=canvas_size)
     elif isinstance(inpt, datapoints.BoundingBoxes):
         if format is not None or canvas_size is not None:
diff --git a/torchvision/transforms/v2/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py
index 7fc48929917..51b76f59270 100644
--- a/torchvision/transforms/v2/functional/_utils.py
+++ b/torchvision/transforms/v2/functional/_utils.py
@@ -8,7 +8,7 @@
 _FillTypeJIT = Optional[List[float]]
 
 
-def is_simple_tensor(inpt: Any) -> bool:
+def is_pure_tensor(inpt: Any) -> bool:
     return isinstance(inpt, torch.Tensor) and not isinstance(inpt, datapoints.Datapoint)
 
 
@@ -69,7 +69,7 @@ def _name_to_functional(name):
 def register_kernel(functional, datapoint_cls):
     """Decorate a kernel to register it for a functional and a (custom) datapoint type.
 
-    See :ref:`sphx_glr_auto_examples_plot_custom_datapoints.py` for usage
+    See :ref:`sphx_glr_auto_examples_v2_transforms_plot_custom_datapoints.py` for usage
     details.
     """
     if isinstance(functional, str):
diff --git a/torchvision/transforms/v2/utils.py b/torchvision/transforms/v2/utils.py
index 1d9219fb4f5..1e4ff2d05aa 100644
--- a/torchvision/transforms/v2/utils.py
+++ b/torchvision/transforms/v2/utils.py
@@ -6,7 +6,7 @@
 from torchvision import datapoints
 
 from torchvision._utils import sequence_to_str
-from torchvision.transforms.v2.functional import get_dimensions, get_size, is_simple_tensor
+from torchvision.transforms.v2.functional import get_dimensions, get_size, is_pure_tensor
 
 
 def get_bounding_boxes(flat_inputs: List[Any]) -> datapoints.BoundingBoxes:
@@ -21,7 +21,7 @@ def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]:
     chws = {
         tuple(get_dimensions(inpt))
         for inpt in flat_inputs
-        if check_type(inpt, (is_simple_tensor, datapoints.Image, PIL.Image.Image, datapoints.Video))
+        if check_type(inpt, (is_pure_tensor, datapoints.Image, PIL.Image.Image, datapoints.Video))
     }
     if not chws:
         raise TypeError("No image or video was found in the sample")
@@ -38,7 +38,7 @@ def query_size(flat_inputs: List[Any]) -> Tuple[int, int]:
         if check_type(
             inpt,
             (
-                is_simple_tensor,
+                is_pure_tensor,
                 datapoints.Image,
                 PIL.Image.Image,
                 datapoints.Video,