diff --git a/test/common_utils.py b/test/common_utils.py
index 9713901bdcf..c815786b586 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -1,7 +1,4 @@
-import collections.abc
 import contextlib
-import dataclasses
-import enum
 import functools
 import itertools
 import os
@@ -12,12 +9,9 @@
 import sys
 import tempfile
 import warnings
-from collections import defaultdict
 from subprocess import CalledProcessError, check_output, STDOUT
-from typing import Callable, Sequence, Tuple, Union
 
 import numpy as np
-
 import PIL.Image
 import pytest
 import torch
@@ -27,7 +21,7 @@
 from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair
 from torchvision import datapoints, io
 from torchvision.transforms._functional_tensor import _max_value as get_max_value
-from torchvision.transforms.v2.functional import to_dtype_image, to_image, to_pil_image
+from torchvision.transforms.v2.functional import to_image, to_pil_image
 
 
 IN_OSS_CI = any(os.getenv(var) == "true" for var in ["CIRCLECI", "GITHUB_ACTIONS"])
@@ -363,132 +357,7 @@ def assert_close(
 assert_equal = functools.partial(assert_close, rtol=0, atol=0)
 
 
-def parametrized_error_message(*args, **kwargs):
-    def to_str(obj):
-        if isinstance(obj, torch.Tensor) and obj.numel() > 30:
-            return f"tensor(shape={list(obj.shape)}, dtype={obj.dtype}, device={obj.device})"
-        elif isinstance(obj, enum.Enum):
-            return f"{type(obj).__name__}.{obj.name}"
-        else:
-            return repr(obj)
-
-    if args or kwargs:
-        postfix = "\n".join(
-            [
-                "",
-                "Failure happened for the following parameters:",
-                "",
-                *[to_str(arg) for arg in args],
-                *[f"{name}={to_str(kwarg)}" for name, kwarg in kwargs.items()],
-            ]
-        )
-    else:
-        postfix = ""
-
-    def wrapper(msg):
-        return msg + postfix
-
-    return wrapper
-
-
-class ArgsKwargs:
-    def __init__(self, *args, **kwargs):
-        self.args = args
-        self.kwargs = kwargs
-
-    def __iter__(self):
-        yield self.args
-        yield self.kwargs
-
-    def load(self, device="cpu"):
-        return ArgsKwargs(
-            *(arg.load(device) if isinstance(arg, TensorLoader) else arg for arg in self.args),
-            **{
-                keyword: arg.load(device) if isinstance(arg, TensorLoader) else arg
-                for keyword, arg in self.kwargs.items()
-            },
-        )
-
-
-# new v2 default
 DEFAULT_SIZE = (17, 11)
-# old v2 defaults
-DEFAULT_SQUARE_SPATIAL_SIZE = 15
-DEFAULT_LANDSCAPE_SPATIAL_SIZE = (7, 33)
-DEFAULT_PORTRAIT_SPATIAL_SIZE = (31, 9)
-DEFAULT_SPATIAL_SIZES = (
-    DEFAULT_LANDSCAPE_SPATIAL_SIZE,
-    DEFAULT_PORTRAIT_SPATIAL_SIZE,
-    DEFAULT_SQUARE_SPATIAL_SIZE,
-)
-
-
-def _parse_size(size, *, name="size"):
-    if size == "random":
-        raise ValueError("This should never happen")
-    elif isinstance(size, int) and size > 0:
-        return (size, size)
-    elif (
-        isinstance(size, collections.abc.Sequence)
-        and len(size) == 2
-        and all(isinstance(length, int) and length > 0 for length in size)
-    ):
-        return tuple(size)
-    else:
-        raise pytest.UsageError(
-            f"'{name}' can either be `'random'`, a positive integer, or a sequence of two positive integers,"
-            f"but got {size} instead."
-        )
-
-
-VALID_EXTRA_DIMS = ((), (4,), (2, 3))
-DEGENERATE_BATCH_DIMS = ((0,), (5, 0), (0, 5))
-
-DEFAULT_EXTRA_DIMS = (*VALID_EXTRA_DIMS, *DEGENERATE_BATCH_DIMS)
-
-
-def from_loader(loader_fn):
-    def wrapper(*args, **kwargs):
-        device = kwargs.pop("device", "cpu")
-        loader = loader_fn(*args, **kwargs)
-        return loader.load(device)
-
-    return wrapper
-
-
-def from_loaders(loaders_fn):
-    def wrapper(*args, **kwargs):
-        device = kwargs.pop("device", "cpu")
-        loaders = loaders_fn(*args, **kwargs)
-        for loader in loaders:
-            yield loader.load(device)
-
-    return wrapper
-
-
-@dataclasses.dataclass
-class TensorLoader:
-    fn: Callable[[Sequence[int], torch.dtype, Union[str, torch.device]], torch.Tensor]
-    shape: Sequence[int]
-    dtype: torch.dtype
-
-    def load(self, device):
-        return self.fn(self.shape, self.dtype, device)
-
-
-@dataclasses.dataclass
-class ImageLoader(TensorLoader):
-    spatial_size: Tuple[int, int] = dataclasses.field(init=False)
-    num_channels: int = dataclasses.field(init=False)
-    memory_format: torch.memory_format = torch.contiguous_format
-    canvas_size: Tuple[int, int] = dataclasses.field(init=False)
-
-    def __post_init__(self):
-        self.spatial_size = self.canvas_size = self.shape[-2:]
-        self.num_channels = self.shape[-3]
-
-    def load(self, device):
-        return self.fn(self.shape, self.dtype, device, memory_format=self.memory_format)
 
 
 NUM_CHANNELS_MAP = {
@@ -499,13 +368,6 @@ def load(self, device):
 }
 
 
-def get_num_channels(color_space):
-    num_channels = NUM_CHANNELS_MAP.get(color_space)
-    if not num_channels:
-        raise pytest.UsageError(f"Can't determine the number of channels for color space {color_space}")
-    return num_channels
-
-
 def make_image(
     size=DEFAULT_SIZE,
     *,
@@ -515,10 +377,11 @@ def make_image(
     device="cpu",
     memory_format=torch.contiguous_format,
 ):
+    num_channels = NUM_CHANNELS_MAP[color_space]
     dtype = dtype or torch.uint8
     max_value = get_max_value(dtype)
     data = torch.testing.make_tensor(
-        (*batch_dims, get_num_channels(color_space), *size),
+        (*batch_dims, num_channels, *size),
         low=0,
         high=max_value,
         dtype=dtype,
@@ -539,109 +402,7 @@ def make_image_pil(*args, **kwargs):
     return to_pil_image(make_image(*args, **kwargs))
 
 
-def make_image_loader(
-    size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
-    *,
-    color_space="RGB",
-    extra_dims=(),
-    dtype=torch.float32,
-    constant_alpha=True,
-    memory_format=torch.contiguous_format,
-):
-    if not constant_alpha:
-        raise ValueError("This should never happen")
-    size = _parse_size(size)
-    num_channels = get_num_channels(color_space)
-
-    def fn(shape, dtype, device, memory_format):
-        *batch_dims, _, height, width = shape
-        return make_image(
-            (height, width),
-            color_space=color_space,
-            batch_dims=batch_dims,
-            dtype=dtype,
-            device=device,
-            memory_format=memory_format,
-        )
-
-    return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype, memory_format=memory_format)
-
-
-def make_image_loaders(
-    *,
-    sizes=DEFAULT_SPATIAL_SIZES,
-    color_spaces=(
-        "GRAY",
-        "GRAY_ALPHA",
-        "RGB",
-        "RGBA",
-    ),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.float32, torch.float64, torch.uint8),
-    constant_alpha=True,
-):
-    for params in combinations_grid(size=sizes, color_space=color_spaces, extra_dims=extra_dims, dtype=dtypes):
-        yield make_image_loader(**params, constant_alpha=constant_alpha)
-
-
-make_images = from_loaders(make_image_loaders)
-
-
-def make_image_loader_for_interpolation(
-    size=(233, 147), *, color_space="RGB", dtype=torch.uint8, memory_format=torch.contiguous_format
-):
-    size = _parse_size(size)
-    num_channels = get_num_channels(color_space)
-
-    def fn(shape, dtype, device, memory_format):
-        height, width = shape[-2:]
-
-        image_pil = (
-            PIL.Image.open(pathlib.Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg")
-            .resize((width, height))
-            .convert(
-                {
-                    "GRAY": "L",
-                    "GRAY_ALPHA": "LA",
-                    "RGB": "RGB",
-                    "RGBA": "RGBA",
-                }[color_space]
-            )
-        )
-
-        image_tensor = to_image(image_pil)
-        if memory_format == torch.contiguous_format:
-            image_tensor = image_tensor.to(device=device, memory_format=memory_format, copy=True)
-        else:
-            image_tensor = image_tensor.to(device=device)
-        image_tensor = to_dtype_image(image_tensor, dtype=dtype, scale=True)
-
-        return datapoints.Image(image_tensor)
-
-    return ImageLoader(fn, shape=(num_channels, *size), dtype=dtype, memory_format=memory_format)
-
-
-def make_image_loaders_for_interpolation(
-    sizes=((233, 147),),
-    color_spaces=("RGB",),
-    dtypes=(torch.uint8,),
-    memory_formats=(torch.contiguous_format, torch.channels_last),
-):
-    for params in combinations_grid(size=sizes, color_space=color_spaces, dtype=dtypes, memory_format=memory_formats):
-        yield make_image_loader_for_interpolation(**params)
-
-
-@dataclasses.dataclass
-class BoundingBoxesLoader(TensorLoader):
-    format: datapoints.BoundingBoxFormat
-    spatial_size: Tuple[int, int]
-    canvas_size: Tuple[int, int] = dataclasses.field(init=False)
-
-    def __post_init__(self):
-        self.canvas_size = self.spatial_size
-
-
-def make_bounding_box(
+def make_bounding_boxes(
     canvas_size=DEFAULT_SIZE,
     *,
     format=datapoints.BoundingBoxFormat.XYXY,
@@ -687,42 +448,6 @@ def sample_position(values, max_value):
     )
 
 
-def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.float32):
-    if isinstance(format, str):
-        format = datapoints.BoundingBoxFormat[format]
-
-    spatial_size = _parse_size(spatial_size, name="spatial_size")
-
-    def fn(shape, dtype, device):
-        *batch_dims, num_coordinates = shape
-        if num_coordinates != 4:
-            raise pytest.UsageError()
-
-        return make_bounding_box(
-            format=format, canvas_size=spatial_size, batch_dims=batch_dims, dtype=dtype, device=device
-        )
-
-    return BoundingBoxesLoader(fn, shape=(*extra_dims[-1:], 4), dtype=dtype, format=format, spatial_size=spatial_size)
-
-
-def make_bounding_box_loaders(
-    *,
-    extra_dims=tuple(d for d in DEFAULT_EXTRA_DIMS if len(d) < 2),
-    formats=tuple(datapoints.BoundingBoxFormat),
-    spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
-    dtypes=(torch.float32, torch.float64, torch.int64),
-):
-    for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes):
-        yield make_bounding_box_loader(**params, spatial_size=spatial_size)
-
-
-make_bounding_boxes = from_loaders(make_bounding_box_loaders)
-
-
-class MaskLoader(TensorLoader):
-    pass
-
-
 def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtype=None, device="cpu"):
     """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks"""
     return datapoints.Mask(
@@ -736,32 +461,6 @@ def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtyp
     )
 
 
-def make_detection_mask_loader(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_objects=5, extra_dims=(), dtype=torch.uint8):
-    # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects
-    size = _parse_size(size)
-
-    def fn(shape, dtype, device):
-        *batch_dims, num_objects, height, width = shape
-        return make_detection_mask(
-            (height, width), num_objects=num_objects, batch_dims=batch_dims, dtype=dtype, device=device
-        )
-
-    return MaskLoader(fn, shape=(*extra_dims, num_objects, *size), dtype=dtype)
-
-
-def make_detection_mask_loaders(
-    sizes=DEFAULT_SPATIAL_SIZES,
-    num_objects=(1, 0, 5),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8,),
-):
-    for params in combinations_grid(size=sizes, num_objects=num_objects, extra_dims=extra_dims, dtype=dtypes):
-        yield make_detection_mask_loader(**params)
-
-
-make_detection_masks = from_loaders(make_detection_mask_loaders)
-
-
 def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"):
     """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value"""
     return datapoints.Mask(
@@ -775,56 +474,6 @@ def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(
     )
 
 
-def make_segmentation_mask_loader(
-    size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_categories=10, extra_dims=(), dtype=torch.uint8
-):
-    # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
-    size = _parse_size(size)
-
-    def fn(shape, dtype, device):
-        *batch_dims, height, width = shape
-        return make_segmentation_mask(
-            (height, width), num_categories=num_categories, batch_dims=batch_dims, dtype=dtype, device=device
-        )
-
-    return MaskLoader(fn, shape=(*extra_dims, *size), dtype=dtype)
-
-
-def make_segmentation_mask_loaders(
-    *,
-    sizes=DEFAULT_SPATIAL_SIZES,
-    num_categories=(1, 2, 10),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8,),
-):
-    for params in combinations_grid(size=sizes, num_categories=num_categories, extra_dims=extra_dims, dtype=dtypes):
-        yield make_segmentation_mask_loader(**params)
-
-
-make_segmentation_masks = from_loaders(make_segmentation_mask_loaders)
-
-
-def make_mask_loaders(
-    *,
-    sizes=DEFAULT_SPATIAL_SIZES,
-    num_objects=(1, 0, 5),
-    num_categories=(1, 2, 10),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8,),
-):
-    yield from make_detection_mask_loaders(sizes=sizes, num_objects=num_objects, extra_dims=extra_dims, dtypes=dtypes)
-    yield from make_segmentation_mask_loaders(
-        sizes=sizes, num_categories=num_categories, extra_dims=extra_dims, dtypes=dtypes
-    )
-
-
-make_masks = from_loaders(make_mask_loaders)
-
-
-class VideoLoader(ImageLoader):
-    pass
-
-
 def make_video(size=DEFAULT_SIZE, *, num_frames=3, batch_dims=(), **kwargs):
     return datapoints.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs))
 
@@ -833,120 +482,6 @@ def make_video_tensor(*args, **kwargs):
     return make_video(*args, **kwargs).as_subclass(torch.Tensor)
 
 
-def make_video_loader(
-    size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
-    *,
-    color_space="RGB",
-    num_frames=3,
-    extra_dims=(),
-    dtype=torch.uint8,
-):
-    size = _parse_size(size)
-
-    def fn(shape, dtype, device, memory_format):
-        *batch_dims, num_frames, _, height, width = shape
-        return make_video(
-            (height, width),
-            num_frames=num_frames,
-            batch_dims=batch_dims,
-            color_space=color_space,
-            dtype=dtype,
-            device=device,
-            memory_format=memory_format,
-        )
-
-    return VideoLoader(fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype)
-
-
-def make_video_loaders(
-    *,
-    sizes=DEFAULT_SPATIAL_SIZES,
-    color_spaces=(
-        "GRAY",
-        "RGB",
-    ),
-    num_frames=(1, 0, 3),
-    extra_dims=DEFAULT_EXTRA_DIMS,
-    dtypes=(torch.uint8, torch.float32, torch.float64),
-):
-    for params in combinations_grid(
-        size=sizes, color_space=color_spaces, num_frames=num_frames, extra_dims=extra_dims, dtype=dtypes
-    ):
-        yield make_video_loader(**params)
-
-
-make_videos = from_loaders(make_video_loaders)
-
-
-class TestMark:
-    def __init__(
-        self,
-        # Tuple of test class name and test function name that identifies the test the mark is applied to. If there is
-        # no test class, i.e. a standalone test function, use `None`.
-        test_id,
-        # `pytest.mark.*` to apply, e.g. `pytest.mark.skip` or `pytest.mark.xfail`
-        mark,
-        *,
-        # Callable, that will be passed an `ArgsKwargs` and should return a boolean to indicate if the mark will be
-        # applied. If omitted, defaults to always apply.
-        condition=None,
-    ):
-        self.test_id = test_id
-        self.mark = mark
-        self.condition = condition or (lambda args_kwargs: True)
-
-
-def mark_framework_limitation(test_id, reason, condition=None):
-    # The purpose of this function is to have a single entry point for skip marks that are only there, because the test
-    # framework cannot handle the kernel in general or a specific parameter combination.
-    # As development progresses, we can change the `mark.skip` to `mark.xfail` from time to time to see if the skip is
-    # still justified.
-    # We don't want to use `mark.xfail` all the time, because that actually runs the test until an error happens. Thus,
-    # we are wasting CI resources for no reason for most of the time
-    return TestMark(test_id, pytest.mark.skip(reason=reason), condition=condition)
-
-
-class InfoBase:
-    def __init__(
-        self,
-        *,
-        # Identifier if the info that shows up the parametrization.
-        id,
-        # Test markers that will be (conditionally) applied to an `ArgsKwargs` parametrization.
-        # See the `TestMark` class for details
-        test_marks=None,
-        # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`. Keys are a 3-tuple of `test_id` (see
-        # `TestMark`), the dtype, and the device.
-        closeness_kwargs=None,
-    ):
-        self.id = id
-
-        self.test_marks = test_marks or []
-        test_marks_map = defaultdict(list)
-        for test_mark in self.test_marks:
-            test_marks_map[test_mark.test_id].append(test_mark)
-        self._test_marks_map = dict(test_marks_map)
-
-        self.closeness_kwargs = closeness_kwargs or dict()
-
-    def get_marks(self, test_id, args_kwargs):
-        return [
-            test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs)
-        ]
-
-    def get_closeness_kwargs(self, test_id, *, dtype, device):
-        if not (isinstance(test_id, tuple) and len(test_id) == 2):
-            msg = "`test_id` should be a `Tuple[Optional[str], str]` denoting the test class and function name"
-            if callable(test_id):
-                msg += ". Did you forget to add the `test_id` fixture to parameters of the test?"
-            else:
-                msg += f", but got {test_id} instead."
-            raise pytest.UsageError(msg)
-        if isinstance(device, torch.device):
-            device = device.type
-        return self.closeness_kwargs.get((test_id, dtype, device), dict())
-
-
 def assert_run_python_script(source_code):
     """Utility to check assertions in an independent Python subprocess.
 
diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index 8259246c0cb..acbe1a6a77a 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -4,12 +4,12 @@
 
 import pytest
 import torch
-
-from common_utils import combinations_grid, DEFAULT_EXTRA_DIMS, from_loader, from_loaders, TensorLoader
 from torch.nn.functional import one_hot
 
 from torchvision.prototype import datapoints
 
+from transforms_v2_legacy_utils import combinations_grid, DEFAULT_EXTRA_DIMS, from_loader, from_loaders, TensorLoader
+
 
 @dataclasses.dataclass
 class LabelLoader(TensorLoader):
diff --git a/test/test_datapoints.py b/test/test_datapoints.py
index 4da2eb39383..1aeb2367752 100644
--- a/test/test_datapoints.py
+++ b/test/test_datapoints.py
@@ -2,7 +2,7 @@
 
 import pytest
 import torch
-from common_utils import assert_equal, make_bounding_box, make_image, make_segmentation_mask, make_video
+from common_utils import assert_equal, make_bounding_boxes, make_image, make_segmentation_mask, make_video
 from PIL import Image
 
 from torchvision import datapoints
@@ -68,7 +68,7 @@ def test_new_requires_grad(data, input_requires_grad, expected_requires_grad):
     assert datapoint.requires_grad is expected_requires_grad
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 def test_isinstance(make_input):
     assert isinstance(make_input(), torch.Tensor)
 
@@ -80,7 +80,7 @@ def test_wrapping_no_copy():
     assert image.data_ptr() == tensor.data_ptr()
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 def test_to_wrapping(make_input):
     dp = make_input()
 
@@ -90,7 +90,7 @@ def test_to_wrapping(make_input):
     assert dp_to.dtype is torch.float64
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 def test_to_datapoint_reference(make_input, return_type):
     tensor = torch.rand((3, 16, 16), dtype=torch.float64)
@@ -104,7 +104,7 @@ def test_to_datapoint_reference(make_input, return_type):
     assert type(tensor) is torch.Tensor
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 def test_clone_wrapping(make_input, return_type):
     dp = make_input()
@@ -116,7 +116,7 @@ def test_clone_wrapping(make_input, return_type):
     assert dp_clone.data_ptr() != dp.data_ptr()
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 def test_requires_grad__wrapping(make_input, return_type):
     dp = make_input(dtype=torch.float)
@@ -131,7 +131,7 @@ def test_requires_grad__wrapping(make_input, return_type):
     assert dp_requires_grad.requires_grad
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 def test_detach_wrapping(make_input, return_type):
     dp = make_input(dtype=torch.float).requires_grad_(True)
@@ -170,7 +170,7 @@ def test_force_subclass_with_metadata(return_type):
     datapoints.set_return_type("tensor")
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 def test_other_op_no_wrapping(make_input, return_type):
     dp = make_input()
@@ -182,7 +182,7 @@ def test_other_op_no_wrapping(make_input, return_type):
     assert type(output) is (type(dp) if return_type == "datapoint" else torch.Tensor)
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize(
     "op",
     [
@@ -199,7 +199,7 @@ def test_no_tensor_output_op_no_wrapping(make_input, op):
     assert type(output) is not type(dp)
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 def test_inplace_op_no_wrapping(make_input, return_type):
     dp = make_input()
@@ -212,7 +212,7 @@ def test_inplace_op_no_wrapping(make_input, return_type):
     assert type(dp) is original_type
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 def test_wrap(make_input):
     dp = make_input()
 
@@ -225,7 +225,7 @@ def test_wrap(make_input):
     assert dp_new.data_ptr() == output.data_ptr()
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("requires_grad", [False, True])
 def test_deepcopy(make_input, requires_grad):
     dp = make_input(dtype=torch.float)
@@ -242,7 +242,7 @@ def test_deepcopy(make_input, requires_grad):
     assert dp_deepcopied.requires_grad is requires_grad
 
 
-@pytest.mark.parametrize("make_input", [make_image, make_bounding_box, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
 @pytest.mark.parametrize("return_type", ["Tensor", "datapoint"])
 @pytest.mark.parametrize(
     "op",
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index bf45970df97..0410ecadc48 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -4,14 +4,7 @@
 import pytest
 import torch
 
-from common_utils import (
-    assert_equal,
-    DEFAULT_EXTRA_DIMS,
-    make_bounding_box,
-    make_detection_mask,
-    make_image,
-    make_video,
-)
+from common_utils import assert_equal
 
 from prototype_common_utils import make_label
 
@@ -19,6 +12,13 @@
 from torchvision.prototype import datapoints, transforms
 from torchvision.transforms.v2.functional import clamp_bounding_boxes, InterpolationMode, pil_to_tensor, to_pil_image
 from torchvision.transforms.v2.utils import check_type, is_pure_tensor
+from transforms_v2_legacy_utils import (
+    DEFAULT_EXTRA_DIMS,
+    make_bounding_boxes,
+    make_detection_mask,
+    make_image,
+    make_video,
+)
 
 BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims]
 
@@ -167,7 +167,7 @@ def test__get_params(self, mocker):
 
         flat_inputs = [
             make_image(size=canvas_size, color_space="RGB"),
-            make_bounding_box(format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=batch_shape),
+            make_bounding_boxes(format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=batch_shape),
         ]
         params = transform._get_params(flat_inputs)
 
@@ -202,7 +202,7 @@ def test__transform_culling(self, mocker):
             ),
         )
 
-        bounding_boxes = make_bounding_box(
+        bounding_boxes = make_bounding_boxes(
             format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(batch_size,)
         )
         masks = make_detection_mask(size=canvas_size, batch_dims=(batch_size,))
@@ -240,7 +240,7 @@ def test__transform_bounding_boxes_clamping(self, mocker):
             ),
         )
 
-        bounding_boxes = make_bounding_box(
+        bounding_boxes = make_bounding_boxes(
             format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(batch_size,)
         )
         mock = mocker.patch(
@@ -283,7 +283,7 @@ class TestPermuteDimensions:
     def test_call(self, dims, inverse_dims):
         sample = dict(
             image=make_image(),
-            bounding_boxes=make_bounding_box(format=BoundingBoxFormat.XYXY),
+            bounding_boxes=make_bounding_boxes(format=BoundingBoxFormat.XYXY),
             video=make_video(),
             str="str",
             int=0,
@@ -327,7 +327,7 @@ class TestTransposeDimensions:
     def test_call(self, dims):
         sample = dict(
             image=make_image(),
-            bounding_boxes=make_bounding_box(format=BoundingBoxFormat.XYXY),
+            bounding_boxes=make_bounding_boxes(format=BoundingBoxFormat.XYXY),
             video=make_video(),
             str="str",
             int=0,
@@ -389,7 +389,7 @@ def make_datapoints():
 
         pil_image = to_pil_image(make_image(size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
             "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
         }
@@ -398,7 +398,7 @@ def make_datapoints():
 
         tensor_image = torch.Tensor(make_image(size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
             "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
         }
@@ -407,7 +407,7 @@ def make_datapoints():
 
         datapoint_image = make_image(size=size, color_space="RGB")
         target = {
-            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
             "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
         }
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index d7a6f21bbe7..5752b323f79 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -11,25 +11,23 @@
 import torch
 import torchvision.transforms.v2 as transforms
 
-from common_utils import (
-    assert_equal,
-    assert_run_python_script,
-    cpu_and_cuda,
-    make_bounding_box,
+from common_utils import assert_equal, assert_run_python_script, cpu_and_cuda
+from torch.utils._pytree import tree_flatten, tree_unflatten
+from torchvision import datapoints
+from torchvision.ops.boxes import box_iou
+from torchvision.transforms.functional import to_pil_image
+from torchvision.transforms.v2 import functional as F
+from torchvision.transforms.v2.utils import check_type, is_pure_tensor, query_chw
+from transforms_v2_legacy_utils import (
     make_bounding_boxes,
     make_detection_mask,
     make_image,
     make_images,
+    make_multiple_bounding_boxes,
     make_segmentation_mask,
     make_video,
     make_videos,
 )
-from torch.utils._pytree import tree_flatten, tree_unflatten
-from torchvision import datapoints
-from torchvision.ops.boxes import box_iou
-from torchvision.transforms.functional import to_pil_image
-from torchvision.transforms.v2 import functional as F
-from torchvision.transforms.v2.utils import check_type, is_pure_tensor, query_chw
 
 
 def make_vanilla_tensor_images(*args, **kwargs):
@@ -45,7 +43,7 @@ def make_pil_images(*args, **kwargs):
 
 
 def make_vanilla_tensor_bounding_boxes(*args, **kwargs):
-    for bounding_boxes in make_bounding_boxes(*args, **kwargs):
+    for bounding_boxes in make_multiple_bounding_boxes(*args, **kwargs):
         yield bounding_boxes.data
 
 
@@ -180,13 +178,13 @@ def test_common(self, transform, adapter, container_type, image_or_video, device
             image_datapoint=make_image(size=canvas_size),
             video_datapoint=make_video(size=canvas_size),
             image_pil=next(make_pil_images(sizes=[canvas_size], color_spaces=["RGB"])),
-            bounding_boxes_xyxy=make_bounding_box(
+            bounding_boxes_xyxy=make_bounding_boxes(
                 format=datapoints.BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(3,)
             ),
-            bounding_boxes_xywh=make_bounding_box(
+            bounding_boxes_xywh=make_bounding_boxes(
                 format=datapoints.BoundingBoxFormat.XYWH, canvas_size=canvas_size, batch_dims=(4,)
             ),
-            bounding_boxes_cxcywh=make_bounding_box(
+            bounding_boxes_cxcywh=make_bounding_boxes(
                 format=datapoints.BoundingBoxFormat.CXCYWH, canvas_size=canvas_size, batch_dims=(5,)
             ),
             bounding_boxes_degenerate_xyxy=datapoints.BoundingBoxes(
@@ -813,7 +811,7 @@ def test__transform(self, mocker):
 
         size = (32, 24)
         image = make_image(size)
-        bboxes = make_bounding_box(format="XYXY", canvas_size=size, batch_dims=(6,))
+        bboxes = make_bounding_boxes(format="XYXY", canvas_size=size, batch_dims=(6,))
         masks = make_detection_mask(size, num_objects=6)
 
         sample = [image, bboxes, masks]
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index 3196a5fd82c..61de769d885 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -12,17 +12,7 @@
 
 import torch
 import torchvision.transforms.v2 as v2_transforms
-from common_utils import (
-    ArgsKwargs,
-    assert_close,
-    assert_equal,
-    make_bounding_box,
-    make_detection_mask,
-    make_image,
-    make_images,
-    make_segmentation_mask,
-    set_rng_seed,
-)
+from common_utils import assert_close, assert_equal, set_rng_seed
 from torch import nn
 from torchvision import datapoints, transforms as legacy_transforms
 from torchvision._utils import sequence_to_str
@@ -32,6 +22,14 @@
 from torchvision.transforms.v2._utils import _get_fill
 from torchvision.transforms.v2.functional import to_pil_image
 from torchvision.transforms.v2.utils import query_size
+from transforms_v2_legacy_utils import (
+    ArgsKwargs,
+    make_bounding_boxes,
+    make_detection_mask,
+    make_image,
+    make_images,
+    make_segmentation_mask,
+)
 
 DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=["RGB"], extra_dims=[(4,)])
 
@@ -1090,7 +1088,7 @@ def make_label(extra_dims, categories):
 
         pil_image = to_pil_image(make_image(size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
@@ -1100,7 +1098,7 @@ def make_label(extra_dims, categories):
 
         tensor_image = torch.Tensor(make_image(size=size, color_space="RGB", dtype=torch.float32))
         target = {
-            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
@@ -1110,7 +1108,7 @@ def make_label(extra_dims, categories):
 
         datapoint_image = make_image(size=size, color_space="RGB", dtype=torch.float32)
         target = {
-            "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index 29ef54d925a..15af5a7a9ed 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -8,16 +8,7 @@
 import pytest
 import torch
 
-from common_utils import (
-    assert_close,
-    cache,
-    cpu_and_cuda,
-    DEFAULT_SQUARE_SPATIAL_SIZE,
-    make_bounding_boxes,
-    needs_cuda,
-    parametrized_error_message,
-    set_rng_seed,
-)
+from common_utils import assert_close, cache, cpu_and_cuda, needs_cuda, set_rng_seed
 from torch.utils._pytree import tree_map
 from torchvision import datapoints
 from torchvision.transforms.functional import _get_perspective_coeffs
@@ -27,6 +18,11 @@
 from torchvision.transforms.v2.utils import is_pure_tensor
 from transforms_v2_dispatcher_infos import DISPATCHER_INFOS
 from transforms_v2_kernel_infos import KERNEL_INFOS
+from transforms_v2_legacy_utils import (
+    DEFAULT_SQUARE_SPATIAL_SIZE,
+    make_multiple_bounding_boxes,
+    parametrized_error_message,
+)
 
 
 KERNEL_INFOS_MAP = {info.kernel: info for info in KERNEL_INFOS}
@@ -506,7 +502,7 @@ class TestClampBoundingBoxes:
         ],
     )
     def test_pure_tensor_insufficient_metadata(self, metadata):
-        pure_tensor = next(make_bounding_boxes()).as_subclass(torch.Tensor)
+        pure_tensor = next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor)
 
         with pytest.raises(ValueError, match=re.escape("`format` and `canvas_size` has to be passed")):
             F.clamp_bounding_boxes(pure_tensor, **metadata)
@@ -520,7 +516,7 @@ def test_pure_tensor_insufficient_metadata(self, metadata):
         ],
     )
     def test_datapoint_explicit_metadata(self, metadata):
-        datapoint = next(make_bounding_boxes())
+        datapoint = next(make_multiple_bounding_boxes())
 
         with pytest.raises(ValueError, match=re.escape("`format` and `canvas_size` must not be passed")):
             F.clamp_bounding_boxes(datapoint, **metadata)
@@ -530,8 +526,8 @@ class TestConvertFormatBoundingBoxes:
     @pytest.mark.parametrize(
         ("inpt", "old_format"),
         [
-            (next(make_bounding_boxes()), None),
-            (next(make_bounding_boxes()).as_subclass(torch.Tensor), datapoints.BoundingBoxFormat.XYXY),
+            (next(make_multiple_bounding_boxes()), None),
+            (next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor), datapoints.BoundingBoxFormat.XYXY),
         ],
     )
     def test_missing_new_format(self, inpt, old_format):
@@ -539,13 +535,13 @@ def test_missing_new_format(self, inpt, old_format):
             F.convert_format_bounding_boxes(inpt, old_format)
 
     def test_pure_tensor_insufficient_metadata(self):
-        pure_tensor = next(make_bounding_boxes()).as_subclass(torch.Tensor)
+        pure_tensor = next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor)
 
         with pytest.raises(ValueError, match=re.escape("`old_format` has to be passed")):
             F.convert_format_bounding_boxes(pure_tensor, new_format=datapoints.BoundingBoxFormat.CXCYWH)
 
     def test_datapoint_explicit_metadata(self):
-        datapoint = next(make_bounding_boxes())
+        datapoint = next(make_multiple_bounding_boxes())
 
         with pytest.raises(ValueError, match=re.escape("`old_format` must not be passed")):
             F.convert_format_bounding_boxes(
@@ -736,7 +732,7 @@ def _compute_expected_canvas_size(bbox, padding_):
         height, width = bbox.canvas_size
         return height + pad_up + pad_down, width + pad_left + pad_right
 
-    for bboxes in make_bounding_boxes(extra_dims=((4,),)):
+    for bboxes in make_multiple_bounding_boxes(extra_dims=((4,),)):
         bboxes = bboxes.to(device)
         bboxes_format = bboxes.format
         bboxes_canvas_size = bboxes.canvas_size
@@ -822,7 +818,7 @@ def _compute_expected_bbox(bbox, format_, canvas_size_, pcoeffs_):
     pcoeffs = _get_perspective_coeffs(startpoints, endpoints)
     inv_pcoeffs = _get_perspective_coeffs(endpoints, startpoints)
 
-    for bboxes in make_bounding_boxes(spatial_size=canvas_size, extra_dims=((4,),)):
+    for bboxes in make_multiple_bounding_boxes(spatial_size=canvas_size, extra_dims=((4,),)):
         bboxes = bboxes.to(device)
 
         output_bboxes = F.perspective_bounding_boxes(
@@ -870,7 +866,7 @@ def _compute_expected_bbox(bbox, format_, canvas_size_, output_size_):
         out_bbox = clamp_bounding_boxes(out_bbox, format=format_, canvas_size=output_size)
         return out_bbox.to(dtype=dtype, device=bbox.device)
 
-    for bboxes in make_bounding_boxes(extra_dims=((4,),)):
+    for bboxes in make_multiple_bounding_boxes(extra_dims=((4,),)):
         bboxes = bboxes.to(device)
         bboxes_format = bboxes.format
         bboxes_canvas_size = bboxes.canvas_size
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index c51b7c7555f..f57736e5abd 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -19,7 +19,7 @@
     cpu_and_cuda,
     freeze_rng_state,
     ignore_jit_no_profile_information_warning,
-    make_bounding_box,
+    make_bounding_boxes,
     make_detection_mask,
     make_image,
     make_image_pil,
@@ -456,7 +456,7 @@ def test_kernel_bounding_boxes(self, format, size, use_max_size, dtype, device):
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
-        bounding_boxes = make_bounding_box(
+        bounding_boxes = make_bounding_boxes(
             format=format,
             canvas_size=self.INPUT_SIZE,
             dtype=dtype,
@@ -481,7 +481,7 @@ def test_kernel_video(self):
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     def test_functional(self, size, make_input):
         check_functional(
@@ -514,7 +514,7 @@ def test_functional_signature(self, kernel, input_type):
             make_image_tensor,
             make_image_pil,
             make_image,
-            make_bounding_box,
+            make_bounding_boxes,
             make_segmentation_mask,
             make_detection_mask,
             make_video,
@@ -579,7 +579,7 @@ def test_bounding_boxes_correctness(self, format, size, use_max_size, fn):
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
-        bounding_boxes = make_bounding_box(format=format, canvas_size=self.INPUT_SIZE)
+        bounding_boxes = make_bounding_boxes(format=format, canvas_size=self.INPUT_SIZE)
 
         actual = fn(bounding_boxes, size=size, **max_size_kwarg)
         expected = self._reference_resize_bounding_boxes(bounding_boxes, size=size, **max_size_kwarg)
@@ -618,7 +618,7 @@ def test_functional_pil_antialias_warning(self):
             make_image_tensor,
             make_image_pil,
             make_image,
-            make_bounding_box,
+            make_bounding_boxes,
             make_segmentation_mask,
             make_detection_mask,
             make_video,
@@ -687,7 +687,7 @@ def test_transform_unknown_size_error(self):
             make_image_tensor,
             make_image_pil,
             make_image,
-            make_bounding_box,
+            make_bounding_boxes,
             make_segmentation_mask,
             make_detection_mask,
             make_video,
@@ -714,7 +714,7 @@ def test_noop(self, size, make_input):
             make_image_tensor,
             make_image_pil,
             make_image,
-            make_bounding_box,
+            make_bounding_boxes,
             make_segmentation_mask,
             make_detection_mask,
             make_video,
@@ -743,7 +743,7 @@ def test_kernel_image_tensor(self, dtype, device):
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_boxes(self, format, dtype, device):
-        bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device)
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
         check_kernel(
             F.horizontal_flip_bounding_boxes,
             bounding_boxes,
@@ -760,7 +760,7 @@ def test_kernel_video(self):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     def test_functional(self, make_input):
         check_functional(F.horizontal_flip, make_input())
@@ -781,7 +781,7 @@ def test_functional_signature(self, kernel, input_type):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, make_input, device):
@@ -821,7 +821,7 @@ def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes):
         "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
     )
     def test_bounding_boxes_correctness(self, format, fn):
-        bounding_boxes = make_bounding_box(format=format)
+        bounding_boxes = make_bounding_boxes(format=format)
 
         actual = fn(bounding_boxes)
         expected = self._reference_horizontal_flip_bounding_boxes(bounding_boxes)
@@ -830,7 +830,7 @@ def test_bounding_boxes_correctness(self, format, fn):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform_noop(self, make_input, device):
@@ -917,7 +917,7 @@ def test_kernel_image_tensor(self, param, value, dtype, device):
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_boxes(self, param, value, format, dtype, device):
-        bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device)
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
         self._check_kernel(
             F.affine_bounding_boxes,
             bounding_boxes,
@@ -936,7 +936,7 @@ def test_kernel_video(self):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     def test_functional(self, make_input):
         check_functional(F.affine, make_input(), **self._MINIMAL_AFFINE_KWARGS)
@@ -957,7 +957,7 @@ def test_functional_signature(self, kernel, input_type):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, make_input, device):
@@ -1076,7 +1076,7 @@ def _reference_affine_bounding_boxes(self, bounding_boxes, *, angle, translate,
     @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"])
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     def test_functional_bounding_boxes_correctness(self, format, angle, translate, scale, shear, center):
-        bounding_boxes = make_bounding_box(format=format)
+        bounding_boxes = make_bounding_boxes(format=format)
 
         actual = F.affine(
             bounding_boxes,
@@ -1101,7 +1101,7 @@ def test_functional_bounding_boxes_correctness(self, format, angle, translate, s
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     @pytest.mark.parametrize("seed", list(range(5)))
     def test_transform_bounding_boxes_correctness(self, format, center, seed):
-        bounding_boxes = make_bounding_box(format=format)
+        bounding_boxes = make_bounding_boxes(format=format)
 
         transform = transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center)
 
@@ -1208,7 +1208,7 @@ def test_kernel_image_tensor(self, dtype, device):
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_boxes(self, format, dtype, device):
-        bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device)
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
         check_kernel(
             F.vertical_flip_bounding_boxes,
             bounding_boxes,
@@ -1225,7 +1225,7 @@ def test_kernel_video(self):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     def test_functional(self, make_input):
         check_functional(F.vertical_flip, make_input())
@@ -1246,7 +1246,7 @@ def test_functional_signature(self, kernel, input_type):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, make_input, device):
@@ -1282,7 +1282,7 @@ def _reference_vertical_flip_bounding_boxes(self, bounding_boxes):
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
     def test_bounding_boxes_correctness(self, format, fn):
-        bounding_boxes = make_bounding_box(format=format)
+        bounding_boxes = make_bounding_boxes(format=format)
 
         actual = fn(bounding_boxes)
         expected = self._reference_vertical_flip_bounding_boxes(bounding_boxes)
@@ -1291,7 +1291,7 @@ def test_bounding_boxes_correctness(self, format, fn):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform_noop(self, make_input, device):
@@ -1356,7 +1356,7 @@ def test_kernel_bounding_boxes(self, param, value, format, dtype, device):
         if param != "angle":
             kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"]
 
-        bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device)
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
 
         check_kernel(
             F.rotate_bounding_boxes,
@@ -1375,7 +1375,7 @@ def test_kernel_video(self):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     def test_functional(self, make_input):
         check_functional(F.rotate, make_input(), **self._MINIMAL_AFFINE_KWARGS)
@@ -1396,7 +1396,7 @@ def test_functional_signature(self, kernel, input_type):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, make_input, device):
@@ -1490,7 +1490,7 @@ def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, cen
     @pytest.mark.parametrize("expand", [False])
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     def test_functional_bounding_boxes_correctness(self, format, angle, expand, center):
-        bounding_boxes = make_bounding_box(format=format)
+        bounding_boxes = make_bounding_boxes(format=format)
 
         actual = F.rotate(bounding_boxes, angle=angle, expand=expand, center=center)
         expected = self._reference_rotate_bounding_boxes(bounding_boxes, angle=angle, expand=expand, center=center)
@@ -1503,7 +1503,7 @@ def test_functional_bounding_boxes_correctness(self, format, angle, expand, cent
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     @pytest.mark.parametrize("seed", list(range(5)))
     def test_transform_bounding_boxes_correctness(self, format, expand, center, seed):
-        bounding_boxes = make_bounding_box(format=format)
+        bounding_boxes = make_bounding_boxes(format=format)
 
         transform = transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, expand=expand, center=center)
 
@@ -1652,7 +1652,7 @@ def test_functional(self, make_input, input_dtype, output_dtype, device, scale):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8])
     @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8])
@@ -1727,7 +1727,7 @@ def make_inpt_with_bbox_and_mask(self, make_input):
         mask_dtype = torch.bool
         sample = {
             "inpt": make_input(size=(H, W), dtype=inpt_dtype),
-            "bbox": make_bounding_box(canvas_size=(H, W), dtype=bbox_dtype),
+            "bbox": make_bounding_boxes(canvas_size=(H, W), dtype=bbox_dtype),
             "mask": make_detection_mask(size=(H, W), dtype=mask_dtype),
         }
 
@@ -2013,7 +2013,7 @@ def test_get_num_channels(self, kernel, make_input):
             (F.get_size_image, make_image_tensor),
             (F._get_size_image_pil, make_image_pil),
             (F.get_size_image, make_image),
-            (F.get_size_bounding_boxes, make_bounding_box),
+            (F.get_size_bounding_boxes, make_bounding_boxes),
             (F.get_size_mask, make_detection_mask),
             (F.get_size_mask, make_segmentation_mask),
             (F.get_size_video, make_video),
@@ -2043,15 +2043,15 @@ def test_get_num_frames(self, kernel, make_input):
     @pytest.mark.parametrize(
         ("functional", "make_input"),
         [
-            (F.get_dimensions, make_bounding_box),
+            (F.get_dimensions, make_bounding_boxes),
             (F.get_dimensions, make_detection_mask),
             (F.get_dimensions, make_segmentation_mask),
-            (F.get_num_channels, make_bounding_box),
+            (F.get_num_channels, make_bounding_boxes),
             (F.get_num_channels, make_detection_mask),
             (F.get_num_channels, make_segmentation_mask),
             (F.get_num_frames, make_image_pil),
             (F.get_num_frames, make_image),
-            (F.get_num_frames, make_bounding_box),
+            (F.get_num_frames, make_bounding_boxes),
             (F.get_num_frames, make_detection_mask),
             (F.get_num_frames, make_segmentation_mask),
         ],
@@ -2290,7 +2290,7 @@ def test_kernel_image_tensor(self, param, value, dtype, device):
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_boxes(self, format, dtype, device):
-        bounding_boxes = make_bounding_box(format=format, dtype=dtype, device=device)
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
 
         check_kernel(
             F.elastic_bounding_boxes,
@@ -2311,7 +2311,7 @@ def test_kernel_video(self):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     def test_functional(self, make_input):
         input = make_input()
@@ -2333,7 +2333,7 @@ def test_functional_signature(self, kernel, input_type):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     def test_displacement_error(self, make_input):
         input = make_input()
@@ -2346,7 +2346,7 @@ def test_displacement_error(self, make_input):
 
     @pytest.mark.parametrize(
         "make_input",
-        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
     )
     # ElasticTransform needs larger images to avoid the needed internal padding being larger than the actual image
     @pytest.mark.parametrize("size", [(163, 163), (72, 333), (313, 95)])
@@ -2363,7 +2363,7 @@ def test_correctness(self):
             "img_pil": make_image_pil(),
             "mask": make_detection_mask(),
             "video": make_video(),
-            "bbox": make_bounding_box(),
+            "bbox": make_bounding_boxes(),
             "str": "str",
         }
 
diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py
index 0cfe0db7077..55825d652e6 100644
--- a/test/test_transforms_v2_utils.py
+++ b/test/test_transforms_v2_utils.py
@@ -4,7 +4,7 @@
 import torch
 
 import torchvision.transforms.v2.utils
-from common_utils import DEFAULT_SIZE, make_bounding_box, make_detection_mask, make_image
+from common_utils import DEFAULT_SIZE, make_bounding_boxes, make_detection_mask, make_image
 
 from torchvision import datapoints
 from torchvision.transforms.v2.functional import to_pil_image
@@ -12,7 +12,7 @@
 
 
 IMAGE = make_image(DEFAULT_SIZE, color_space="RGB")
-BOUNDING_BOX = make_bounding_box(DEFAULT_SIZE, format=datapoints.BoundingBoxFormat.XYXY)
+BOUNDING_BOX = make_bounding_boxes(DEFAULT_SIZE, format=datapoints.BoundingBoxFormat.XYXY)
 MASK = make_detection_mask(DEFAULT_SIZE)
 
 
diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py
index 903518627de..375c307324c 100644
--- a/test/transforms_v2_dispatcher_infos.py
+++ b/test/transforms_v2_dispatcher_infos.py
@@ -2,9 +2,9 @@
 
 import pytest
 import torchvision.transforms.v2.functional as F
-from common_utils import InfoBase, TestMark
 from torchvision import datapoints
 from transforms_v2_kernel_infos import KERNEL_INFOS, pad_xfail_jit_fill_condition
+from transforms_v2_legacy_utils import InfoBase, TestMark
 
 __all__ = ["DispatcherInfo", "DISPATCHER_INFOS"]
 
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index acb9a857750..33813b6519d 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -7,7 +7,9 @@
 import torch.testing
 import torchvision.ops
 import torchvision.transforms.v2.functional as F
-from common_utils import (
+from torchvision import datapoints
+from torchvision.transforms._functional_tensor import _max_value as get_max_value, _parse_pad_padding
+from transforms_v2_legacy_utils import (
     ArgsKwargs,
     combinations_grid,
     DEFAULT_PORTRAIT_SPATIAL_SIZE,
@@ -26,8 +28,6 @@
     mark_framework_limitation,
     TestMark,
 )
-from torchvision import datapoints
-from torchvision.transforms._functional_tensor import _max_value as get_max_value, _parse_pad_padding
 
 __all__ = ["KernelInfo", "KERNEL_INFOS"]
 
diff --git a/test/transforms_v2_legacy_utils.py b/test/transforms_v2_legacy_utils.py
new file mode 100644
index 00000000000..bb8943a8889
--- /dev/null
+++ b/test/transforms_v2_legacy_utils.py
@@ -0,0 +1,633 @@
+"""
+As the name implies, these are legacy utilities that are hopefully removed soon. The future of
+transforms v2 testing is in test/test_transforms_v2_refactored.py. All new test should be
+implemented there and must not use any of the utilities here.
+
+The following legacy modules depend on this module
+
+- transforms_v2_kernel_infos.py
+- transforms_v2_dispatcher_infos.py
+- test_transforms_v2_functional.py
+- test_transforms_v2_consistency.py
+- test_transforms.py
+
+When all the logic is ported from the files above to test_transforms_v2_refactored.py, delete
+all the legacy modules including this one and drop the _refactored prefix from the name.
+"""
+
+import collections.abc
+import dataclasses
+import enum
+import itertools
+import pathlib
+from collections import defaultdict
+from typing import Callable, Sequence, Tuple, Union
+
+import PIL.Image
+import pytest
+import torch
+
+from torchvision import datapoints
+from torchvision.transforms._functional_tensor import _max_value as get_max_value
+from torchvision.transforms.v2.functional import to_dtype_image, to_image, to_pil_image
+
+
+def combinations_grid(**kwargs):
+    """Creates a grid of input combinations.
+
+    Each element in the returned sequence is a dictionary containing one possible combination as values.
+
+    Example:
+        >>> combinations_grid(foo=("bar", "baz"), spam=("eggs", "ham"))
+        [
+            {'foo': 'bar', 'spam': 'eggs'},
+            {'foo': 'bar', 'spam': 'ham'},
+            {'foo': 'baz', 'spam': 'eggs'},
+            {'foo': 'baz', 'spam': 'ham'}
+        ]
+    """
+    return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())]
+
+
+DEFAULT_SIZE = (17, 11)
+
+NUM_CHANNELS_MAP = {
+    "GRAY": 1,
+    "GRAY_ALPHA": 2,
+    "RGB": 3,
+    "RGBA": 4,
+}
+
+
+def make_image(
+    size=DEFAULT_SIZE,
+    *,
+    color_space="RGB",
+    batch_dims=(),
+    dtype=None,
+    device="cpu",
+    memory_format=torch.contiguous_format,
+):
+    num_channels = NUM_CHANNELS_MAP[color_space]
+    dtype = dtype or torch.uint8
+    max_value = get_max_value(dtype)
+    data = torch.testing.make_tensor(
+        (*batch_dims, num_channels, *size),
+        low=0,
+        high=max_value,
+        dtype=dtype,
+        device=device,
+        memory_format=memory_format,
+    )
+    if color_space in {"GRAY_ALPHA", "RGBA"}:
+        data[..., -1, :, :] = max_value
+
+    return datapoints.Image(data)
+
+
+def make_image_tensor(*args, **kwargs):
+    return make_image(*args, **kwargs).as_subclass(torch.Tensor)
+
+
+def make_image_pil(*args, **kwargs):
+    return to_pil_image(make_image(*args, **kwargs))
+
+
+def make_bounding_boxes(
+    canvas_size=DEFAULT_SIZE,
+    *,
+    format=datapoints.BoundingBoxFormat.XYXY,
+    batch_dims=(),
+    dtype=None,
+    device="cpu",
+):
+    def sample_position(values, max_value):
+        # We cannot use torch.randint directly here, because it only allows integer scalars as values for low and high.
+        # However, if we have batch_dims, we need tensors as limits.
+        return torch.stack([torch.randint(max_value - v, ()) for v in values.flatten().tolist()]).reshape(values.shape)
+
+    if isinstance(format, str):
+        format = datapoints.BoundingBoxFormat[format]
+
+    dtype = dtype or torch.float32
+
+    if any(dim == 0 for dim in batch_dims):
+        return datapoints.BoundingBoxes(
+            torch.empty(*batch_dims, 4, dtype=dtype, device=device), format=format, canvas_size=canvas_size
+        )
+
+    h, w = [torch.randint(1, c, batch_dims) for c in canvas_size]
+    y = sample_position(h, canvas_size[0])
+    x = sample_position(w, canvas_size[1])
+
+    if format is datapoints.BoundingBoxFormat.XYWH:
+        parts = (x, y, w, h)
+    elif format is datapoints.BoundingBoxFormat.XYXY:
+        x1, y1 = x, y
+        x2 = x1 + w
+        y2 = y1 + h
+        parts = (x1, y1, x2, y2)
+    elif format is datapoints.BoundingBoxFormat.CXCYWH:
+        cx = x + w / 2
+        cy = y + h / 2
+        parts = (cx, cy, w, h)
+    else:
+        raise ValueError(f"Format {format} is not supported")
+
+    return datapoints.BoundingBoxes(
+        torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, canvas_size=canvas_size
+    )
+
+
+def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtype=None, device="cpu"):
+    """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks"""
+    return datapoints.Mask(
+        torch.testing.make_tensor(
+            (*batch_dims, num_objects, *size),
+            low=0,
+            high=2,
+            dtype=dtype or torch.bool,
+            device=device,
+        )
+    )
+
+
+def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"):
+    """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value"""
+    return datapoints.Mask(
+        torch.testing.make_tensor(
+            (*batch_dims, *size),
+            low=0,
+            high=num_categories,
+            dtype=dtype or torch.uint8,
+            device=device,
+        )
+    )
+
+
+def make_video(size=DEFAULT_SIZE, *, num_frames=3, batch_dims=(), **kwargs):
+    return datapoints.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs))
+
+
+def make_video_tensor(*args, **kwargs):
+    return make_video(*args, **kwargs).as_subclass(torch.Tensor)
+
+
+DEFAULT_SQUARE_SPATIAL_SIZE = 15
+DEFAULT_LANDSCAPE_SPATIAL_SIZE = (7, 33)
+DEFAULT_PORTRAIT_SPATIAL_SIZE = (31, 9)
+DEFAULT_SPATIAL_SIZES = (
+    DEFAULT_LANDSCAPE_SPATIAL_SIZE,
+    DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    DEFAULT_SQUARE_SPATIAL_SIZE,
+)
+
+
+def _parse_size(size, *, name="size"):
+    if size == "random":
+        raise ValueError("This should never happen")
+    elif isinstance(size, int) and size > 0:
+        return (size, size)
+    elif (
+        isinstance(size, collections.abc.Sequence)
+        and len(size) == 2
+        and all(isinstance(length, int) and length > 0 for length in size)
+    ):
+        return tuple(size)
+    else:
+        raise pytest.UsageError(
+            f"'{name}' can either be `'random'`, a positive integer, or a sequence of two positive integers,"
+            f"but got {size} instead."
+        )
+
+
+def get_num_channels(color_space):
+    num_channels = NUM_CHANNELS_MAP.get(color_space)
+    if not num_channels:
+        raise pytest.UsageError(f"Can't determine the number of channels for color space {color_space}")
+    return num_channels
+
+
+VALID_EXTRA_DIMS = ((), (4,), (2, 3))
+DEGENERATE_BATCH_DIMS = ((0,), (5, 0), (0, 5))
+
+DEFAULT_EXTRA_DIMS = (*VALID_EXTRA_DIMS, *DEGENERATE_BATCH_DIMS)
+
+
+def from_loader(loader_fn):
+    def wrapper(*args, **kwargs):
+        device = kwargs.pop("device", "cpu")
+        loader = loader_fn(*args, **kwargs)
+        return loader.load(device)
+
+    return wrapper
+
+
+def from_loaders(loaders_fn):
+    def wrapper(*args, **kwargs):
+        device = kwargs.pop("device", "cpu")
+        loaders = loaders_fn(*args, **kwargs)
+        for loader in loaders:
+            yield loader.load(device)
+
+    return wrapper
+
+
+@dataclasses.dataclass
+class TensorLoader:
+    fn: Callable[[Sequence[int], torch.dtype, Union[str, torch.device]], torch.Tensor]
+    shape: Sequence[int]
+    dtype: torch.dtype
+
+    def load(self, device):
+        return self.fn(self.shape, self.dtype, device)
+
+
+@dataclasses.dataclass
+class ImageLoader(TensorLoader):
+    spatial_size: Tuple[int, int] = dataclasses.field(init=False)
+    num_channels: int = dataclasses.field(init=False)
+    memory_format: torch.memory_format = torch.contiguous_format
+    canvas_size: Tuple[int, int] = dataclasses.field(init=False)
+
+    def __post_init__(self):
+        self.spatial_size = self.canvas_size = self.shape[-2:]
+        self.num_channels = self.shape[-3]
+
+    def load(self, device):
+        return self.fn(self.shape, self.dtype, device, memory_format=self.memory_format)
+
+
+def make_image_loader(
+    size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    *,
+    color_space="RGB",
+    extra_dims=(),
+    dtype=torch.float32,
+    constant_alpha=True,
+    memory_format=torch.contiguous_format,
+):
+    if not constant_alpha:
+        raise ValueError("This should never happen")
+    size = _parse_size(size)
+    num_channels = get_num_channels(color_space)
+
+    def fn(shape, dtype, device, memory_format):
+        *batch_dims, _, height, width = shape
+        return make_image(
+            (height, width),
+            color_space=color_space,
+            batch_dims=batch_dims,
+            dtype=dtype,
+            device=device,
+            memory_format=memory_format,
+        )
+
+    return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype, memory_format=memory_format)
+
+
+def make_image_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    color_spaces=(
+        "GRAY",
+        "GRAY_ALPHA",
+        "RGB",
+        "RGBA",
+    ),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.float32, torch.float64, torch.uint8),
+    constant_alpha=True,
+):
+    for params in combinations_grid(size=sizes, color_space=color_spaces, extra_dims=extra_dims, dtype=dtypes):
+        yield make_image_loader(**params, constant_alpha=constant_alpha)
+
+
+make_images = from_loaders(make_image_loaders)
+
+
+def make_image_loader_for_interpolation(
+    size=(233, 147), *, color_space="RGB", dtype=torch.uint8, memory_format=torch.contiguous_format
+):
+    size = _parse_size(size)
+    num_channels = get_num_channels(color_space)
+
+    def fn(shape, dtype, device, memory_format):
+        height, width = shape[-2:]
+
+        image_pil = (
+            PIL.Image.open(pathlib.Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg")
+            .resize((width, height))
+            .convert(
+                {
+                    "GRAY": "L",
+                    "GRAY_ALPHA": "LA",
+                    "RGB": "RGB",
+                    "RGBA": "RGBA",
+                }[color_space]
+            )
+        )
+
+        image_tensor = to_image(image_pil)
+        if memory_format == torch.contiguous_format:
+            image_tensor = image_tensor.to(device=device, memory_format=memory_format, copy=True)
+        else:
+            image_tensor = image_tensor.to(device=device)
+        image_tensor = to_dtype_image(image_tensor, dtype=dtype, scale=True)
+
+        return datapoints.Image(image_tensor)
+
+    return ImageLoader(fn, shape=(num_channels, *size), dtype=dtype, memory_format=memory_format)
+
+
+def make_image_loaders_for_interpolation(
+    sizes=((233, 147),),
+    color_spaces=("RGB",),
+    dtypes=(torch.uint8,),
+    memory_formats=(torch.contiguous_format, torch.channels_last),
+):
+    for params in combinations_grid(size=sizes, color_space=color_spaces, dtype=dtypes, memory_format=memory_formats):
+        yield make_image_loader_for_interpolation(**params)
+
+
+@dataclasses.dataclass
+class BoundingBoxesLoader(TensorLoader):
+    format: datapoints.BoundingBoxFormat
+    spatial_size: Tuple[int, int]
+    canvas_size: Tuple[int, int] = dataclasses.field(init=False)
+
+    def __post_init__(self):
+        self.canvas_size = self.spatial_size
+
+
+def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.float32):
+    if isinstance(format, str):
+        format = datapoints.BoundingBoxFormat[format]
+
+    spatial_size = _parse_size(spatial_size, name="spatial_size")
+
+    def fn(shape, dtype, device):
+        *batch_dims, num_coordinates = shape
+        if num_coordinates != 4:
+            raise pytest.UsageError()
+
+        return make_bounding_boxes(
+            format=format, canvas_size=spatial_size, batch_dims=batch_dims, dtype=dtype, device=device
+        )
+
+    return BoundingBoxesLoader(fn, shape=(*extra_dims[-1:], 4), dtype=dtype, format=format, spatial_size=spatial_size)
+
+
+def make_bounding_box_loaders(
+    *,
+    extra_dims=tuple(d for d in DEFAULT_EXTRA_DIMS if len(d) < 2),
+    formats=tuple(datapoints.BoundingBoxFormat),
+    spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    dtypes=(torch.float32, torch.float64, torch.int64),
+):
+    for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes):
+        yield make_bounding_box_loader(**params, spatial_size=spatial_size)
+
+
+make_multiple_bounding_boxes = from_loaders(make_bounding_box_loaders)
+
+
+class MaskLoader(TensorLoader):
+    pass
+
+
+def make_detection_mask_loader(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_objects=5, extra_dims=(), dtype=torch.uint8):
+    # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects
+    size = _parse_size(size)
+
+    def fn(shape, dtype, device):
+        *batch_dims, num_objects, height, width = shape
+        return make_detection_mask(
+            (height, width), num_objects=num_objects, batch_dims=batch_dims, dtype=dtype, device=device
+        )
+
+    return MaskLoader(fn, shape=(*extra_dims, num_objects, *size), dtype=dtype)
+
+
+def make_detection_mask_loaders(
+    sizes=DEFAULT_SPATIAL_SIZES,
+    num_objects=(1, 0, 5),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8,),
+):
+    for params in combinations_grid(size=sizes, num_objects=num_objects, extra_dims=extra_dims, dtype=dtypes):
+        yield make_detection_mask_loader(**params)
+
+
+make_detection_masks = from_loaders(make_detection_mask_loaders)
+
+
+def make_segmentation_mask_loader(
+    size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_categories=10, extra_dims=(), dtype=torch.uint8
+):
+    # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
+    size = _parse_size(size)
+
+    def fn(shape, dtype, device):
+        *batch_dims, height, width = shape
+        return make_segmentation_mask(
+            (height, width), num_categories=num_categories, batch_dims=batch_dims, dtype=dtype, device=device
+        )
+
+    return MaskLoader(fn, shape=(*extra_dims, *size), dtype=dtype)
+
+
+def make_segmentation_mask_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    num_categories=(1, 2, 10),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8,),
+):
+    for params in combinations_grid(size=sizes, num_categories=num_categories, extra_dims=extra_dims, dtype=dtypes):
+        yield make_segmentation_mask_loader(**params)
+
+
+make_segmentation_masks = from_loaders(make_segmentation_mask_loaders)
+
+
+def make_mask_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    num_objects=(1, 0, 5),
+    num_categories=(1, 2, 10),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8,),
+):
+    yield from make_detection_mask_loaders(sizes=sizes, num_objects=num_objects, extra_dims=extra_dims, dtypes=dtypes)
+    yield from make_segmentation_mask_loaders(
+        sizes=sizes, num_categories=num_categories, extra_dims=extra_dims, dtypes=dtypes
+    )
+
+
+make_masks = from_loaders(make_mask_loaders)
+
+
+class VideoLoader(ImageLoader):
+    pass
+
+
+def make_video_loader(
+    size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    *,
+    color_space="RGB",
+    num_frames=3,
+    extra_dims=(),
+    dtype=torch.uint8,
+):
+    size = _parse_size(size)
+
+    def fn(shape, dtype, device, memory_format):
+        *batch_dims, num_frames, _, height, width = shape
+        return make_video(
+            (height, width),
+            num_frames=num_frames,
+            batch_dims=batch_dims,
+            color_space=color_space,
+            dtype=dtype,
+            device=device,
+            memory_format=memory_format,
+        )
+
+    return VideoLoader(fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype)
+
+
+def make_video_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    color_spaces=(
+        "GRAY",
+        "RGB",
+    ),
+    num_frames=(1, 0, 3),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8, torch.float32, torch.float64),
+):
+    for params in combinations_grid(
+        size=sizes, color_space=color_spaces, num_frames=num_frames, extra_dims=extra_dims, dtype=dtypes
+    ):
+        yield make_video_loader(**params)
+
+
+make_videos = from_loaders(make_video_loaders)
+
+
+class TestMark:
+    def __init__(
+        self,
+        # Tuple of test class name and test function name that identifies the test the mark is applied to. If there is
+        # no test class, i.e. a standalone test function, use `None`.
+        test_id,
+        # `pytest.mark.*` to apply, e.g. `pytest.mark.skip` or `pytest.mark.xfail`
+        mark,
+        *,
+        # Callable, that will be passed an `ArgsKwargs` and should return a boolean to indicate if the mark will be
+        # applied. If omitted, defaults to always apply.
+        condition=None,
+    ):
+        self.test_id = test_id
+        self.mark = mark
+        self.condition = condition or (lambda args_kwargs: True)
+
+
+def mark_framework_limitation(test_id, reason, condition=None):
+    # The purpose of this function is to have a single entry point for skip marks that are only there, because the test
+    # framework cannot handle the kernel in general or a specific parameter combination.
+    # As development progresses, we can change the `mark.skip` to `mark.xfail` from time to time to see if the skip is
+    # still justified.
+    # We don't want to use `mark.xfail` all the time, because that actually runs the test until an error happens. Thus,
+    # we are wasting CI resources for no reason for most of the time
+    return TestMark(test_id, pytest.mark.skip(reason=reason), condition=condition)
+
+
+class InfoBase:
+    def __init__(
+        self,
+        *,
+        # Identifier if the info that shows up the parametrization.
+        id,
+        # Test markers that will be (conditionally) applied to an `ArgsKwargs` parametrization.
+        # See the `TestMark` class for details
+        test_marks=None,
+        # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`. Keys are a 3-tuple of `test_id` (see
+        # `TestMark`), the dtype, and the device.
+        closeness_kwargs=None,
+    ):
+        self.id = id
+
+        self.test_marks = test_marks or []
+        test_marks_map = defaultdict(list)
+        for test_mark in self.test_marks:
+            test_marks_map[test_mark.test_id].append(test_mark)
+        self._test_marks_map = dict(test_marks_map)
+
+        self.closeness_kwargs = closeness_kwargs or dict()
+
+    def get_marks(self, test_id, args_kwargs):
+        return [
+            test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs)
+        ]
+
+    def get_closeness_kwargs(self, test_id, *, dtype, device):
+        if not (isinstance(test_id, tuple) and len(test_id) == 2):
+            msg = "`test_id` should be a `Tuple[Optional[str], str]` denoting the test class and function name"
+            if callable(test_id):
+                msg += ". Did you forget to add the `test_id` fixture to parameters of the test?"
+            else:
+                msg += f", but got {test_id} instead."
+            raise pytest.UsageError(msg)
+        if isinstance(device, torch.device):
+            device = device.type
+        return self.closeness_kwargs.get((test_id, dtype, device), dict())
+
+
+class ArgsKwargs:
+    def __init__(self, *args, **kwargs):
+        self.args = args
+        self.kwargs = kwargs
+
+    def __iter__(self):
+        yield self.args
+        yield self.kwargs
+
+    def load(self, device="cpu"):
+        return ArgsKwargs(
+            *(arg.load(device) if isinstance(arg, TensorLoader) else arg for arg in self.args),
+            **{
+                keyword: arg.load(device) if isinstance(arg, TensorLoader) else arg
+                for keyword, arg in self.kwargs.items()
+            },
+        )
+
+
+def parametrized_error_message(*args, **kwargs):
+    def to_str(obj):
+        if isinstance(obj, torch.Tensor) and obj.numel() > 30:
+            return f"tensor(shape={list(obj.shape)}, dtype={obj.dtype}, device={obj.device})"
+        elif isinstance(obj, enum.Enum):
+            return f"{type(obj).__name__}.{obj.name}"
+        else:
+            return repr(obj)
+
+    if args or kwargs:
+        postfix = "\n".join(
+            [
+                "",
+                "Failure happened for the following parameters:",
+                "",
+                *[to_str(arg) for arg in args],
+                *[f"{name}={to_str(kwarg)}" for name, kwarg in kwargs.items()],
+            ]
+        )
+    else:
+        postfix = ""
+
+    def wrapper(msg):
+        return msg + postfix
+
+    return wrapper