diff --git a/.github/workflows/build-cmake.yml b/.github/workflows/build-cmake.yml
index 3e584ef6ce6..1ab20669f0b 100644
--- a/.github/workflows/build-cmake.yml
+++ b/.github/workflows/build-cmake.yml
@@ -1,85 +1,85 @@
-# name: CMake
+name: CMake
 
-# on:
-#   pull_request:
-#   push:
-#     branches:
-#       - nightly
-#       - main
-#       - release/*
-#   workflow_dispatch:
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+  workflow_dispatch:
 
-# jobs:
-#   linux:
-#     strategy:
-#       matrix:
-#         include:
-#           - runner: linux.12xlarge
-#             gpu-arch-type: cpu
-#           - runner: linux.g5.4xlarge.nvidia.gpu
-#             gpu-arch-type: cuda
-#             gpu-arch-version: "11.8"
-#       fail-fast: false
-#     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-#     with:
-#       repository: pytorch/vision
-#       runner: ${{ matrix.runner }}
-#       gpu-arch-type: ${{ matrix.gpu-arch-type }}
-#       gpu-arch-version: ${{ matrix.gpu-arch-version }}
-#       test-infra-ref: main
-#       script: |
-#         set -euo pipefail
+jobs:
+  linux:
+    strategy:
+      matrix:
+        include:
+          - runner: linux.12xlarge
+            gpu-arch-type: cpu
+          - runner: linux.g5.4xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "11.8"
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
 
-#         export PYTHON_VERSION=3.8
-#         export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
-#         export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
+        export PYTHON_VERSION=3.8
+        export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
+        export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
 
-#         ./.github/scripts/cmake.sh
+        ./.github/scripts/cmake.sh
 
-#   macos:
-#     strategy:
-#       matrix:
-#         include:
-#           - runner: macos-m1-stable
-#       fail-fast: false
-#     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-#     with:
-#       repository: pytorch/vision
-#       runner: ${{ matrix.runner }}
-#       test-infra-ref: main
-#       script: |
-#         set -euo pipefail
+  macos:
+    strategy:
+      matrix:
+        include:
+          - runner: macos-m1-stable
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
 
-#         export PYTHON_VERSION=3.8
-#         export GPU_ARCH_TYPE=cpu
-#         export GPU_ARCH_VERSION=''
+        export PYTHON_VERSION=3.8
+        export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
 
-#         ${CONDA_RUN} ./.github/scripts/cmake.sh
+        ${CONDA_RUN} ./.github/scripts/cmake.sh
 
-#   windows:
-#     strategy:
-#       matrix:
-#         include:
-#           - runner: windows.4xlarge
-#             gpu-arch-type: cpu
-#           - runner: windows.g5.4xlarge.nvidia.gpu
-#             gpu-arch-type: cuda
-#             gpu-arch-version: "11.8"
-#       fail-fast: false
-#     uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
-#     with:
-#       repository: pytorch/vision
-#       runner: ${{ matrix.runner }}
-#       gpu-arch-type: ${{ matrix.gpu-arch-type }}
-#       gpu-arch-version: ${{ matrix.gpu-arch-version }}
-#       test-infra-ref: main
-#       script: |
-#         set -euo pipefail
+  windows:
+    strategy:
+      matrix:
+        include:
+          - runner: windows.4xlarge
+            gpu-arch-type: cpu
+          - runner: windows.g5.4xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "11.8"
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
 
-#         export PYTHON_VERSION=3.8
-#         export VC_YEAR=2022
-#         export VSDEVCMD_ARGS=""
-#         export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
-#         export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
+        export PYTHON_VERSION=3.8
+        export VC_YEAR=2022
+        export VSDEVCMD_ARGS=""
+        export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
+        export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
 
-#         ./.github/scripts/cmake.sh
+        ./.github/scripts/cmake.sh
diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build-wheels-linux.yml
index 7e85f4cc980..818f32c102b 100644
--- a/.github/workflows/build-wheels-linux.yml
+++ b/.github/workflows/build-wheels-linux.yml
@@ -1,51 +1,52 @@
-# name: Build Linux Wheels
+name: Build Linux Wheels
 
-# on:
-#   pull_request:
-#   push:
-#     branches:
-#       - nightly
-#       - main
-#       - release/*
-#     tags:
-#         # NOTE: Binary build pipelines should only get triggered on release candidate builds
-#         # Release candidate tags look like: v1.11.0-rc1
-#         - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-#   workflow_dispatch:
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
 
-# permissions:
-#   id-token: write
-#   contents: read
+permissions:
+  id-token: write
+  contents: read
 
-# jobs:
-#   generate-matrix:
-#     uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
-#     with:
-#       package-type: wheel
-#       os: linux
-#       test-infra-repository: pytorch/test-infra
-#       test-infra-ref: main
-#   build:
-#     needs: generate-matrix
-#     strategy:
-#       fail-fast: false
-#       matrix:
-#         include:
-#           - repository: pytorch/vision
-#             pre-script: packaging/pre_build_script.sh
-#             post-script: packaging/post_build_script.sh
-#             smoke-test-script: test/smoke_test.py
-#             package-name: torchvision
-#     name: ${{ matrix.repository }}
-#     uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
-#     with:
-#       repository: ${{ matrix.repository }}
-#       ref: ""
-#       test-infra-repository: pytorch/test-infra
-#       test-infra-ref: main
-#       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
-#       pre-script: ${{ matrix.pre-script }}
-#       post-script: ${{ matrix.post-script }}
-#       package-name: ${{ matrix.package-name }}
-#       smoke-test-script: ${{ matrix.smoke-test-script }}
-#       trigger-event: ${{ github.event_name }}
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: linux
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      with-xpu: enable
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: packaging/pre_build_script.sh
+            post-script: packaging/post_build_script.sh
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
+    with:
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      trigger-event: ${{ github.event_name }}
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 7582225e80a..5402294322d 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -1,129 +1,129 @@
-# name: Docs
-
-# on:
-#   pull_request:
-#   push:
-#     branches:
-#       - nightly
-#       - main
-#       - release/*
-#     tags:
-#       - v[0-9]+.[0-9]+.[0-9]
-#       - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-#   workflow_dispatch:
-
-# jobs:
-#   build:
-#     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-#     with:
-#       repository: pytorch/vision
-#       upload-artifact: docs
-#       test-infra-ref: main
-#       script: |
-#         set -euo pipefail
-
-#         export PYTHON_VERSION=3.8
-#         export GPU_ARCH_TYPE=cpu
-#         export GPU_ARCH_VERSION=''
-#         ./.github/scripts/setup-env.sh
-
-#         # Prepare conda
-#         CONDA_PATH=$(which conda)
-#         eval "$(${CONDA_PATH} shell.bash hook)"
-#         conda activate ci
-#         # FIXME: not sure why we need this. `ldd torchvision/video_reader.so` shows that it
-#         #  already links against the one pulled from conda. However, at runtime it pulls from
-#         #  /lib64
-#         # Should we maybe always do this in `./.github/scripts/setup-env.sh` so that we don't
-#         # have to pay attention in all other workflows?
-#         export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
-
-#         cd docs
-
-#         echo '::group::Install doc requirements'
-#         pip install --progress-bar=off -r requirements.txt
-#         echo '::endgroup::'
-
-#         if [[ ${{ github.event_name }} == push && (${{ github.ref_type }} == tag || (${{ github.ref_type }} == branch && ${{ github.ref_name }} == release/*)) ]]; then
-#           echo '::group::Enable version string sanitization'
-#           # This environment variable just has to exist and must not be empty. The actual value is arbitrary.
-#           # See docs/source/conf.py for details
-#           export TORCHVISION_SANITIZE_VERSION_STR_IN_DOCS=1
-#           echo '::endgroup::'
-#         fi
-
-#         # The runner does not have sufficient memory to run with as many processes as there are
-#         # cores (`-j auto`). Thus, we limit to a single process (`-j 1`) here.
-#         sed -i -e 's/-j auto/-j 1/' Makefile
-#         make html
-
-#         # Below is an imperfect way for us to add "try on Colab" links to all of our gallery examples.
-#         # sphinx-gallery will convert all gallery examples to .ipynb notebooks and stores them in
-#         # build/html/_downloads/<some_hash>/<example_name>.ipynb
-#         # We copy all those ipynb files in a more convenient folder so that we can more easily link to them.
-#         mkdir build/html/_generated_ipynb_notebooks
-#         for file in `find build/html/_downloads`; do
-#           if [[ $file == *.ipynb ]]; then
-#             cp $file build/html/_generated_ipynb_notebooks/
-#           fi
-#         done
-
-#         cp -r build/html "${RUNNER_ARTIFACT_DIR}"
-
-#         # On PRs we also want to upload the docs into our S3 bucket for preview.
-#         if [[ ${{ github.event_name == 'pull_request' }} ]]; then
-#           cp -r build/html/* "${RUNNER_DOCS_DIR}"
-#         fi
-
-#   upload:
-#     needs: build
-#     if: github.repository == 'pytorch/vision' && github.event_name == 'push' && 
-#         ((github.ref_type == 'branch' && github.ref_name == 'main') || github.ref_type == 'tag')
-#     permissions:
-#       contents: write
-#     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-#     with:
-#       repository: pytorch/vision
-#       download-artifact: docs
-#       ref: gh-pages
-#       test-infra-ref: main
-#       script: |
-#         set -euo pipefail
-
-#         REF_TYPE=${{ github.ref_type }}
-#         REF_NAME=${{ github.ref_name }}
-
-#         if [[ "${REF_TYPE}" == branch ]]; then
-#           TARGET_FOLDER="${REF_NAME}"
-#         elif [[ "${REF_TYPE}" == tag ]]; then
-#           case "${REF_NAME}" in
-#             *-rc*)
-#               echo "Aborting upload since this is an RC tag: ${REF_NAME}"
-#               exit 0
-#               ;;
-#             *)
-#               # Strip the leading "v" as well as the trailing patch version. For example:
-#               # 'v0.15.2' -> '0.15'
-#               TARGET_FOLDER=$(echo "${REF_NAME}" | sed 's/v\([0-9]\+\)\.\([0-9]\+\)\.[0-9]\+/\1.\2/')
-#               ;;
-#           esac
-#         fi
-#         echo "Target Folder: ${TARGET_FOLDER}"
-
-#         mkdir -p "${TARGET_FOLDER}"
-#         rm -rf "${TARGET_FOLDER}"/*
-#         mv "${RUNNER_ARTIFACT_DIR}"/html/* "${TARGET_FOLDER}"
-#         git add "${TARGET_FOLDER}" || true
-
-#         if [[ "${TARGET_FOLDER}" == main ]]; then
-#           mkdir -p _static
-#           rm -rf _static/*
-#           cp -r "${TARGET_FOLDER}"/_static/* _static
-#           git add _static || true
-#         fi
-
-#         git config user.name 'pytorchbot'
-#         git config user.email 'soumith+bot@pytorch.org'
-#         git config http.postBuffer 524288000
-#         git commit -m "auto-generating sphinx docs" || true
-#         git push
+name: Docs
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+      - v[0-9]+.[0-9]+.[0-9]
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+jobs:
+  build:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      repository: pytorch/vision
+      upload-artifact: docs
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.8
+        export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
+        ./.github/scripts/setup-env.sh
+
+        # Prepare conda
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        conda activate ci
+        # FIXME: not sure why we need this. `ldd torchvision/video_reader.so` shows that it
+        #  already links against the one pulled from conda. However, at runtime it pulls from
+        #  /lib64
+        # Should we maybe always do this in `./.github/scripts/setup-env.sh` so that we don't
+        # have to pay attention in all other workflows?
+        export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
+
+        cd docs
+
+        echo '::group::Install doc requirements'
+        pip install --progress-bar=off -r requirements.txt
+        echo '::endgroup::'
+
+        if [[ ${{ github.event_name }} == push && (${{ github.ref_type }} == tag || (${{ github.ref_type }} == branch && ${{ github.ref_name }} == release/*)) ]]; then
+          echo '::group::Enable version string sanitization'
+          # This environment variable just has to exist and must not be empty. The actual value is arbitrary.
+          # See docs/source/conf.py for details
+          export TORCHVISION_SANITIZE_VERSION_STR_IN_DOCS=1
+          echo '::endgroup::'
+        fi
+
+        # The runner does not have sufficient memory to run with as many processes as there are
+        # cores (`-j auto`). Thus, we limit to a single process (`-j 1`) here.
+        sed -i -e 's/-j auto/-j 1/' Makefile
+        make html
+
+        # Below is an imperfect way for us to add "try on Colab" links to all of our gallery examples.
+        # sphinx-gallery will convert all gallery examples to .ipynb notebooks and stores them in
+        # build/html/_downloads/<some_hash>/<example_name>.ipynb
+        # We copy all those ipynb files in a more convenient folder so that we can more easily link to them.
+        mkdir build/html/_generated_ipynb_notebooks
+        for file in `find build/html/_downloads`; do
+          if [[ $file == *.ipynb ]]; then
+            cp $file build/html/_generated_ipynb_notebooks/
+          fi
+        done
+
+        cp -r build/html "${RUNNER_ARTIFACT_DIR}"
+
+        # On PRs we also want to upload the docs into our S3 bucket for preview.
+        if [[ ${{ github.event_name == 'pull_request' }} ]]; then
+          cp -r build/html/* "${RUNNER_DOCS_DIR}"
+        fi
+
+  upload:
+    needs: build
+    if: github.repository == 'pytorch/vision' && github.event_name == 'push' && 
+        ((github.ref_type == 'branch' && github.ref_name == 'main') || github.ref_type == 'tag')
+    permissions:
+      contents: write
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      repository: pytorch/vision
+      download-artifact: docs
+      ref: gh-pages
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
+
+        REF_TYPE=${{ github.ref_type }}
+        REF_NAME=${{ github.ref_name }}
+
+        if [[ "${REF_TYPE}" == branch ]]; then
+          TARGET_FOLDER="${REF_NAME}"
+        elif [[ "${REF_TYPE}" == tag ]]; then
+          case "${REF_NAME}" in
+            *-rc*)
+              echo "Aborting upload since this is an RC tag: ${REF_NAME}"
+              exit 0
+              ;;
+            *)
+              # Strip the leading "v" as well as the trailing patch version. For example:
+              # 'v0.15.2' -> '0.15'
+              TARGET_FOLDER=$(echo "${REF_NAME}" | sed 's/v\([0-9]\+\)\.\([0-9]\+\)\.[0-9]\+/\1.\2/')
+              ;;
+          esac
+        fi
+        echo "Target Folder: ${TARGET_FOLDER}"
+
+        mkdir -p "${TARGET_FOLDER}"
+        rm -rf "${TARGET_FOLDER}"/*
+        mv "${RUNNER_ARTIFACT_DIR}"/html/* "${TARGET_FOLDER}"
+        git add "${TARGET_FOLDER}" || true
+
+        if [[ "${TARGET_FOLDER}" == main ]]; then
+          mkdir -p _static
+          rm -rf _static/*
+          cp -r "${TARGET_FOLDER}"/_static/* _static
+          git add _static || true
+        fi
+
+        git config user.name 'pytorchbot'
+        git config user.email 'soumith+bot@pytorch.org'
+        git config http.postBuffer 524288000
+        git commit -m "auto-generating sphinx docs" || true
+        git push
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index b9056a4ece6..5c2b3344247 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -1,110 +1,110 @@
-# name: Lint
-
-# on:
-#   pull_request:
-#   push:
-#     branches:
-#       - nightly
-#       - main
-#       - release/*
-#   workflow_dispatch:
-
-# jobs:
-#   python-source-and-configs:
-#     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-#     with:
-#       repository: pytorch/vision
-#       test-infra-ref: main
-#       script: |
-#         set -euo pipefail
-
-#         echo '::group::Setup environment'
-#         CONDA_PATH=$(which conda)
-#         eval "$(${CONDA_PATH} shell.bash hook)"
-#         conda create --name ci --quiet --yes python=3.8 pip
-#         conda activate ci
-#         echo '::endgroup::'
-
-#         echo '::group::Install lint tools'
-#         pip install --progress-bar=off pre-commit
-#         echo '::endgroup::'
-
-#         set +e
-#         pre-commit run --all-files
-
-#         if [ $? -ne 0 ]; then
-#           git --no-pager diff
-#           exit 1
-#         fi
-
-#   c-source:
-#     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-#     with:
-#       repository: pytorch/vision
-#       test-infra-ref: main
-#       script: |
-#         set -euo pipefail
-
-#         echo '::group::Setup environment'
-#         CONDA_PATH=$(which conda)
-#         eval "$(${CONDA_PATH} shell.bash hook)"
-#         # clang-format needs some shared libraries that conflict with the system ones. Thus, we install them from conda
-#         # and prepend the libraries to linker path to prioritize them. `ncurses=5` is only available on the conda-forge
-#         # channel. Since we are not building or testing here, this is fine.
-#         conda create --name ci --quiet --yes -c conda-forge python=3.8 ncurses=5 libgcc
-#         conda activate ci
-#         export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
-#         echo '::endgroup::'
-
-#         echo '::group::Install lint tools'
-#         curl https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64 -o ./clang-format
-#         chmod +x ./clang-format
-#         echo '::endgroup::'
-
-#         echo '::group::Lint C source'
-#         set +e
-#         ./.github/scripts/run-clang-format.py -r torchvision/csrc --clang-format-executable ./clang-format --exclude "torchvision/csrc/io/image/cpu/giflib/*"
-
-#         if [ $? -ne 0 ]; then
-#           git --no-pager diff
-#           exit 1
-#         fi
-#         echo '::endgroup::'
-
-
-#   python-types:
-#     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-#     with:
-#       repository: pytorch/vision
-#       test-infra-ref: main
-#       script: |
-#         set -euo pipefail
-
-#         export PYTHON_VERSION=3.8
-#         export GPU_ARCH_TYPE=cpu
-#         export GPU_ARCH_VERSION=''
-
-#         ./.github/scripts/setup-env.sh
-
-#         CONDA_PATH=$(which conda)
-#         eval "$(${CONDA_PATH} shell.bash hook)"
-#         conda activate ci
-
-#         echo '::group::Install lint tools'
-#         pip install --progress-bar=off mypy
-#         echo '::endgroup::'
-
-#         echo '::group::Lint Python types'
-#         mypy --install-types --non-interactive --config-file mypy.ini
-#         echo '::endgroup::'
-
-#   bc:
-#     if: github.event.pull_request
-#     runs-on: ubuntu-latest
-#     steps:
-#       - name: Run BC Lint Action
-#         uses: pytorch/test-infra/.github/actions/bc-lint@main
-#         with:
-#           repo: ${{ github.event.pull_request.head.repo.full_name }}
-#           base_sha: ${{ github.event.pull_request.base.sha }}
-#           head_sha: ${{ github.event.pull_request.head.sha }}
+name: Lint
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+  workflow_dispatch:
+
+jobs:
+  python-source-and-configs:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      repository: pytorch/vision
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
+
+        echo '::group::Setup environment'
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        conda create --name ci --quiet --yes python=3.8 pip
+        conda activate ci
+        echo '::endgroup::'
+
+        echo '::group::Install lint tools'
+        pip install --progress-bar=off pre-commit
+        echo '::endgroup::'
+
+        set +e
+        pre-commit run --all-files
+
+        if [ $? -ne 0 ]; then
+          git --no-pager diff
+          exit 1
+        fi
+
+  c-source:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      repository: pytorch/vision
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
+
+        echo '::group::Setup environment'
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        # clang-format needs some shared libraries that conflict with the system ones. Thus, we install them from conda
+        # and prepend the libraries to linker path to prioritize them. `ncurses=5` is only available on the conda-forge
+        # channel. Since we are not building or testing here, this is fine.
+        conda create --name ci --quiet --yes -c conda-forge python=3.8 ncurses=5 libgcc
+        conda activate ci
+        export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
+        echo '::endgroup::'
+
+        echo '::group::Install lint tools'
+        curl https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64 -o ./clang-format
+        chmod +x ./clang-format
+        echo '::endgroup::'
+
+        echo '::group::Lint C source'
+        set +e
+        ./.github/scripts/run-clang-format.py -r torchvision/csrc --clang-format-executable ./clang-format --exclude "torchvision/csrc/io/image/cpu/giflib/*"
+
+        if [ $? -ne 0 ]; then
+          git --no-pager diff
+          exit 1
+        fi
+        echo '::endgroup::'
+
+
+  python-types:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      repository: pytorch/vision
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.8
+        export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
+
+        ./.github/scripts/setup-env.sh
+
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        conda activate ci
+
+        echo '::group::Install lint tools'
+        pip install --progress-bar=off mypy
+        echo '::endgroup::'
+
+        echo '::group::Lint Python types'
+        mypy --install-types --non-interactive --config-file mypy.ini
+        echo '::endgroup::'
+
+  bc:
+    if: github.event.pull_request
+    runs-on: ubuntu-latest
+    steps:
+      - name: Run BC Lint Action
+        uses: pytorch/test-infra/.github/actions/bc-lint@main
+        with:
+          repo: ${{ github.event.pull_request.head.repo.full_name }}
+          base_sha: ${{ github.event.pull_request.base.sha }}
+          head_sha: ${{ github.event.pull_request.head.sha }}
diff --git a/.github/workflows/prototype-tests-linux-gpu.yml b/.github/workflows/prototype-tests-linux-gpu.yml
index ddb61676ab8..7fbe77ca146 100644
--- a/.github/workflows/prototype-tests-linux-gpu.yml
+++ b/.github/workflows/prototype-tests-linux-gpu.yml
@@ -11,7 +11,6 @@ jobs:
     strategy:
       matrix:
         python-version:
-          - "3.8"
           - "3.9"
           - "3.10"
           - "3.11"
@@ -19,7 +18,7 @@ jobs:
         runner: ["linux.12xlarge"]
         gpu-arch-type: ["cpu"]
         include:
-          - python-version: "3.8"
+          - python-version: "3.9"
             runner: linux.g5.4xlarge.nvidia.gpu
             gpu-arch-type: cuda
             gpu-arch-version: "11.8"
diff --git a/.github/workflows/tests-schedule.yml b/.github/workflows/tests-schedule.yml
index 1f9fe52e6e8..3cba2ef59d8 100644
--- a/.github/workflows/tests-schedule.yml
+++ b/.github/workflows/tests-schedule.yml
@@ -18,7 +18,7 @@ jobs:
       - name: Set up python
         uses: actions/setup-python@v2
         with:
-          python-version: 3.8
+          python-version: 3.9
 
       - name: Upgrade system packages
         run: python -m pip install --upgrade pip setuptools wheel
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 2fdda3beab9..93d6afb00d5 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -10,75 +10,75 @@ on:
   workflow_dispatch:
 
 jobs:
-#   unittests-linux:
-#     strategy:
-#       matrix:
-#         python-version:
-#         #   - "3.8"
-#         #   - "3.9"
-#         #   - "3.10"
-#           - "3.11"
-#         #   - "3.12"
-#         runner: ["linux.12xlarge"]
-#         gpu-arch-type: ["cpu"]
-#         include:
-#           - python-version: 3.8
-#             runner: linux.g5.4xlarge.nvidia.gpu
-#             gpu-arch-type: cuda
-#             gpu-arch-version: "11.8"
-#       fail-fast: false
-#     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-#     with:
-#       repository: pytorch/vision
-#       runner: ${{ matrix.runner }}
-#       gpu-arch-type: ${{ matrix.gpu-arch-type }}
-#       gpu-arch-version: ${{ matrix.gpu-arch-version }}
-#       timeout: 120
-#       test-infra-ref: main
-#       script: |
-#         set -euo pipefail
-
-#         export PYTHON_VERSION=${{ matrix.python-version }}
-#         export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
-#         export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
-
-#         ./.github/scripts/unittest.sh
-
-#   unittests-macos:
-#     strategy:
-#       matrix:
-#         python-version:
-#         #   - "3.8"
-#         #   - "3.9"
-#         #   - "3.10"
-#           - "3.11"
-#         #   - "3.12"
-#         runner: ["macos-m1-stable"]
-#       fail-fast: false
-#     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-#     with:
-#       repository: pytorch/vision
-#       timeout: 240
-#       runner: ${{ matrix.runner }}
-#       test-infra-ref: main
-#       script: |
-#         set -euo pipefail
-
-#         export PYTHON_VERSION=${{ matrix.python-version }}
-#         export GPU_ARCH_TYPE=cpu
-#         export GPU_ARCH_VERSION=''
-
-#         ${CONDA_RUN} ./.github/scripts/unittest.sh
+  unittests-linux:
+    strategy:
+      matrix:
+        python-version:
+          - "3.8"
+          - "3.9"
+          - "3.10"
+          - "3.11"
+          - "3.12"
+        runner: ["linux.12xlarge"]
+        gpu-arch-type: ["cpu"]
+        include:
+          - python-version: 3.8
+            runner: linux.g5.4xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "11.8"
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      timeout: 120
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=${{ matrix.python-version }}
+        export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
+        export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
+
+        ./.github/scripts/unittest.sh
+
+  unittests-macos:
+    strategy:
+      matrix:
+        python-version:
+          - "3.8"
+          - "3.9"
+          - "3.10"
+          - "3.11"
+          - "3.12"
+        runner: ["macos-m1-stable"]
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      repository: pytorch/vision
+      timeout: 240
+      runner: ${{ matrix.runner }}
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=${{ matrix.python-version }}
+        export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
+
+        ${CONDA_RUN} ./.github/scripts/unittest.sh
 
   unittests-windows:
     strategy:
       matrix:
         python-version:
-        #   - "3.8"
-        #   - "3.9"
-        #   - "3.10"
+          - "3.8"
+          - "3.9"
+          - "3.10"
           - "3.11"
-        #   - "3.12"
+          - "3.12"
         runner: ["windows.4xlarge"]
         gpu-arch-type: ["cpu"]
         include:
@@ -106,67 +106,68 @@ jobs:
 
         ./.github/scripts/unittest.sh
 
-#   onnx:
-#     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-#     with:
-#       repository: pytorch/vision
-#       test-infra-ref: main
-#       script: |
-#         set -euo pipefail
-
-#         export PYTHON_VERSION=3.8
-#         export GPU_ARCH_TYPE=cpu
-#         export GPU_ARCH_VERSION=''
-
-#         ./.github/scripts/setup-env.sh
-
-#         # Prepare conda
-#         CONDA_PATH=$(which conda)
-#         eval "$(${CONDA_PATH} shell.bash hook)"
-#         conda activate ci
-
-#         echo '::group::Install ONNX'
-#         pip install --progress-bar=off onnx onnxruntime
-#         echo '::endgroup::'
-
-#         echo '::group::Install testing utilities'
-#         pip install --progress-bar=off pytest
-#         echo '::endgroup::'
-
-#         echo '::group::Run ONNX tests'
-#         pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 test/test_onnx.py
-#         echo '::endgroup::'
-
-#   unittests-extended:
-#     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-#     with:
-#       repository: pytorch/vision
-#       test-infra-ref: main
-#       script: |
-#         set -euo pipefail
-
-#         export PYTHON_VERSION=3.8
-#         export GPU_ARCH_TYPE=cpu
-#         export GPU_ARCH_VERSION=''
-
-#         ./.github/scripts/setup-env.sh
-
-#         # Prepare conda
-#         CONDA_PATH=$(which conda)
-#         eval "$(${CONDA_PATH} shell.bash hook)"
-#         conda activate ci
-
-#         echo '::group::Pre-download model weights'
-#         pip install --progress-bar=off aiohttp aiofiles tqdm
-#         python scripts/download_model_urls.py
-#         echo '::endgroup::'
-
-#         echo '::group::Install testing utilities'
-#         # TODO: remove the <8 constraint on pytest when https://github.com/pytorch/vision/issues/8238 is closed
-#         pip install --progress-bar=off "pytest<8"
-#         echo '::endgroup::'
-
-#         echo '::group::Run extended unittests'
-#         export PYTORCH_TEST_WITH_EXTENDED=1
-#         pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 test/test_extended_*.py
-#         echo '::endgroup::'
+  onnx:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      repository: pytorch/vision
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.8
+        export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
+
+        ./.github/scripts/setup-env.sh
+
+        # Prepare conda
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        conda activate ci
+
+        echo '::group::Install ONNX'
+        pip install --progress-bar=off onnx onnxruntime
+        echo '::endgroup::'
+
+        echo '::group::Install testing utilities'
+        pip install --progress-bar=off pytest
+        echo '::endgroup::'
+
+        echo '::group::Run ONNX tests'
+        pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 test/test_onnx.py
+        echo '::endgroup::'
+
+  unittests-extended:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    if: contains(github.event.pull_request.labels.*.name, 'run-extended')
+    with:
+      repository: pytorch/vision
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.8
+        export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
+
+        ./.github/scripts/setup-env.sh
+
+        # Prepare conda
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        conda activate ci
+
+        echo '::group::Pre-download model weights'
+        pip install --progress-bar=off aiohttp aiofiles tqdm
+        python scripts/download_model_urls.py
+        echo '::endgroup::'
+
+        echo '::group::Install testing utilities'
+        # TODO: remove the <8 constraint on pytest when https://github.com/pytorch/vision/issues/8238 is closed
+        pip install --progress-bar=off "pytest<8"
+        echo '::endgroup::'
+
+        echo '::group::Run extended unittests'
+        export PYTORCH_TEST_WITH_EXTENDED=1
+        pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 test/test_extended_*.py
+        echo '::endgroup::'
diff --git a/README.md b/README.md
index 52298e79049..60583c45256 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,8 @@ versions.
 
 | `torch`            | `torchvision`      | Python              |
 | ------------------ | ------------------ | ------------------- |
-| `main` / `nightly` | `main` / `nightly` | `>=3.8`, `<=3.12`   |
+| `main` / `nightly` | `main` / `nightly` | `>=3.9`, `<=3.12`   |
+| `2.4`              | `0.19`             | `>=3.8`, `<=3.12`   |
 | `2.3`              | `0.18`             | `>=3.8`, `<=3.12`   |
 | `2.2`              | `0.17`             | `>=3.8`, `<=3.11`   |
 | `2.1`              | `0.16`             | `>=3.8`, `<=3.11`   |
diff --git a/benchmarks/encoding.py b/benchmarks/encoding.py
deleted file mode 100644
index f994b03c783..00000000000
--- a/benchmarks/encoding.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import os
-import platform
-import statistics
-
-import torch
-import torch.utils.benchmark as benchmark
-import torchvision
-
-
-def print_machine_specs():
-    print("Processor:", platform.processor())
-    print("Platform:", platform.platform())
-    print("Logical CPUs:", os.cpu_count())
-    print(f"\nCUDA device: {torch.cuda.get_device_name()}")
-    print(f"Total Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
-
-
-def get_data():
-    transform = torchvision.transforms.Compose(
-        [
-            torchvision.transforms.PILToTensor(),
-        ]
-    )
-    path = os.path.join(os.getcwd(), "data")
-    testset = torchvision.datasets.Places365(
-        root="./data", download=not os.path.exists(path), transform=transform, split="val"
-    )
-    testloader = torch.utils.data.DataLoader(
-        testset, batch_size=1000, shuffle=False, num_workers=1, collate_fn=lambda batch: [r[0] for r in batch]
-    )
-    return next(iter(testloader))
-
-
-def run_benchmark(batch):
-    results = []
-    for device in ["cpu", "cuda"]:
-        batch_device = [t.to(device=device) for t in batch]
-        for size in [1, 100, 1000]:
-            for num_threads in [1, 12, 24]:
-                for stmt, strat in zip(
-                    [
-                        "[torchvision.io.encode_jpeg(img) for img in batch_input]",
-                        "torchvision.io.encode_jpeg(batch_input)",
-                    ],
-                    ["unfused", "fused"],
-                ):
-                    batch_input = batch_device[:size]
-                    t = benchmark.Timer(
-                        stmt=stmt,
-                        setup="import torchvision",
-                        globals={"batch_input": batch_input},
-                        label="Image Encoding",
-                        sub_label=f"{device.upper()} ({strat}): {stmt}",
-                        description=f"{size} images",
-                        num_threads=num_threads,
-                    )
-                    results.append(t.blocked_autorange())
-    compare = benchmark.Compare(results)
-    compare.print()
-
-
-if __name__ == "__main__":
-    print_machine_specs()
-    batch = get_data()
-    mean_h, mean_w = statistics.mean(t.shape[-2] for t in batch), statistics.mean(t.shape[-1] for t in batch)
-    print(f"\nMean image size: {int(mean_h)}x{int(mean_w)}")
-    run_benchmark(batch)
diff --git a/benchmarks/encoding_decoding.py b/benchmarks/encoding_decoding.py
new file mode 100644
index 00000000000..0cafdb2d8a6
--- /dev/null
+++ b/benchmarks/encoding_decoding.py
@@ -0,0 +1,99 @@
+import os
+import platform
+import statistics
+
+import torch
+import torch.utils.benchmark as benchmark
+import torchvision
+
+
+def print_machine_specs():
+    print("Processor:", platform.processor())
+    print("Platform:", platform.platform())
+    print("Logical CPUs:", os.cpu_count())
+    print(f"\nCUDA device: {torch.cuda.get_device_name()}")
+    print(f"Total Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
+
+
+def get_data():
+    transform = torchvision.transforms.Compose(
+        [
+            torchvision.transforms.PILToTensor(),
+        ]
+    )
+    path = os.path.join(os.getcwd(), "data")
+    testset = torchvision.datasets.Places365(
+        root="./data", download=not os.path.exists(path), transform=transform, split="val"
+    )
+    testloader = torch.utils.data.DataLoader(
+        testset, batch_size=1000, shuffle=False, num_workers=1, collate_fn=lambda batch: [r[0] for r in batch]
+    )
+    return next(iter(testloader))
+
+
+def run_encoding_benchmark(decoded_images):
+    results = []
+    for device in ["cpu", "cuda"]:
+        decoded_images_device = [t.to(device=device) for t in decoded_images]
+        for size in [1, 100, 1000]:
+            for num_threads in [1, 12, 24]:
+                for stmt, strat in zip(
+                    [
+                        "[torchvision.io.encode_jpeg(img) for img in decoded_images_device_trunc]",
+                        "torchvision.io.encode_jpeg(decoded_images_device_trunc)",
+                    ],
+                    ["unfused", "fused"],
+                ):
+                    decoded_images_device_trunc = decoded_images_device[:size]
+                    t = benchmark.Timer(
+                        stmt=stmt,
+                        setup="import torchvision",
+                        globals={"decoded_images_device_trunc": decoded_images_device_trunc},
+                        label="Image Encoding",
+                        sub_label=f"{device.upper()} ({strat}): {stmt}",
+                        description=f"{size} images",
+                        num_threads=num_threads,
+                    )
+                    results.append(t.blocked_autorange())
+    compare = benchmark.Compare(results)
+    compare.print()
+
+
+def run_decoding_benchmark(encoded_images):
+    results = []
+    for device in ["cpu", "cuda"]:
+        for size in [1, 100, 1000]:
+            for num_threads in [1, 12, 24]:
+                for stmt, strat in zip(
+                    [
+                        f"[torchvision.io.decode_jpeg(img, device='{device}') for img in encoded_images_trunc]",
+                        f"torchvision.io.decode_jpeg(encoded_images_trunc, device='{device}')",
+                    ],
+                    ["unfused", "fused"],
+                ):
+                    encoded_images_trunc = encoded_images[:size]
+                    t = benchmark.Timer(
+                        stmt=stmt,
+                        setup="import torchvision",
+                        globals={"encoded_images_trunc": encoded_images_trunc},
+                        label="Image Decoding",
+                        sub_label=f"{device.upper()} ({strat}): {stmt}",
+                        description=f"{size} images",
+                        num_threads=num_threads,
+                    )
+                    results.append(t.blocked_autorange())
+    compare = benchmark.Compare(results)
+    compare.print()
+
+
+if __name__ == "__main__":
+    print_machine_specs()
+    decoded_images = get_data()
+    mean_h, mean_w = statistics.mean(t.shape[-2] for t in decoded_images), statistics.mean(
+        t.shape[-1] for t in decoded_images
+    )
+    print(f"\nMean image size: {int(mean_h)}x{int(mean_w)}")
+    run_encoding_benchmark(decoded_images)
+    encoded_images_cuda = torchvision.io.encode_jpeg([img.cuda() for img in decoded_images])
+    encoded_images_cpu = [img.cpu() for img in encoded_images_cuda]
+    run_decoding_benchmark(encoded_images_cpu)
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 88bf0b28643..df6cca3856a 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -383,7 +383,7 @@ def inject_weight_metadata(app, what, name, obj, options, lines):
             f"``weights='DEFAULT'`` or ``weights='{str(list(obj)[0]).split('.')[1]}'``.",
         ]
 
-        if obj.__doc__ != "An enumeration.":
+        if obj.__doc__ is not None and obj.__doc__ != "An enumeration.":
             # We only show the custom enum doc if it was overridden. The default one from Python is "An enumeration"
             lines.append("")
             lines.append(obj.__doc__)
diff --git a/packaging/torchvision/meta.yaml b/packaging/torchvision/meta.yaml
index 950c8f20643..a847328a77e 100644
--- a/packaging/torchvision/meta.yaml
+++ b/packaging/torchvision/meta.yaml
@@ -12,13 +12,13 @@ requirements:
     - libpng
     - libjpeg-turbo
     - libwebp
-    - ffmpeg >=4.2  # [linux]
+    - ffmpeg >=4.2.2, <5.0.0  # [linux]
 
   host:
     - python
     - setuptools
     - pytorch-mutex 1.0 {{ build_variant }}  # [not osx ]
-    {{ environ.get('CONDA_PYTORCH_BUILD_CONSTRAINT') }}
+    {{ environ.get('CONDA_PYTORCH_BUILD_CONSTRAINT', 'pytorch') }}
     {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT', '') }}
 
   run:
@@ -27,12 +27,12 @@ requirements:
     - numpy >=1.23.5 # [py >= 311]
     - requests
     - libpng
-    - ffmpeg >=4.2  # [linux]
+    - ffmpeg >=4.2.2, <5.0.0  # [linux]
     - libjpeg-turbo
     - libwebp
     - pillow >=5.3.0, !=8.3.*
     - pytorch-mutex 1.0 {{ build_variant }}  # [not osx ]
-    {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }}
+    {{ environ.get('CONDA_PYTORCH_CONSTRAINT', 'pytorch') }}
     {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT', '') }}
 
   {% if build_variant == 'cpu' %}
diff --git a/test/test_image.py b/test/test_image.py
index 4e85206a054..48b1753e682 100644
--- a/test/test_image.py
+++ b/test/test_image.py
@@ -414,23 +414,32 @@ def test_read_interlaced_png():
 
 
 @needs_cuda
-@pytest.mark.parametrize(
-    "img_path",
-    [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(IMAGE_ROOT, ".jpg")],
-)
 @pytest.mark.parametrize("mode", [ImageReadMode.UNCHANGED, ImageReadMode.GRAY, ImageReadMode.RGB])
 @pytest.mark.parametrize("scripted", (False, True))
-def test_decode_jpeg_cuda(mode, img_path, scripted):
-    if "cmyk" in img_path:
-        pytest.xfail("Decoding a CMYK jpeg isn't supported")
+def test_decode_jpegs_cuda(mode, scripted):
+    encoded_images = []
+    for jpeg_path in get_images(IMAGE_ROOT, ".jpg"):
+        if "cmyk" in jpeg_path:
+            continue
+        encoded_image = read_file(jpeg_path)
+        encoded_images.append(encoded_image)
+    decoded_images_cpu = decode_jpeg(encoded_images, mode=mode)
+    decode_fn = torch.jit.script(decode_jpeg) if scripted else decode_jpeg
 
-    data = read_file(img_path)
-    img = decode_image(data, mode=mode)
-    f = torch.jit.script(decode_jpeg) if scripted else decode_jpeg
-    img_nvjpeg = f(data, mode=mode, device="cuda")
+    # test multithreaded decoding
+    # in the current version we prevent this by using a lock but we still want to test it
+    num_workers = 10
 
-    # Some difference expected between jpeg implementations
-    assert (img.float() - img_nvjpeg.cpu().float()).abs().mean() < 2
+    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
+        futures = [executor.submit(decode_fn, encoded_images, mode, "cuda") for _ in range(num_workers)]
+    decoded_images_threaded = [future.result() for future in futures]
+    assert len(decoded_images_threaded) == num_workers
+    for decoded_images in decoded_images_threaded:
+        assert len(decoded_images) == len(encoded_images)
+        for decoded_image_cuda, decoded_image_cpu in zip(decoded_images, decoded_images_cpu):
+            assert decoded_image_cuda.shape == decoded_image_cpu.shape
+            assert decoded_image_cuda.dtype == decoded_image_cpu.dtype == torch.uint8
+            assert (decoded_image_cuda.cpu().float() - decoded_image_cpu.cpu().float()).abs().mean() < 2
 
 
 @needs_cuda
@@ -441,12 +450,21 @@ def test_decode_image_cuda_raises():
 
 
 @needs_cuda
-@pytest.mark.parametrize("cuda_device", ("cuda", "cuda:0", torch.device("cuda")))
-def test_decode_jpeg_cuda_device_param(cuda_device):
-    """Make sure we can pass a string or a torch.device as device param"""
+def test_decode_jpeg_cuda_device_param():
     path = next(path for path in get_images(IMAGE_ROOT, ".jpg") if "cmyk" not in path)
     data = read_file(path)
-    decode_jpeg(data, device=cuda_device)
+    current_device = torch.cuda.current_device()
+    current_stream = torch.cuda.current_stream()
+    num_devices = torch.cuda.device_count()
+    devices = ["cuda", torch.device("cuda")] + [torch.device(f"cuda:{i}") for i in range(num_devices)]
+    results = []
+    for device in devices:
+        results.append(decode_jpeg(data, device=device))
+    assert len(results) == len(devices)
+    for result in results:
+        assert torch.all(result.cpu() == results[0].cpu())
+    assert current_device == torch.cuda.current_device()
+    assert current_stream == torch.cuda.current_stream()
 
 
 @needs_cuda
@@ -454,12 +472,73 @@ def test_decode_jpeg_cuda_errors():
     data = read_file(next(get_images(IMAGE_ROOT, ".jpg")))
     with pytest.raises(RuntimeError, match="Expected a non empty 1-dimensional tensor"):
         decode_jpeg(data.reshape(-1, 1), device="cuda")
-    with pytest.raises(RuntimeError, match="input tensor must be on CPU"):
+    with pytest.raises(ValueError, match="must be tensors"):
+        decode_jpeg([1, 2, 3])
+    with pytest.raises(ValueError, match="Input tensor must be a CPU tensor"):
         decode_jpeg(data.to("cuda"), device="cuda")
     with pytest.raises(RuntimeError, match="Expected a torch.uint8 tensor"):
         decode_jpeg(data.to(torch.float), device="cuda")
-    with pytest.raises(RuntimeError, match="Expected a cuda device"):
-        torch.ops.image.decode_jpeg_cuda(data, ImageReadMode.UNCHANGED.value, "cpu")
+    with pytest.raises(RuntimeError, match="Expected the device parameter to be a cuda device"):
+        torch.ops.image.decode_jpegs_cuda([data], ImageReadMode.UNCHANGED.value, "cpu")
+    with pytest.raises(ValueError, match="Input tensor must be a CPU tensor"):
+        decode_jpeg(
+            torch.empty((100,), dtype=torch.uint8, device="cuda"),
+        )
+    with pytest.raises(ValueError, match="Input list must contain tensors on CPU"):
+        decode_jpeg(
+            [
+                torch.empty((100,), dtype=torch.uint8, device="cuda"),
+                torch.empty((100,), dtype=torch.uint8, device="cuda"),
+            ]
+        )
+
+    with pytest.raises(ValueError, match="Input list must contain tensors on CPU"):
+        decode_jpeg(
+            [
+                torch.empty((100,), dtype=torch.uint8, device="cuda"),
+                torch.empty((100,), dtype=torch.uint8, device="cuda"),
+            ],
+            device="cuda",
+        )
+
+    with pytest.raises(ValueError, match="Input list must contain tensors on CPU"):
+        decode_jpeg(
+            [
+                torch.empty((100,), dtype=torch.uint8, device="cpu"),
+                torch.empty((100,), dtype=torch.uint8, device="cuda"),
+            ],
+            device="cuda",
+        )
+
+    with pytest.raises(RuntimeError, match="Expected a torch.uint8 tensor"):
+        decode_jpeg(
+            [
+                torch.empty((100,), dtype=torch.uint8),
+                torch.empty((100,), dtype=torch.float32),
+            ],
+            device="cuda",
+        )
+
+    with pytest.raises(RuntimeError, match="Expected a non empty 1-dimensional tensor"):
+        decode_jpeg(
+            [
+                torch.empty((100,), dtype=torch.uint8),
+                torch.empty((1, 100), dtype=torch.uint8),
+            ],
+            device="cuda",
+        )
+
+    with pytest.raises(RuntimeError, match="Error while decoding JPEG images"):
+        decode_jpeg(
+            [
+                torch.empty((100,), dtype=torch.uint8),
+                torch.empty((100,), dtype=torch.uint8),
+            ],
+            device="cuda",
+        )
+
+    with pytest.raises(ValueError, match="Input list must contain at least one element"):
+        decode_jpeg([], device="cuda")
 
 
 def test_encode_jpeg_errors():
@@ -516,12 +595,10 @@ def test_encode_jpeg_cuda_device_param():
     devices = ["cuda", torch.device("cuda")] + [torch.device(f"cuda:{i}") for i in range(num_devices)]
     results = []
     for device in devices:
-        print(f"python: device: {device}")
         results.append(encode_jpeg(data.to(device=device)))
     assert len(results) == len(devices)
     for result in results:
         assert torch.all(result.cpu() == results[0].cpu())
-
     assert current_device == torch.cuda.current_device()
     assert current_stream == torch.cuda.current_stream()
 
diff --git a/test/test_io.py b/test/test_io.py
index c45180571f0..1b7b7eb15a1 100644
--- a/test/test_io.py
+++ b/test/test_io.py
@@ -6,7 +6,7 @@
 import pytest
 import torch
 import torchvision.io as io
-from common_utils import assert_equal
+from common_utils import assert_equal, cpu_and_cuda
 from torchvision import get_video_backend
 
 
@@ -255,18 +255,19 @@ def test_read_video_partially_corrupted_file(self):
                 assert_equal(video, data)
 
     @pytest.mark.skipif(sys.platform == "win32", reason="temporarily disabled on Windows")
-    def test_write_video_with_audio(self, tmpdir):
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_write_video_with_audio(self, device, tmpdir):
         f_name = os.path.join(VIDEO_DIR, "R6llTwEh07w.mp4")
         video_tensor, audio_tensor, info = io.read_video(f_name, pts_unit="sec")
 
         out_f_name = os.path.join(tmpdir, "testing.mp4")
         io.video.write_video(
             out_f_name,
-            video_tensor,
+            video_tensor.to(device),
             round(info["video_fps"]),
             video_codec="libx264rgb",
             options={"crf": "0"},
-            audio_array=audio_tensor,
+            audio_array=audio_tensor.to(device),
             audio_fps=info["audio_fps"],
             audio_codec="aac",
         )
diff --git a/test/test_ops.py b/test/test_ops.py
index 99b259f73f5..1ba7a2c9efa 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -507,6 +507,7 @@ def test_autocast(self, aligned, deterministic, x_dtype, rois_dtype):
                 rois_dtype=rois_dtype,
             )
 
+    @pytest.mark.skip(reason="1/5000 flaky failure")
     @pytest.mark.parametrize("aligned", (True, False))
     @pytest.mark.parametrize("deterministic", (True, False))
     @pytest.mark.parametrize("x_dtype", (torch.float, torch.bfloat16))
diff --git a/torchvision/csrc/io/decoder/gpu/decoder.cpp b/torchvision/csrc/io/decoder/gpu/decoder.cpp
index 22cce7f87ab..f7377ede38b 100644
--- a/torchvision/csrc/io/decoder/gpu/decoder.cpp
+++ b/torchvision/csrc/io/decoder/gpu/decoder.cpp
@@ -143,7 +143,8 @@ int Decoder::handle_picture_display(CUVIDPARSERDISPINFO* disp_info) {
   uint8_t* frame_ptr = decoded_frame.data_ptr<uint8_t>();
   const uint8_t* const source_arr[] = {
       (const uint8_t* const)source_frame,
-      (const uint8_t* const)(source_frame + source_pitch * ((surface_height + 1) & ~1))};
+      (const uint8_t* const)(source_frame +
+                             source_pitch * ((surface_height + 1) & ~1))};
 
   auto err = nppiNV12ToRGB_709CSC_8u_P2C3R(
       source_arr,
diff --git a/torchvision/csrc/io/image/cuda/decode_jpeg_cuda.cpp b/torchvision/csrc/io/image/cuda/decode_jpeg_cuda.cpp
deleted file mode 100644
index 26fecc3e1f3..00000000000
--- a/torchvision/csrc/io/image/cuda/decode_jpeg_cuda.cpp
+++ /dev/null
@@ -1,208 +0,0 @@
-#include "encode_decode_jpegs_cuda.h"
-
-#include <ATen/ATen.h>
-
-#if NVJPEG_FOUND
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <nvjpeg.h>
-#endif
-
-#include <string>
-
-namespace vision {
-namespace image {
-
-#if !NVJPEG_FOUND
-
-torch::Tensor decode_jpeg_cuda(
-    const torch::Tensor& data,
-    ImageReadMode mode,
-    torch::Device device) {
-  TORCH_CHECK(
-      false, "decode_jpeg_cuda: torchvision not compiled with nvJPEG support");
-}
-
-#else
-
-namespace {
-static nvjpegHandle_t nvjpeg_handle = nullptr;
-}
-
-torch::Tensor decode_jpeg_cuda(
-    const torch::Tensor& data,
-    ImageReadMode mode,
-    torch::Device device) {
-  C10_LOG_API_USAGE_ONCE(
-      "torchvision.csrc.io.image.cuda.decode_jpeg_cuda.decode_jpeg_cuda");
-  TORCH_CHECK(data.dtype() == torch::kU8, "Expected a torch.uint8 tensor");
-
-  TORCH_CHECK(
-      !data.is_cuda(),
-      "The input tensor must be on CPU when decoding with nvjpeg")
-
-  TORCH_CHECK(
-      data.dim() == 1 && data.numel() > 0,
-      "Expected a non empty 1-dimensional tensor");
-
-  TORCH_CHECK(device.is_cuda(), "Expected a cuda device")
-
-  int major_version;
-  int minor_version;
-  nvjpegStatus_t get_major_property_status =
-      nvjpegGetProperty(MAJOR_VERSION, &major_version);
-  nvjpegStatus_t get_minor_property_status =
-      nvjpegGetProperty(MINOR_VERSION, &minor_version);
-
-  TORCH_CHECK(
-      get_major_property_status == NVJPEG_STATUS_SUCCESS,
-      "nvjpegGetProperty failed: ",
-      get_major_property_status);
-  TORCH_CHECK(
-      get_minor_property_status == NVJPEG_STATUS_SUCCESS,
-      "nvjpegGetProperty failed: ",
-      get_minor_property_status);
-  if ((major_version < 11) || ((major_version == 11) && (minor_version < 6))) {
-    TORCH_WARN_ONCE(
-        "There is a memory leak issue in the nvjpeg library for CUDA versions < 11.6. "
-        "Make sure to rely on CUDA 11.6 or above before using decode_jpeg(..., device='cuda').");
-  }
-
-  at::cuda::CUDAGuard device_guard(device);
-
-  // Create global nvJPEG handle
-  static std::once_flag nvjpeg_handle_creation_flag;
-  std::call_once(nvjpeg_handle_creation_flag, []() {
-    if (nvjpeg_handle == nullptr) {
-      nvjpegStatus_t create_status = nvjpegCreateSimple(&nvjpeg_handle);
-
-      if (create_status != NVJPEG_STATUS_SUCCESS) {
-        // Reset handle so that one can still call the function again in the
-        // same process if there was a failure
-        free(nvjpeg_handle);
-        nvjpeg_handle = nullptr;
-      }
-      TORCH_CHECK(
-          create_status == NVJPEG_STATUS_SUCCESS,
-          "nvjpegCreateSimple failed: ",
-          create_status);
-    }
-  });
-
-  // Create the jpeg state
-  nvjpegJpegState_t jpeg_state;
-  nvjpegStatus_t state_status =
-      nvjpegJpegStateCreate(nvjpeg_handle, &jpeg_state);
-
-  TORCH_CHECK(
-      state_status == NVJPEG_STATUS_SUCCESS,
-      "nvjpegJpegStateCreate failed: ",
-      state_status);
-
-  auto datap = data.data_ptr<uint8_t>();
-
-  // Get the image information
-  int num_channels;
-  nvjpegChromaSubsampling_t subsampling;
-  int widths[NVJPEG_MAX_COMPONENT];
-  int heights[NVJPEG_MAX_COMPONENT];
-  nvjpegStatus_t info_status = nvjpegGetImageInfo(
-      nvjpeg_handle,
-      datap,
-      data.numel(),
-      &num_channels,
-      &subsampling,
-      widths,
-      heights);
-
-  if (info_status != NVJPEG_STATUS_SUCCESS) {
-    nvjpegJpegStateDestroy(jpeg_state);
-    TORCH_CHECK(false, "nvjpegGetImageInfo failed: ", info_status);
-  }
-
-  if (subsampling == NVJPEG_CSS_UNKNOWN) {
-    nvjpegJpegStateDestroy(jpeg_state);
-    TORCH_CHECK(false, "Unknown NVJPEG chroma subsampling");
-  }
-
-  int width = widths[0];
-  int height = heights[0];
-
-  nvjpegOutputFormat_t ouput_format;
-  int num_channels_output;
-
-  switch (mode) {
-    case IMAGE_READ_MODE_UNCHANGED:
-      num_channels_output = num_channels;
-      // For some reason, setting output_format to NVJPEG_OUTPUT_UNCHANGED will
-      // not properly decode RGB images (it's fine for grayscale), so we set
-      // output_format manually here
-      if (num_channels == 1) {
-        ouput_format = NVJPEG_OUTPUT_Y;
-      } else if (num_channels == 3) {
-        ouput_format = NVJPEG_OUTPUT_RGB;
-      } else {
-        nvjpegJpegStateDestroy(jpeg_state);
-        TORCH_CHECK(
-            false,
-            "When mode is UNCHANGED, only 1 or 3 input channels are allowed.");
-      }
-      break;
-    case IMAGE_READ_MODE_GRAY:
-      ouput_format = NVJPEG_OUTPUT_Y;
-      num_channels_output = 1;
-      break;
-    case IMAGE_READ_MODE_RGB:
-      ouput_format = NVJPEG_OUTPUT_RGB;
-      num_channels_output = 3;
-      break;
-    default:
-      nvjpegJpegStateDestroy(jpeg_state);
-      TORCH_CHECK(
-          false, "The provided mode is not supported for JPEG decoding on GPU");
-  }
-
-  auto out_tensor = torch::empty(
-      {int64_t(num_channels_output), int64_t(height), int64_t(width)},
-      torch::dtype(torch::kU8).device(device));
-
-  // nvjpegImage_t is a struct with
-  // - an array of pointers to each channel
-  // - the pitch for each channel
-  // which must be filled in manually
-  nvjpegImage_t out_image;
-
-  for (int c = 0; c < num_channels_output; c++) {
-    out_image.channel[c] = out_tensor[c].data_ptr<uint8_t>();
-    out_image.pitch[c] = width;
-  }
-  for (int c = num_channels_output; c < NVJPEG_MAX_COMPONENT; c++) {
-    out_image.channel[c] = nullptr;
-    out_image.pitch[c] = 0;
-  }
-
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream(device.index());
-
-  nvjpegStatus_t decode_status = nvjpegDecode(
-      nvjpeg_handle,
-      jpeg_state,
-      datap,
-      data.numel(),
-      ouput_format,
-      &out_image,
-      stream);
-
-  nvjpegJpegStateDestroy(jpeg_state);
-
-  TORCH_CHECK(
-      decode_status == NVJPEG_STATUS_SUCCESS,
-      "nvjpegDecode failed: ",
-      decode_status);
-
-  return out_tensor;
-}
-
-#endif // NVJPEG_FOUND
-
-} // namespace image
-} // namespace vision
diff --git a/torchvision/csrc/io/image/cuda/decode_jpegs_cuda.cpp b/torchvision/csrc/io/image/cuda/decode_jpegs_cuda.cpp
new file mode 100644
index 00000000000..6314ececef1
--- /dev/null
+++ b/torchvision/csrc/io/image/cuda/decode_jpegs_cuda.cpp
@@ -0,0 +1,603 @@
+#include "decode_jpegs_cuda.h"
+#if !NVJPEG_FOUND
+namespace vision {
+namespace image {
+std::vector<torch::Tensor> decode_jpegs_cuda(
+    const std::vector<torch::Tensor>& encoded_images,
+    vision::image::ImageReadMode mode,
+    torch::Device device) {
+  TORCH_CHECK(
+      false, "decode_jpegs_cuda: torchvision not compiled with nvJPEG support");
+}
+} // namespace image
+} // namespace vision
+
+#else
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda_runtime_api.h>
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <typeinfo>
+namespace vision {
+namespace image {
+
+std::mutex decoderMutex;
+std::unique_ptr<CUDAJpegDecoder> cudaJpegDecoder;
+
+std::vector<torch::Tensor> decode_jpegs_cuda(
+    const std::vector<torch::Tensor>& encoded_images,
+    vision::image::ImageReadMode mode,
+    torch::Device device) {
+  C10_LOG_API_USAGE_ONCE(
+      "torchvision.csrc.io.image.cuda.decode_jpegs_cuda.decode_jpegs_cuda");
+
+  std::lock_guard<std::mutex> lock(decoderMutex);
+  std::vector<torch::Tensor> contig_images;
+  contig_images.reserve(encoded_images.size());
+
+  TORCH_CHECK(
+      device.is_cuda(), "Expected the device parameter to be a cuda device");
+
+  for (auto& encoded_image : encoded_images) {
+    TORCH_CHECK(
+        encoded_image.dtype() == torch::kU8, "Expected a torch.uint8 tensor");
+
+    TORCH_CHECK(
+        !encoded_image.is_cuda(),
+        "The input tensor must be on CPU when decoding with nvjpeg")
+
+    TORCH_CHECK(
+        encoded_image.dim() == 1 && encoded_image.numel() > 0,
+        "Expected a non empty 1-dimensional tensor");
+
+    // nvjpeg requires images to be contiguous
+    if (encoded_image.is_contiguous()) {
+      contig_images.push_back(encoded_image);
+    } else {
+      contig_images.push_back(encoded_image.contiguous());
+    }
+  }
+
+  int major_version;
+  int minor_version;
+  nvjpegStatus_t get_major_property_status =
+      nvjpegGetProperty(MAJOR_VERSION, &major_version);
+  nvjpegStatus_t get_minor_property_status =
+      nvjpegGetProperty(MINOR_VERSION, &minor_version);
+
+  TORCH_CHECK(
+      get_major_property_status == NVJPEG_STATUS_SUCCESS,
+      "nvjpegGetProperty failed: ",
+      get_major_property_status);
+  TORCH_CHECK(
+      get_minor_property_status == NVJPEG_STATUS_SUCCESS,
+      "nvjpegGetProperty failed: ",
+      get_minor_property_status);
+  if ((major_version < 11) || ((major_version == 11) && (minor_version < 6))) {
+    TORCH_WARN_ONCE(
+        "There is a memory leak issue in the nvjpeg library for CUDA versions < 11.6. "
+        "Make sure to rely on CUDA 11.6 or above before using decode_jpeg(..., device='cuda').");
+  }
+
+  at::cuda::CUDAGuard device_guard(device);
+
+  if (cudaJpegDecoder == nullptr || device != cudaJpegDecoder->target_device) {
+    if (cudaJpegDecoder != nullptr)
+      cudaJpegDecoder.reset(new CUDAJpegDecoder(device));
+    else {
+      cudaJpegDecoder = std::make_unique<CUDAJpegDecoder>(device);
+      std::atexit([]() { cudaJpegDecoder.reset(); });
+    }
+  }
+
+  nvjpegOutputFormat_t output_format;
+
+  switch (mode) {
+    case vision::image::IMAGE_READ_MODE_UNCHANGED:
+      // Using NVJPEG_OUTPUT_UNCHANGED causes differently sized output channels
+      // which is related to the subsampling used I'm not sure why this is the
+      // case, but for now we're just using RGB and later removing channels from
+      // grayscale images.
+      output_format = NVJPEG_OUTPUT_UNCHANGED;
+      break;
+    case vision::image::IMAGE_READ_MODE_GRAY:
+      output_format = NVJPEG_OUTPUT_Y;
+      break;
+    case vision::image::IMAGE_READ_MODE_RGB:
+      output_format = NVJPEG_OUTPUT_RGB;
+      break;
+    default:
+      TORCH_CHECK(
+          false, "The provided mode is not supported for JPEG decoding on GPU");
+  }
+
+  try {
+    at::cuda::CUDAEvent event;
+    auto result = cudaJpegDecoder->decode_images(contig_images, output_format);
+    auto current_stream{
+        device.has_index() ? at::cuda::getCurrentCUDAStream(
+                                 cudaJpegDecoder->original_device.index())
+                           : at::cuda::getCurrentCUDAStream()};
+    event.record(cudaJpegDecoder->stream);
+    event.block(current_stream);
+    return result;
+  } catch (const std::exception& e) {
+    if (typeid(e) != typeid(std::runtime_error)) {
+      TORCH_CHECK(false, "Error while decoding JPEG images: ", e.what());
+    } else {
+      throw;
+    }
+  }
+}
+
+CUDAJpegDecoder::CUDAJpegDecoder(const torch::Device& target_device)
+    : original_device{torch::kCUDA, torch::cuda::current_device()},
+      target_device{target_device},
+      stream{
+          target_device.has_index()
+              ? at::cuda::getStreamFromPool(false, target_device.index())
+              : at::cuda::getStreamFromPool(false)} {
+  nvjpegStatus_t status;
+
+  hw_decode_available = true;
+  status = nvjpegCreateEx(
+      NVJPEG_BACKEND_HARDWARE,
+      NULL,
+      NULL,
+      NVJPEG_FLAGS_DEFAULT,
+      &nvjpeg_handle);
+  if (status == NVJPEG_STATUS_ARCH_MISMATCH) {
+    status = nvjpegCreateEx(
+        NVJPEG_BACKEND_DEFAULT,
+        NULL,
+        NULL,
+        NVJPEG_FLAGS_DEFAULT,
+        &nvjpeg_handle);
+    TORCH_CHECK(
+        status == NVJPEG_STATUS_SUCCESS,
+        "Failed to initialize nvjpeg with default backend: ",
+        status);
+    hw_decode_available = false;
+  } else {
+    TORCH_CHECK(
+        status == NVJPEG_STATUS_SUCCESS,
+        "Failed to initialize nvjpeg with hardware backend: ",
+        status);
+  }
+
+  status = nvjpegJpegStateCreate(nvjpeg_handle, &nvjpeg_state);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to create nvjpeg state: ",
+      status);
+
+  status = nvjpegDecoderCreate(
+      nvjpeg_handle, NVJPEG_BACKEND_DEFAULT, &nvjpeg_decoder);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to create nvjpeg decoder: ",
+      status);
+
+  status = nvjpegDecoderStateCreate(
+      nvjpeg_handle, nvjpeg_decoder, &nvjpeg_decoupled_state);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to create nvjpeg decoder state: ",
+      status);
+
+  status = nvjpegBufferPinnedCreate(nvjpeg_handle, NULL, &pinned_buffers[0]);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to create pinned buffer: ",
+      status);
+
+  status = nvjpegBufferPinnedCreate(nvjpeg_handle, NULL, &pinned_buffers[1]);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to create pinned buffer: ",
+      status);
+
+  status = nvjpegBufferDeviceCreate(nvjpeg_handle, NULL, &device_buffer);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to create device buffer: ",
+      status);
+
+  status = nvjpegJpegStreamCreate(nvjpeg_handle, &jpeg_streams[0]);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to create jpeg stream: ",
+      status);
+
+  status = nvjpegJpegStreamCreate(nvjpeg_handle, &jpeg_streams[1]);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to create jpeg stream: ",
+      status);
+
+  status = nvjpegDecodeParamsCreate(nvjpeg_handle, &nvjpeg_decode_params);
+  TORCH_CHECK(
+      status == NVJPEG_STATUS_SUCCESS,
+      "Failed to create decode params: ",
+      status);
+}
+
+CUDAJpegDecoder::~CUDAJpegDecoder() {
+  /*
+  The below code works on Mac and Linux, but fails on Windows.
+  This is because on Windows, the atexit hook which calls this
+  destructor executes after cuda is already shut down causing SIGSEGV.
+  We do not have a solution to this problem at the moment, so we'll
+  just leak the libnvjpeg & cuda variables for the time being and hope
+  that the CUDA runtime handles cleanup for us.
+  Please send a PR if you have a solution for this problem.
+  */
+
+  // nvjpegStatus_t status;
+
+  // status = nvjpegDecodeParamsDestroy(nvjpeg_decode_params);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS,
+  //     "Failed to destroy nvjpeg decode params: ",
+  //     status);
+
+  // status = nvjpegJpegStreamDestroy(jpeg_streams[0]);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS,
+  //     "Failed to destroy jpeg stream: ",
+  //     status);
+
+  // status = nvjpegJpegStreamDestroy(jpeg_streams[1]);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS,
+  //     "Failed to destroy jpeg stream: ",
+  //     status);
+
+  // status = nvjpegBufferPinnedDestroy(pinned_buffers[0]);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS,
+  //     "Failed to destroy pinned buffer[0]: ",
+  //     status);
+
+  // status = nvjpegBufferPinnedDestroy(pinned_buffers[1]);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS,
+  //     "Failed to destroy pinned buffer[1]: ",
+  //     status);
+
+  // status = nvjpegBufferDeviceDestroy(device_buffer);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS,
+  //     "Failed to destroy device buffer: ",
+  //     status);
+
+  // status = nvjpegJpegStateDestroy(nvjpeg_decoupled_state);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS,
+  //     "Failed to destroy nvjpeg decoupled state: ",
+  //     status);
+
+  // status = nvjpegDecoderDestroy(nvjpeg_decoder);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS,
+  //     "Failed to destroy nvjpeg decoder: ",
+  //     status);
+
+  // status = nvjpegJpegStateDestroy(nvjpeg_state);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS,
+  //     "Failed to destroy nvjpeg state: ",
+  //     status);
+
+  // status = nvjpegDestroy(nvjpeg_handle);
+  // TORCH_CHECK(
+  //     status == NVJPEG_STATUS_SUCCESS, "nvjpegDestroy failed: ", status);
+}
+
+std::tuple<
+    std::vector<nvjpegImage_t>,
+    std::vector<torch::Tensor>,
+    std::vector<int>>
+CUDAJpegDecoder::prepare_buffers(
+    const std::vector<torch::Tensor>& encoded_images,
+    const nvjpegOutputFormat_t& output_format) {
+  /*
+    This function scans the encoded images' jpeg headers and
+    allocates decoding buffers based on the metadata found
+
+    Args:
+    - encoded_images (std::vector<torch::Tensor>): a vector of tensors
+    containing the jpeg bitstreams to be decoded. Each tensor must have dtype
+    torch.uint8 and device cpu
+    - output_format (nvjpegOutputFormat_t): NVJPEG_OUTPUT_RGB, NVJPEG_OUTPUT_Y
+    or NVJPEG_OUTPUT_UNCHANGED
+
+    Returns:
+    - decoded_images (std::vector<nvjpegImage_t>): a vector of nvjpegImages
+    containing pointers to the memory of the decoded images
+    - output_tensors (std::vector<torch::Tensor>): a vector of Tensors
+    containing the decoded images. `decoded_images` points to the memory of
+    output_tensors
+    - channels (std::vector<int>): a vector of ints containing the number of
+    output image channels for every image
+  */
+
+  int width[NVJPEG_MAX_COMPONENT];
+  int height[NVJPEG_MAX_COMPONENT];
+  std::vector<int> channels(encoded_images.size());
+  nvjpegChromaSubsampling_t subsampling;
+  nvjpegStatus_t status;
+
+  std::vector<torch::Tensor> output_tensors{encoded_images.size()};
+  std::vector<nvjpegImage_t> decoded_images{encoded_images.size()};
+
+  for (std::vector<at::Tensor>::size_type i = 0; i < encoded_images.size();
+       i++) {
+    // extract bitstream meta data to figure out the number of channels, height,
+    // width for every image
+    status = nvjpegGetImageInfo(
+        nvjpeg_handle,
+        (unsigned char*)encoded_images[i].data_ptr(),
+        encoded_images[i].numel(),
+        &channels[i],
+        &subsampling,
+        width,
+        height);
+    TORCH_CHECK(
+        status == NVJPEG_STATUS_SUCCESS, "Failed to get image info: ", status);
+
+    TORCH_CHECK(
+        subsampling != NVJPEG_CSS_UNKNOWN, "Unknown chroma subsampling");
+
+    // output channels may be different from the actual number of channels in
+    // the image, e.g. we decode a grayscale image as RGB and slice off the
+    // extra channels later
+    int output_channels = 3;
+    if (output_format == NVJPEG_OUTPUT_RGB ||
+        output_format == NVJPEG_OUTPUT_UNCHANGED) {
+      output_channels = 3;
+    } else if (output_format == NVJPEG_OUTPUT_Y) {
+      output_channels = 1;
+    }
+
+    // reserve output buffer
+    auto output_tensor = torch::empty(
+        {int64_t(output_channels), int64_t(height[0]), int64_t(width[0])},
+        torch::dtype(torch::kU8).device(target_device));
+    output_tensors[i] = output_tensor;
+
+    // fill nvjpegImage_t struct
+    for (int c = 0; c < output_channels; c++) {
+      decoded_images[i].channel[c] = output_tensor[c].data_ptr<uint8_t>();
+      decoded_images[i].pitch[c] = width[0];
+    }
+    for (int c = output_channels; c < NVJPEG_MAX_COMPONENT; c++) {
+      decoded_images[i].channel[c] = NULL;
+      decoded_images[i].pitch[c] = 0;
+    }
+  }
+  return {decoded_images, output_tensors, channels};
+}
+
+std::vector<torch::Tensor> CUDAJpegDecoder::decode_images(
+    const std::vector<torch::Tensor>& encoded_images,
+    const nvjpegOutputFormat_t& output_format) {
+  /*
+    This function decodes a batch of jpeg bitstreams.
+    We scan all encoded bitstreams and sort them into two groups:
+    1. Baseline JPEGs: Can be decoded with hardware support on A100+ GPUs.
+    2. Other JPEGs (e.g. progressive JPEGs): Can also be decoded on the
+    GPU (albeit with software support only) but need some preprocessing on the
+    host first.
+
+    See
+    https://github.com/NVIDIA/CUDALibrarySamples/blob/f17940ac4e705bf47a8c39f5365925c1665f6c98/nvJPEG/nvJPEG-Decoder/nvjpegDecoder.cpp#L33
+    for reference.
+
+    Args:
+    - encoded_images (std::vector<torch::Tensor>): a vector of tensors
+    containing the jpeg bitstreams to be decoded
+    - output_format (nvjpegOutputFormat_t): NVJPEG_OUTPUT_RGB, NVJPEG_OUTPUT_Y
+    or NVJPEG_OUTPUT_UNCHANGED
+    - device (torch::Device): The desired CUDA device for the returned Tensors
+
+    Returns:
+    - output_tensors (std::vector<torch::Tensor>): a vector of Tensors
+    containing the decoded images
+  */
+
+  auto [decoded_imgs_buf, output_tensors, channels] =
+      prepare_buffers(encoded_images, output_format);
+
+  nvjpegStatus_t status;
+  cudaError_t cudaStatus;
+
+  cudaStatus = cudaStreamSynchronize(stream);
+  TORCH_CHECK(
+      cudaStatus == cudaSuccess,
+      "Failed to synchronize CUDA stream: ",
+      cudaStatus);
+
+  // baseline JPEGs can be batch decoded with hardware support on A100+ GPUs
+  // ultra fast!
+  std::vector<const unsigned char*> hw_input_buffer;
+  std::vector<size_t> hw_input_buffer_size;
+  std::vector<nvjpegImage_t> hw_output_buffer;
+
+  // other JPEG types such as progressive JPEGs can be decoded one-by-one in
+  // software slow :(
+  std::vector<const unsigned char*> sw_input_buffer;
+  std::vector<size_t> sw_input_buffer_size;
+  std::vector<nvjpegImage_t> sw_output_buffer;
+
+  if (hw_decode_available) {
+    for (std::vector<at::Tensor>::size_type i = 0; i < encoded_images.size();
+         ++i) {
+      // extract bitstream meta data to figure out whether a bit-stream can be
+      // decoded
+      nvjpegJpegStreamParseHeader(
+          nvjpeg_handle,
+          encoded_images[i].data_ptr<uint8_t>(),
+          encoded_images[i].numel(),
+          jpeg_streams[0]);
+      int isSupported = -1;
+      nvjpegDecodeBatchedSupported(
+          nvjpeg_handle, jpeg_streams[0], &isSupported);
+
+      if (isSupported == 0) {
+        hw_input_buffer.push_back(encoded_images[i].data_ptr<uint8_t>());
+        hw_input_buffer_size.push_back(encoded_images[i].numel());
+        hw_output_buffer.push_back(decoded_imgs_buf[i]);
+      } else {
+        sw_input_buffer.push_back(encoded_images[i].data_ptr<uint8_t>());
+        sw_input_buffer_size.push_back(encoded_images[i].numel());
+        sw_output_buffer.push_back(decoded_imgs_buf[i]);
+      }
+    }
+  } else {
+    for (std::vector<at::Tensor>::size_type i = 0; i < encoded_images.size();
+         ++i) {
+      sw_input_buffer.push_back(encoded_images[i].data_ptr<uint8_t>());
+      sw_input_buffer_size.push_back(encoded_images[i].numel());
+      sw_output_buffer.push_back(decoded_imgs_buf[i]);
+    }
+  }
+
+  if (hw_input_buffer.size() > 0) {
+    // UNCHANGED behaves weird, so we use RGB instead
+    status = nvjpegDecodeBatchedInitialize(
+        nvjpeg_handle,
+        nvjpeg_state,
+        hw_input_buffer.size(),
+        1,
+        output_format == NVJPEG_OUTPUT_UNCHANGED ? NVJPEG_OUTPUT_RGB
+                                                 : output_format);
+    TORCH_CHECK(
+        status == NVJPEG_STATUS_SUCCESS,
+        "Failed to initialize batch decoding: ",
+        status);
+
+    status = nvjpegDecodeBatched(
+        nvjpeg_handle,
+        nvjpeg_state,
+        hw_input_buffer.data(),
+        hw_input_buffer_size.data(),
+        hw_output_buffer.data(),
+        stream);
+    TORCH_CHECK(
+        status == NVJPEG_STATUS_SUCCESS, "Failed to decode batch: ", status);
+  }
+
+  if (sw_input_buffer.size() > 0) {
+    status =
+        nvjpegStateAttachDeviceBuffer(nvjpeg_decoupled_state, device_buffer);
+    TORCH_CHECK(
+        status == NVJPEG_STATUS_SUCCESS,
+        "Failed to attach device buffer: ",
+        status);
+    int buffer_index = 0;
+    // UNCHANGED behaves weird, so we use RGB instead
+    status = nvjpegDecodeParamsSetOutputFormat(
+        nvjpeg_decode_params,
+        output_format == NVJPEG_OUTPUT_UNCHANGED ? NVJPEG_OUTPUT_RGB
+                                                 : output_format);
+    TORCH_CHECK(
+        status == NVJPEG_STATUS_SUCCESS,
+        "Failed to set output format: ",
+        status);
+    for (std::vector<at::Tensor>::size_type i = 0; i < sw_input_buffer.size();
+         ++i) {
+      status = nvjpegJpegStreamParse(
+          nvjpeg_handle,
+          sw_input_buffer[i],
+          sw_input_buffer_size[i],
+          0,
+          0,
+          jpeg_streams[buffer_index]);
+      TORCH_CHECK(
+          status == NVJPEG_STATUS_SUCCESS,
+          "Failed to parse jpeg stream: ",
+          status);
+
+      status = nvjpegStateAttachPinnedBuffer(
+          nvjpeg_decoupled_state, pinned_buffers[buffer_index]);
+      TORCH_CHECK(
+          status == NVJPEG_STATUS_SUCCESS,
+          "Failed to attach pinned buffer: ",
+          status);
+
+      status = nvjpegDecodeJpegHost(
+          nvjpeg_handle,
+          nvjpeg_decoder,
+          nvjpeg_decoupled_state,
+          nvjpeg_decode_params,
+          jpeg_streams[buffer_index]);
+      TORCH_CHECK(
+          status == NVJPEG_STATUS_SUCCESS,
+          "Failed to decode jpeg stream: ",
+          status);
+
+      cudaStatus = cudaStreamSynchronize(stream);
+      TORCH_CHECK(
+          cudaStatus == cudaSuccess,
+          "Failed to synchronize CUDA stream: ",
+          cudaStatus);
+
+      status = nvjpegDecodeJpegTransferToDevice(
+          nvjpeg_handle,
+          nvjpeg_decoder,
+          nvjpeg_decoupled_state,
+          jpeg_streams[buffer_index],
+          stream);
+      TORCH_CHECK(
+          status == NVJPEG_STATUS_SUCCESS,
+          "Failed to transfer jpeg to device: ",
+          status);
+
+      buffer_index = 1 - buffer_index; // switch pinned buffer in pipeline mode
+                                       // to avoid an extra sync
+
+      status = nvjpegDecodeJpegDevice(
+          nvjpeg_handle,
+          nvjpeg_decoder,
+          nvjpeg_decoupled_state,
+          &sw_output_buffer[i],
+          stream);
+      TORCH_CHECK(
+          status == NVJPEG_STATUS_SUCCESS,
+          "Failed to decode jpeg stream: ",
+          status);
+    }
+  }
+
+  cudaStatus = cudaStreamSynchronize(stream);
+  TORCH_CHECK(
+      cudaStatus == cudaSuccess,
+      "Failed to synchronize CUDA stream: ",
+      cudaStatus);
+
+  // prune extraneous channels from single channel images
+  if (output_format == NVJPEG_OUTPUT_UNCHANGED) {
+    for (std::vector<at::Tensor>::size_type i = 0; i < output_tensors.size();
+         ++i) {
+      if (channels[i] == 1) {
+        output_tensors[i] = output_tensors[i][0].unsqueeze(0).clone();
+      }
+    }
+  }
+
+  return output_tensors;
+}
+
+} // namespace image
+} // namespace vision
+
+#endif
diff --git a/torchvision/csrc/io/image/cuda/decode_jpegs_cuda.h b/torchvision/csrc/io/image/cuda/decode_jpegs_cuda.h
new file mode 100644
index 00000000000..2458a103a3a
--- /dev/null
+++ b/torchvision/csrc/io/image/cuda/decode_jpegs_cuda.h
@@ -0,0 +1,45 @@
+#pragma once
+#include <torch/types.h>
+#include <vector>
+#include "../image_read_mode.h"
+
+#if NVJPEG_FOUND
+#include <c10/cuda/CUDAStream.h>
+#include <nvjpeg.h>
+
+namespace vision {
+namespace image {
+class CUDAJpegDecoder {
+ public:
+  CUDAJpegDecoder(const torch::Device& target_device);
+  ~CUDAJpegDecoder();
+
+  std::vector<torch::Tensor> decode_images(
+      const std::vector<torch::Tensor>& encoded_images,
+      const nvjpegOutputFormat_t& output_format);
+
+  const torch::Device original_device;
+  const torch::Device target_device;
+  const c10::cuda::CUDAStream stream;
+
+ private:
+  std::tuple<
+      std::vector<nvjpegImage_t>,
+      std::vector<torch::Tensor>,
+      std::vector<int>>
+  prepare_buffers(
+      const std::vector<torch::Tensor>& encoded_images,
+      const nvjpegOutputFormat_t& output_format);
+  nvjpegJpegState_t nvjpeg_state;
+  nvjpegJpegState_t nvjpeg_decoupled_state;
+  nvjpegBufferPinned_t pinned_buffers[2];
+  nvjpegBufferDevice_t device_buffer;
+  nvjpegJpegStream_t jpeg_streams[2];
+  nvjpegDecodeParams_t nvjpeg_decode_params;
+  nvjpegJpegDecoder_t nvjpeg_decoder;
+  bool hw_decode_available{false};
+  nvjpegHandle_t nvjpeg_handle;
+};
+} // namespace image
+} // namespace vision
+#endif
diff --git a/torchvision/csrc/io/image/cuda/encode_decode_jpegs_cuda.h b/torchvision/csrc/io/image/cuda/encode_decode_jpegs_cuda.h
index 7723d11d621..3fdf715b00f 100644
--- a/torchvision/csrc/io/image/cuda/encode_decode_jpegs_cuda.h
+++ b/torchvision/csrc/io/image/cuda/encode_decode_jpegs_cuda.h
@@ -2,16 +2,55 @@
 
 #include <torch/types.h>
 #include "../image_read_mode.h"
+#include "decode_jpegs_cuda.h"
 #include "encode_jpegs_cuda.h"
 
 namespace vision {
 namespace image {
 
-C10_EXPORT torch::Tensor decode_jpeg_cuda(
-    const torch::Tensor& data,
-    ImageReadMode mode,
+/*
+
+Fast jpeg decoding with CUDA.
+A100+ GPUs have dedicated hardware support for jpeg decoding.
+
+Args:
+    - encoded_images (const std::vector<torch::Tensor>&): a vector of tensors
+    containing the jpeg bitstreams to be decoded. Each tensor must have dtype
+    torch.uint8 and device cpu
+    - mode (ImageReadMode): IMAGE_READ_MODE_UNCHANGED, IMAGE_READ_MODE_GRAY and
+IMAGE_READ_MODE_RGB are supported
+    - device (torch::Device): The desired CUDA device to run the decoding on and
+which will contain the output tensors
+
+Returns:
+    - decoded_images (std::vector<torch::Tensor>): a vector of torch::Tensors of
+dtype torch.uint8 on the specified <device> containing the decoded images
+
+Notes:
+    - If a single image fails, the whole batch fails.
+    - This function is thread-safe
+*/
+C10_EXPORT std::vector<torch::Tensor> decode_jpegs_cuda(
+    const std::vector<torch::Tensor>& encoded_images,
+    vision::image::ImageReadMode mode,
     torch::Device device);
 
+/*
+Fast jpeg encoding with CUDA.
+
+Args:
+    - decoded_images (const std::vector<torch::Tensor>&): a vector of contiguous
+CUDA tensors of dtype torch.uint8 to be encoded.
+    - quality (int64_t): 0-100, 75 is the default
+
+Returns:
+    - encoded_images (std::vector<torch::Tensor>): a vector of CUDA
+torch::Tensors of dtype torch.uint8 containing the encoded images
+
+Notes:
+    - If a single image fails, the whole batch fails.
+    - This function is thread-safe
+*/
 C10_EXPORT std::vector<torch::Tensor> encode_jpegs_cuda(
     const std::vector<torch::Tensor>& decoded_images,
     const int64_t quality);
diff --git a/torchvision/csrc/io/image/image.cpp b/torchvision/csrc/io/image/image.cpp
index 1b412c8d84d..8ca2f814996 100644
--- a/torchvision/csrc/io/image/image.cpp
+++ b/torchvision/csrc/io/image/image.cpp
@@ -27,7 +27,7 @@ static auto registry =
         .op("image::write_file", &write_file)
         .op("image::decode_image(Tensor data, int mode, bool apply_exif_orientation=False) -> Tensor",
             &decode_image)
-        .op("image::decode_jpeg_cuda", &decode_jpeg_cuda)
+        .op("image::decode_jpegs_cuda", &decode_jpegs_cuda)
         .op("image::encode_jpegs_cuda", &encode_jpegs_cuda)
         .op("image::_jpeg_version", &_jpeg_version)
         .op("image::_is_compiled_against_turbo", &_is_compiled_against_turbo);
diff --git a/torchvision/csrc/ops/autocast/nms_kernel.cpp b/torchvision/csrc/ops/autocast/nms_kernel.cpp
index 2acd0f5d0dc..39482ceadbf 100644
--- a/torchvision/csrc/ops/autocast/nms_kernel.cpp
+++ b/torchvision/csrc/ops/autocast/nms_kernel.cpp
@@ -38,5 +38,12 @@ TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
           (nms_autocast<c10::DispatchKey::AutocastCPU, c10::DeviceType::CPU>)));
 }
 
+TORCH_LIBRARY_IMPL(torchvision, AutocastXPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::nms"),
+      TORCH_FN(
+          (nms_autocast<c10::DispatchKey::AutocastXPU, c10::DeviceType::XPU>)));
+}
+
 } // namespace ops
 } // namespace vision
diff --git a/torchvision/csrc/ops/autocast/roi_align_kernel.cpp b/torchvision/csrc/ops/autocast/roi_align_kernel.cpp
index 919393a5ef0..3eb8443b54d 100644
--- a/torchvision/csrc/ops/autocast/roi_align_kernel.cpp
+++ b/torchvision/csrc/ops/autocast/roi_align_kernel.cpp
@@ -48,5 +48,13 @@ TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
                 c10::DeviceType::CPU>)));
 }
 
+TORCH_LIBRARY_IMPL(torchvision, AutocastXPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::roi_align"),
+      TORCH_FN((roi_align_autocast<
+                c10::DispatchKey::AutocastXPU,
+                c10::DeviceType::XPU>)));
+}
+
 } // namespace ops
 } // namespace vision
diff --git a/torchvision/datasets/eurosat.py b/torchvision/datasets/eurosat.py
index 3f490b11902..c6571d2abab 100644
--- a/torchvision/datasets/eurosat.py
+++ b/torchvision/datasets/eurosat.py
@@ -9,6 +9,9 @@
 class EuroSAT(ImageFolder):
     """RGB version of the `EuroSAT <https://github.com/phelber/eurosat>`_ Dataset.
 
+    For the MS version of the dataset, see
+    `TorchGeo <https://torchgeo.readthedocs.io/en/stable/api/datasets.html#eurosat>`__.
+
     Args:
         root (str or ``pathlib.Path``): Root directory of dataset where ``root/eurosat`` exists.
         transform (callable, optional): A function/transform that takes in a PIL image
@@ -53,7 +56,7 @@ def download(self) -> None:
 
         os.makedirs(self._base_folder, exist_ok=True)
         download_and_extract_archive(
-            "https://madm.dfki.de/files/sentinel/EuroSAT.zip",
+            "https://huggingface.co/datasets/torchgeo/eurosat/resolve/c877bcd43f099cd0196738f714544e355477f3fd/EuroSAT.zip",
             download_root=self._base_folder,
             md5="c8fa014336c82ac7804f0398fcb19387",
         )
diff --git a/torchvision/datasets/kinetics.py b/torchvision/datasets/kinetics.py
index 42d32533953..868c08e2c30 100644
--- a/torchvision/datasets/kinetics.py
+++ b/torchvision/datasets/kinetics.py
@@ -16,7 +16,7 @@
 from .vision import VisionDataset
 
 
-def _dl_wrap(tarpath: str, videopath: str, line: str) -> None:
+def _dl_wrap(tarpath: Union[str, Path], videopath: Union[str, Path], line: str) -> None:
     download_and_extract_archive(line, tarpath, videopath)
 
 
diff --git a/torchvision/datasets/sbd.py b/torchvision/datasets/sbd.py
index 3012c0cd974..4b9ccb75eb9 100644
--- a/torchvision/datasets/sbd.py
+++ b/torchvision/datasets/sbd.py
@@ -46,7 +46,7 @@ class SBDataset(VisionDataset):
     md5 = "82b4d87ceb2ed10f6038a1cba92111cb"
     filename = "benchmark.tgz"
 
-    voc_train_url = "http://home.bharathh.info/pubs/codes/SBD/train_noval.txt"
+    voc_train_url = "https://www.cs.cornell.edu/~bharathh/train_noval.txt"
     voc_split_filename = "train_noval.txt"
     voc_split_md5 = "79bff800c5f0b1ec6b21080a3c066722"
 
diff --git a/torchvision/datasets/utils.py b/torchvision/datasets/utils.py
index 344056d67db..f65eb535459 100644
--- a/torchvision/datasets/utils.py
+++ b/torchvision/datasets/utils.py
@@ -26,7 +26,7 @@
 
 def _urlretrieve(url: str, filename: Union[str, pathlib.Path], chunk_size: int = 1024 * 32) -> None:
     with urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": USER_AGENT})) as response:
-        with open(filename, "wb") as fh, tqdm(total=response.length) as pbar:
+        with open(filename, "wb") as fh, tqdm(total=response.length, unit="B", unit_scale=True) as pbar:
             while chunk := response.read(chunk_size):
                 fh.write(chunk)
                 pbar.update(len(chunk))
diff --git a/torchvision/io/image.py b/torchvision/io/image.py
index b2b478c8e7d..58047d1a419 100644
--- a/torchvision/io/image.py
+++ b/torchvision/io/image.py
@@ -143,22 +143,28 @@ def write_png(input: torch.Tensor, filename: str, compression_level: int = 6):
 
 
 def decode_jpeg(
-    input: torch.Tensor,
+    input: Union[torch.Tensor, List[torch.Tensor]],
     mode: ImageReadMode = ImageReadMode.UNCHANGED,
-    device: str = "cpu",
+    device: Union[str, torch.device] = "cpu",
     apply_exif_orientation: bool = False,
-) -> torch.Tensor:
+) -> Union[torch.Tensor, List[torch.Tensor]]:
     """
-    Decodes a JPEG image into a 3 dimensional RGB or grayscale Tensor.
+    Decode JPEG image(s) into 3 dimensional RGB or grayscale Tensor(s).
 
     The values of the output tensor are uint8 between 0 and 255.
 
+    .. note::
+        When using a CUDA device, passing a list of tensors is more efficient than repeated individual calls to ``decode_jpeg``.
+        When using CPU the performance is equivalent.
+        The CUDA version of this function has explicitly been designed with thread-safety in mind.
+        This function does not return partial results in case of an error.
+
     Args:
-        input (Tensor[1]): a one dimensional uint8 tensor containing
-            the raw bytes of the JPEG image. This tensor must be on CPU,
+        input (Tensor[1] or list[Tensor[1]]): a (list of) one dimensional uint8 tensor(s) containing
+            the raw bytes of the JPEG image. The tensor(s) must be on CPU,
             regardless of the ``device`` parameter.
         mode (ImageReadMode): the read mode used for optionally
-            converting the image. The supported modes are: ``ImageReadMode.UNCHANGED``,
+            converting the image(s). The supported modes are: ``ImageReadMode.UNCHANGED``,
             ``ImageReadMode.GRAY`` and ``ImageReadMode.RGB``
             Default: ``ImageReadMode.UNCHANGED``.
             See ``ImageReadMode`` class for more information on various
@@ -177,16 +183,36 @@ def decode_jpeg(
             Default: False. Only implemented for JPEG format on CPU.
 
     Returns:
-        output (Tensor[image_channels, image_height, image_width])
+        output (Tensor[image_channels, image_height, image_width] or list[Tensor[image_channels, image_height, image_width]]):
+            The values of the output tensor(s) are uint8 between 0 and 255.
+            ``output.device`` will be set to the specified ``device``
+
+
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(decode_jpeg)
-    device = torch.device(device)
-    if device.type == "cuda":
-        output = torch.ops.image.decode_jpeg_cuda(input, mode.value, device)
-    else:
-        output = torch.ops.image.decode_jpeg(input, mode.value, apply_exif_orientation)
-    return output
+    if isinstance(device, str):
+        device = torch.device(device)
+
+    if isinstance(input, list):
+        if len(input) == 0:
+            raise ValueError("Input list must contain at least one element")
+        if not all(isinstance(t, torch.Tensor) for t in input):
+            raise ValueError("All elements of the input list must be tensors.")
+        if not all(t.device.type == "cpu" for t in input):
+            raise ValueError("Input list must contain tensors on CPU.")
+        if device.type == "cuda":
+            return torch.ops.image.decode_jpegs_cuda(input, mode.value, device)
+        else:
+            return [torch.ops.image.decode_jpeg(img, mode.value, apply_exif_orientation) for img in input]
+
+    else:  # input is tensor
+        if input.device.type != "cpu":
+            raise ValueError("Input tensor must be a CPU tensor")
+        if device.type == "cuda":
+            return torch.ops.image.decode_jpegs_cuda([input], mode.value, device)[0]
+        else:
+            return torch.ops.image.decode_jpeg(input, mode.value, apply_exif_orientation)
 
 
 def encode_jpeg(
diff --git a/torchvision/io/video.py b/torchvision/io/video.py
index 9b2eacbab11..c8f7d2ebde2 100644
--- a/torchvision/io/video.py
+++ b/torchvision/io/video.py
@@ -80,7 +80,7 @@ def write_video(
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(write_video)
     _check_av_available()
-    video_array = torch.as_tensor(video_array, dtype=torch.uint8).numpy()
+    video_array = torch.as_tensor(video_array, dtype=torch.uint8).numpy(force=True)
 
     # PyAV does not support floating point numbers with decimal point
     # and will throw OverflowException in case this is not the case
@@ -115,7 +115,7 @@ def write_video(
             audio_sample_fmt = container.streams.audio[0].format.name
 
             format_dtype = np.dtype(audio_format_dtypes[audio_sample_fmt])
-            audio_array = torch.as_tensor(audio_array).numpy().astype(format_dtype)
+            audio_array = torch.as_tensor(audio_array).numpy(force=True).astype(format_dtype)
 
             frame = av.AudioFrame.from_ndarray(audio_array, format=audio_sample_fmt, layout=audio_layout)
 
diff --git a/torchvision/io/video_reader.py b/torchvision/io/video_reader.py
index 69f7f1e016b..c00723a4534 100644
--- a/torchvision/io/video_reader.py
+++ b/torchvision/io/video_reader.py
@@ -251,7 +251,7 @@ def get_metadata(self) -> Dict[str, Any]:
                         rate_n = "framerate"
                     metadata[stream.type] = {rate_n: [], "duration": []}
 
-                rate = stream.average_rate if stream.average_rate is not None else stream.sample_rate
+                rate = getattr(stream, "average_rate", None) or stream.sample_rate
 
                 metadata[stream.type]["duration"].append(float(stream.duration * stream.time_base))
                 metadata[stream.type][rate_n].append(float(rate))
diff --git a/torchvision/models/detection/keypoint_rcnn.py b/torchvision/models/detection/keypoint_rcnn.py
index 0494a7329ee..5d7ff0ea433 100644
--- a/torchvision/models/detection/keypoint_rcnn.py
+++ b/torchvision/models/detection/keypoint_rcnn.py
@@ -157,7 +157,6 @@ class KeypointRCNN(FasterRCNN):
         >>>                      box_roi_pool=roi_pooler,
         >>>                      keypoint_roi_pool=keypoint_roi_pooler)
         >>> model.eval()
-        >>> model.eval()
         >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
         >>> predictions = model(x)
     """
diff --git a/torchvision/transforms/_presets.py b/torchvision/transforms/_presets.py
index d7f88bdb992..fb6f4ad5ca5 100644
--- a/torchvision/transforms/_presets.py
+++ b/torchvision/transforms/_presets.py
@@ -2,7 +2,7 @@
 This file is part of the private API. Please do not use directly these classes as they will be modified on
 future versions without warning. The classes should be accessed only via the transforms argument of Weights.
 """
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 
 import torch
 from torch import nn, Tensor
@@ -87,7 +87,7 @@ def __init__(
         self,
         *,
         crop_size: Tuple[int, int],
-        resize_size: Tuple[int, int],
+        resize_size: Union[Tuple[int], Tuple[int, int]],
         mean: Tuple[float, ...] = (0.43216, 0.394666, 0.37645),
         std: Tuple[float, ...] = (0.22803, 0.22145, 0.216989),
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
diff --git a/torchvision/transforms/v2/functional/_augment.py b/torchvision/transforms/v2/functional/_augment.py
index 60b49099fc5..a904d8d7cbd 100644
--- a/torchvision/transforms/v2/functional/_augment.py
+++ b/torchvision/transforms/v2/functional/_augment.py
@@ -80,9 +80,12 @@ def jpeg_image(image: torch.Tensor, quality: int) -> torch.Tensor:
 
     images = []
     for i in range(image.shape[0]):
+        # isinstance checks are needed for torchscript.
         encoded_image = encode_jpeg(image[i], quality=quality)
-        assert isinstance(encoded_image, torch.Tensor)  # For torchscript
-        images.append(decode_jpeg(encoded_image))
+        assert isinstance(encoded_image, torch.Tensor)
+        decoded_image = decode_jpeg(encoded_image)
+        assert isinstance(decoded_image, torch.Tensor)
+        images.append(decoded_image)
 
     images = torch.stack(images, dim=0).view(original_shape)
     return images
diff --git a/torchvision/transforms/v2/functional/_misc.py b/torchvision/transforms/v2/functional/_misc.py
index 8b20473e6e7..f40bf117753 100644
--- a/torchvision/transforms/v2/functional/_misc.py
+++ b/torchvision/transforms/v2/functional/_misc.py
@@ -84,9 +84,9 @@ def gaussian_blur(inpt: torch.Tensor, kernel_size: List[int], sigma: Optional[Li
 
 
 def _get_gaussian_kernel1d(kernel_size: int, sigma: float, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
-    lim = (kernel_size - 1) / (2.0 * math.sqrt(2.0) * sigma)
+    lim = (kernel_size - 1) / (2.0 * math.sqrt(2.0))
     x = torch.linspace(-lim, lim, steps=kernel_size, dtype=dtype, device=device)
-    kernel1d = torch.softmax(x.pow_(2).neg_(), dim=0)
+    kernel1d = torch.softmax(x.div(sigma).pow(2).neg(), dim=0)
     return kernel1d
 
 
@@ -119,7 +119,7 @@ def gaussian_blur_image(
         if isinstance(sigma, (list, tuple)):
             length = len(sigma)
             if length == 1:
-                s = float(sigma[0])
+                s = sigma[0]
                 sigma = [s, s]
             elif length != 2:
                 raise ValueError(f"If sigma is a sequence, its length should be 2. Got {length}")