From 9b3ca3673190d1fff6e55fe6142cb1128d45426a Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Mon, 24 Jun 2024 11:57:44 +0100
Subject: [PATCH] Deprecate functions for reading VCF

---
 docs/api.rst                  |  3 +++
 docs/vcf.rst                  |  3 +++
 sgkit/io/vcf/vcf_partition.py | 11 +++++++++
 sgkit/io/vcf/vcf_reader.py    | 45 +++++++++++++++++++++++++++++++++++
 sgkit/io/vcfzarr_reader.py    |  9 +++++++
 5 files changed, 71 insertions(+)

diff --git a/docs/api.rst b/docs/api.rst
index ce6c29cc3..ba0605482 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -35,6 +35,9 @@ PLINK
 VCF (reading)
 -------------
 
+.. deprecated:: 0.9.0
+   Functions for reading VCF are deprecated, please use the `bio2zarr <https://github.com/sgkit-dev/bio2zarr>`_ package.
+
 .. currentmodule:: sgkit.io.vcf
 .. autosummary::
    :toctree: generated/
diff --git a/docs/vcf.rst b/docs/vcf.rst
index 1773cd8e5..bbeeab10c 100644
--- a/docs/vcf.rst
+++ b/docs/vcf.rst
@@ -3,6 +3,9 @@
 Reading VCF
 ===========
 
+.. deprecated:: 0.9.0
+   Functions for reading VCF are deprecated, please use the `bio2zarr <https://github.com/sgkit-dev/bio2zarr>`_ package.
+
 .. contents:: Table of contents:
    :local:
 
diff --git a/sgkit/io/vcf/vcf_partition.py b/sgkit/io/vcf/vcf_partition.py
index cc979cb25..c64e5c088 100644
--- a/sgkit/io/vcf/vcf_partition.py
+++ b/sgkit/io/vcf/vcf_partition.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Any, Dict, Optional, Sequence, Union
 
 import dask
@@ -78,6 +79,9 @@ def partition_into_regions(
     """
     Calculate genomic region strings to partition a compressed VCF or BCF file into roughly equal parts.
 
+    .. deprecated:: 0.9.0
+       Functions for reading VCF are deprecated, please use the `bio2zarr <https://github.com/sgkit-dev/bio2zarr>`_ package.
+
     A ``.tbi`` or ``.csi`` file is used to find BGZF boundaries in the compressed VCF file, which are then
     used to divide the file into parts.
 
@@ -118,6 +122,13 @@ def partition_into_regions(
     ValueError
         If either of ``num_parts`` or ``target_part_size`` is not a positive integer.
     """
+
+    warnings.warn(
+        "Functions for reading VCF are deprecated, please use the bio2zarr package.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+
     if num_parts is None and target_part_size is None:
         raise ValueError("One of num_parts or target_part_size must be specified")
 
diff --git a/sgkit/io/vcf/vcf_reader.py b/sgkit/io/vcf/vcf_reader.py
index e9dfc88e9..5de4f44f2 100644
--- a/sgkit/io/vcf/vcf_reader.py
+++ b/sgkit/io/vcf/vcf_reader.py
@@ -679,6 +679,9 @@ def vcf_to_zarrs(
 ) -> Sequence[str]:
     """Convert VCF files to multiple Zarr on-disk stores, one per region.
 
+    .. deprecated:: 0.9.0
+       Functions for reading VCF are deprecated, please use the `bio2zarr <https://github.com/sgkit-dev/bio2zarr>`_ package.
+
     Parameters
     ----------
     input
@@ -754,6 +757,12 @@ def vcf_to_zarrs(
     A list of URLs to the Zarr outputs.
     """
 
+    warnings.warn(
+        "Functions for reading VCF are deprecated, please use the bio2zarr package.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+
     output_storage_options = output_storage_options or {}
 
     tasks = []
@@ -798,6 +807,9 @@ def concat_zarrs(
 ) -> None:
     """Concatenate multiple Zarr stores into a single Zarr store.
 
+    .. deprecated:: 0.9.0
+       Functions for reading VCF are deprecated, please use the `bio2zarr <https://github.com/sgkit-dev/bio2zarr>`_ package.
+
     The Zarr stores are concatenated and rechunked to produce a single combined store.
 
     Parameters
@@ -814,6 +826,12 @@ def concat_zarrs(
        the chunk length of the first input Zarr store is used.
     """
 
+    warnings.warn(
+        "Functions for reading VCF are deprecated, please use the bio2zarr package.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+
     vars_to_rechunk = []
     vars_to_copy = []
     storage_options = storage_options or {}
@@ -856,6 +874,9 @@ def vcf_to_zarr(
 ) -> None:
     """Convert VCF files to a single Zarr on-disk store.
 
+    .. deprecated:: 0.9.0
+       Functions for reading VCF are deprecated, please use the `bio2zarr <https://github.com/sgkit-dev/bio2zarr>`_ package.
+
     By default, the conversion is carried out in parallel, by writing the output for each
     part to a separate, intermediate Zarr store in ``tempdir``. Then, in a second step
     the intermediate outputs are concatenated and rechunked into the final output Zarr
@@ -955,6 +976,12 @@ def vcf_to_zarr(
         so for large VCF files this can be slow.
     """
 
+    warnings.warn(
+        "Functions for reading VCF are deprecated, please use the bio2zarr package.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+
     if temp_chunk_length is not None:
         if chunk_length % temp_chunk_length != 0:
             raise ValueError(
@@ -1039,6 +1066,9 @@ def read_vcf(
 ) -> xr.Dataset:
     """Read VCF dataset.
 
+    .. deprecated:: 0.9.0
+       Functions for reading VCF are deprecated, please use the `bio2zarr <https://github.com/sgkit-dev/bio2zarr>`_ package.
+
     A convenience for :func:`vcf_to_zarr` followed by :func:`sgkit.load_dataset`.
     Note that the output Zarr store in ``tempdir`` is not deleted after this function
     returns, so must be deleted manually by the user.
@@ -1119,6 +1149,12 @@ def read_vcf(
 
     """
 
+    warnings.warn(
+        "Functions for reading VCF are deprecated, please use the bio2zarr package.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+
     # Need to retain zarr file backing the returned dataset
     with temporary_directory(
         prefix="read_vcf_",
@@ -1166,6 +1202,9 @@ def zarr_array_sizes(
 ) -> Dict[str, Any]:
     """Make a pass through a VCF/BCF file to determine sizes for storage in Zarr.
 
+    .. deprecated:: 0.9.0
+       Functions for reading VCF are deprecated, please use the `bio2zarr <https://github.com/sgkit-dev/bio2zarr>`_ package.
+
     By default, the input is processed in parts in parallel. However, if the input
     is a single file, ``target_part_size`` is None, and ``regions`` is None,
     then the operation will be carried out sequentially.
@@ -1188,6 +1227,12 @@ def zarr_array_sizes(
         are not None.
     """
 
+    warnings.warn(
+        "Functions for reading VCF are deprecated, please use the bio2zarr package.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+
     return process_vcfs(
         input,
         zarr_array_sizes_sequential,
diff --git a/sgkit/io/vcfzarr_reader.py b/sgkit/io/vcfzarr_reader.py
index cff8353f8..66426ceee 100644
--- a/sgkit/io/vcfzarr_reader.py
+++ b/sgkit/io/vcfzarr_reader.py
@@ -49,6 +49,9 @@ def read_scikit_allel_vcfzarr(
 ) -> xr.Dataset:
     """Read a VCF Zarr file created using scikit-allel.
 
+    .. deprecated:: 0.9.0
+       Functions for reading VCF are deprecated, please use the `bio2zarr <https://github.com/sgkit-dev/bio2zarr>`_ package.
+
     Loads VCF variant, sample, and genotype data as Dask arrays within a Dataset
     from a Zarr file created using scikit-allel's ``vcf_to_zarr`` function.
 
@@ -90,6 +93,12 @@ def read_scikit_allel_vcfzarr(
     - :data:`sgkit.variables.call_genotype_mask_spec` (variants, samples, ploidy)
     """
 
+    warnings.warn(
+        "Functions for reading VCF are deprecated, please use the bio2zarr package.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+
     vcfzarr = zarr.open_group(str(path), mode="r")
 
     # don't fix strings since it requires a pass over the whole dataset