From 9b3ca3673190d1fff6e55fe6142cb1128d45426a Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 24 Jun 2024 11:57:44 +0100 Subject: [PATCH] Deprecate functions for reading VCF --- docs/api.rst | 3 +++ docs/vcf.rst | 3 +++ sgkit/io/vcf/vcf_partition.py | 11 +++++++++ sgkit/io/vcf/vcf_reader.py | 45 +++++++++++++++++++++++++++++++++++ sgkit/io/vcfzarr_reader.py | 9 +++++++ 5 files changed, 71 insertions(+) diff --git a/docs/api.rst b/docs/api.rst index ce6c29cc3..ba0605482 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -35,6 +35,9 @@ PLINK VCF (reading) ------------- +.. deprecated:: 0.9.0 + Functions for reading VCF are deprecated, please use the `bio2zarr `_ package. + .. currentmodule:: sgkit.io.vcf .. autosummary:: :toctree: generated/ diff --git a/docs/vcf.rst b/docs/vcf.rst index 1773cd8e5..bbeeab10c 100644 --- a/docs/vcf.rst +++ b/docs/vcf.rst @@ -3,6 +3,9 @@ Reading VCF =========== +.. deprecated:: 0.9.0 + Functions for reading VCF are deprecated, please use the `bio2zarr `_ package. + .. contents:: Table of contents: :local: diff --git a/sgkit/io/vcf/vcf_partition.py b/sgkit/io/vcf/vcf_partition.py index cc979cb25..c64e5c088 100644 --- a/sgkit/io/vcf/vcf_partition.py +++ b/sgkit/io/vcf/vcf_partition.py @@ -1,3 +1,4 @@ +import warnings from typing import Any, Dict, Optional, Sequence, Union import dask @@ -78,6 +79,9 @@ def partition_into_regions( """ Calculate genomic region strings to partition a compressed VCF or BCF file into roughly equal parts. + .. deprecated:: 0.9.0 + Functions for reading VCF are deprecated, please use the `bio2zarr `_ package. + A ``.tbi`` or ``.csi`` file is used to find BGZF boundaries in the compressed VCF file, which are then used to divide the file into parts. @@ -118,6 +122,13 @@ def partition_into_regions( ValueError If either of ``num_parts`` or ``target_part_size`` is not a positive integer. """ + + warnings.warn( + "Functions for reading VCF are deprecated, please use the bio2zarr package.", + DeprecationWarning, + stacklevel=2, + ) + if num_parts is None and target_part_size is None: raise ValueError("One of num_parts or target_part_size must be specified") diff --git a/sgkit/io/vcf/vcf_reader.py b/sgkit/io/vcf/vcf_reader.py index e9dfc88e9..5de4f44f2 100644 --- a/sgkit/io/vcf/vcf_reader.py +++ b/sgkit/io/vcf/vcf_reader.py @@ -679,6 +679,9 @@ def vcf_to_zarrs( ) -> Sequence[str]: """Convert VCF files to multiple Zarr on-disk stores, one per region. + .. deprecated:: 0.9.0 + Functions for reading VCF are deprecated, please use the `bio2zarr `_ package. + Parameters ---------- input @@ -754,6 +757,12 @@ def vcf_to_zarrs( A list of URLs to the Zarr outputs. """ + warnings.warn( + "Functions for reading VCF are deprecated, please use the bio2zarr package.", + DeprecationWarning, + stacklevel=2, + ) + output_storage_options = output_storage_options or {} tasks = [] @@ -798,6 +807,9 @@ def concat_zarrs( ) -> None: """Concatenate multiple Zarr stores into a single Zarr store. + .. deprecated:: 0.9.0 + Functions for reading VCF are deprecated, please use the `bio2zarr `_ package. + The Zarr stores are concatenated and rechunked to produce a single combined store. Parameters @@ -814,6 +826,12 @@ def concat_zarrs( the chunk length of the first input Zarr store is used. """ + warnings.warn( + "Functions for reading VCF are deprecated, please use the bio2zarr package.", + DeprecationWarning, + stacklevel=2, + ) + vars_to_rechunk = [] vars_to_copy = [] storage_options = storage_options or {} @@ -856,6 +874,9 @@ def vcf_to_zarr( ) -> None: """Convert VCF files to a single Zarr on-disk store. + .. deprecated:: 0.9.0 + Functions for reading VCF are deprecated, please use the `bio2zarr `_ package. + By default, the conversion is carried out in parallel, by writing the output for each part to a separate, intermediate Zarr store in ``tempdir``. Then, in a second step the intermediate outputs are concatenated and rechunked into the final output Zarr @@ -955,6 +976,12 @@ def vcf_to_zarr( so for large VCF files this can be slow. """ + warnings.warn( + "Functions for reading VCF are deprecated, please use the bio2zarr package.", + DeprecationWarning, + stacklevel=2, + ) + if temp_chunk_length is not None: if chunk_length % temp_chunk_length != 0: raise ValueError( @@ -1039,6 +1066,9 @@ def read_vcf( ) -> xr.Dataset: """Read VCF dataset. + .. deprecated:: 0.9.0 + Functions for reading VCF are deprecated, please use the `bio2zarr `_ package. + A convenience for :func:`vcf_to_zarr` followed by :func:`sgkit.load_dataset`. Note that the output Zarr store in ``tempdir`` is not deleted after this function returns, so must be deleted manually by the user. @@ -1119,6 +1149,12 @@ def read_vcf( """ + warnings.warn( + "Functions for reading VCF are deprecated, please use the bio2zarr package.", + DeprecationWarning, + stacklevel=2, + ) + # Need to retain zarr file backing the returned dataset with temporary_directory( prefix="read_vcf_", @@ -1166,6 +1202,9 @@ def zarr_array_sizes( ) -> Dict[str, Any]: """Make a pass through a VCF/BCF file to determine sizes for storage in Zarr. + .. deprecated:: 0.9.0 + Functions for reading VCF are deprecated, please use the `bio2zarr `_ package. + By default, the input is processed in parts in parallel. However, if the input is a single file, ``target_part_size`` is None, and ``regions`` is None, then the operation will be carried out sequentially. @@ -1188,6 +1227,12 @@ def zarr_array_sizes( are not None. """ + warnings.warn( + "Functions for reading VCF are deprecated, please use the bio2zarr package.", + DeprecationWarning, + stacklevel=2, + ) + return process_vcfs( input, zarr_array_sizes_sequential, diff --git a/sgkit/io/vcfzarr_reader.py b/sgkit/io/vcfzarr_reader.py index cff8353f8..66426ceee 100644 --- a/sgkit/io/vcfzarr_reader.py +++ b/sgkit/io/vcfzarr_reader.py @@ -49,6 +49,9 @@ def read_scikit_allel_vcfzarr( ) -> xr.Dataset: """Read a VCF Zarr file created using scikit-allel. + .. deprecated:: 0.9.0 + Functions for reading VCF are deprecated, please use the `bio2zarr `_ package. + Loads VCF variant, sample, and genotype data as Dask arrays within a Dataset from a Zarr file created using scikit-allel's ``vcf_to_zarr`` function. @@ -90,6 +93,12 @@ def read_scikit_allel_vcfzarr( - :data:`sgkit.variables.call_genotype_mask_spec` (variants, samples, ploidy) """ + warnings.warn( + "Functions for reading VCF are deprecated, please use the bio2zarr package.", + DeprecationWarning, + stacklevel=2, + ) + vcfzarr = zarr.open_group(str(path), mode="r") # don't fix strings since it requires a pass over the whole dataset