From a24f81da8087e545891bebc9345e65a8dfb1024d Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Tue, 10 Sep 2024 16:02:37 +0200 Subject: [PATCH] Revert "[python] Follow spec-0 (#1189)" This reverts commit fc7aefe743cce95eb406b7e33b592018d05418b8. --- .github/workflows/py-dependency-check.yml | 4 +- .github/workflows/py-unittests.yml | 4 +- api/python/cellxgene_census/pyproject.toml | 16 ++--- .../src/cellxgene_census/_get_anndata.py | 49 +++++++------ .../src/cellxgene_census/_open.py | 18 ++--- .../cellxgene_census/_release_directory.py | 37 +++++----- .../experimental/_embedding.py | 8 +-- .../experimental/_embedding_search.py | 17 +++-- .../experimental/ml/encoders.py | 9 +-- .../ml/huggingface/cell_dataset_builder.py | 13 ++-- .../ml/huggingface/geneformer_tokenizer.py | 21 +++--- .../experimental/ml/pytorch.py | 71 +++++++++---------- .../experimental/pp/_highly_variable_genes.py | 4 +- .../experimental/pp/_online.py | 12 ++-- .../experimental/pp/_stats.py | 3 +- .../experimental/util/_eager_iter.py | 11 ++- .../tests/experimental/ml/test_pytorch.py | 40 ++++++----- .../tests/experimental/pp/test_stats.py | 8 +-- .../experimental/test_embeddings_search.py | 8 +-- .../cellxgene_census/tests/test_acceptance.py | 25 ++++--- .../tests/test_get_anndata.py | 20 +++--- .../cellxgene_census/tests/test_lts_compat.py | 14 ++-- .../cellxgene_census/tests/test_open.py | 4 +- .../cellxgene_census/tests/test_user_agent.py | 3 +- api/python/notebooks/README.md | 2 +- docs/cellxgene_census_docsite_installation.md | 2 +- 26 files changed, 213 insertions(+), 210 deletions(-) diff --git a/.github/workflows/py-dependency-check.yml b/.github/workflows/py-dependency-check.yml index 010409042..3afc6a06a 100644 --- a/.github/workflows/py-dependency-check.yml +++ b/.github/workflows/py-dependency-check.yml @@ -25,10 +25,10 @@ jobs: fail-fast: false # don't fail-fast, as errors are often specific to a single cell in the matrix matrix: os: [sc-dev-64g-runner, macos-latest] - python-version: ["3.10", "3.11", "3.12"] + python-version: ["3.8", "3.9", "3.10", "3.11"] exclude: - os: macos-latest - python-version: "3.12" + python-version: "3.8" runs-on: ${{matrix.os}} diff --git a/.github/workflows/py-unittests.yml b/.github/workflows/py-unittests.yml index e5ba6f0eb..f224f4560 100644 --- a/.github/workflows/py-unittests.yml +++ b/.github/workflows/py-unittests.yml @@ -21,10 +21,10 @@ jobs: fail-fast: false # Don't stop the workflow if one of the jobs fails matrix: os: [sc-dev-64g-runner, macos-latest] - python-version: ["3.10", "3.11", "3.12"] + python-version: ["3.8", "3.9", "3.10", "3.11"] exclude: - os: macos-latest - python-version: "3.12" + python-version: "3.8" runs-on: ${{matrix.os}} diff --git a/api/python/cellxgene_census/pyproject.toml b/api/python/cellxgene_census/pyproject.toml index e9aa1979d..f050bef75 100644 --- a/api/python/cellxgene_census/pyproject.toml +++ b/api/python/cellxgene_census/pyproject.toml @@ -11,7 +11,7 @@ authors = [ ] license = { text = "MIT" } readme = "README.md" -requires-python = ">= 3.10, < 3.13" +requires-python = ">= 3.8, < 3.12" classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Developers", @@ -22,17 +22,18 @@ classifiers = [ "Topic :: Scientific/Engineering :: Bio-Informatics", "Operating System :: POSIX :: Linux", "Operating System :: MacOS :: MacOS X", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", ] dependencies= [ # NOTE: the tiledbsoma version must be >= to the version used in the Census builder, to # ensure that the assets are readable (tiledbsoma supports backward compatible reading). # Make sure this version does not fall behind the builder's tiledbsoma version. - "tiledbsoma>=1.12.3", + "tiledbsoma~=1.12.3", "anndata", - "numpy>=1.23,<2.0", + "numpy>=1.21,<2.0", "requests", "typing_extensions", "s3fs>=2021.06.1", @@ -42,8 +43,9 @@ dependencies= [ experimental = [ "torch", "torchdata~=0.7", - "scikit-learn>=1.2", + "scikit-learn~=1.0", "scikit-misc>=0.2,<0.4", # scikit-misc 0.3 dropped Python 3.8 support, and 0.4 doesn't have MacOS/ARM wheels + "psutil~=5.0", "datasets~=2.0", "tdigest~=0.5", # choose newest version of tiledb-vector-search that doesn't need a newer version of tiledb @@ -79,7 +81,7 @@ root = "../../.." [tool.ruff] line-length = 120 src = ["api/python/cellxgene_census/src"] -target-version = "py310" +target-version = "py38" [tool.ruff.lint] select = [ @@ -127,8 +129,6 @@ ignore = [ "D205", # Prefer absolute imports over relative imports from parent modules TODO: enable "TID252", - # It's okay to use zip without the strict kwarg. In fact, numba doesn't like it when you use it - "B905", ] [tool.ruff.lint.pydocstyle] diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py index 9d7a5c41b..e37337184 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py @@ -7,8 +7,7 @@ Methods to retrieve slices of the census as AnnData objects. """ -from collections.abc import Sequence -from typing import Literal +from typing import Literal, Optional, Sequence from warnings import warn import anndata @@ -28,20 +27,20 @@ def get_anndata( organism: str, measurement_name: str = "RNA", X_name: str = "raw", - X_layers: Sequence[str] | None = (), - obsm_layers: Sequence[str] | None = (), - obsp_layers: Sequence[str] | None = (), - varm_layers: Sequence[str] | None = (), - varp_layers: Sequence[str] | None = (), - obs_value_filter: str | None = None, - obs_coords: SparseDFCoord | None = None, - var_value_filter: str | None = None, - var_coords: SparseDFCoord | None = None, - column_names: soma.AxisColumnNames | None = None, - obs_embeddings: Sequence[str] | None = (), - var_embeddings: Sequence[str] | None = (), - obs_column_names: Sequence[str] | None = None, - var_column_names: Sequence[str] | None = None, + X_layers: Optional[Sequence[str]] = (), + obsm_layers: Optional[Sequence[str]] = (), + obsp_layers: Optional[Sequence[str]] = (), + varm_layers: Optional[Sequence[str]] = (), + varp_layers: Optional[Sequence[str]] = (), + obs_value_filter: Optional[str] = None, + obs_coords: Optional[SparseDFCoord] = None, + var_value_filter: Optional[str] = None, + var_coords: Optional[SparseDFCoord] = None, + column_names: Optional[soma.AxisColumnNames] = None, + obs_embeddings: Optional[Sequence[str]] = (), + var_embeddings: Optional[Sequence[str]] = (), + obs_column_names: Optional[Sequence[str]] = None, + var_column_names: Optional[Sequence[str]] = None, ) -> anndata.AnnData: """Convenience wrapper around :class:`tiledbsoma.Experiment` query, to build and execute a query, and return it as an :class:`anndata.AnnData` object. @@ -177,9 +176,9 @@ def _get_axis_metadata( axis: Literal["obs", "var"], organism: str, *, - value_filter: str | None = None, - coords: SparseDFCoord | None = slice(None), - column_names: Sequence[str] | None = None, + value_filter: Optional[str] = None, + coords: Optional[SparseDFCoord] = slice(None), + column_names: Optional[Sequence[str]] = None, ) -> pd.DataFrame: exp = _get_experiment(census, organism) coords = (slice(None),) if coords is None else (coords,) @@ -199,9 +198,9 @@ def get_obs( census: soma.Collection, organism: str, *, - value_filter: str | None = None, - coords: SparseDFCoord | None = slice(None), - column_names: Sequence[str] | None = None, + value_filter: Optional[str] = None, + coords: Optional[SparseDFCoord] = slice(None), + column_names: Optional[Sequence[str]] = None, ) -> pd.DataFrame: """Get the observation metadata for a query on the census. @@ -231,9 +230,9 @@ def get_var( census: soma.Collection, organism: str, *, - value_filter: str | None = None, - coords: SparseDFCoord | None = slice(None), - column_names: Sequence[str] | None = None, + value_filter: Optional[str] = None, + coords: Optional[SparseDFCoord] = slice(None), + column_names: Optional[Sequence[str]] = None, ) -> pd.DataFrame: """Get the variable metadata for a query on the census. diff --git a/api/python/cellxgene_census/src/cellxgene_census/_open.py b/api/python/cellxgene_census/src/cellxgene_census/_open.py index 5af0fea0b..642e6fbb6 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_open.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_open.py @@ -10,7 +10,7 @@ import logging import os.path import urllib.parse -from typing import Any, get_args +from typing import Any, Dict, Optional, get_args import s3fs import tiledbsoma as soma @@ -32,7 +32,7 @@ "anon": True, "cache_regions": True, } -DEFAULT_TILEDB_CONFIGURATION: dict[str, Any] = { +DEFAULT_TILEDB_CONFIGURATION: Dict[str, Any] = { # https://docs.tiledb.com/main/how-to/configuration#configuration-parameters "py.init_buffer_bytes": 1 * 1024**3, "soma.init_buffer_bytes": 1 * 1024**3, @@ -71,7 +71,7 @@ def _resolve_census_locator(locator: CensusLocator, mirror: CensusMirror) -> Res def _open_soma( locator: ResolvedCensusLocator, - context: soma.options.SOMATileDBContext | None = None, + context: Optional[soma.options.SOMATileDBContext] = None, ) -> soma.Collection: """Private. Merge config defaults and return open census as a soma Collection/context.""" # if no user-defined context, cellxgene_census defaults take precedence over SOMA defaults @@ -85,7 +85,7 @@ def _open_soma( return soma.open(locator["uri"], mode="r", soma_type=soma.Collection, context=context) -def get_default_soma_context(tiledb_config: dict[str, Any] | None = None) -> soma.options.SOMATileDBContext: +def get_default_soma_context(tiledb_config: Optional[Dict[str, Any]] = None) -> soma.options.SOMATileDBContext: """Return a :class:`tiledbsoma.SOMATileDBContext` with sensible defaults that can be further customized by the user. The customized context can then be passed to :func:`cellxgene_census.open_soma` with the ``context`` argument or to :meth:`somacore.SOMAObject.open` with the ``context`` argument, such as @@ -132,11 +132,11 @@ def get_default_soma_context(tiledb_config: dict[str, Any] | None = None) -> som def open_soma( *, - census_version: str | None = DEFAULT_CENSUS_VERSION, - mirror: str | None = None, - uri: str | None = None, - tiledb_config: dict[str, Any] | None = None, - context: soma.options.SOMATileDBContext | None = None, + census_version: Optional[str] = DEFAULT_CENSUS_VERSION, + mirror: Optional[str] = None, + uri: Optional[str] = None, + tiledb_config: Optional[Dict[str, Any]] = None, + context: Optional[soma.options.SOMATileDBContext] = None, ) -> soma.Collection: """Open the Census by version or URI. diff --git a/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py b/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py index 177094e90..644a9df5b 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py @@ -7,8 +7,9 @@ Methods to retrieve information about versions of the publicly hosted Census object. """ +import typing from collections import OrderedDict -from typing import Any, Literal, cast +from typing import Any, Dict, Literal, Optional, Union, cast import requests from typing_extensions import NotRequired, TypedDict @@ -36,7 +37,7 @@ class CensusLocator(TypedDict): uri: str relative_uri: str - s3_region: str | None + s3_region: Optional[str] class CensusVersionRetraction(TypedDict): @@ -54,13 +55,13 @@ class CensusVersionRetraction(TypedDict): """ date: str - reason: str | None - info_url: str | None - replaced_by: str | None + reason: Optional[str] + info_url: Optional[str] + replaced_by: Optional[str] ReleaseFlag = Literal["lts", "retracted"] -ReleaseFlags = dict[ReleaseFlag, bool] +ReleaseFlags = Dict[ReleaseFlag, bool] class CensusVersionDescription(TypedDict): @@ -81,7 +82,7 @@ class CensusVersionDescription(TypedDict): If retracted, details of the retraction. """ - release_date: str | None + release_date: Optional[str] release_build: str soma: CensusLocator h5ads: CensusLocator @@ -89,7 +90,7 @@ class CensusVersionDescription(TypedDict): retraction: NotRequired[CensusVersionRetraction] -CensusDirectory = dict[CensusVersionName, CensusVersionName | CensusVersionDescription] +CensusDirectory = Dict[CensusVersionName, Union[CensusVersionName, CensusVersionDescription]] """ A provider identifies a storage medium for the Census, which can either be a cloud provider or a local file. @@ -131,11 +132,11 @@ class CensusMirror(TypedDict): provider: Provider base_uri: str - region: str | None + region: Optional[str] embeddings_base_uri: str -CensusMirrors = dict[CensusMirrorName, CensusMirrorName | CensusMirror] +CensusMirrors = Dict[CensusMirrorName, Union[CensusMirrorName, CensusMirror]] class ResolvedCensusLocator(TypedDict): @@ -154,7 +155,7 @@ class ResolvedCensusLocator(TypedDict): """ uri: str - region: str | None + region: Optional[str] provider: str @@ -199,8 +200,8 @@ def get_census_version_description(census_version: str) -> CensusVersionDescript def get_census_version_directory( - *, lts: bool | None = None, retracted: bool | None = False -) -> dict[CensusVersionName, CensusVersionDescription]: + *, lts: Optional[bool] = None, retracted: Optional[bool] = False +) -> Dict[CensusVersionName, CensusVersionDescription]: """Get the directory of Census versions currently available, optionally filtering by specified flags. If a filtering flag is not specified, Census versions will not be filtered by that flag. Defaults to including both "long-term stable" (LTS) and weekly Census versions, and excluding @@ -355,9 +356,9 @@ def get_census_version_directory( response = requests.get(CELL_CENSUS_RELEASE_DIRECTORY_URL, headers={"User-Agent": _user_agent()}) response.raise_for_status() - directory: dict[str, str | dict[str, Any]] = response.json() + directory: dict[str, Union[str, dict[str, Any]]] = response.json() directory_out: CensusDirectory = {} - aliases: set[CensusVersionName] = set() + aliases: typing.Set[CensusVersionName] = set() # Resolve all aliases for easier use for census_version_name in list(directory.keys()): @@ -400,7 +401,7 @@ def get_census_version_directory( directory_out[census_version_name] = census_version_description.copy() # Cast is safe, as we have removed all aliases - unordered_directory = cast(dict[CensusVersionName, CensusVersionDescription], directory_out) + unordered_directory = cast(Dict[CensusVersionName, CensusVersionDescription], directory_out) # Sort by aliases and release date, descending aliased_releases = [(k, v) for k, v in unordered_directory.items() if k in aliases] @@ -416,7 +417,7 @@ def get_census_version_directory( return ordered_directory -def get_census_mirror_directory() -> dict[CensusMirrorName, CensusMirror]: +def get_census_mirror_directory() -> Dict[CensusMirrorName, CensusMirror]: """Get the directory of Census mirrors currently available. Returns: @@ -428,7 +429,7 @@ def get_census_mirror_directory() -> dict[CensusMirrorName, CensusMirror]: """ mirrors = _get_census_mirrors() del mirrors["default"] - return cast(dict[CensusMirrorName, CensusMirror], mirrors) + return cast(Dict[CensusMirrorName, CensusMirror], mirrors) def _get_census_mirrors() -> CensusMirrors: diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py index 34d93ef42..4baba8e06 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py @@ -8,7 +8,7 @@ import json import warnings -from typing import Any, cast +from typing import Any, Dict, cast import numpy as np import numpy.typing as npt @@ -55,7 +55,7 @@ def get_embedding_metadata(embedding_uri: str, context: soma.options.SOMATileDBC embedding_metadata = json.loads(E.metadata["CxG_embedding_info"]) assert isinstance(embedding_metadata, dict) - return cast(dict[str, Any], embedding_metadata) + return cast(Dict[str, Any], embedding_metadata) def _get_embedding( @@ -67,7 +67,7 @@ def _get_embedding( context: soma.options.SOMATileDBContext | None = None, ) -> npt.NDArray[np.float32]: """Private. Like get_embedding, but accepts a Census object and a Census directory.""" - if isinstance(obs_soma_joinids, pa.Array | pa.ChunkedArray | pd.Series): + if isinstance(obs_soma_joinids, (pa.Array, pa.ChunkedArray, pd.Series)): obs_soma_joinids = obs_soma_joinids.to_numpy() assert isinstance(obs_soma_joinids, np.ndarray) if obs_soma_joinids.dtype != np.int64: @@ -194,7 +194,7 @@ def get_embedding_metadata_by_name( response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL, headers={"User-Agent": _user_agent()}) response.raise_for_status() - manifest = cast(dict[str, dict[str, Any]], response.json()) + manifest = cast(Dict[str, Dict[str, Any]], response.json()) embeddings = [] for _, obj in manifest.items(): if ( diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding_search.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding_search.py index de09e2060..179fa1d6d 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding_search.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding_search.py @@ -1,8 +1,7 @@ """Nearest-neighbor search based on vector index of Census embeddings.""" -from collections.abc import Sequence from contextlib import ExitStack -from typing import Any, NamedTuple, cast +from typing import Any, Dict, NamedTuple, Optional, Sequence, Tuple, cast import anndata as ad import numpy as np @@ -44,9 +43,9 @@ def find_nearest_obs( k: int = 10, nprobe: int = 100, memory_GiB: int = 4, - mirror: str | None = None, - embedding_metadata: dict[str, Any] | None = None, - **kwargs: dict[str, Any], + mirror: Optional[str] = None, + embedding_metadata: Optional[Dict[str, Any]] = None, + **kwargs: Dict[str, Any], ) -> NeighborObs: """Search Census for similar obs (cells) based on nearest neighbors in embedding space. @@ -98,9 +97,9 @@ def find_nearest_obs( def _resolve_embedding_index( - embedding_metadata: dict[str, Any], - mirror: str | None = None, -) -> tuple[str, str] | None: + embedding_metadata: Dict[str, Any], + mirror: Optional[str] = None, +) -> Optional[Tuple[str, str]]: index_metadata = embedding_metadata.get("indexes", None) if not index_metadata: return None @@ -118,7 +117,7 @@ def predict_obs_metadata( census_version: str, neighbors: NeighborObs, column_names: Sequence[str], - experiment: soma.Experiment | None = None, + experiment: Optional[soma.Experiment] = None, ) -> pd.DataFrame: """Predict obs metadata attributes for the query cells based on the embedding nearest neighbors. diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/encoders.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/encoders.py index 0be576ef6..3d4fc4dc5 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/encoders.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/encoders.py @@ -1,5 +1,6 @@ import abc import functools +from typing import List import numpy.typing as npt import pandas as pd @@ -46,7 +47,7 @@ def name(self) -> str: @property @abc.abstractmethod - def columns(self) -> list[str]: + def columns(self) -> List[str]: """Columns in ``obs`` that the encoder will be applied to.""" pass @@ -76,7 +77,7 @@ def name(self) -> str: return self.col @property - def columns(self) -> list[str]: + def columns(self) -> List[str]: """Columns in ``obs`` that the encoder will be applied to.""" return [self.col] @@ -89,7 +90,7 @@ def classes_(self): # type: ignore class BatchEncoder(Encoder): """An encoder that concatenates and encodes several ``obs`` columns.""" - def __init__(self, cols: list[str], name: str = "batch"): + def __init__(self, cols: List[str], name: str = "batch"): self.cols = cols from sklearn.preprocessing import LabelEncoder @@ -114,7 +115,7 @@ def fit(self, obs: pd.DataFrame) -> None: self._encoder.fit(arr.unique()) @property - def columns(self) -> list[str]: + def columns(self) -> List[str]: """Columns in ``obs`` that the encoder will be applied to.""" return self.cols diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py index 5a9c2d626..07d2212c8 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py @@ -1,7 +1,6 @@ import uuid from abc import ABC, abstractmethod -from collections.abc import Generator -from typing import Any +from typing import Any, Dict, Generator, Optional import scipy.sparse from datasets import Dataset @@ -38,7 +37,7 @@ def __init__( measurement_name: str = "RNA", layer_name: str = "raw", *, - block_size: int | None = None, + block_size: Optional[int] = None, **kwargs: Any, ): """Initialize the CellDatasetBuilder to process the results of a Census @@ -56,13 +55,13 @@ def __init__( self.layer_name = layer_name self.block_size = block_size - def build(self, from_generator_kwargs: dict[str, Any] | None = None) -> Dataset: + def build(self, from_generator_kwargs: Optional[Dict[str, Any]] = None) -> Dataset: """Build the dataset from query results. - `from_generator_kwargs`: kwargs passed through to `Dataset.from_generator()` """ - def gen() -> Generator[dict[str, Any], None, None]: + def gen() -> Generator[Dict[str, Any], None, None]: for Xblock, (block_cell_joinids, _) in ( self.X(self.layer_name).blockwise(axis=0, reindex_disable_on_axis=[1], size=self.block_size).scipy() ): @@ -74,7 +73,7 @@ def gen() -> Generator[dict[str, Any], None, None]: return Dataset.from_generator(_DatasetGeneratorPickleHack(gen), **(from_generator_kwargs or {})) @abstractmethod - def cell_item(self, cell_joinid: int, Xrow: scipy.sparse.csr_matrix) -> dict[str, Any]: + def cell_item(self, cell_joinid: int, Xrow: scipy.sparse.csr_matrix) -> Dict[str, Any]: """Abstract method to process the X row for one cell into a Dataset item. - `cell_joinid`: The cell `soma_joinid`. @@ -87,7 +86,7 @@ def cell_item(self, cell_joinid: int, Xrow: scipy.sparse.csr_matrix) -> dict[str class _DatasetGeneratorPickleHack: """SEE: https://github.com/huggingface/datasets/issues/6194.""" - def __init__(self, generator: Any, generator_id: str | None = None) -> None: + def __init__(self, generator: Any, generator_id: Optional[str] = None) -> None: self.generator = generator self.generator_id = generator_id if generator_id is not None else str(uuid.uuid4()) diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py index 7303d3bbf..48ea8fdea 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py @@ -1,6 +1,5 @@ import pickle -from collections.abc import Sequence -from typing import Any +from typing import Any, Dict, List, Optional, Sequence, Set import numpy as np import numpy.typing as npt @@ -43,7 +42,7 @@ class GeneformerTokenizer(CellDatasetBuilder): - and the specified `obs_column_names` (cell metadata from the experiment obs dataframe) """ - obs_column_names: set[str] + obs_column_names: Set[str] max_input_tokens: int special_token: bool @@ -55,15 +54,15 @@ class GeneformerTokenizer(CellDatasetBuilder): model_gene_map: scipy.sparse.coo_matrix model_gene_tokens: npt.NDArray[np.int64] # Geneformer token for each column of model_gene_map model_gene_medians: npt.NDArray[np.float64] # float for each column of model_gene_map - model_cls_token: np.int64 | None = None - model_eos_token: np.int64 | None = None + model_cls_token: Optional[np.int64] = None + model_eos_token: Optional[np.int64] = None def __init__( self, experiment: tiledbsoma.Experiment, *, - obs_column_names: Sequence[str] | None = None, - obs_attributes: Sequence[str] | None = None, + obs_column_names: Optional[Sequence[str]] = None, + obs_attributes: Optional[Sequence[str]] = None, max_input_tokens: int = 2048, special_token: bool = False, token_dictionary_file: str = "", @@ -148,10 +147,10 @@ def _load_geneformer_data( map_data = [] map_i = [] map_j = [] - model_gene_id_by_ensg: dict[str, int] = {} + model_gene_id_by_ensg: Dict[str, int] = {} model_gene_count = 0 - model_gene_tokens: list[np.int64] = [] - model_gene_medians: list[np.float64] = [] + model_gene_tokens: List[np.int64] = [] + model_gene_medians: List[np.float64] = [] for gene_id, row in genes_df.iterrows(): ensg = row["feature_id"] # ENSG... gene id, which keys Geneformer's dicts if gene_mapping is not None: @@ -199,7 +198,7 @@ def __enter__(self) -> "GeneformerTokenizer": self.obs_df = self.obs(column_names=obs_column_names).concat().to_pandas().set_index("soma_joinid") return self - def cell_item(self, cell_joinid: int, cell_Xrow: scipy.sparse.csr_matrix) -> dict[str, Any]: + def cell_item(self, cell_joinid: int, cell_Xrow: scipy.sparse.csr_matrix) -> Dict[str, Any]: """Given the expression vector for one cell, compute the Dataset item providing the Geneformer inputs (token sequence and metadata). """ diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py index 2e12f68b6..5bef673c9 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py @@ -3,12 +3,11 @@ import logging import os import typing -from collections.abc import Iterator, Sequence from contextlib import contextmanager from datetime import timedelta from math import ceil from time import time -from typing import Any, TypeAlias +from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple, Union import numpy as np import numpy.typing as npt @@ -33,13 +32,13 @@ pytorch_logger = logging.getLogger("cellxgene_census.experimental.pytorch") # TODO: Rename to reflect the correct order of the Tensors within the tuple: (X, obs) -ObsAndXDatum = tuple[Tensor, Tensor] +ObsAndXDatum = Tuple[Tensor, Tensor] """Return type of ``ExperimentDataPipe`` that pairs a Tensor of ``obs`` row(s) with a Tensor of ``X`` matrix row(s). The Tensors are rank 1 if ``batch_size`` is 1, otherwise the Tensors are rank 2.""" # "Chunk" of X data, returned by each `Method` above -ChunkX: TypeAlias = npt.NDArray[Any] | sparse.csr_matrix +ChunkX = Union[npt.NDArray[Any], sparse.csr_matrix] @define @@ -59,7 +58,7 @@ def __len__(self) -> int: return len(self.obs) -Encoders = dict[str, Encoder] +Encoders = Dict[str, Encoder] """A dictionary of ``Encoder``s keyed by the ``obs`` column name.""" @@ -98,7 +97,7 @@ def __add__(self, other: "Stats") -> "Stats": @contextmanager def _open_experiment( uri: str, - aws_region: str | None = None, + aws_region: Optional[str] = None, ) -> soma.Experiment: """Internal method for opening a SOMA ``Experiment`` as a context manager.""" context = get_default_soma_context().replace(tiledb_config={"vfs.s3.region": aws_region} if aws_region else {}) @@ -108,8 +107,8 @@ def _open_experiment( def _tables_to_np( - tables: Iterator[tuple[Table, Any]], shape: tuple[int, int] -) -> typing.Generator[tuple[npt.NDArray[Any], Any, int], None, None]: + tables: Iterator[Tuple[Table, Any]], shape: Tuple[int, int] +) -> typing.Generator[Tuple[npt.NDArray[Any], Any, int], None, None]: for tbl, indices in tables: row_indices, col_indices, data = (x.to_numpy() for x in tbl.columns) nnz = len(data) @@ -136,10 +135,10 @@ def __init__( obs: soma.DataFrame, X: soma.SparseNDArray, obs_column_names: Sequence[str], - obs_joinids_chunked: list[npt.NDArray[np.int64]], + obs_joinids_chunked: List[npt.NDArray[np.int64]], var_joinids: npt.NDArray[np.int64], - shuffle_chunk_count: int | None = None, - shuffle_rng: Generator | None = None, + shuffle_chunk_count: Optional[int] = None, + shuffle_rng: Optional[Generator] = None, return_sparse_X: bool = False, ): self.obs = obs @@ -222,7 +221,7 @@ def __next__(self) -> _SOMAChunk: return _SOMAChunk(obs=obs_batch, X=X_batch, stats=stats) -def list_split(arr_list: list[Any], sublist_len: int) -> list[list[Any]]: +def list_split(arr_list: List[Any], sublist_len: int) -> List[List[Any]]: """Splits a python list into a list of sublists where each sublist is of size `sublist_len`. TODO: Replace with `itertools.batched` when Python 3.12 becomes the minimum supported version. """ @@ -239,7 +238,7 @@ def list_split(arr_list: list[Any], sublist_len: int) -> list[list[Any]]: return result -def run_gc() -> tuple[tuple[Any, Any, Any], tuple[Any, Any, Any], float]: # noqa: D103 +def run_gc() -> Tuple[Tuple[Any, Any, Any], Tuple[Any, Any, Any], float]: # noqa: D103 proc = psutil.Process(os.getpid()) pre_gc = proc.memory_full_info(), psutil.virtual_memory(), psutil.swap_memory() @@ -267,7 +266,7 @@ class _ObsAndXIterator(Iterator[ObsAndXDatum]): soma_chunk_iter: Iterator[_SOMAChunk] """The iterator for SOMA chunks of paired obs and X data""" - soma_chunk: _SOMAChunk | None + soma_chunk: Optional[_SOMAChunk] """The current SOMA chunk of obs and X data""" i: int = -1 @@ -278,15 +277,15 @@ def __init__( obs: soma.DataFrame, X: soma.SparseNDArray, obs_column_names: Sequence[str], - obs_joinids_chunked: list[npt.NDArray[np.int64]], + obs_joinids_chunked: List[npt.NDArray[np.int64]], var_joinids: npt.NDArray[np.int64], batch_size: int, - encoders: list[Encoder], + encoders: List[Encoder], stats: Stats, return_sparse_X: bool, use_eager_fetch: bool, - shuffle_chunk_count: int | None = None, - shuffle_rng: Generator | None = None, + shuffle_chunk_count: Optional[int] = None, + shuffle_rng: Optional[Generator] = None, ) -> None: self.soma_chunk_iter = _ObsAndXSOMAIterator( obs, @@ -363,7 +362,7 @@ def __next__(self) -> ObsAndXDatum: return X_tensor, obs_tensor - def _read_partial_torch_batch(self, batch_size: int) -> tuple[pd.DataFrame, ChunkX]: + def _read_partial_torch_batch(self, batch_size: int) -> Tuple[pd.DataFrame, ChunkX]: """Reads a torch-size batch of data from the current SOMA chunk, returning a torch-size batch whose size may contain fewer rows than the requested ``batch_size``. This can happen when the remaining rows in the current SOMA chunk are fewer than the requested ``batch_size``. @@ -444,15 +443,15 @@ class ExperimentDataPipe(pipes.IterDataPipe[Dataset[ObsAndXDatum]]): # type: ig _initialized: bool - _obs_joinids: npt.NDArray[np.int64] | None + _obs_joinids: Optional[npt.NDArray[np.int64]] - _var_joinids: npt.NDArray[np.int64] | None + _var_joinids: Optional[npt.NDArray[np.int64]] - _encoders: list[Encoder] + _encoders: List[Encoder] _stats: Stats - _shuffle_rng: Generator | None + _shuffle_rng: Optional[Generator] # TODO: Consider adding another convenience method wrapper to construct this object whose signature is more closely # aligned with get_anndata() params (i.e. "exploded" AxisQuery params). @@ -461,17 +460,17 @@ def __init__( experiment: soma.Experiment, measurement_name: str = "RNA", X_name: str = "raw", - obs_query: soma.AxisQuery | None = None, - var_query: soma.AxisQuery | None = None, + obs_query: Optional[soma.AxisQuery] = None, + var_query: Optional[soma.AxisQuery] = None, obs_column_names: Sequence[str] = (), batch_size: int = 1, shuffle: bool = True, - seed: int | None = None, + seed: Optional[int] = None, return_sparse_X: bool = False, - soma_chunk_size: int | None = 64, + soma_chunk_size: Optional[int] = 64, use_eager_fetch: bool = True, - shuffle_chunk_count: int | None = 2000, - encoders: list[Encoder] | None = None, + encoders: Optional[List[Encoder]] = None, + shuffle_chunk_count: Optional[int] = 2000, ) -> None: r"""Construct a new ``ExperimentDataPipe``. @@ -597,10 +596,10 @@ def _init(self) -> None: @staticmethod def _subset_ids_to_partition( - ids_chunked: list[npt.NDArray[np.int64]], + ids_chunked: List[npt.NDArray[np.int64]], partition_index: int, num_partitions: int, - ) -> list[npt.NDArray[np.int64]]: + ) -> List[npt.NDArray[np.int64]]: """Returns a single partition of the obs_joinids_chunked (a 2D ndarray), based upon the current process's distributed rank and world size. """ @@ -623,7 +622,7 @@ def _compute_partitions( loader_partitions: int, dist_partition: int, num_dist_partitions: int, - ) -> tuple[int, int]: + ) -> Tuple[int, int]: # NOTE: Can alternately use a `worker_init_fn` to split among workers split workload total_partitions = num_dist_partitions * loader_partitions partition = dist_partition * loader_partitions + loader_partition @@ -666,7 +665,7 @@ def __iter__(self) -> Iterator[ObsAndXDatum]: dist_partition=dist.get_rank() if dist.is_initialized() else 0, num_dist_partitions=dist.get_world_size() if dist.is_initialized() else 1, ) - obs_joinids_chunked_partition: list[npt.NDArray[np.int64]] = self._subset_ids_to_partition( + obs_joinids_chunked_partition: List[npt.NDArray[np.int64]] = self._subset_ids_to_partition( obs_joinids_chunked, partition, partitions ) @@ -694,7 +693,7 @@ def __iter__(self) -> Iterator[ObsAndXDatum]: ) @staticmethod - def _chunk_ids(ids: npt.NDArray[np.int64], chunk_size: int) -> list[npt.NDArray[np.int64]]: + def _chunk_ids(ids: npt.NDArray[np.int64], chunk_size: int) -> List[npt.NDArray[np.int64]]: num_chunks = max(1, ceil(len(ids) / chunk_size)) pytorch_logger.debug(f"Shuffling {len(ids)} obs joinids into {num_chunks} chunks of {chunk_size}") return np.array_split(ids, num_chunks) @@ -709,7 +708,7 @@ def __len__(self) -> int: def __getitem__(self, index: int) -> ObsAndXDatum: raise NotImplementedError("IterDataPipe can only be iterated") - def _build_obs_encoders(self, query: soma.ExperimentAxisQuery) -> list[Encoder]: + def _build_obs_encoders(self, query: soma.ExperimentAxisQuery) -> List[Encoder]: pytorch_logger.debug("Initializing encoders") encoders = [] @@ -749,7 +748,7 @@ def stats(self) -> Stats: return self._stats @property - def shape(self) -> tuple[int, int]: + def shape(self) -> Tuple[int, int]: """Get the shape of the data that will be returned by this :class:`cellxgene_census.experimental.ml.pytorch.ExperimentDataPipe`. This is the number of obs (cell) and var (feature) counts in the returned data. If used in multiprocessing mode (i.e. :class:`torch.utils.data.DataLoader` instantiated with num_workers > 0), the obs (cell) count will reflect diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_highly_variable_genes.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_highly_variable_genes.py index 188513c65..c47ad9f1e 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_highly_variable_genes.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_highly_variable_genes.py @@ -1,14 +1,14 @@ from __future__ import annotations import os -from collections.abc import Callable, Sequence from concurrent import futures -from typing import Any, Literal, cast +from typing import Any, Callable, Sequence, cast import numpy as np import pandas as pd import tiledbsoma as soma from somacore.options import SparseDFCoord +from typing_extensions import Literal from ..._experiment import _get_experiment from ..util._eager_iter import _EagerIterator diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py index 13bbe76c5..2eaf71a7b 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py @@ -1,3 +1,5 @@ +from typing import Optional, Tuple + import numba import numpy as np import numpy.typing as npt @@ -41,7 +43,7 @@ def update( self, var_vec: npt.NDArray[np.int64], val_vec: npt.NDArray[np.float32], - batch_vec: npt.NDArray[np.int64] | None = None, + batch_vec: Optional[npt.NDArray[np.int64]] = None, ) -> None: if self.n_batches == 1: assert batch_vec is None @@ -52,7 +54,7 @@ def update( def finalize( self, - ) -> tuple[ + ) -> Tuple[ npt.NDArray[np.float64], npt.NDArray[np.float64], npt.NDArray[np.float64], @@ -123,7 +125,7 @@ def update( self, var_vec: npt.NDArray[np.int64], val_vec: npt.NDArray[np.float32], - batch_vec: npt.NDArray[np.int64] | None = None, + batch_vec: Optional[npt.NDArray[np.int64]] = None, ) -> None: if self.n_batches == 1: assert batch_vec is None @@ -145,7 +147,7 @@ def update( self.clip_val, ) - def finalize(self) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]: + def finalize(self) -> Tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]: return self.counts_sum, self.squared_counts_sum @@ -280,7 +282,7 @@ def _mbomv_combine_batches( n_samples: npt.NDArray[np.int64], u: npt.NDArray[np.float64], M2: npt.NDArray[np.float64], -) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]: +) -> Tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]: """Combine all batches using Chan's parallel adaptation of Welford's. Returns tuple of (u, M2). diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_stats.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_stats.py index 8cef3e24e..a5930525a 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_stats.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_stats.py @@ -1,8 +1,7 @@ from __future__ import annotations -from collections.abc import Generator from concurrent import futures -from typing import Any +from typing import Any, Generator import numpy as np import numpy.typing as npt diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/util/_eager_iter.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/util/_eager_iter.py index 9c229b889..6ee5db37a 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/util/_eager_iter.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/util/_eager_iter.py @@ -1,10 +1,9 @@ import logging import threading from collections import deque -from collections.abc import Iterator from concurrent import futures from concurrent.futures import Future -from typing import TypeVar +from typing import Deque, Iterator, Optional, TypeVar util_logger = logging.getLogger("cellxgene_census.experimental.util") @@ -15,13 +14,13 @@ class _EagerIterator(Iterator[_T]): def __init__( self, iterator: Iterator[_T], - pool: futures.Executor | None = None, + pool: Optional[futures.Executor] = None, ): super().__init__() self.iterator = iterator self._pool = pool or futures.ThreadPoolExecutor() self._own_pool = pool is None - self._future: Future[_T] | None = None + self._future: Optional[Future[_T]] = None self._begin_next() def _begin_next(self) -> None: @@ -57,14 +56,14 @@ def __init__( self, iterator: Iterator[_T], max_pending: int = 1, - pool: futures.Executor | None = None, + pool: Optional[futures.Executor] = None, ): super().__init__() self.iterator = iterator self.max_pending = max_pending self._pool = pool or futures.ThreadPoolExecutor() self._own_pool = pool is None - self._pending_results: deque[futures.Future[_T]] = deque() + self._pending_results: Deque[futures.Future[_T]] = deque() self._lock = threading.Lock() self._begin_next() diff --git a/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py b/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py index e76b87508..3490c4bbd 100644 --- a/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py +++ b/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py @@ -1,5 +1,6 @@ import pathlib -from collections.abc import Callable, Sequence +import sys +from typing import Callable, List, Optional, Sequence, Union from unittest.mock import patch import numpy as np @@ -49,17 +50,17 @@ def pytorch_seq_x_value_gen(obs_range: range, var_range: range) -> spmatrix: @pytest.fixture -def X_layer_names() -> list[str]: +def X_layer_names() -> List[str]: return ["raw"] @pytest.fixture -def obsp_layer_names() -> list[str] | None: +def obsp_layer_names() -> Optional[List[str]]: return None @pytest.fixture -def varp_layer_names() -> list[str] | None: +def varp_layer_names() -> Optional[List[str]]: return None @@ -101,8 +102,8 @@ def add_sparse_array( @pytest.fixture(scope="function") def soma_experiment( tmp_path: pathlib.Path, - obs_range: int | range, - var_range: int | range, + obs_range: Union[int, range], + var_range: Union[int, range], X_value_gen: Callable[[range, range], sparse.spmatrix], obsp_layer_names: Sequence[str], varp_layer_names: Sequence[str], @@ -484,6 +485,10 @@ def test_custom_encoders_fail_if_columns_defined(soma_experiment: Experiment) -> @pytest.mark.experimental +@pytest.mark.skipif( + (sys.version_info.major, sys.version_info.minor) == (3, 9), + reason="fails intermittently with OOM error for 3.9", +) # noinspection PyTestParametrized @pytest.mark.parametrize("obs_range,var_range,X_value_gen", [(6, 3, pytorch_x_value_gen)]) def test_multiprocessing__returns_full_result(soma_experiment: Experiment) -> None: @@ -515,11 +520,11 @@ def test_distributed__returns_data_partition_for_rank( """Tests pytorch._partition_obs_joinids() behavior in a simulated PyTorch distributed processing mode, using mocks to avoid having to do real PyTorch distributed setup.""" - with ( - patch("cellxgene_census.experimental.ml.pytorch.dist.is_initialized") as mock_dist_is_initialized, - patch("cellxgene_census.experimental.ml.pytorch.dist.get_rank") as mock_dist_get_rank, - patch("cellxgene_census.experimental.ml.pytorch.dist.get_world_size") as mock_dist_get_world_size, - ): + with patch("cellxgene_census.experimental.ml.pytorch.dist.is_initialized") as mock_dist_is_initialized, patch( + "cellxgene_census.experimental.ml.pytorch.dist.get_rank" + ) as mock_dist_get_rank, patch( + "cellxgene_census.experimental.ml.pytorch.dist.get_world_size" + ) as mock_dist_get_world_size: mock_dist_is_initialized.return_value = True mock_dist_get_rank.return_value = 1 mock_dist_get_world_size.return_value = 3 @@ -551,12 +556,13 @@ def test_distributed_and_multiprocessing__returns_data_partition_for_rank( DataLoader multiprocessing mode, using mocks to avoid having to do distributed pytorch setup or real DataLoader multiprocessing.""" - with ( - patch("torch.utils.data.get_worker_info") as mock_get_worker_info, - patch("cellxgene_census.experimental.ml.pytorch.dist.is_initialized") as mock_dist_is_initialized, - patch("cellxgene_census.experimental.ml.pytorch.dist.get_rank") as mock_dist_get_rank, - patch("cellxgene_census.experimental.ml.pytorch.dist.get_world_size") as mock_dist_get_world_size, - ): + with patch("torch.utils.data.get_worker_info") as mock_get_worker_info, patch( + "cellxgene_census.experimental.ml.pytorch.dist.is_initialized" + ) as mock_dist_is_initialized, patch( + "cellxgene_census.experimental.ml.pytorch.dist.get_rank" + ) as mock_dist_get_rank, patch( + "cellxgene_census.experimental.ml.pytorch.dist.get_world_size" + ) as mock_dist_get_world_size: mock_get_worker_info.return_value = WorkerInfo(id=1, num_workers=2, seed=1234) mock_dist_is_initialized.return_value = True mock_dist_get_rank.return_value = 1 diff --git a/api/python/cellxgene_census/tests/experimental/pp/test_stats.py b/api/python/cellxgene_census/tests/experimental/pp/test_stats.py index ecc410d0e..3c113ea07 100644 --- a/api/python/cellxgene_census/tests/experimental/pp/test_stats.py +++ b/api/python/cellxgene_census/tests/experimental/pp/test_stats.py @@ -1,4 +1,4 @@ -from typing import Any +from typing import Any, Tuple, Union import numpy as np import numpy.ma as ma @@ -11,7 +11,7 @@ from cellxgene_census.experimental import pp -def var(X: sparse.csc_matrix | sparse.csr_matrix, axis: int = 0, ddof: int = 1) -> Any: +def var(X: Union[sparse.csc_matrix, sparse.csr_matrix], axis: int = 0, ddof: int = 1) -> Any: """ Variance of a sparse matrix calculated as mean(X**2) - mean(X)**2 with Bessel's correction applied for unbiased estimate @@ -52,7 +52,7 @@ def test_mean_variance( calc_mean: bool, calc_variance: bool, small_mem_context: soma.SOMATileDBContext, - obs_coords: tuple[None, slice], + obs_coords: Tuple[None, slice], ) -> None: with cellxgene_census.open_soma(census_version="latest", context=small_mem_context) as census: with census["census_data"][experiment_name].axis_query( @@ -119,7 +119,7 @@ def test_mean_variance_nnz_only( calc_mean: bool, calc_variance: bool, small_mem_context: soma.SOMATileDBContext, - obs_coords: tuple[None, slice], + obs_coords: Tuple[None, slice], ) -> None: # Note: since this test requires materializing the matrix in memory to compute the mean/variance, # we're going to use a coord slice based approach. This will ensure the matrix can fit in memory. diff --git a/api/python/cellxgene_census/tests/experimental/test_embeddings_search.py b/api/python/cellxgene_census/tests/experimental/test_embeddings_search.py index a58a31628..b3c0f6f77 100644 --- a/api/python/cellxgene_census/tests/experimental/test_embeddings_search.py +++ b/api/python/cellxgene_census/tests/experimental/test_embeddings_search.py @@ -1,5 +1,5 @@ import json -from typing import Any +from typing import Any, Dict, List import anndata as ad import numpy as np @@ -15,7 +15,7 @@ @pytest.mark.experimental @pytest.mark.live_corpus -def test_embeddings_search(true_neighbors: dict[str, Any], query_result: NeighborObs) -> None: +def test_embeddings_search(true_neighbors: Dict[str, Any], query_result: NeighborObs) -> None: # check result shapes rslt = query_result assert isinstance(rslt.neighbor_ids, np.ndarray) @@ -96,7 +96,7 @@ def test_predict_obs_metadata(query_anndata: ad.AnnData, query_result: NeighborO @pytest.fixture(scope="module") -def true_neighbors() -> dict[int, list[dict[str, Any]]]: +def true_neighbors() -> Dict[int, List[Dict[str, Any]]]: ans = {} for line in TRUE_NEAREST_NEIGHBORS_JSON.strip().split("\n"): example = json.loads(line) @@ -105,7 +105,7 @@ def true_neighbors() -> dict[int, list[dict[str, Any]]]: @pytest.fixture(scope="module") -def query_anndata(true_neighbors: dict[str, Any]) -> ad.AnnData: +def query_anndata(true_neighbors: Dict[str, Any]) -> ad.AnnData: with cellxgene_census.open_soma(census_version=TRUE_NEAREST_NEIGHBORS_CENSUS_VERSION) as census: return cellxgene_census.get_anndata( census, diff --git a/api/python/cellxgene_census/tests/test_acceptance.py b/api/python/cellxgene_census/tests/test_acceptance.py index bd01b840b..d4587e03e 100644 --- a/api/python/cellxgene_census/tests/test_acceptance.py +++ b/api/python/cellxgene_census/tests/test_acceptance.py @@ -11,8 +11,7 @@ See README.md for historical data. """ -from collections.abc import Iterator -from typing import Any +from typing import Any, Dict, Iterator, Optional, Tuple import pyarrow as pa import pytest @@ -22,7 +21,7 @@ from cellxgene_census._open import DEFAULT_TILEDB_CONFIGURATION -def make_context(census_version: str, config: dict[str, Any] | None = None) -> soma.SOMATileDBContext: +def make_context(census_version: str, config: Optional[Dict[str, Any]] = None) -> soma.SOMATileDBContext: config = config or {} version = cellxgene_census.get_census_version_description(census_version) s3_region = version["soma"].get("s3_region", "us-west-2") @@ -52,7 +51,7 @@ def test_load_axes(organism: str) -> None: del var_df -def table_iter_is_ok(tbl_iter: Iterator[pa.Table], stop_after: int | None = 2) -> bool: +def table_iter_is_ok(tbl_iter: Iterator[pa.Table], stop_after: Optional[int] = 2) -> bool: """ Utility that verifies that the value is an iterator of pa.Table. @@ -79,7 +78,7 @@ def table_iter_is_ok(tbl_iter: Iterator[pa.Table], stop_after: int | None = 2) - pytest.param(None, DEFAULT_TILEDB_CONFIGURATION, marks=pytest.mark.expensive), ], ) -def test_incremental_read_obs(organism: str, stop_after: int | None, ctx_config: dict[str, Any] | None) -> None: +def test_incremental_read_obs(organism: str, stop_after: Optional[int], ctx_config: Optional[Dict[str, Any]]) -> None: """Verify that obs, var and X[raw] can be read incrementally, i.e., in chunks""" # ctx_config=None open census with a small (default) TileDB buffer size, which reduces @@ -102,7 +101,7 @@ def test_incremental_read_obs(organism: str, stop_after: int | None, ctx_config: pytest.param(None, DEFAULT_TILEDB_CONFIGURATION, marks=pytest.mark.expensive), ], ) -def test_incremental_read_var(organism: str, stop_after: int | None, ctx_config: dict[str, Any] | None) -> None: +def test_incremental_read_var(organism: str, stop_after: Optional[int], ctx_config: Optional[Dict[str, Any]]) -> None: """Verify that var can be read incrementally, i.e., in chunks""" # ctx_config=None open census with a small (default) TileDB buffer size, which reduces @@ -144,9 +143,9 @@ def test_incremental_read_var(organism: str, stop_after: int | None, ctx_config: ) def test_incremental_read_X( organism: str, - stop_after: int | None, - ctx_config: dict[str, Any] | None, - coords: tuple[slice, slice] | None, + stop_after: Optional[int], + ctx_config: Optional[Dict[str, Any]], + coords: Optional[Tuple[slice, slice]], ) -> None: """Verify that obs, var and X[raw] can be read incrementally, i.e., in chunks""" @@ -166,7 +165,7 @@ def test_incremental_read_X( ["tissue=='aorta'", pytest.param("tissue=='brain'", marks=pytest.mark.expensive)], ) @pytest.mark.parametrize("stop_after", [2, pytest.param(None, marks=pytest.mark.expensive)]) -def test_incremental_query(organism: str, obs_value_filter: str, stop_after: int | None) -> None: +def test_incremental_query(organism: str, obs_value_filter: str, stop_after: Optional[int]) -> None: """Verify incremental read of query result.""" # use default TileDB configuration with cellxgene_census.open_soma(census_version="latest") as census: @@ -261,9 +260,9 @@ def test_incremental_query(organism: str, obs_value_filter: str, stop_after: int ) def test_get_anndata( organism: str, - obs_value_filter: str | None, - obs_coords: slice | None, - ctx_config: dict[str, Any] | None, + obs_value_filter: Optional[str], + obs_coords: Optional[slice], + ctx_config: Optional[Dict[str, Any]], ) -> None: """Verify query and read into AnnData""" ctx_config = ctx_config or {} diff --git a/api/python/cellxgene_census/tests/test_get_anndata.py b/api/python/cellxgene_census/tests/test_get_anndata.py index f3161cca8..fb3375f97 100644 --- a/api/python/cellxgene_census/tests/test_get_anndata.py +++ b/api/python/cellxgene_census/tests/test_get_anndata.py @@ -1,4 +1,4 @@ -from typing import Any, Literal +from typing import Any, Dict, List, Literal import numpy as np import pandas as pd @@ -83,7 +83,7 @@ def test_get_anndata_x_layer(census: soma.Collection, layer: str) -> None: @pytest.mark.live_corpus @pytest.mark.parametrize("layers", [["raw", "normalized"], ["normalized", "raw"]]) -def test_get_anndata_two_layers(census: soma.Collection, layers: list[str]) -> None: +def test_get_anndata_two_layers(census: soma.Collection, layers: List[str]) -> None: ad_primary_layer_in_X = cellxgene_census.get_anndata( census, organism="Homo sapiens", @@ -165,7 +165,7 @@ def test_get_anndata_obsm_one_layer(dec_lts_census: soma.Collection, obsm_layer: @pytest.mark.live_corpus @pytest.mark.parametrize("obsm_layers", [["scvi", "geneformer"]]) -def test_get_anndata_obsm_two_layers(dec_lts_census: soma.Collection, obsm_layers: list[str]) -> None: +def test_get_anndata_obsm_two_layers(dec_lts_census: soma.Collection, obsm_layers: List[str]) -> None: # NOTE: This test only works on the 2023-12-15 LTS Census, since in newer releases # the embeddings aren't distributed via the `obsm_layer` parameter. ad = cellxgene_census.get_anndata( @@ -184,10 +184,8 @@ def test_get_anndata_obsm_two_layers(dec_lts_census: soma.Collection, obsm_layer @pytest.mark.live_corpus -@pytest.mark.parametrize("obs_embeddings", [["scvi", "geneformer"]]) -def test_get_anndata_obs_embeddings(lts_census: soma.Collection, obs_embeddings: list[str]) -> None: - # NOTE: when the next LTS gets released (>2023-12-15), embeddings may or may not be available, - # so this test could require adjustments. +@pytest.mark.parametrize("obs_embeddings", [["scvi", "scgpt"]]) +def test_get_anndata_obs_embeddings(lts_census: soma.Collection, obs_embeddings: List[str]) -> None: ad = cellxgene_census.get_anndata( lts_census, organism="Homo sapiens", @@ -206,7 +204,7 @@ def test_get_anndata_obs_embeddings(lts_census: soma.Collection, obs_embeddings: @pytest.mark.live_corpus @pytest.mark.parametrize("var_embeddings", [["nmf"]]) -def test_get_anndata_var_embeddings(dec_lts_census: soma.Collection, var_embeddings: list[str]) -> None: +def test_get_anndata_var_embeddings(dec_lts_census: soma.Collection, var_embeddings: List[str]) -> None: # NOTE: this test only works on the 2023-12-15 LTS Census, since var embeddings # aren't available in the newer releases. @@ -301,7 +299,7 @@ def test_deprecated_column_api(census: soma.Collection) -> None: pd.testing.assert_frame_equal(ad_curr.var, ad_prev.var) -def _map_to_get_anndata_args(query: dict[str, Any], axis: Literal["obs", "var"]) -> dict[str, Any]: +def _map_to_get_anndata_args(query: Dict[str, Any], axis: Literal["obs", "var"]) -> Dict[str, Any]: """Helper to map arguments of get_obs/ get_var to get_anndata.""" result = {} if "coords" in query: @@ -334,7 +332,7 @@ def _map_to_get_anndata_args(query: dict[str, Any], axis: Literal["obs", "var"]) pytest.param({"value_filter": "tissue_general == 'vasculature'"}, id="value_filter"), ], ) -def test_get_obs(lts_census: soma.Collection, query: dict[str, Any]) -> None: +def test_get_obs(lts_census: soma.Collection, query: Dict[str, Any]) -> None: adata_obs = cellxgene_census.get_anndata( lts_census, organism="Mus musculus", **_map_to_get_anndata_args(query, "obs") ).obs @@ -360,7 +358,7 @@ def test_get_obs(lts_census: soma.Collection, query: dict[str, Any]) -> None: pytest.param({"value_filter": "feature_name in ['Gm53058', '0610010K14Rik']"}, id="value_filter"), ], ) -def test_get_var(lts_census: soma.Collection, query: dict[str, Any]) -> None: +def test_get_var(lts_census: soma.Collection, query: Dict[str, Any]) -> None: adata_var = cellxgene_census.get_anndata( lts_census, organism="Mus musculus", obs_coords=slice(0), **_map_to_get_anndata_args(query, "var") ).var diff --git a/api/python/cellxgene_census/tests/test_lts_compat.py b/api/python/cellxgene_census/tests/test_lts_compat.py index 2c486d541..dbe646cdd 100644 --- a/api/python/cellxgene_census/tests/test_lts_compat.py +++ b/api/python/cellxgene_census/tests/test_lts_compat.py @@ -9,8 +9,7 @@ from __future__ import annotations from collections import deque -from collections.abc import Iterator, Sequence -from typing import Literal, TypeAlias, get_args +from typing import Iterator, Literal, Sequence, Union, get_args import pyarrow as pa import pytest @@ -28,9 +27,14 @@ ] CollectionTypeNames = ["SOMACollection", "SOMAExperiment", "SOMAMeasurement"] -SOMATypes: TypeAlias = ( - soma.Collection | soma.DataFrame | soma.SparseNDArray | soma.DenseNDArray | soma.Experiment | soma.Measurement -) +SOMATypes = Union[ + soma.Collection, + soma.DataFrame, + soma.SparseNDArray, + soma.DenseNDArray, + soma.Experiment, + soma.Measurement, +] def walk_census( diff --git a/api/python/cellxgene_census/tests/test_open.py b/api/python/cellxgene_census/tests/test_open.py index 5945ea9e4..df20b3337 100644 --- a/api/python/cellxgene_census/tests/test_open.py +++ b/api/python/cellxgene_census/tests/test_open.py @@ -442,8 +442,8 @@ def test_opening_census_without_anon_access_fails_with_bogus_creds() -> None: os.environ["AWS_SECRET_ACCESS_KEY"] = "fake_key" # Passing an empty context with pytest.raises( - (tiledb.TileDBError, soma.DoesNotExistError), - match=r"does not exist", + tiledb.TileDBError, + match=r"The AWS Access Key Id you provided does not exist in our records", ): cellxgene_census.open_soma(census_version="latest", context=soma.SOMATileDBContext()) diff --git a/api/python/cellxgene_census/tests/test_user_agent.py b/api/python/cellxgene_census/tests/test_user_agent.py index 41612c649..dc410df9a 100644 --- a/api/python/cellxgene_census/tests/test_user_agent.py +++ b/api/python/cellxgene_census/tests/test_user_agent.py @@ -3,10 +3,9 @@ import json import os -from collections.abc import Callable from functools import partial from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Callable import numpy as np import proxy diff --git a/api/python/notebooks/README.md b/api/python/notebooks/README.md index cdf89656d..2b1c683ef 100644 --- a/api/python/notebooks/README.md +++ b/api/python/notebooks/README.md @@ -9,7 +9,7 @@ Demonstration notebooks for the CZ CELLxGENE Discover Census. There are two kind You must be on a Linux or MacOS system, with the following installed: -* Python 3.10 to 3.12 +* Python 3.8 to 3.11 * Jupyter or some other means of running notebooks (e.g., vscode) For now, it is recommended that you do all this on a host with sufficient memory, diff --git a/docs/cellxgene_census_docsite_installation.md b/docs/cellxgene_census_docsite_installation.md index 0cfbd969b..4654eb37a 100644 --- a/docs/cellxgene_census_docsite_installation.md +++ b/docs/cellxgene_census_docsite_installation.md @@ -4,7 +4,7 @@ The Census API requires a Linux or MacOS system with: -- Python 3.10 to Python 3.12. Or R, supported versions TBD. +- Python 3.8 to Python 3.11. Or R, supported versions TBD. - Recommended: >16 GB of memory. - Recommended: >5 Mbps internet connection. - Recommended: for increased performance use the API through a AWS-EC2 instance from the region `us-west-2`. The Census data builds are hosted in a AWS-S3 bucket in that region.