From 7be0fecd1343702f3b5b60806a7a0ae4e87a292f Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Fri, 18 Oct 2024 13:04:09 -0700 Subject: [PATCH] Update cellxgene schema version to 5.2.0 (#1300) * Update cellxgene schema version to 5.2.0 * specify architecture for docker to build image --- docs/cellxgene_census_schema.md | 28 +++++++++---------- tools/cellxgene_census_builder/Makefile | 2 +- tools/cellxgene_census_builder/pyproject.toml | 2 +- .../build_soma/globals.py | 2 +- .../tests/anndata/test_anndata.py | 6 ++-- .../tests/conftest.py | 2 +- .../tests/test_manifest.py | 10 +++---- 7 files changed, 26 insertions(+), 26 deletions(-) diff --git a/docs/cellxgene_census_schema.md b/docs/cellxgene_census_schema.md index 511899ec0..f1eb9ece1 100644 --- a/docs/cellxgene_census_schema.md +++ b/docs/cellxgene_census_schema.md @@ -10,14 +10,14 @@ The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "S The CZ CELLxGENE Discover Census, hereafter referred as Census, is a versioned data object and API for most of the single-cell data hosted at [CZ CELLxGENE Discover](https://cellxgene.cziscience.com/). To learn more about the Census visit the `chanzuckerberg/cellxgene-census` [github repository](https://github.com/chanzuckerberg/cellxgene-census) -To better understand this document the reader should be familiar with the [CELLxGENE dataset schema](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md) and [SOMA](https://github.com/single-cell-data/SOMA/blob/main/abstract_specification.md). +To better understand this document the reader should be familiar with the [CELLxGENE dataset schema](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md) and [SOMA](https://github.com/single-cell-data/SOMA/blob/main/abstract_specification.md). ## Definitions The following terms are used throughout this document: * adata – generic variable name that refers to an [`AnnData`](https://anndata.readthedocs.io/) object. -* CELLxGENE dataset schema – the data schema for h5ad files served by CELLxGENE Discover, for this Census schema: [CELLxGENE dataset schema version is 5.1.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md) +* CELLxGENE dataset schema – the data schema for h5ad files served by CELLxGENE Discover, for this Census schema: [CELLxGENE dataset schema version is 5.2.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md) * census\_obj – the Census root object, a SOMACollection. * Census data release – a versioned Census object deposited in a public bucket and accessible by APIs. * tissue – original tissue annotation. @@ -44,17 +44,17 @@ Census data releases are versioned separately from the schema. ### Data included -All datasets included in the Census MUST be of [CELLxGENE dataset schema version 5.1.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md). The following data constraints are imposed on top of the CELLxGENE dataset schema. +All datasets included in the Census MUST be of [CELLxGENE dataset schema version 5.2.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md). The following data constraints are imposed on top of the CELLxGENE dataset schema. #### Species -The Census MUST only contain observations (cells) with an [`organism_ontology_term_id`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#organism_ontology_term_id) value of either "NCBITaxon:10090" for *Mus musculus* or "NCBITaxon:9606" for *Homo sapiens* MUST be included. +The Census MUST only contain observations (cells) with an [`organism_ontology_term_id`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#organism_ontology_term_id) value of either "NCBITaxon:10090" for *Mus musculus* or "NCBITaxon:9606" for *Homo sapiens* MUST be included. -The Census MUST only contain features (genes) with a [`feature_reference`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#feature_reference) value of either "NCBITaxon:10090" for *Mus musculus* or "NCBITaxon:9606" for *Homo sapiens* MUST be included +The Census MUST only contain features (genes) with a [`feature_reference`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#feature_reference) value of either "NCBITaxon:10090" for *Mus musculus* or "NCBITaxon:9606" for *Homo sapiens* MUST be included #### Multi-species data constraints -Per the CELLxGENE dataset schema, [multi-species datasets MAY contain observations (cells) of a given organism and features (genes) of a different one](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#general-requirements), as defined in [`organism_ontology_term_id`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#organism_ontology_term_id) and [`feature_reference`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#feature_reference) respectively. +Per the CELLxGENE dataset schema, [multi-species datasets MAY contain observations (cells) of a given organism and features (genes) of a different one](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#general-requirements), as defined in [`organism_ontology_term_id`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#organism_ontology_term_id) and [`feature_reference`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#feature_reference) respectively. For any given multi-species dataset, observation and features from the dataset are included in the Census as defined by the following: @@ -114,7 +114,7 @@ The table below shows all possible combinations of organisms for both observatio #### Assays -Assays are defined in the CELLxGENE dataset schema in [`assay_ontology_term_id`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#assay_ontology_term_id). +Assays are defined in the CELLxGENE dataset schema in [`assay_ontology_term_id`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#assay_ontology_term_id). The Census MUST include all cells from the list of [accepted assays](./census_accepted_assays.csv). @@ -143,15 +143,15 @@ These data need to be normalized by gene length for downstream analysis. #### Data matrix types -Per the CELLxGENE dataset schema, [all RNA assays MUST include UMI or read counts](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#x-matrix-layers). Author-normalized data layers [as defined in the CELLxGENE dataset schema](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#x-matrix-layers) MUST NOT be included in the Census. +Per the CELLxGENE dataset schema, [all RNA assays MUST include UMI or read counts](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#x-matrix-layers). Author-normalized data layers [as defined in the CELLxGENE dataset schema](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#x-matrix-layers) MUST NOT be included in the Census. #### Sample types -Only observations (cells) from primary tissue MUST be included in the Census. Thus, ONLY those observations with a [`tissue_type`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#tissue_type) value equal to "tissue" MUST be included; other values of `tissue_type` MUST NOT be included. +Only observations (cells) from primary tissue MUST be included in the Census. Thus, ONLY those observations with a [`tissue_type`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#tissue_type) value equal to "tissue" MUST be included; other values of `tissue_type` MUST NOT be included. #### Repeated data -When a cell is represented multiple times in CELLxGENE Discover, only one is marked as the primary cell. This is defined in the CELLxGENE dataset schema under [`is_primary_data`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#is_primary_data). This information MUST be included in the Census cell metadata to enable queries that retrieve datasets (see cell metadata below), and all cells MUST be included in the Census. +When a cell is represented multiple times in CELLxGENE Discover, only one is marked as the primary cell. This is defined in the CELLxGENE dataset schema under [`is_primary_data`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#is_primary_data). This information MUST be included in the Census cell metadata to enable queries that retrieve datasets (see cell metadata below), and all cells MUST be included in the Census. ### Data encoding and organization @@ -660,7 +660,7 @@ For each organism the `SOMAExperiment` MUST contain the following: #### Matrix Data, count (raw) matrix – `census_obj["census_data"][organism].ms["RNA"].X["raw"]` – `SOMASparseNDArray` -Per the CELLxGENE dataset schema, [all RNA assays MUST include UMI or read counts](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#x-matrix-layers). These counts MUST be encoded as `float32` in this `SOMASparseNDArray` with a fill value of zero (0), and no explicitly stored zero values. +Per the CELLxGENE dataset schema, [all RNA assays MUST include UMI or read counts](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#x-matrix-layers). These counts MUST be encoded as `float32` in this `SOMASparseNDArray` with a fill value of zero (0), and no explicitly stored zero values. #### Matrix Data, normalized count matrix – `census_obj["census_data"][organism].ms["RNA"].X["normalized"]` – `SOMASparseNDArray` @@ -674,9 +674,9 @@ as `normalized[i,j] = X[i,j] / sum(X[i, ])`. #### Feature metadata – `census_obj["census_data"][organism].ms["RNA"].var` – `SOMADataFrame` -The Census MUST only contain features with a [`feature_biotype`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#feature_biotype) value of "gene". +The Census MUST only contain features with a [`feature_biotype`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#feature_biotype) value of "gene". -The [gene references are pinned](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#required-gene-annotations) as defined in the CELLxGENE dataset schema. +The [gene references are pinned](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#required-gene-annotations) as defined in the CELLxGENE dataset schema. The following columns MUST be included: @@ -876,7 +876,7 @@ Cell metadata MUST be encoded as a `SOMADataFrame` with the following columns: ### Version 2.1.0 -* Update to require [CELLxGENE schema version 5.1.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md) +* Update to require [CELLxGENE schema version 5.2.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md) * Adds `collection_doi_label` to "Census table of CELLxGENE Discover datasets – `census_obj["census_info"]["datasets"]`" ### Version 2.0.1 diff --git a/tools/cellxgene_census_builder/Makefile b/tools/cellxgene_census_builder/Makefile index 5c597a400..ed12053bf 100644 --- a/tools/cellxgene_census_builder/Makefile +++ b/tools/cellxgene_census_builder/Makefile @@ -9,7 +9,7 @@ .PHONY: image image: clean python3 -m build . - docker build --build-arg=COMMIT_SHA="$$(git describe)" -t cellxgene-census-builder . + docker build --platform linux/amd64 --build-arg=COMMIT_SHA="$$(git describe)" -t cellxgene-census-builder . # Clean Python build .PHONY: clean diff --git a/tools/cellxgene_census_builder/pyproject.toml b/tools/cellxgene_census_builder/pyproject.toml index ce22ff228..51c704a15 100644 --- a/tools/cellxgene_census_builder/pyproject.toml +++ b/tools/cellxgene_census_builder/pyproject.toml @@ -36,7 +36,7 @@ dependencies= [ # https://github.com/TileDB-Inc/TileDB/blob/dev/format_spec/FORMAT_SPEC.md "tiledbsoma==1.11.4", "cellxgene-census==1.15.0", - "cellxgene-ontology-guide==1.0.0", + "cellxgene-ontology-guide==1.2.0", "scipy==1.12.0", "fsspec[http]==2024.3.1", "s3fs==2024.3.1", diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py index ec044a267..a0717aa1e 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py @@ -13,7 +13,7 @@ CENSUS_SCHEMA_VERSION = "2.1.0" -CXG_SCHEMA_VERSION = "5.1.0" # the CELLxGENE schema version supported +CXG_SCHEMA_VERSION = "5.2.0" # the CELLxGENE schema version supported # Columns expected in the census_datasets dataframe CENSUS_DATASETS_TABLE_SPEC = TableSpec.create( diff --git a/tools/cellxgene_census_builder/tests/anndata/test_anndata.py b/tools/cellxgene_census_builder/tests/anndata/test_anndata.py index dc2f1e999..b940f342a 100644 --- a/tools/cellxgene_census_builder/tests/anndata/test_anndata.py +++ b/tools/cellxgene_census_builder/tests/anndata/test_anndata.py @@ -265,7 +265,7 @@ def test_empty_estimated_density(tmp_path: pathlib.Path) -> None: adata = anndata.AnnData( obs=pd.DataFrame(), var=pd.DataFrame({"feature_id": [0, 1, 2]}), X=sparse.csr_matrix((0, 3), dtype=np.float32) ) - adata.uns["schema_version"] = "5.1.0" + adata.uns["schema_version"] = "5.2.0" adata.write_h5ad(path) with open_anndata(path) as ad: @@ -297,7 +297,7 @@ def test_open_anndata_raw_X(tmp_path: pathlib.Path) -> None: var=pd.DataFrame({"feature_id": [0, 1, 2]}), X=sparse.csr_matrix((2, 3), dtype=np.float32), raw={"X": sparse.csr_matrix((2, 4), dtype=np.float32)}, - uns={"schema_version": "5.1.0"}, + uns={"schema_version": "5.2.0"}, ) adata.write_h5ad(path) @@ -410,7 +410,7 @@ def test_multi_species_filter( index=[f"feature_{i}" for i in range(n_vars)], ), X=sparse.random(n_obs, n_vars, format="csr", dtype=np.float32), - uns={"schema_version": "5.1.0"}, + uns={"schema_version": "5.2.0"}, ) path = (tmp_path / "species.h5ad").as_posix() adata.write_h5ad(path) diff --git a/tools/cellxgene_census_builder/tests/conftest.py b/tools/cellxgene_census_builder/tests/conftest.py index adccea725..c03027cf4 100644 --- a/tools/cellxgene_census_builder/tests/conftest.py +++ b/tools/cellxgene_census_builder/tests/conftest.py @@ -116,7 +116,7 @@ def get_anndata( uns["batch_condition"] = np.array(["a", "b"], dtype="object") # Need to carefully set the corpora schema versions in order for tests to pass. - uns["schema_version"] = "5.1.0" # type: ignore + uns["schema_version"] = "5.2.0" # type: ignore return anndata.AnnData(X=X, obs=obs, var=var, obsm=obsm, uns=uns) diff --git a/tools/cellxgene_census_builder/tests/test_manifest.py b/tools/cellxgene_census_builder/tests/test_manifest.py index fb9098cea..1228696ac 100644 --- a/tools/cellxgene_census_builder/tests/test_manifest.py +++ b/tools/cellxgene_census_builder/tests/test_manifest.py @@ -65,7 +65,7 @@ def test_load_manifest_from_cxg(empty_blocklist: str) -> None: "collection_doi_label": "Publication 1", "citation": "citation", "title": "dataset #1", - "schema_version": "5.1.0", + "schema_version": "5.2.0", "assets": [ { "filesize": 123, @@ -90,7 +90,7 @@ def test_load_manifest_from_cxg(empty_blocklist: str) -> None: "collection_doi_label": "Publication 2", "citation": "citation", "title": "dataset #2", - "schema_version": "5.1.0", + "schema_version": "5.2.0", "assets": [{"filesize": 456, "filetype": "H5AD", "url": "https://fake.url/dataset_id_2.h5ad"}], "dataset_version_id": "dataset_id_2", "cell_count": 11, @@ -122,7 +122,7 @@ def test_load_manifest_from_cxg_errors_on_datasets_with_old_schema( "collection_doi_label": "Publication 1", "citation": "citation", "title": "dataset #1", - "schema_version": "5.1.0", + "schema_version": "5.2.0", "assets": [{"filesize": 123, "filetype": "H5AD", "url": "https://fake.url/dataset_id_1.h5ad"}], "dataset_version_id": "dataset_id_1", "cell_count": 10, @@ -166,7 +166,7 @@ def test_load_manifest_from_cxg_excludes_datasets_with_no_assets( "collection_doi": None, "citation": "citation", "title": "dataset #1", - "schema_version": "5.1.0", + "schema_version": "5.2.0", "assets": [{"filesize": 123, "filetype": "H5AD", "url": "https://fake.url/dataset_id_1.h5ad"}], "dataset_version_id": "dataset_id_1", "cell_count": 10, @@ -179,7 +179,7 @@ def test_load_manifest_from_cxg_excludes_datasets_with_no_assets( "collection_doi": None, "citation": "citation", "title": "dataset #2", - "schema_version": "5.1.0", + "schema_version": "5.2.0", "assets": [], "dataset_version_id": "dataset_id_2", "cell_count": 10,