From 7be0fecd1343702f3b5b60806a7a0ae4e87a292f Mon Sep 17 00:00:00 2001
From: Isaac Virshup <ivirshup@gmail.com>
Date: Fri, 18 Oct 2024 13:04:09 -0700
Subject: [PATCH] Update cellxgene schema version to 5.2.0 (#1300)

* Update cellxgene schema version to 5.2.0

* specify architecture for docker to build image
---
 docs/cellxgene_census_schema.md               | 28 +++++++++----------
 tools/cellxgene_census_builder/Makefile       |  2 +-
 tools/cellxgene_census_builder/pyproject.toml |  2 +-
 .../build_soma/globals.py                     |  2 +-
 .../tests/anndata/test_anndata.py             |  6 ++--
 .../tests/conftest.py                         |  2 +-
 .../tests/test_manifest.py                    | 10 +++----
 7 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/docs/cellxgene_census_schema.md b/docs/cellxgene_census_schema.md
index 511899ec0..f1eb9ece1 100644
--- a/docs/cellxgene_census_schema.md
+++ b/docs/cellxgene_census_schema.md
@@ -10,14 +10,14 @@ The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "S
 
 The CZ CELLxGENE Discover Census, hereafter referred as Census, is a versioned data object and API for most of the single-cell data hosted at [CZ CELLxGENE Discover](https://cellxgene.cziscience.com/). To learn more about the Census visit the `chanzuckerberg/cellxgene-census` [github repository](https://github.com/chanzuckerberg/cellxgene-census)
 
-To better understand this document the reader should be familiar with the [CELLxGENE dataset schema](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md) and [SOMA](https://github.com/single-cell-data/SOMA/blob/main/abstract_specification.md).
+To better understand this document the reader should be familiar with the [CELLxGENE dataset schema](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md) and [SOMA](https://github.com/single-cell-data/SOMA/blob/main/abstract_specification.md).
 
 ## Definitions
 
 The following terms are used throughout this document:
 
 * adata – generic variable name that refers to an [`AnnData`](https://anndata.readthedocs.io/) object.
-* CELLxGENE dataset schema – the data schema for h5ad files served by CELLxGENE Discover, for this Census schema: [CELLxGENE dataset schema version is 5.1.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md)
+* CELLxGENE dataset schema – the data schema for h5ad files served by CELLxGENE Discover, for this Census schema: [CELLxGENE dataset schema version is 5.2.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md)
 * census\_obj – the Census root object, a SOMACollection.
 * Census data release – a versioned Census object deposited in a public bucket and accessible by APIs.
 * tissue – original tissue annotation.
@@ -44,17 +44,17 @@ Census data releases are versioned separately from the schema.
 
 ### Data included
 
-All datasets included in the Census MUST be of [CELLxGENE dataset schema version 5.1.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md). The following data constraints are imposed on top of the CELLxGENE dataset schema.
+All datasets included in the Census MUST be of [CELLxGENE dataset schema version 5.2.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md). The following data constraints are imposed on top of the CELLxGENE dataset schema.
 
 #### Species
 
-The Census MUST only contain observations (cells) with an  [`organism_ontology_term_id`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#organism_ontology_term_id) value of either "NCBITaxon:10090" for *Mus musculus* or "NCBITaxon:9606" for *Homo sapiens* MUST be included.
+The Census MUST only contain observations (cells) with an  [`organism_ontology_term_id`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#organism_ontology_term_id) value of either "NCBITaxon:10090" for *Mus musculus* or "NCBITaxon:9606" for *Homo sapiens* MUST be included.
 
-The Census MUST only contain features (genes) with a [`feature_reference`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#feature_reference) value of either "NCBITaxon:10090" for *Mus musculus* or "NCBITaxon:9606" for *Homo sapiens* MUST be included
+The Census MUST only contain features (genes) with a [`feature_reference`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#feature_reference) value of either "NCBITaxon:10090" for *Mus musculus* or "NCBITaxon:9606" for *Homo sapiens* MUST be included
 
 #### Multi-species data constraints
 
-Per the CELLxGENE dataset schema, [multi-species datasets MAY contain observations (cells) of a given organism and features (genes) of a different one](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#general-requirements), as defined in [`organism_ontology_term_id`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#organism_ontology_term_id) and [`feature_reference`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#feature_reference) respectively.
+Per the CELLxGENE dataset schema, [multi-species datasets MAY contain observations (cells) of a given organism and features (genes) of a different one](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#general-requirements), as defined in [`organism_ontology_term_id`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#organism_ontology_term_id) and [`feature_reference`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#feature_reference) respectively.
 
 For any given multi-species dataset, observation and features from the dataset are included in the Census as defined by the following:
 
@@ -114,7 +114,7 @@ The table below shows all possible combinations of organisms for both observatio
 
 #### Assays
 
-Assays are defined in the CELLxGENE dataset schema in [`assay_ontology_term_id`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#assay_ontology_term_id).
+Assays are defined in the CELLxGENE dataset schema in [`assay_ontology_term_id`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#assay_ontology_term_id).
 
 The Census MUST include all cells from the list of [accepted assays](./census_accepted_assays.csv).
 
@@ -143,15 +143,15 @@ These data need to be normalized by gene length for downstream analysis.
 
 #### Data matrix types
 
-Per the CELLxGENE dataset schema, [all RNA assays MUST include UMI or read counts](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#x-matrix-layers). Author-normalized data layers [as defined in the CELLxGENE dataset schema](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#x-matrix-layers) MUST NOT be included in the Census.
+Per the CELLxGENE dataset schema, [all RNA assays MUST include UMI or read counts](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#x-matrix-layers). Author-normalized data layers [as defined in the CELLxGENE dataset schema](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#x-matrix-layers) MUST NOT be included in the Census.
 
 #### Sample types
 
-Only observations (cells) from primary tissue MUST be included in the Census. Thus, ONLY those observations with a [`tissue_type`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#tissue_type) value equal to "tissue" MUST be included; other values of `tissue_type` MUST NOT be included.
+Only observations (cells) from primary tissue MUST be included in the Census. Thus, ONLY those observations with a [`tissue_type`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#tissue_type) value equal to "tissue" MUST be included; other values of `tissue_type` MUST NOT be included.
 
 #### Repeated data
 
-When a cell is represented multiple times in CELLxGENE Discover, only one is marked as the primary cell. This is defined in the CELLxGENE dataset schema under [`is_primary_data`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#is_primary_data). This information MUST be included in the Census cell metadata to enable queries that retrieve datasets (see cell metadata below), and all cells MUST be included in the Census.
+When a cell is represented multiple times in CELLxGENE Discover, only one is marked as the primary cell. This is defined in the CELLxGENE dataset schema under [`is_primary_data`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#is_primary_data). This information MUST be included in the Census cell metadata to enable queries that retrieve datasets (see cell metadata below), and all cells MUST be included in the Census.
 
 ### Data encoding and organization
 
@@ -660,7 +660,7 @@ For each organism the `SOMAExperiment` MUST contain the following:
 
 #### Matrix Data, count (raw) matrix – `census_obj["census_data"][organism].ms["RNA"].X["raw"]` – `SOMASparseNDArray`
 
-Per the CELLxGENE dataset schema, [all RNA assays MUST include UMI or read counts](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#x-matrix-layers). These counts MUST be encoded as `float32` in this `SOMASparseNDArray` with a fill value of zero (0), and no explicitly stored zero values.
+Per the CELLxGENE dataset schema, [all RNA assays MUST include UMI or read counts](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#x-matrix-layers). These counts MUST be encoded as `float32` in this `SOMASparseNDArray` with a fill value of zero (0), and no explicitly stored zero values.
 
 #### Matrix Data, normalized count matrix – `census_obj["census_data"][organism].ms["RNA"].X["normalized"]` – `SOMASparseNDArray`
 
@@ -674,9 +674,9 @@ as `normalized[i,j] = X[i,j] / sum(X[i, ])`.
 
 #### Feature metadata – `census_obj["census_data"][organism].ms["RNA"].var` – `SOMADataFrame`
 
-The Census MUST only contain features with a [`feature_biotype`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#feature_biotype) value of "gene".
+The Census MUST only contain features with a [`feature_biotype`](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#feature_biotype) value of "gene".
 
-The [gene references are pinned](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md#required-gene-annotations) as defined in the CELLxGENE dataset schema.
+The [gene references are pinned](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md#required-gene-annotations) as defined in the CELLxGENE dataset schema.
 
 The following columns MUST be included:
 
@@ -876,7 +876,7 @@ Cell metadata MUST be encoded as a `SOMADataFrame` with the following columns:
 
 ### Version 2.1.0
 
-* Update to require [CELLxGENE schema version 5.1.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.1.0/schema.md)
+* Update to require [CELLxGENE schema version 5.2.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md)
 * Adds `collection_doi_label` to "Census table of CELLxGENE Discover datasets – `census_obj["census_info"]["datasets"]`"
 
 ### Version 2.0.1
diff --git a/tools/cellxgene_census_builder/Makefile b/tools/cellxgene_census_builder/Makefile
index 5c597a400..ed12053bf 100644
--- a/tools/cellxgene_census_builder/Makefile
+++ b/tools/cellxgene_census_builder/Makefile
@@ -9,7 +9,7 @@
 .PHONY: image
 image: clean
 	python3 -m build .
-	docker build --build-arg=COMMIT_SHA="$$(git describe)" -t cellxgene-census-builder .
+	docker build --platform linux/amd64 --build-arg=COMMIT_SHA="$$(git describe)" -t cellxgene-census-builder .
 
 # Clean Python build
 .PHONY: clean
diff --git a/tools/cellxgene_census_builder/pyproject.toml b/tools/cellxgene_census_builder/pyproject.toml
index ce22ff228..51c704a15 100644
--- a/tools/cellxgene_census_builder/pyproject.toml
+++ b/tools/cellxgene_census_builder/pyproject.toml
@@ -36,7 +36,7 @@ dependencies= [
     #    https://github.com/TileDB-Inc/TileDB/blob/dev/format_spec/FORMAT_SPEC.md
     "tiledbsoma==1.11.4",
     "cellxgene-census==1.15.0",
-    "cellxgene-ontology-guide==1.0.0",
+    "cellxgene-ontology-guide==1.2.0",
     "scipy==1.12.0",
     "fsspec[http]==2024.3.1",
     "s3fs==2024.3.1",
diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py
index ec044a267..a0717aa1e 100644
--- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py
+++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py
@@ -13,7 +13,7 @@
 
 CENSUS_SCHEMA_VERSION = "2.1.0"
 
-CXG_SCHEMA_VERSION = "5.1.0"  # the CELLxGENE schema version supported
+CXG_SCHEMA_VERSION = "5.2.0"  # the CELLxGENE schema version supported
 
 # Columns expected in the census_datasets dataframe
 CENSUS_DATASETS_TABLE_SPEC = TableSpec.create(
diff --git a/tools/cellxgene_census_builder/tests/anndata/test_anndata.py b/tools/cellxgene_census_builder/tests/anndata/test_anndata.py
index dc2f1e999..b940f342a 100644
--- a/tools/cellxgene_census_builder/tests/anndata/test_anndata.py
+++ b/tools/cellxgene_census_builder/tests/anndata/test_anndata.py
@@ -265,7 +265,7 @@ def test_empty_estimated_density(tmp_path: pathlib.Path) -> None:
     adata = anndata.AnnData(
         obs=pd.DataFrame(), var=pd.DataFrame({"feature_id": [0, 1, 2]}), X=sparse.csr_matrix((0, 3), dtype=np.float32)
     )
-    adata.uns["schema_version"] = "5.1.0"
+    adata.uns["schema_version"] = "5.2.0"
     adata.write_h5ad(path)
 
     with open_anndata(path) as ad:
@@ -297,7 +297,7 @@ def test_open_anndata_raw_X(tmp_path: pathlib.Path) -> None:
         var=pd.DataFrame({"feature_id": [0, 1, 2]}),
         X=sparse.csr_matrix((2, 3), dtype=np.float32),
         raw={"X": sparse.csr_matrix((2, 4), dtype=np.float32)},
-        uns={"schema_version": "5.1.0"},
+        uns={"schema_version": "5.2.0"},
     )
     adata.write_h5ad(path)
 
@@ -410,7 +410,7 @@ def test_multi_species_filter(
             index=[f"feature_{i}" for i in range(n_vars)],
         ),
         X=sparse.random(n_obs, n_vars, format="csr", dtype=np.float32),
-        uns={"schema_version": "5.1.0"},
+        uns={"schema_version": "5.2.0"},
     )
     path = (tmp_path / "species.h5ad").as_posix()
     adata.write_h5ad(path)
diff --git a/tools/cellxgene_census_builder/tests/conftest.py b/tools/cellxgene_census_builder/tests/conftest.py
index adccea725..c03027cf4 100644
--- a/tools/cellxgene_census_builder/tests/conftest.py
+++ b/tools/cellxgene_census_builder/tests/conftest.py
@@ -116,7 +116,7 @@ def get_anndata(
     uns["batch_condition"] = np.array(["a", "b"], dtype="object")
 
     # Need to carefully set the corpora schema versions in order for tests to pass.
-    uns["schema_version"] = "5.1.0"  # type: ignore
+    uns["schema_version"] = "5.2.0"  # type: ignore
 
     return anndata.AnnData(X=X, obs=obs, var=var, obsm=obsm, uns=uns)
 
diff --git a/tools/cellxgene_census_builder/tests/test_manifest.py b/tools/cellxgene_census_builder/tests/test_manifest.py
index fb9098cea..1228696ac 100644
--- a/tools/cellxgene_census_builder/tests/test_manifest.py
+++ b/tools/cellxgene_census_builder/tests/test_manifest.py
@@ -65,7 +65,7 @@ def test_load_manifest_from_cxg(empty_blocklist: str) -> None:
                 "collection_doi_label": "Publication 1",
                 "citation": "citation",
                 "title": "dataset #1",
-                "schema_version": "5.1.0",
+                "schema_version": "5.2.0",
                 "assets": [
                     {
                         "filesize": 123,
@@ -90,7 +90,7 @@ def test_load_manifest_from_cxg(empty_blocklist: str) -> None:
                 "collection_doi_label": "Publication 2",
                 "citation": "citation",
                 "title": "dataset #2",
-                "schema_version": "5.1.0",
+                "schema_version": "5.2.0",
                 "assets": [{"filesize": 456, "filetype": "H5AD", "url": "https://fake.url/dataset_id_2.h5ad"}],
                 "dataset_version_id": "dataset_id_2",
                 "cell_count": 11,
@@ -122,7 +122,7 @@ def test_load_manifest_from_cxg_errors_on_datasets_with_old_schema(
                 "collection_doi_label": "Publication 1",
                 "citation": "citation",
                 "title": "dataset #1",
-                "schema_version": "5.1.0",
+                "schema_version": "5.2.0",
                 "assets": [{"filesize": 123, "filetype": "H5AD", "url": "https://fake.url/dataset_id_1.h5ad"}],
                 "dataset_version_id": "dataset_id_1",
                 "cell_count": 10,
@@ -166,7 +166,7 @@ def test_load_manifest_from_cxg_excludes_datasets_with_no_assets(
                 "collection_doi": None,
                 "citation": "citation",
                 "title": "dataset #1",
-                "schema_version": "5.1.0",
+                "schema_version": "5.2.0",
                 "assets": [{"filesize": 123, "filetype": "H5AD", "url": "https://fake.url/dataset_id_1.h5ad"}],
                 "dataset_version_id": "dataset_id_1",
                 "cell_count": 10,
@@ -179,7 +179,7 @@ def test_load_manifest_from_cxg_excludes_datasets_with_no_assets(
                 "collection_doi": None,
                 "citation": "citation",
                 "title": "dataset #2",
-                "schema_version": "5.1.0",
+                "schema_version": "5.2.0",
                 "assets": [],
                 "dataset_version_id": "dataset_id_2",
                 "cell_count": 10,