comment-block

single-cell-data · Jul 27, 2023 · a53a4dd · a53a4dd
1 parent e5cc6e6
commit a53a4dd
Show file tree

Hide file tree

Showing 3 changed files with 126 additions and 4 deletions.
diff --git a/apis/python/src/tiledbsoma/io/registration/__init__.py b/apis/python/src/tiledbsoma/io/registration/__init__.py
@@ -1,4 +1,102 @@
-"""TODO: docstring"""
+"""
+Support for soma_joinid remapping for append-mode ingestion.
+
+This is an internal-use class; none of it is user-facing API.
+
+The SOMA experiment ``obs``, ``var``, ``X``, etc. are indexed by soma_joinid.  Input AnnData/H5AD
+``obs`` and ``var`` are indexed by an index column; ``X` et al. are indexed zero-up by row numbers
+aligning with ``obs`` and ``var``. This raises the issue: if we have a SOMA experiment created
+by ingesting two or more AnnData/H5AD files, how do we compute soma_joinid values for the full experiment?
+
+Essential ideas:
+
+- The input ``obs`` must have some (user-specified) column which contains values unique across all
+  AnnData/H5AD inputs. Nominally this will be a cell barcode, and nominally this will be different
+  for all input files. If one input has data for 100 cells and another has data for 200 cells,
+  we will expect total input to have data for 300 cells. In particular, as multiple inputs
+  are appended to a SOMA experiment, ``obs`` will grow taller.
+
+- The input ``var`` must also havesome (user-specified) column containing string identifiers.
+  Nominally these are Ensembl IDs like ENSG00000142208, or HGNC IDs like AKT1. Nominally
+  these will be the same for all input files, although it's to be expected that one input file
+  may have data for some infrequently expressed genes that don't appear in other input files.
+  In particular, as multiple inputs are appended to a SOMA experiment, ``var`` may gain
+  a few rows here and there.
+
+- Putting the last two together means that ``X`` (which is sparse) will mainly gain new
+  rows (cells) for each input file, perhaps with some columns (genes) that didn't appear for
+  previous input files.
+
+The purpose of the registration mappings is to simply track a mapping from user-specified
+``obs`` and ``var`` ID-column values to soma_joinid values.
+
+Append-mode ingestion has a _registration pass_, which must be sequential and must use all input
+files to compute join-ID mappings, and then an _ingestion pass_, which can be parallelized across
+input files.
+
+There are two kinds of mappings: _ambient mappings_ which contain string-to-int join-ID mappings
+for _all_ inputs, and ID mappings which contain int-to-int offset-to-join-ID mappings for each _single_ input.
+
+Example:
+
+- Input 1 has obs IDs ``["AAAT", "ACTG", "AGAG"]`` numbered 0, 1, 2 within input 1.
+- Input 1 has var IDs ``["AKT1", "APOE", "ESR1", "TP53", "VEGFA"]`` numbered 0, 1, 2, 3, 4 within input1.
+- Input 2 has obs IDs ``["CAAT", "CCTG", "CGAG"]`` numbered 0, 1, 2 within input 2.
+- Input 2 has var IDs ``["APOE", "EGFR", "TP53", "VEGFA"]`` numbered 0, 1, 2, 3 within input 2.
+
+Then registration produces ambient mappings like this:
+
+- For ``obs``, ``AxisAmbientLabelMapping`` of
+
+    AAAT:0
+    ACTG:1
+    AGAG:2
+    CAAT:3
+    CCTG:4
+    CGAG:5
+
+- For ``var``, ``AxisAmbientLabelMapping`` of
+
+    AKT1:0
+    APOE:1
+    ESR1:2
+    TP53:3
+    VEGFA:4
+    EGFR:5
+
+- ``ExperimentAmbientLabelMapping`` containing these two axes.
+
+This registration data is passed to the ingestor for each input file. Within the ingestion logic
+itself and without user intervention, the ingestor selects out the int-to-int mappings from AnnData 0-up
+offsets to registered SOMA join IDs like this:
+
+- For input 1's ``obs``, ``AxisIDMapping`` of
+
+    0:0 (for AAAT)
+    1:1 (for ACTG)
+    2:2 (for AGAG)
+
+- For input 1's ``var``, ``AxisIDMapping`` of
+
+    0:0 (for AKT1)
+    1:1 (for APOE)
+    2:2 (for ESR1)
+    3:3 (for TP53)
+    4:4 (for VEGFA)
+
+- For input 2's ``obs``, ``AxisIDMapping`` of
+
+    3:3 (for CAAT)
+    4:4 (for CCTG)
+    5:5 (for CGAG)
+
+- For input 2's ``var``, ``AxisIDMapping`` of
+
+    0:1 (for APOE)
+    1:5 (for EGFR)
+    2:3 (for TP53)
+    3:4 (for VEGFA)
+"""
 
 from .ambient_label_mappings import (
     AxisAmbientLabelMapping,

diff --git a/apis/python/src/tiledbsoma/io/registration/ambient_label_mappings.py b/apis/python/src/tiledbsoma/io/registration/ambient_label_mappings.py
@@ -14,7 +14,13 @@
 
 @dataclass
 class AxisAmbientLabelMapping:
-    """TODO: docstring"""
+    """
+    For all the to-be-appended AnnData/H5AD inputs in SOMA multi-file append-mode ingestion, this
+    class tracks the mapping of input-data ``obs`` or ``var`` ID-column name (barcode ID, gene
+    symbol) to SOMA join IDs for SOMA experiment ``obs`` or ``var``.
+
+    See module-level comments for more information.
+    """
 
     data: Dict[str, int]
     field_name: str
@@ -75,6 +81,12 @@ def fromJSON(cls, s: str) -> Self:
 
 @dataclass
 class ExperimentAmbientLabelMapping:
+    """
+    For all the to-be-appended AnnData/H5AD inputs in SOMA multi-file append-mode ingestion, this
+    class contains an ``AxisAmbientLabelMapping`` for ``obs``, and an ``AxisAmbientLabelMapping``
+    for ``var`` in each measurement.
+    """
+
     obs_axis: AxisAmbientLabelMapping
     var_axes: Dict[str, AxisAmbientLabelMapping]
 

diff --git a/apis/python/src/tiledbsoma/io/registration/id_mappings.py b/apis/python/src/tiledbsoma/io/registration/id_mappings.py
@@ -11,7 +11,13 @@
 
 @dataclass
 class AxisIDMapping:
-    """TODO: docstring"""
+    """
+    For a single to-be-appended AnnData/H5AD input in SOMA multi-file append-mode ingestion, this
+    class tracks the mapping of input-data ``obs`` or ``var`` 0-up offsets to SOMA join ID values
+    for the destination SOMA experiment.
+
+    See module-level comments for more information.
+    """
 
     data: List[int]
 
@@ -23,7 +29,13 @@ def identity(cls, n: int) -> Self:
 
 @dataclass
 class ExperimentIDMapping:
-    """TODO: docstring"""
+    """
+    For a single to-be-appended AnnData/H5AD input in SOMA multi-file append-mode ingestion, this
+    class contains an ``ExperimentIDMapping`` for ``obs``, and one ``ExperimentIDMapping`` for
+    ``var`` in each measurement.
+
+    See module-level comments for more information.
+    """
 
     obs_axis: AxisIDMapping
     var_axes: Dict[str, AxisIDMapping]