From 5acaf9e87e057e77b1a80a2cafdf9c067db29fb1 Mon Sep 17 00:00:00 2001
From: Emanuele Bezzi <ebezzi@chanzuckerberg.com>
Date: Tue, 16 Apr 2024 09:20:38 -0700
Subject: [PATCH] [builder] cellxgene_ontology_guide integration (#1094)

---
 tools/cellxgene_census_builder/pyproject.toml |   1 +
 .../build_soma/globals.py                     |   4 -
 .../build_soma/tissue_mapper.py               | 220 ++----------------
 3 files changed, 15 insertions(+), 210 deletions(-)

diff --git a/tools/cellxgene_census_builder/pyproject.toml b/tools/cellxgene_census_builder/pyproject.toml
index dc436a9d0..4a3af93ba 100644
--- a/tools/cellxgene_census_builder/pyproject.toml
+++ b/tools/cellxgene_census_builder/pyproject.toml
@@ -36,6 +36,7 @@ dependencies= [
     #    https://github.com/TileDB-Inc/TileDB/blob/dev/format_spec/FORMAT_SPEC.md
     "tiledbsoma==1.9.3",
     "cellxgene-census==1.12.0",
+    "cellxgene-ontology-guide==0.6.1",
     "scipy==1.12.0",
     "fsspec[http]==2024.3.1",
     "s3fs==2024.3.1",
diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py
index 6e5d33d14..877f1e545 100644
--- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py
+++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py
@@ -15,10 +15,6 @@
 
 CXG_SCHEMA_VERSION = "5.0.0"  # the CELLxGENE schema version supported
 
-# NOTE: The UBERON ontology URL needs to manually updated if the CXG Dataset Schema is updated. This is a temporary
-# hassle, however, since the TissueMapper, which relies upon this ontology, will eventually be removed from the Builder
-CXG_UBERON_ONTOLOGY_URL = "https://github.com/obophenotype/uberon/releases/download/v2024-01-18/uberon.owl"
-
 # Columns expected in the census_datasets dataframe
 CENSUS_DATASETS_TABLE_SPEC = TableSpec.create(
     [
diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/tissue_mapper.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/tissue_mapper.py
index 520837b11..1ad625d89 100644
--- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/tissue_mapper.py
+++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/tissue_mapper.py
@@ -1,117 +1,16 @@
-"""NOTE: In the future, this code will be part of an ontology service library."""
+from cellxgene_ontology_guide import curated_ontology_term_lists
+from cellxgene_ontology_guide.entities import CuratedOntologyTermList
+from cellxgene_ontology_guide.ontology_parser import OntologyParser
 
-import owlready2
-
-from .globals import CXG_UBERON_ONTOLOGY_URL
+from .globals import CXG_SCHEMA_VERSION
 
 
 class TissueMapper:
-    # Name of anatomical structure, used to determine the set of ancestors for a given
-    # entity that we"re interested in.
-    ANATOMICAL_STRUCTURE_NAME = "UBERON_0000061"
-
-    # List of high level tissues, ORDER MATTERS. If for a given tissue there are multiple high-level tissues associated
-    # then `self.get_high_level_tissue()` returns the one that appears first in th this list
-    HIGH_LEVEL_TISSUES = [
-        "UBERON_0000178",  # blood
-        "UBERON_0002048",  # lung
-        "UBERON_0002106",  # spleen
-        "UBERON_0002371",  # bone marrow
-        "UBERON_0002107",  # liver
-        "UBERON_0002113",  # kidney
-        "UBERON_0000955",  # brain
-        "UBERON_0002240",  # spinal cord
-        "UBERON_0000310",  # breast
-        "UBERON_0000948",  # heart
-        "UBERON_0002097",  # skin of body
-        "UBERON_0000970",  # eye
-        "UBERON_0001264",  # pancreas
-        "UBERON_0001043",  # esophagus
-        "UBERON_0001155",  # colon
-        "UBERON_0000059",  # large intestine
-        "UBERON_0002108",  # small intestine
-        "UBERON_0000160",  # intestine
-        "UBERON_0000945",  # stomach
-        "UBERON_0001836",  # saliva
-        "UBERON_0001723",  # tongue
-        "UBERON_0001013",  # adipose tissue
-        "UBERON_0000473",  # testis
-        "UBERON_0002367",  # prostate gland
-        "UBERON_0000057",  # urethra
-        "UBERON_0000056",  # ureter
-        "UBERON_0003889",  # fallopian tube
-        "UBERON_0000995",  # uterus
-        "UBERON_0000992",  # ovary
-        "UBERON_0002110",  # gall bladder
-        "UBERON_0001255",  # urinary bladder
-        "UBERON_0018707",  # bladder organ
-        "UBERON_0000922",  # embryo
-        "UBERON_0004023",  # ganglionic eminence --> this a part of the embryo, remove in case generality is desired
-        "UBERON_0001987",  # placenta
-        "UBERON_0007106",  # chorionic villus
-        "UBERON_0002369",  # adrenal gland
-        "UBERON_0002368",  # endocrine gland
-        "UBERON_0002365",  # exocrine gland
-        "UBERON_0000030",  # lamina propria
-        "UBERON_0000029",  # lymph node
-        "UBERON_0004536",  # lymph vasculature
-        "UBERON_0001015",  # musculature
-        "UBERON_0000004",  # nose
-        "UBERON_0003688",  # omentum
-        "UBERON_0000977",  # pleura
-        "UBERON_0002370",  # thymus
-        "UBERON_0002049",  # vasculature
-        "UBERON_0009472",  # axilla
-        "UBERON_0001087",  # pleural fluid
-        "UBERON_0000344",  # mucosa
-        "UBERON_0001434",  # skeletal system
-        "UBERON_0002228",  # rib
-        "UBERON_0003129",  # skull
-        "UBERON_0004537",  # blood vasculature
-        "UBERON_0002405",  # immune system
-        "UBERON_0001009",  # circulatory system
-        "UBERON_0001007",  # digestive system
-        "UBERON_0001017",  # central nervous system
-        "UBERON_0001008",  # renal system
-        "UBERON_0000990",  # reproductive system
-        "UBERON_0001004",  # respiratory system
-        "UBERON_0000010",  # peripheral nervous system
-        "UBERON_0001032",  # sensory system
-        "UBERON_0002046",  # thyroid gland
-        "UBERON_0004535",  # cardiovascular system
-        "UBERON_0000949",  # endocrine system
-        "UBERON_0002330",  # exocrine system
-        "UBERON_0002390",  # hematopoietic system
-        "UBERON_0000383",  # musculature of body
-        "UBERON_0001465",  # knee
-        "UBERON_0001016",  # nervous system
-        "UBERON_0001348",  # brown adipose tissue
-        "UBERON_0015143",  # mesenteric fat pad
-        "UBERON_0000175",  # pleural effusion
-        "UBERON_0001416",  # skin of abdomen
-        "UBERON_0001868",  # skin of chest
-        "UBERON_0001511",  # skin of leg
-        "UBERON_0002190",  # subcutaneous adipose tissue
-        "UBERON_0000014",  # zone of skin
-        "UBERON_0000916",  # abdomen
-    ]
-
-    # Terms to ignore when mapping
-    DENY_LIST = [
-        "BFO_0000004",
-        "CARO_0000000",
-        "CARO_0030000",
-        "CARO_0000003",
-        "NCBITaxon_6072",
-        "Thing",
-        "UBERON_0000465",  # material anatomical entity
-        "UBERON_0001062",  # anatomical entity
-    ]
-
     def __init__(self) -> None:
-        self._cached_tissues: dict[str, str] = {}
-        self._cached_labels: dict[str, str] = {}
-        self._uberon = owlready2.get_ontology(CXG_UBERON_ONTOLOGY_URL).load()
+        self.ontology_parser = OntologyParser(f"v{CXG_SCHEMA_VERSION}")
+        self.tissues = curated_ontology_term_lists.get_curated_ontology_term_list(
+            CuratedOntologyTermList.TISSUE_GENERAL
+        )
 
     def get_high_level_tissue(self, tissue_ontology_term_id: str) -> str:
         """Returns the associated high-level tissue ontology term ID from any other ID.
@@ -123,42 +22,11 @@ def get_high_level_tissue(self, tissue_ontology_term_id: str) -> str:
             - If the input tissue is not found in the ontology, return the same as input.
                 - This could happen with something like "UBERON:0002048 (cell culture)"
         """
-        tissue_ontology_term_id = self.reformat_ontology_term_id(tissue_ontology_term_id, to_writable=False)
-
-        if tissue_ontology_term_id in self._cached_tissues:
-            # If we have looked this up already
-            return self._cached_tissues[tissue_ontology_term_id]
-
-        entity = self._get_entity_from_id(tissue_ontology_term_id)
-
-        if not entity:
-            # If not found as an ontology ID return itself
-            result = self.reformat_ontology_term_id(tissue_ontology_term_id, to_writable=True)
-            self._cached_tissues[tissue_ontology_term_id] = result
-            return result
-
-        # List ancestors for this entity, including itself. Ignore any ancestors that
-        # are not descendents of UBERON_0000061 (anatomical structure).
-        ancestors = [entity.name]
-        branch_ancestors: list[str] = []
-        for is_a in entity.is_a:
-            branch_ancestors = self._list_ancestors(is_a, branch_ancestors)
-
-        # Include this branch of ancestors is under anatomical structure
-        if self.ANATOMICAL_STRUCTURE_NAME in branch_ancestors:
-            ancestors.extend(branch_ancestors)
-
-        # Check if there's at least one top-level entity in the list of ancestors
-        # for this entity
-        selected_tissue = tissue_ontology_term_id
-        for high_level_tissue in self.HIGH_LEVEL_TISSUES:
-            if high_level_tissue in ancestors:
-                selected_tissue = high_level_tissue
-                break
-
-        result = self.reformat_ontology_term_id(selected_tissue, to_writable=True)
-        self._cached_tissues[tissue_ontology_term_id] = result
-        return result
+        try:
+            tissues: list[str] = self.ontology_parser.get_high_level_terms(tissue_ontology_term_id, self.tissues)
+            return tissues[0]
+        except (IndexError, KeyError, ValueError):
+            return tissue_ontology_term_id
 
     def get_label_from_writable_id(self, ontology_term_id: str) -> str:
         """Returns the label from and ontology term id that is in writable form.
@@ -166,17 +34,7 @@ def get_label_from_writable_id(self, ontology_term_id: str) -> str:
         Example: "UBERON:0002048" returns "lung"
         Example: "UBERON_0002048" raises ValueError because the ID is not in writable form
         """
-        if ontology_term_id in self._cached_labels:
-            return self._cached_labels[ontology_term_id]
-
-        entity = self._get_entity_from_id(self.reformat_ontology_term_id(ontology_term_id, to_writable=False))
-        if entity:
-            result: str = entity.label[0]
-        else:
-            result = ontology_term_id
-
-        self._cached_labels[ontology_term_id] = result
-        return result
+        return self.ontology_parser.get_term_label(ontology_term_id)  # type: ignore[no-any-return]
 
     @staticmethod
     def reformat_ontology_term_id(ontology_term_id: str, to_writable: bool = True) -> str:
@@ -193,53 +51,3 @@ def reformat_ontology_term_id(ontology_term_id: str, to_writable: bool = True) -
             if ontology_term_id.count(":") != 1:
                 raise ValueError(f"{ontology_term_id} is an invalid ontology term id, it must contain exactly one ':'")
             return ontology_term_id.replace(":", "_")
-
-    def _list_ancestors(self, entity: owlready2.entity.ThingClass, ancestors: list[str] | None = None) -> list[str]:
-        """Recursive function that given an entity of an ontology, it traverses the ontology and returns
-        a list of all ancestors associated with the entity.
-        """
-        if ancestors is None:
-            ancestors = []
-
-        if self._is_restriction(entity):
-            # Entity is a restriction, check for part_of relationship
-
-            prop = entity.property.name
-            if prop != "BFO_0000050":
-                # BFO_0000050 is "part of"
-                return ancestors
-            ancestors.append(entity.value.name.replace("obo.", ""))
-
-            # Check for ancestors of restriction
-            self._list_ancestors(entity.value, ancestors)
-            return ancestors
-
-        elif self._is_entity(entity) and not self._is_and_object(entity):
-            # Entity is a superclass, check for is_a relationships
-
-            if entity.name in self.DENY_LIST:
-                return ancestors
-            ancestors.append(entity.name)
-
-            # Check for ancestors of superclass
-            for super_entity in entity.is_a:
-                self._list_ancestors(super_entity, ancestors)
-            return ancestors
-
-        raise ValueError("Unexpected condition in ontology.")
-
-    def _get_entity_from_id(self, ontology_term_id: str) -> owlready2.entity.ThingClass:
-        """Given a readable ontology term id (e.g. "UBERON_0002048"), it returns the associated ontology entity."""
-        return self._uberon.search_one(iri=f"http://purl.obolibrary.org/obo/{ontology_term_id}")
-
-    @staticmethod
-    def _is_restriction(entity: owlready2.entity.ThingClass) -> bool:
-        return hasattr(entity, "value")
-
-    @staticmethod
-    def _is_entity(entity: owlready2.entity.ThingClass) -> bool:
-        return hasattr(entity, "name")
-
-    @staticmethod
-    def _is_and_object(entity: owlready2.entity.ThingClass) -> bool:
-        return hasattr(entity, "Classes")