From 5acaf9e87e057e77b1a80a2cafdf9c067db29fb1 Mon Sep 17 00:00:00 2001 From: Emanuele Bezzi Date: Tue, 16 Apr 2024 09:20:38 -0700 Subject: [PATCH] [builder] cellxgene_ontology_guide integration (#1094) --- tools/cellxgene_census_builder/pyproject.toml | 1 + .../build_soma/globals.py | 4 - .../build_soma/tissue_mapper.py | 220 ++---------------- 3 files changed, 15 insertions(+), 210 deletions(-) diff --git a/tools/cellxgene_census_builder/pyproject.toml b/tools/cellxgene_census_builder/pyproject.toml index dc436a9d0..4a3af93ba 100644 --- a/tools/cellxgene_census_builder/pyproject.toml +++ b/tools/cellxgene_census_builder/pyproject.toml @@ -36,6 +36,7 @@ dependencies= [ # https://github.com/TileDB-Inc/TileDB/blob/dev/format_spec/FORMAT_SPEC.md "tiledbsoma==1.9.3", "cellxgene-census==1.12.0", + "cellxgene-ontology-guide==0.6.1", "scipy==1.12.0", "fsspec[http]==2024.3.1", "s3fs==2024.3.1", diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py index 6e5d33d14..877f1e545 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py @@ -15,10 +15,6 @@ CXG_SCHEMA_VERSION = "5.0.0" # the CELLxGENE schema version supported -# NOTE: The UBERON ontology URL needs to manually updated if the CXG Dataset Schema is updated. This is a temporary -# hassle, however, since the TissueMapper, which relies upon this ontology, will eventually be removed from the Builder -CXG_UBERON_ONTOLOGY_URL = "https://github.com/obophenotype/uberon/releases/download/v2024-01-18/uberon.owl" - # Columns expected in the census_datasets dataframe CENSUS_DATASETS_TABLE_SPEC = TableSpec.create( [ diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/tissue_mapper.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/tissue_mapper.py index 520837b11..1ad625d89 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/tissue_mapper.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/tissue_mapper.py @@ -1,117 +1,16 @@ -"""NOTE: In the future, this code will be part of an ontology service library.""" +from cellxgene_ontology_guide import curated_ontology_term_lists +from cellxgene_ontology_guide.entities import CuratedOntologyTermList +from cellxgene_ontology_guide.ontology_parser import OntologyParser -import owlready2 - -from .globals import CXG_UBERON_ONTOLOGY_URL +from .globals import CXG_SCHEMA_VERSION class TissueMapper: - # Name of anatomical structure, used to determine the set of ancestors for a given - # entity that we"re interested in. - ANATOMICAL_STRUCTURE_NAME = "UBERON_0000061" - - # List of high level tissues, ORDER MATTERS. If for a given tissue there are multiple high-level tissues associated - # then `self.get_high_level_tissue()` returns the one that appears first in th this list - HIGH_LEVEL_TISSUES = [ - "UBERON_0000178", # blood - "UBERON_0002048", # lung - "UBERON_0002106", # spleen - "UBERON_0002371", # bone marrow - "UBERON_0002107", # liver - "UBERON_0002113", # kidney - "UBERON_0000955", # brain - "UBERON_0002240", # spinal cord - "UBERON_0000310", # breast - "UBERON_0000948", # heart - "UBERON_0002097", # skin of body - "UBERON_0000970", # eye - "UBERON_0001264", # pancreas - "UBERON_0001043", # esophagus - "UBERON_0001155", # colon - "UBERON_0000059", # large intestine - "UBERON_0002108", # small intestine - "UBERON_0000160", # intestine - "UBERON_0000945", # stomach - "UBERON_0001836", # saliva - "UBERON_0001723", # tongue - "UBERON_0001013", # adipose tissue - "UBERON_0000473", # testis - "UBERON_0002367", # prostate gland - "UBERON_0000057", # urethra - "UBERON_0000056", # ureter - "UBERON_0003889", # fallopian tube - "UBERON_0000995", # uterus - "UBERON_0000992", # ovary - "UBERON_0002110", # gall bladder - "UBERON_0001255", # urinary bladder - "UBERON_0018707", # bladder organ - "UBERON_0000922", # embryo - "UBERON_0004023", # ganglionic eminence --> this a part of the embryo, remove in case generality is desired - "UBERON_0001987", # placenta - "UBERON_0007106", # chorionic villus - "UBERON_0002369", # adrenal gland - "UBERON_0002368", # endocrine gland - "UBERON_0002365", # exocrine gland - "UBERON_0000030", # lamina propria - "UBERON_0000029", # lymph node - "UBERON_0004536", # lymph vasculature - "UBERON_0001015", # musculature - "UBERON_0000004", # nose - "UBERON_0003688", # omentum - "UBERON_0000977", # pleura - "UBERON_0002370", # thymus - "UBERON_0002049", # vasculature - "UBERON_0009472", # axilla - "UBERON_0001087", # pleural fluid - "UBERON_0000344", # mucosa - "UBERON_0001434", # skeletal system - "UBERON_0002228", # rib - "UBERON_0003129", # skull - "UBERON_0004537", # blood vasculature - "UBERON_0002405", # immune system - "UBERON_0001009", # circulatory system - "UBERON_0001007", # digestive system - "UBERON_0001017", # central nervous system - "UBERON_0001008", # renal system - "UBERON_0000990", # reproductive system - "UBERON_0001004", # respiratory system - "UBERON_0000010", # peripheral nervous system - "UBERON_0001032", # sensory system - "UBERON_0002046", # thyroid gland - "UBERON_0004535", # cardiovascular system - "UBERON_0000949", # endocrine system - "UBERON_0002330", # exocrine system - "UBERON_0002390", # hematopoietic system - "UBERON_0000383", # musculature of body - "UBERON_0001465", # knee - "UBERON_0001016", # nervous system - "UBERON_0001348", # brown adipose tissue - "UBERON_0015143", # mesenteric fat pad - "UBERON_0000175", # pleural effusion - "UBERON_0001416", # skin of abdomen - "UBERON_0001868", # skin of chest - "UBERON_0001511", # skin of leg - "UBERON_0002190", # subcutaneous adipose tissue - "UBERON_0000014", # zone of skin - "UBERON_0000916", # abdomen - ] - - # Terms to ignore when mapping - DENY_LIST = [ - "BFO_0000004", - "CARO_0000000", - "CARO_0030000", - "CARO_0000003", - "NCBITaxon_6072", - "Thing", - "UBERON_0000465", # material anatomical entity - "UBERON_0001062", # anatomical entity - ] - def __init__(self) -> None: - self._cached_tissues: dict[str, str] = {} - self._cached_labels: dict[str, str] = {} - self._uberon = owlready2.get_ontology(CXG_UBERON_ONTOLOGY_URL).load() + self.ontology_parser = OntologyParser(f"v{CXG_SCHEMA_VERSION}") + self.tissues = curated_ontology_term_lists.get_curated_ontology_term_list( + CuratedOntologyTermList.TISSUE_GENERAL + ) def get_high_level_tissue(self, tissue_ontology_term_id: str) -> str: """Returns the associated high-level tissue ontology term ID from any other ID. @@ -123,42 +22,11 @@ def get_high_level_tissue(self, tissue_ontology_term_id: str) -> str: - If the input tissue is not found in the ontology, return the same as input. - This could happen with something like "UBERON:0002048 (cell culture)" """ - tissue_ontology_term_id = self.reformat_ontology_term_id(tissue_ontology_term_id, to_writable=False) - - if tissue_ontology_term_id in self._cached_tissues: - # If we have looked this up already - return self._cached_tissues[tissue_ontology_term_id] - - entity = self._get_entity_from_id(tissue_ontology_term_id) - - if not entity: - # If not found as an ontology ID return itself - result = self.reformat_ontology_term_id(tissue_ontology_term_id, to_writable=True) - self._cached_tissues[tissue_ontology_term_id] = result - return result - - # List ancestors for this entity, including itself. Ignore any ancestors that - # are not descendents of UBERON_0000061 (anatomical structure). - ancestors = [entity.name] - branch_ancestors: list[str] = [] - for is_a in entity.is_a: - branch_ancestors = self._list_ancestors(is_a, branch_ancestors) - - # Include this branch of ancestors is under anatomical structure - if self.ANATOMICAL_STRUCTURE_NAME in branch_ancestors: - ancestors.extend(branch_ancestors) - - # Check if there's at least one top-level entity in the list of ancestors - # for this entity - selected_tissue = tissue_ontology_term_id - for high_level_tissue in self.HIGH_LEVEL_TISSUES: - if high_level_tissue in ancestors: - selected_tissue = high_level_tissue - break - - result = self.reformat_ontology_term_id(selected_tissue, to_writable=True) - self._cached_tissues[tissue_ontology_term_id] = result - return result + try: + tissues: list[str] = self.ontology_parser.get_high_level_terms(tissue_ontology_term_id, self.tissues) + return tissues[0] + except (IndexError, KeyError, ValueError): + return tissue_ontology_term_id def get_label_from_writable_id(self, ontology_term_id: str) -> str: """Returns the label from and ontology term id that is in writable form. @@ -166,17 +34,7 @@ def get_label_from_writable_id(self, ontology_term_id: str) -> str: Example: "UBERON:0002048" returns "lung" Example: "UBERON_0002048" raises ValueError because the ID is not in writable form """ - if ontology_term_id in self._cached_labels: - return self._cached_labels[ontology_term_id] - - entity = self._get_entity_from_id(self.reformat_ontology_term_id(ontology_term_id, to_writable=False)) - if entity: - result: str = entity.label[0] - else: - result = ontology_term_id - - self._cached_labels[ontology_term_id] = result - return result + return self.ontology_parser.get_term_label(ontology_term_id) # type: ignore[no-any-return] @staticmethod def reformat_ontology_term_id(ontology_term_id: str, to_writable: bool = True) -> str: @@ -193,53 +51,3 @@ def reformat_ontology_term_id(ontology_term_id: str, to_writable: bool = True) - if ontology_term_id.count(":") != 1: raise ValueError(f"{ontology_term_id} is an invalid ontology term id, it must contain exactly one ':'") return ontology_term_id.replace(":", "_") - - def _list_ancestors(self, entity: owlready2.entity.ThingClass, ancestors: list[str] | None = None) -> list[str]: - """Recursive function that given an entity of an ontology, it traverses the ontology and returns - a list of all ancestors associated with the entity. - """ - if ancestors is None: - ancestors = [] - - if self._is_restriction(entity): - # Entity is a restriction, check for part_of relationship - - prop = entity.property.name - if prop != "BFO_0000050": - # BFO_0000050 is "part of" - return ancestors - ancestors.append(entity.value.name.replace("obo.", "")) - - # Check for ancestors of restriction - self._list_ancestors(entity.value, ancestors) - return ancestors - - elif self._is_entity(entity) and not self._is_and_object(entity): - # Entity is a superclass, check for is_a relationships - - if entity.name in self.DENY_LIST: - return ancestors - ancestors.append(entity.name) - - # Check for ancestors of superclass - for super_entity in entity.is_a: - self._list_ancestors(super_entity, ancestors) - return ancestors - - raise ValueError("Unexpected condition in ontology.") - - def _get_entity_from_id(self, ontology_term_id: str) -> owlready2.entity.ThingClass: - """Given a readable ontology term id (e.g. "UBERON_0002048"), it returns the associated ontology entity.""" - return self._uberon.search_one(iri=f"http://purl.obolibrary.org/obo/{ontology_term_id}") - - @staticmethod - def _is_restriction(entity: owlready2.entity.ThingClass) -> bool: - return hasattr(entity, "value") - - @staticmethod - def _is_entity(entity: owlready2.entity.ThingClass) -> bool: - return hasattr(entity, "name") - - @staticmethod - def _is_and_object(entity: owlready2.entity.ThingClass) -> bool: - return hasattr(entity, "Classes")