Skip to content

Commit

Permalink
[builder] cellxgene_ontology_guide integration (#1094)
Browse files Browse the repository at this point in the history
  • Loading branch information
ebezzi authored Apr 16, 2024
1 parent 873ba78 commit 5acaf9e
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 210 deletions.
1 change: 1 addition & 0 deletions tools/cellxgene_census_builder/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ dependencies= [
# https://github.com/TileDB-Inc/TileDB/blob/dev/format_spec/FORMAT_SPEC.md
"tiledbsoma==1.9.3",
"cellxgene-census==1.12.0",
"cellxgene-ontology-guide==0.6.1",
"scipy==1.12.0",
"fsspec[http]==2024.3.1",
"s3fs==2024.3.1",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,6 @@

CXG_SCHEMA_VERSION = "5.0.0" # the CELLxGENE schema version supported

# NOTE: The UBERON ontology URL needs to manually updated if the CXG Dataset Schema is updated. This is a temporary
# hassle, however, since the TissueMapper, which relies upon this ontology, will eventually be removed from the Builder
CXG_UBERON_ONTOLOGY_URL = "https://github.com/obophenotype/uberon/releases/download/v2024-01-18/uberon.owl"

# Columns expected in the census_datasets dataframe
CENSUS_DATASETS_TABLE_SPEC = TableSpec.create(
[
Expand Down
Original file line number Diff line number Diff line change
@@ -1,117 +1,16 @@
"""NOTE: In the future, this code will be part of an ontology service library."""
from cellxgene_ontology_guide import curated_ontology_term_lists
from cellxgene_ontology_guide.entities import CuratedOntologyTermList
from cellxgene_ontology_guide.ontology_parser import OntologyParser

import owlready2

from .globals import CXG_UBERON_ONTOLOGY_URL
from .globals import CXG_SCHEMA_VERSION


class TissueMapper:
# Name of anatomical structure, used to determine the set of ancestors for a given
# entity that we"re interested in.
ANATOMICAL_STRUCTURE_NAME = "UBERON_0000061"

# List of high level tissues, ORDER MATTERS. If for a given tissue there are multiple high-level tissues associated
# then `self.get_high_level_tissue()` returns the one that appears first in th this list
HIGH_LEVEL_TISSUES = [
"UBERON_0000178", # blood
"UBERON_0002048", # lung
"UBERON_0002106", # spleen
"UBERON_0002371", # bone marrow
"UBERON_0002107", # liver
"UBERON_0002113", # kidney
"UBERON_0000955", # brain
"UBERON_0002240", # spinal cord
"UBERON_0000310", # breast
"UBERON_0000948", # heart
"UBERON_0002097", # skin of body
"UBERON_0000970", # eye
"UBERON_0001264", # pancreas
"UBERON_0001043", # esophagus
"UBERON_0001155", # colon
"UBERON_0000059", # large intestine
"UBERON_0002108", # small intestine
"UBERON_0000160", # intestine
"UBERON_0000945", # stomach
"UBERON_0001836", # saliva
"UBERON_0001723", # tongue
"UBERON_0001013", # adipose tissue
"UBERON_0000473", # testis
"UBERON_0002367", # prostate gland
"UBERON_0000057", # urethra
"UBERON_0000056", # ureter
"UBERON_0003889", # fallopian tube
"UBERON_0000995", # uterus
"UBERON_0000992", # ovary
"UBERON_0002110", # gall bladder
"UBERON_0001255", # urinary bladder
"UBERON_0018707", # bladder organ
"UBERON_0000922", # embryo
"UBERON_0004023", # ganglionic eminence --> this a part of the embryo, remove in case generality is desired
"UBERON_0001987", # placenta
"UBERON_0007106", # chorionic villus
"UBERON_0002369", # adrenal gland
"UBERON_0002368", # endocrine gland
"UBERON_0002365", # exocrine gland
"UBERON_0000030", # lamina propria
"UBERON_0000029", # lymph node
"UBERON_0004536", # lymph vasculature
"UBERON_0001015", # musculature
"UBERON_0000004", # nose
"UBERON_0003688", # omentum
"UBERON_0000977", # pleura
"UBERON_0002370", # thymus
"UBERON_0002049", # vasculature
"UBERON_0009472", # axilla
"UBERON_0001087", # pleural fluid
"UBERON_0000344", # mucosa
"UBERON_0001434", # skeletal system
"UBERON_0002228", # rib
"UBERON_0003129", # skull
"UBERON_0004537", # blood vasculature
"UBERON_0002405", # immune system
"UBERON_0001009", # circulatory system
"UBERON_0001007", # digestive system
"UBERON_0001017", # central nervous system
"UBERON_0001008", # renal system
"UBERON_0000990", # reproductive system
"UBERON_0001004", # respiratory system
"UBERON_0000010", # peripheral nervous system
"UBERON_0001032", # sensory system
"UBERON_0002046", # thyroid gland
"UBERON_0004535", # cardiovascular system
"UBERON_0000949", # endocrine system
"UBERON_0002330", # exocrine system
"UBERON_0002390", # hematopoietic system
"UBERON_0000383", # musculature of body
"UBERON_0001465", # knee
"UBERON_0001016", # nervous system
"UBERON_0001348", # brown adipose tissue
"UBERON_0015143", # mesenteric fat pad
"UBERON_0000175", # pleural effusion
"UBERON_0001416", # skin of abdomen
"UBERON_0001868", # skin of chest
"UBERON_0001511", # skin of leg
"UBERON_0002190", # subcutaneous adipose tissue
"UBERON_0000014", # zone of skin
"UBERON_0000916", # abdomen
]

# Terms to ignore when mapping
DENY_LIST = [
"BFO_0000004",
"CARO_0000000",
"CARO_0030000",
"CARO_0000003",
"NCBITaxon_6072",
"Thing",
"UBERON_0000465", # material anatomical entity
"UBERON_0001062", # anatomical entity
]

def __init__(self) -> None:
self._cached_tissues: dict[str, str] = {}
self._cached_labels: dict[str, str] = {}
self._uberon = owlready2.get_ontology(CXG_UBERON_ONTOLOGY_URL).load()
self.ontology_parser = OntologyParser(f"v{CXG_SCHEMA_VERSION}")
self.tissues = curated_ontology_term_lists.get_curated_ontology_term_list(
CuratedOntologyTermList.TISSUE_GENERAL
)

def get_high_level_tissue(self, tissue_ontology_term_id: str) -> str:
"""Returns the associated high-level tissue ontology term ID from any other ID.
Expand All @@ -123,60 +22,19 @@ def get_high_level_tissue(self, tissue_ontology_term_id: str) -> str:
- If the input tissue is not found in the ontology, return the same as input.
- This could happen with something like "UBERON:0002048 (cell culture)"
"""
tissue_ontology_term_id = self.reformat_ontology_term_id(tissue_ontology_term_id, to_writable=False)

if tissue_ontology_term_id in self._cached_tissues:
# If we have looked this up already
return self._cached_tissues[tissue_ontology_term_id]

entity = self._get_entity_from_id(tissue_ontology_term_id)

if not entity:
# If not found as an ontology ID return itself
result = self.reformat_ontology_term_id(tissue_ontology_term_id, to_writable=True)
self._cached_tissues[tissue_ontology_term_id] = result
return result

# List ancestors for this entity, including itself. Ignore any ancestors that
# are not descendents of UBERON_0000061 (anatomical structure).
ancestors = [entity.name]
branch_ancestors: list[str] = []
for is_a in entity.is_a:
branch_ancestors = self._list_ancestors(is_a, branch_ancestors)

# Include this branch of ancestors is under anatomical structure
if self.ANATOMICAL_STRUCTURE_NAME in branch_ancestors:
ancestors.extend(branch_ancestors)

# Check if there's at least one top-level entity in the list of ancestors
# for this entity
selected_tissue = tissue_ontology_term_id
for high_level_tissue in self.HIGH_LEVEL_TISSUES:
if high_level_tissue in ancestors:
selected_tissue = high_level_tissue
break

result = self.reformat_ontology_term_id(selected_tissue, to_writable=True)
self._cached_tissues[tissue_ontology_term_id] = result
return result
try:
tissues: list[str] = self.ontology_parser.get_high_level_terms(tissue_ontology_term_id, self.tissues)
return tissues[0]
except (IndexError, KeyError, ValueError):
return tissue_ontology_term_id

def get_label_from_writable_id(self, ontology_term_id: str) -> str:
"""Returns the label from and ontology term id that is in writable form.
Example: "UBERON:0002048" returns "lung"
Example: "UBERON_0002048" raises ValueError because the ID is not in writable form
"""
if ontology_term_id in self._cached_labels:
return self._cached_labels[ontology_term_id]

entity = self._get_entity_from_id(self.reformat_ontology_term_id(ontology_term_id, to_writable=False))
if entity:
result: str = entity.label[0]
else:
result = ontology_term_id

self._cached_labels[ontology_term_id] = result
return result
return self.ontology_parser.get_term_label(ontology_term_id) # type: ignore[no-any-return]

@staticmethod
def reformat_ontology_term_id(ontology_term_id: str, to_writable: bool = True) -> str:
Expand All @@ -193,53 +51,3 @@ def reformat_ontology_term_id(ontology_term_id: str, to_writable: bool = True) -
if ontology_term_id.count(":") != 1:
raise ValueError(f"{ontology_term_id} is an invalid ontology term id, it must contain exactly one ':'")
return ontology_term_id.replace(":", "_")

def _list_ancestors(self, entity: owlready2.entity.ThingClass, ancestors: list[str] | None = None) -> list[str]:
"""Recursive function that given an entity of an ontology, it traverses the ontology and returns
a list of all ancestors associated with the entity.
"""
if ancestors is None:
ancestors = []

if self._is_restriction(entity):
# Entity is a restriction, check for part_of relationship

prop = entity.property.name
if prop != "BFO_0000050":
# BFO_0000050 is "part of"
return ancestors
ancestors.append(entity.value.name.replace("obo.", ""))

# Check for ancestors of restriction
self._list_ancestors(entity.value, ancestors)
return ancestors

elif self._is_entity(entity) and not self._is_and_object(entity):
# Entity is a superclass, check for is_a relationships

if entity.name in self.DENY_LIST:
return ancestors
ancestors.append(entity.name)

# Check for ancestors of superclass
for super_entity in entity.is_a:
self._list_ancestors(super_entity, ancestors)
return ancestors

raise ValueError("Unexpected condition in ontology.")

def _get_entity_from_id(self, ontology_term_id: str) -> owlready2.entity.ThingClass:
"""Given a readable ontology term id (e.g. "UBERON_0002048"), it returns the associated ontology entity."""
return self._uberon.search_one(iri=f"http://purl.obolibrary.org/obo/{ontology_term_id}")

@staticmethod
def _is_restriction(entity: owlready2.entity.ThingClass) -> bool:
return hasattr(entity, "value")

@staticmethod
def _is_entity(entity: owlready2.entity.ThingClass) -> bool:
return hasattr(entity, "name")

@staticmethod
def _is_and_object(entity: owlready2.entity.ThingClass) -> bool:
return hasattr(entity, "Classes")

0 comments on commit 5acaf9e

Please sign in to comment.