From 21130dcef2deb8fba2d40d0af604d1aa3c5eb4cc Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Wed, 3 Aug 2022 16:10:53 -0400 Subject: [PATCH] Slurp pipeline - --- src/ontology/mondo-ingest.Makefile | 23 ++++--- src/scripts/migrate.py | 104 +++++++++++++++++------------ 2 files changed, 76 insertions(+), 51 deletions(-) diff --git a/src/ontology/mondo-ingest.Makefile b/src/ontology/mondo-ingest.Makefile index d6a716a8..b91f6f13 100644 --- a/src/ontology/mondo-ingest.Makefile +++ b/src/ontology/mondo-ingest.Makefile @@ -281,16 +281,23 @@ lexical_matches: mappings/mondo-sources-all-lexical.sssom.tsv slurp/: mkdir -p $@ -# Feel free to change the signature. Min ID is the next available Mondo ID. -slurp/%.tsv: components/%.owl tmp/mondo.sssom.tsv reports/mirror-signature-mondo.tsv | slurp/ +# TODO: Move this to Makefile. how, when I can't edit it here? +.PHONY: component-download-mondo.owl +component-download-mondo.owl: | $(TMPDIR) + if [ $(MIR) = true ] && [ $(COMP) = true ]; then $(ROBOT) merge -I https://github.com/monarch-initiative/omim/releases/latest/download/omim.ttl \ + annotate --ontology-iri $(ONTBASE)/$@ $(ANNOTATE_ONTOLOGY_VERSION) -o $(TMPDIR)/$@.owl; fi + +# min-id: the next available Mondo ID +slurp/%.tsv: components/%.owl tmp/mondo.sssom.tsv reports/mirror_signature-mondo.tsv | slurp/ python $(SCRIPTSDIR)/migrate.py \ - -i $< \ - --mapping-file tmp/mondo.sssom.tsv \ + --ontology-path components/%.owl \ + --sssom-map-path tmp/mondo.sssom.tsv \ --min-id 123000 \ - --mondo-terms reports/mirror-signature-mondo.tsv \ - --output $@ + --mondo-terms-path reports/mirror_signature-mondo.tsv \ + --outpath $@ slurp-%: slurp/%.tsv -# TODO: add more ontologies, e.g.: doid, icd10cm, icd10who, ncit, ordo -slurp: slurp-omim +# TODO: change to all ontologies when ready +# slurp-all: slurp-omim slurp-doid slurp-ncit slurp-ordo slurp-icd10cm slurp-icd10who +slurp-all: slurp-omim diff --git a/src/scripts/migrate.py b/src/scripts/migrate.py index 97368979..5cd653b9 100644 --- a/src/scripts/migrate.py +++ b/src/scripts/migrate.py @@ -1,54 +1,72 @@ -"""Migration pipeline +"""Slurp migration pipeline -#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING -#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING -#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING -#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING - -TODOs: +TODO's: - add CLI: look to makefile for what to include """ -import oakliblib -import pandas - +import os +from argparse import ArgumentParser +from typing import Dict -#Inputs: -source_ontology = '' #e.g. omim -sssom_map = '' # e.g. mondo.sssom.tsv -min_id = '' -termlist_mondo = '' +import oaklib +import pandas as pd -def run(source_ontology = '', sssom_map = '', min_id = '', termlist_mondo = ''): - """source_ontology = '' #e.g. omim - sssom_map = '' # e.g. mondo.sssom.tsv - min_id = '' - termlist_mondo = ''""" - #Outputs: +def run(ontology_path: str, sssom_map_path: str, min_id: int, mondo_terms_path: str, outpath: str): + # TODO: read files + source_ontology = ontology_path + print(os.path.exists(os.path.join(os.getcwd(), sssom_map_path))) + print(os.path.exists(os.path.join(os.getcwd(), mondo_terms_path))) + print(os.path.exists(os.path.join(os.getcwd(), ontology_path))) + sssom_map = pd.read_csv(sssom_map_path, comment='#', sep='\t') + termlist_mondo = pd.read_csv(mondo_terms_path, comment='#', sep='\t') data = [] + # for t in source_ontology: + # if t not in sssom_map['object_id']: + # parents = [] + # migrate = True + # for p in oaklib.get_direct_parents(t): + # if p not in sssom_map['object_id']: + # migrate = False + # break + # elif sssom_map[sssom_map['object_id']==p]['predicate_id'] = 'skos:exactMatch' \ + # or sssom_map[sssom_map['object_id']==p]['predicate_id'] = 'skos:narrowMatch': + # # In other words, if the parent is mapped, and the mapping is either exact or narrower + # parents.append(sssom_map[sssom_map['object_id']==p]['subject_id']) + # else: + # # Its fine, just continue looking for other parents in this case + # if migrate and parents: + # next_mondo_id = determine_next_available_mondo_id(min_id, termlist_mondo) # satrting from min_id, then counting up and checking if it does not already exist. + # label = oaklib.get_label(t) + # definition = oaklib.get_definition(t) + # data.append({'mondo_id':next_mondo_id, 'xref': t, 'label': label, 'definition': definition}) + # + # pd.DataFrame(data).to_csv(fn, sep="\t") + pass + - for t in source_ontology: - if t not in sssom_map['object_id']: - parents = [] - migrate = True - for p in oaklib.get_direct_parents(t): - if p not in sssom_map['object_id']: - migrate = False - break - elif sssom_map[sssom_map['object_id']==p]['predicate_id'] = 'skos:exactMatch' \ - or sssom_map[sssom_map['object_id']==p]['predicate_id'] = 'skos:narrowMatch': - # In other words, if the parent is mapped, and the mapping is either exact or narrower - parents.append(sssom_map[sssom_map['object_id']==p]['subject_id']) - else: - # Its fine, just continue looking for other parents in this case - if migrate and parents: - next_mondo_id = determine_next_available_mondo_id(min_id, termlist_mondo) # satrting from min_id, then counting up and checking if it does not already exist. - label = oaklib.get_label(t) - definition = oaklib.get_definition(t) - data.append({'mondo_id':next_mondo_id, 'xref': t, 'label': label, 'definition': definition}) - - pandas.DataFrame(data).to_csv(fn, sep="\t") +def cli() : + """Command line interface.""" + package_description = \ + 'Slurp pipeline: Integrate new terms from other ontologies into Mondo.' + parser = ArgumentParser(description=package_description) + parser.add_argument( + '-o', '--ontology-path', required=True, + help='xxxxxx') + parser.add_argument( + '-m', '--sssom-map-path', required=True, + help='xxxxxx') + parser.add_argument( + '-i', '--min-id', required=True, + help='xxxxxx') + parser.add_argument( + '-t', '--mondo-terms-path', required=True, + help='xxxxxx') + parser.add_argument( + '-O', '--outpath', required=True, + help='xxxxxx') + d: Dict = vars(parser.parse_args()) + return run(**d) if __name__ == '__main__': - run() + cli()