diff --git a/apiv2/db_import/importers/alignment.py b/apiv2/db_import/importers/alignment.py index 6c6423613..d975ed334 100644 --- a/apiv2/db_import/importers/alignment.py +++ b/apiv2/db_import/importers/alignment.py @@ -46,9 +46,12 @@ class AlignmentItem(ItemDBImporter): model_class = models.Alignment def load_computed_fields(self): + if self.model_args["alignment_method"] == "undefined": + self.model_args["alignment_method"] = None self.model_args["s3_alignment_metadata"] = self.get_s3_url(self.input_data["file"]) self.model_args["https_alignment_metadata"] = self.get_https_url(self.input_data["file"]) - self.model_args["tiltseries_id"] = self.config.get_tiltseries_by_path(self.input_data["tiltseries_path"]) + if self.input_data.get("tiltseries_path"): + self.model_args["tiltseries_id"] = self.config.get_tiltseries_by_path(self.input_data["tiltseries_path"]) self.model_args["run_id"] = self.input_data["run"].id diff --git a/apiv2/db_import/importers/tomogram.py b/apiv2/db_import/importers/tomogram.py index 0f8493d0d..fba9d2d05 100644 --- a/apiv2/db_import/importers/tomogram.py +++ b/apiv2/db_import/importers/tomogram.py @@ -32,7 +32,7 @@ class TomogramItem(ItemDBImporter): } def normalize_to_unknown_str(self, value: str) -> str: - return value if value else "Unknown" + return value.replace(" ", "_") if value else "Unknown" def generate_neuroglancer_data(self, path) -> str: config = self.config.load_key_json(path, is_file_required=False) @@ -55,7 +55,9 @@ def load_computed_fields(self): "s3_mrc_file": self.get_s3_url(self.input_data["mrc_file"]), "https_mrc_file": self.get_https_url(self.input_data["mrc_file"]), # TODO: Add alignment_id once we have an alignment importer. - "alignment_id": self.config.get_alignment_by_path(self.get_s3_url(self.input_data["alignment_metadata_path"])), + "alignment_id": self.config.get_alignment_by_path( + self.get_s3_url(self.input_data["alignment_metadata_path"]) + ), "key_photo_url": None, "key_photo_thumbnail_url": None, "is_portal_standard": self.input_data.get("is_standardized") or False, diff --git a/apiv2/scripts/delete_dataset.py b/apiv2/scripts/delete_dataset.py new file mode 100644 index 000000000..189bd084e --- /dev/null +++ b/apiv2/scripts/delete_dataset.py @@ -0,0 +1,40 @@ +# Copies data from a v1 GraphQL api into a v2 database. +import sqlalchemy as sa +import contextlib +import json +import os +from concurrent.futures import ProcessPoolExecutor, as_completed + +import click +import cryoet_data_portal as cdp +import requests +from database import models +from support.enums import tomogram_reconstruction_method_enum as reconstruction_enum + +from platformics.database.connect import init_sync_db + + +@click.command() +@click.argument("dataset_id", required=True, type=int) +@click.option("--i-am-super-sure", help="this argument must be yes", type=str) +@click.option("--db-uri", help="Database URI", type=str) +def delete_dataset(dataset_id: int, i_am_super_sure: str, db_uri: str): + if not db_uri: + db_uri = ( + f"postgresql+psycopg://{os.environ['PLATFORMICS_DATABASE_USER']}:{os.environ['PLATFORMICS_DATABASE_PASSWORD']}@{os.environ['PLATFORMICS_DATABASE_HOST']}:{os.environ['PLATFORMICS_DATABASE_PORT']}/{os.environ['PLATFORMICS_DATABASE_NAME']}", + ) + + print(f"DELETING DATASET {dataset_id}") + if i_am_super_sure != "yes": + print("You must specify '--i-am-super-sure yes' to delete a dataset") + exit(1) + db = init_sync_db(db_uri) + with db.session() as session: + obj = session.scalars(sa.select(models.Dataset).where(models.Dataset.id == dataset_id)).one() + session.delete(obj) + session.flush() + session.commit() + + +if __name__ == "__main__": + delete_dataset() diff --git a/apiv2/scripts/scrape.py b/apiv2/scripts/scrape.py index d0e58c29c..d343acd8c 100644 --- a/apiv2/scripts/scrape.py +++ b/apiv2/scripts/scrape.py @@ -15,9 +15,6 @@ # cleanup: # delete from deposition_type; delete from annotation_author; delete from annotation_method_link; delete from dataset_author; delete from deposition_author; delete from annotation_file ; delete from annotation_shape ; delete from dataset_funding; delete from tomogram_author; delete from tomogram; delete from annotation; delete from annotation; delete from tomogram_voxel_spacing; delete from per_section_alignment_parameters; delete from alignment; delete from frame; delete from tiltseries; delete from gain_file; delete from frame; delete from run; delete from run; delete from dataset; delete from deposition; -# CLIENT_URL="https://graphql-cryoet-api.cryoet.staging.si.czi.technology/v1/graphql" -CLIENT_URL = "https://graphql.cryoetdataportal.cziscience.com/v1/graphql" - # Adapted from https://github.com/sqlalchemy/sqlalchemy/wiki/UniqueObject def get_or_create(session, cls, row_id, filters, data): @@ -307,7 +304,6 @@ def add(session, model, item, parents): "reconstruction_software": remote_item["reconstruction_software"], "is_author_submitted": remote_item["is_canonical"], "is_portal_standard": False, - "is_standardized": False, "is_visualization_default": remote_item["is_canonical"], "s3_omezarr_dir": remote_item["s3_omezarr_dir"], "https_omezarr_dir": remote_item["https_omezarr_dir"], @@ -356,11 +352,9 @@ def add(session, model, item, parents): return item -def import_deposition(deposition_id: int): - db = init_sync_db( - f"postgresql+psycopg://{os.environ['PLATFORMICS_DATABASE_USER']}:{os.environ['PLATFORMICS_DATABASE_PASSWORD']}@{os.environ['PLATFORMICS_DATABASE_HOST']}:{os.environ['PLATFORMICS_DATABASE_PORT']}/{os.environ['PLATFORMICS_DATABASE_NAME']}", - ) - client = cdp.Client(CLIENT_URL) +def db_import_deposition(client_url, db_uri, deposition_id: int): + db = init_sync_db(db_uri) + client = cdp.Client(client_url) dep = cdp.Deposition.get_by_id(client, deposition_id) with db.session() as session: print(f"processing {dep.id}") @@ -375,7 +369,7 @@ def import_deposition(deposition_id: int): # Method links isn't exposed by the v2 client for some reason -def fetch_method_links(annotation): +def fetch_method_links(client_url, annotation): headers = { "Content-type": "application/json", } @@ -390,7 +384,7 @@ def fetch_method_links(annotation): % annotation.id ) payload = json.dumps({"query": query, "variables": None, "operationName": "MyQuery"}) - res = requests.post(CLIENT_URL, headers=headers, data=payload) + res = requests.post(client_url, headers=headers, data=payload) data = res.json() anno = data["data"]["annotations"][0] if anno and anno.get("method_links"): @@ -398,11 +392,9 @@ def fetch_method_links(annotation): return [] -def import_dataset(dataset_id: int): - db = init_sync_db( - f"postgresql+psycopg://{os.environ['PLATFORMICS_DATABASE_USER']}:{os.environ['PLATFORMICS_DATABASE_PASSWORD']}@{os.environ['PLATFORMICS_DATABASE_HOST']}:{os.environ['PLATFORMICS_DATABASE_PORT']}/{os.environ['PLATFORMICS_DATABASE_NAME']}", - ) - client = cdp.Client(CLIENT_URL) +def db_import_dataset(client_url, db_uri, dataset_id: int): + db = init_sync_db(db_uri) + client = cdp.Client(client_url) dataset = cdp.Dataset.get_by_id(client, dataset_id) with db.session() as session: print(f"processing {dataset_id}") @@ -432,7 +424,7 @@ def import_dataset(dataset_id: int): for anno in cdp.Annotation.find(client, [cdp.Annotation.tomogram_voxel_spacing_id == vs.id]): a = add(session, models.Annotation, anno, parents) parents["annotation_id"] = a.id - for methodlink in fetch_method_links(anno): + for methodlink in fetch_method_links(client_url, anno): add(session, models.AnnotationMethodLink, methodlink, parents) for annofile in cdp.AnnotationFile.find(client, [cdp.AnnotationFile.annotation_id == anno.id]): add(session, models.AnnotationFile, annofile, parents) @@ -454,21 +446,49 @@ def import_dataset(dataset_id: int): @click.command() -@click.option("--skip-until", help="skip all datasets until and including this one") -@click.option("--parallelism", help="how many processes to run in parallel", required=True, default=10) -def do_import(skip_until, parallelism): - client = cdp.Client(CLIENT_URL) +# Either prod or staging +@click.argument("env", required=True, type=str) +@click.option("--skip-until", help="skip all datasets until and including this one", type=int) +@click.option("--import-dataset", help="Import specific datasets (multiple ok)", multiple=True, type=int) +@click.option("--import-deposition", help="Import specific depositions (multiple ok)", multiple=True, type=int) +@click.option("--db-uri", help="Database URI") +@click.option( + "--import-all-depositions", is_flag=True, help="Whether to import depositions", required=False, default=False +) +@click.option("--parallelism", help="how many processes to run in parallel", required=True, default=10, type=int) +def do_import(env, import_dataset, import_deposition, db_uri, import_all_depositions, skip_until, parallelism): futures = [] + if env not in ["prod", "staging"]: + print("Env must be 'prod' or 'staging'") + exit(1) + if env == "staging": + client_url = "https://graphql-cryoet-api.cryoet.staging.si.czi.technology/v1/graphql" + else: + client_url = "https://graphql.cryoetdataportal.cziscience.com/v1/graphql" + + if not db_uri: + db_uri = ( + f"postgresql+psycopg://{os.environ['PLATFORMICS_DATABASE_USER']}:{os.environ['PLATFORMICS_DATABASE_PASSWORD']}@{os.environ['PLATFORMICS_DATABASE_HOST']}:{os.environ['PLATFORMICS_DATABASE_PORT']}/{os.environ['PLATFORMICS_DATABASE_NAME']}", + ) + + client = cdp.Client(client_url) + with ProcessPoolExecutor(max_workers=parallelism) as workerpool: - depositions = cdp.Deposition.find(client) - depositions.sort(key=lambda a: a.id) # Sort datasets by id - for dep in depositions: - futures.append( - workerpool.submit( - import_deposition, - dep.id, - ), - ) + if import_all_depositions or import_deposition: + depositions = cdp.Deposition.find(client) + depositions.sort(key=lambda a: a.id) # Sort datasets by id + for dep in depositions: + if import_deposition: + if dep.id not in import_deposition: + continue + futures.append( + workerpool.submit( + db_import_deposition, + client_url, + db_uri, + dep.id, + ), + ) datasets = cdp.Dataset.find(client) datasets.sort(key=lambda a: a.id) # Sort datasets by id for dataset in datasets: @@ -476,10 +496,15 @@ def do_import(skip_until, parallelism): if dataset.id == int(skip_until): skip_until = None continue + if import_dataset: + if dataset.id not in import_dataset: + continue futures.append( workerpool.submit( - import_dataset, + db_import_dataset, + client_url, + db_uri, dataset.id, ), )