Skip to content

Commit

Permalink
feat: Updating workflow to support deposition ingestion (#185)
Browse files Browse the repository at this point in the history
* Adding deposition tables

* Adding timestamps to tables

* Adding timestamps to s3

* Adding dir prefixes

* Refactoring method name

* Refactoring unavailable ts placeholder

* Updating name

* Revert timestamp addition for now

* minor refactor

* fix revert permissions

* Updating all entities to use deposition

* Updating existing tests

* Adding tests for depositions

* Updting seed sql files

* Updating the comments text

* Fixing broken seed sql

* Update documentation

* Adding deposition relationship to graphql
  • Loading branch information
manasaV3 authored Aug 6, 2024
1 parent cb3cc15 commit e3b1953
Show file tree
Hide file tree
Showing 40 changed files with 1,260 additions and 1,184 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@ table:
name: annotations
schema: public
object_relationships:
- name: deposition
using:
manual_configuration:
column_mapping:
deposition_id: id
insertion_order: null
remote_table:
name: depositions
schema: public
- name: tomogram_voxel_spacing
using:
foreign_key_constraint_on: tomogram_voxel_spacing_id
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
table:
name: datasets
schema: public
object_relationships:
- name: deposition
using:
manual_configuration:
column_mapping:
deposition_id: id
insertion_order: null
remote_table:
name: depositions
schema: public
array_relationships:
- name: authors
using:
Expand All @@ -27,15 +37,16 @@ select_permissions:
- role: anonymous
permission:
columns:
- cell_component_id
- cell_component_name
- cell_name
- cell_strain_id
- cell_strain_name
- cell_type_id
- cell_component_id
- cell_component_name
- dataset_citations
- dataset_publications
- deposition_date
- deposition_id
- description
- grid_preparation
- https_prefix
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
table:
name: deposition_authors
schema: public
object_relationships:
- name: deposition
using:
foreign_key_constraint_on: deposition_id
select_permissions:
- role: anonymous
permission:
columns:
- affiliation_address
- affiliation_identifier
- orcid
- email
- name
- corresponding_author_status
- primary_author_status
- affiliation_name
- id
- deposition_id
- author_list_order
filter: {}
allow_aggregations: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
table:
name: depositions
schema: public
array_relationships:
- name: authors
using:
foreign_key_constraint_on:
column: deposition_id
table:
name: deposition_authors
schema: public
select_permissions:
- role: anonymous
permission:
columns:
- deposition_date
- deposition_publications
- deposition_types
- description
- https_prefix
- id
- last_modified_date
- related_database_entries
- release_date
- s3_prefix
- title
filter: {}
allow_aggregations: true
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@ table:
name: tiltseries
schema: public
object_relationships:
- name: deposition
using:
manual_configuration:
column_mapping:
deposition_id: id
insertion_order: null
remote_table:
name: depositions
schema: public
- name: run
using:
foreign_key_constraint_on: run_id
Expand All @@ -15,6 +24,7 @@ select_permissions:
- camera_manufacturer
- camera_model
- data_acquisition_software
- deposition_id
- frames_count
- https_alignment_file
- https_angle_list
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@ table:
name: tomograms
schema: public
object_relationships:
- name: deposition
using:
manual_configuration:
column_mapping:
deposition_id: id
insertion_order: null
remote_table:
name: depositions
schema: public
- name: tomogram_type
using:
foreign_key_constraint_on: type
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
- "!include public_dataset_authors.yaml"
- "!include public_dataset_funding.yaml"
- "!include public_datasets.yaml"
- "!include public_deposition_authors.yaml"
- "!include public_depositions.yaml"
- "!include public_runs.yaml"
- "!include public_tiltseries.yaml"
- "!include public_tomogram_authors.yaml"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
DROP INDEX IF EXISTS "public"."depositions_type";
DROP TABLE "public"."depositions";
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
CREATE TABLE "public"."depositions" (
"id" serial NOT NULL,
"title" varchar NOT NULL,
"description" varchar NOT NULL,
"deposition_date" date NOT NULL,
"release_date" date NOT NULL,
"last_modified_date" date NOT NULL,
"related_database_entries" varchar,
"deposition_publications" varchar,
"deposition_types" varchar NOT NULL,
"s3_prefix" varchar,
"https_prefix" varchar,
PRIMARY KEY ("id") , UNIQUE ("id")
);

COMMENT ON TABLE "public"."depositions" IS E'Deposition metadata';
COMMENT ON COLUMN public.depositions.id IS 'Numeric identifier for this depositions';
COMMENT ON COLUMN public.depositions.title IS 'Title for the deposition';
COMMENT ON COLUMN public.depositions.description IS 'Description for the deposition';
COMMENT ON COLUMN public.depositions.release_date IS 'The date the deposition was released';
COMMENT ON COLUMN public.depositions.last_modified_date IS 'The date the deposition was last modified';
COMMENT ON COLUMN public.depositions.deposition_date IS 'The date the deposition was deposited';
COMMENT ON COLUMN public.depositions.related_database_entries IS 'The related database entries to this deposition';
COMMENT ON COLUMN public.depositions.deposition_publications IS 'The publications related to this deposition';
COMMENT ON COLUMN public.depositions.deposition_types IS 'The types of data submitted as a part of this deposition';
COMMENT ON COLUMN public.depositions.s3_prefix IS 'The S3 public bucket path where data about this deposition is contained';
COMMENT ON COLUMN public.depositions.https_prefix IS 'The https directory path where data about this deposition is contained';

CREATE INDEX "depositions_type" on "public"."depositions" using btree ("deposition_types");
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
DROP TABLE "public"."deposition_authors";
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
CREATE TABLE "public"."deposition_authors" (
"id" serial NOT NULL,
"name" varchar NOT NULL,
"orcid" varchar,
"corresponding_author_status" boolean DEFAULT false,
"email" varchar,
"affiliation_name" varchar,
"affiliation_address" varchar,
"affiliation_identifier" varchar,
"deposition_id" integer NOT NULL,
"primary_author_status" boolean DEFAULT false,
"author_list_order" integer NOT NULL,
PRIMARY KEY ("id") ,
FOREIGN KEY ("deposition_id") REFERENCES "public"."depositions"("id") ON UPDATE restrict ON DELETE restrict,
UNIQUE ("id")
);

COMMENT ON TABLE "public"."deposition_authors" IS E'Authors for a deposition';
COMMENT ON COLUMN public.deposition_authors.id IS 'Numeric identifier for this deposition author (this may change!)';
COMMENT ON COLUMN public.deposition_authors.author_list_order IS 'The order in which the author appears in the publication';
COMMENT ON COLUMN public.deposition_authors.deposition_id IS 'Reference to the deposition this author contributed to';
COMMENT ON COLUMN public.deposition_authors.name IS 'Full name of a deposition author (e.g. Jane Doe).';
COMMENT ON COLUMN public.deposition_authors.orcid IS 'A unique, persistent identifier for researchers, provided by ORCID.';
COMMENT ON COLUMN public.deposition_authors.corresponding_author_status IS 'Indicates whether an author is the corresponding author';
COMMENT ON COLUMN public.deposition_authors.primary_author_status IS 'Indicates whether an author is the main person creating the deposition';
COMMENT ON COLUMN public.deposition_authors.email IS 'Email address for this author';
COMMENT ON COLUMN public.deposition_authors.affiliation_name IS 'Name of the institution an author is affiliated with.';
COMMENT ON COLUMN public.deposition_authors.affiliation_address IS 'Address of the institution an author is affiliated with.';
COMMENT ON COLUMN public.deposition_authors.affiliation_identifier IS 'A unique identifier assigned to the affiliated institution by The Research Organization Registry (ROR).';
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
DROP INDEX IF EXISTS "public"."dataset_deposition_id";
ALTER TABLE "public"."datasets" DROP COLUMN "deposition_id";
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ALTER TABLE "public"."datasets" ADD COLUMN "deposition_id" INTEGER NULL;
CREATE INDEX "dataset_deposition_id" on "public"."datasets" using btree ("deposition_id");

COMMENT ON COLUMN public.datasets.deposition_id IS 'Reference to the deposition this dataset is associated with';
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
DROP INDEX IF EXISTS "public"."tiltseries_deposition_id";
ALTER TABLE "public"."tiltseries" DROP COLUMN "deposition_id" INTEGER NULL;
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ALTER TABLE "public"."tiltseries" ADD COLUMN "deposition_id" INTEGER NULL;
CREATE INDEX "tiltseries_deposition_id" ON "public"."tiltseries" USING btree ("deposition_id");

COMMENT ON COLUMN public.tiltseries.deposition_id IS 'Reference to the deposition this tiltseries is associated with';
8 changes: 4 additions & 4 deletions ingestion_tools/docs/enqueue_runs.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,13 +100,13 @@ python3 enqueue_runs.py db-import --environment prod --import-annotation-authors
| --import-everything | | If this flag is passed in, the script will attempt to ingest all data specified in the dataset config, (datasets, runs, annotations, etc etc) |

### Other interesting options to be aware of
| Option | Default | Explanation |
| --- | --- | --- |
| Option | Default | Explanation |
|-------------------| --- |----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| --filter-datasets | null | Supply a regular expression to apply to the list of available dataset ID's, to only run import for certain datasets. This option can be specified multiple times with multiple regular expressions |
| --include-dataset | null | Specify specific datasets to import. This option can be specified multiple times with multiple dataset id's |
| --exclude-dataset | null | Supply a regular expression to exclude a list of dataset ID's the import. This option can be specified multiple times with multiple regular expressions. |
| --s3-prefix | null | Only look for datasets in a particular subdirectory (this is faster than the filter/include filters) when importing a single dataset |

| --s3-prefix | null | Only look for datasets in a particular subdirectory (this is faster than the filter/include filters) when importing a single dataset |
| --deposition-id | null | To be used with the --import-depositions flag to specify the depositions to be imported |


## S3 File Sync (`sync` subcommand)
Expand Down
36 changes: 36 additions & 0 deletions ingestion_tools/scripts/common/db_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,40 @@ class Meta:
database = db


class Deposition(BaseModel):
class Meta:
table_name = "depositions"

id = IntegerField(primary_key=True)
title = CharField()
description = CharField()
deposition_date = DateField()
release_date = DateField()
last_modified_date = DateField()
related_database_entries = CharField(null=True)
deposition_publications = CharField(null=True)
deposition_types = CharField()
s3_prefix = CharField()
https_prefix = CharField()


class DepositionAuthor(BaseModel):
class Meta:
table_name = "deposition_authors"

id = IntegerField(primary_key=True)
deposition_id = ForeignKeyField(Deposition, backref="authors")
orcid = CharField(null=True)
name = CharField()
corresponding_author_status = BooleanField(default=False)
primary_author_status = BooleanField(default=False)
email = CharField(null=True)
affiliation_name = CharField(null=True)
affiliation_address = CharField(null=True)
affiliation_identifier = CharField(null=True)
author_list_order = IntegerField()


class Dataset(BaseModel):
class Meta:
table_name = "datasets"
Expand Down Expand Up @@ -50,6 +84,7 @@ class Meta:
cell_component_id = CharField(null=True)
key_photo_url = CharField(null=True)
key_photo_thumbnail_url = CharField(null=True)
deposition_id = IntegerField(null=True)


class DatasetAuthor(BaseModel):
Expand Down Expand Up @@ -256,3 +291,4 @@ class Meta:
pixel_spacing = FloatField()
aligned_tiltseries_binning = IntegerField(null=True)
frames_count = IntegerField(null=True)
deposition_id = IntegerField(null=True)
11 changes: 9 additions & 2 deletions ingestion_tools/scripts/common/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import re
from copy import deepcopy
from datetime import datetime
from typing import Any

from common.formats import tojson
Expand All @@ -15,9 +16,15 @@ def __init__(self, fs: FileSystemApi, deposition_id: str, template: dict[str, An
self.metadata = template
self.deposition_id = deposition_id

def add_defaults(self, metadata: dict[str, Any]) -> dict[str, Any]:
if not metadata.get("deposition_id"):
metadata["deposition_id"] = self.deposition_id
metadata["last_updated_at"] = int(datetime.now().timestamp())
return metadata

def write_metadata(self, filename: str, is_json=True) -> None:
metadata = deepcopy(self.metadata)
metadata["deposition_id"] = self.deposition_id
metadata = self.add_defaults(metadata)
with self.fs.open(filename, "w") as fh:
data = tojson(metadata) if is_json else self.metadata
fh.write(data)
Expand All @@ -26,7 +33,7 @@ def write_metadata(self, filename: str, is_json=True) -> None:
class MergedMetadata(BaseMetadata):
def write_metadata(self, filename: str, merge_data: dict[str, Any]) -> None:
metadata = deep_merge(self.metadata, merge_data)
metadata["deposition_id"] = self.deposition_id
metadata = self.add_defaults(metadata)
with self.fs.open(filename, "w") as fh:
fh.write(tojson(metadata))

Expand Down
Loading

0 comments on commit e3b1953

Please sign in to comment.