Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support depositions of annotations that are associated with existing datasets #34

Closed
wants to merge 14 commits into from
76 changes: 76 additions & 0 deletions ingestion_tools/dataset_configs/deposition_10301.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
annotations:
- metadata:
annotation_object:
id: GO:0016020
name: membrane
description: ~
state: ~
dates: &repo-dates
deposition_date: 2024-02-20
last_modified_date: 2024-04-10
release_date: 2024-04-10
annotation_method: intensity normalization + rescaling + FNET segmentation + skeletonization + DIST segmentation
method_type: automated
annotation_publications: 10.1093/micmic/ozad067.485, https://github.com/SMLC-NYSBC/TARDIS
ground_truth_status: False
authors: &annotation_authors
- name: Robert Kiewisz
ORCID: 0000-0003-2733-4978
primary_author_status: true
- name: Gunar Fabig
ORCID: 0000-0003-3017-0978
- name: Stefanie Redemann
ORCID: 0000-0003-2334-7309
- name: Will Conway
ORCID: 0000-0001-7532-4331
- name: Victor Kostyuchenko
ORCID: 0000-0001-9751-307X
- name: Jake Johnston
ORCID: 0000-0003-3060-7738
- name: Oliver Clarke
- name: Shee-Mei Lok
ORCID: 0000-0003-4631-8041
- name: Thomas Müller-Reichert
ORCID: 0000-0003-0203-1436
- name: Tristan Bepler
ORCID: 0000-0001-5595-9954
corresponding_author_status: true
annotation_software: TARDIS-em v0.2.0
version: '1.0'
confidence:
precision: ~
recall: ~
is_curator_recommended: False
sources:
- file_format: tardis
binning: 1
order: xyz
shape: InstanceSegmentation
glob_string: "{dataset_name}/{run_name}/{run_name}_instance.csv"
is_visualization_default: false
- file_format: mrc
shape: SemanticSegmentationMask
glob_string: "{dataset_name}/{run_name}/{run_name}_semantic.mrc"
is_visualization_default: false
standardization_config:
deposition_id: 10301
dataset:
source:
source_glob:
list_glob: '*'
match_regex: .*
name_regex: (.*)
run:
source:
source_glob:
list_glob: '{dataset_name}/*'
match_regex: .*
name_regex: (.*)
tomogram_voxel_spacing:
source:
destination_glob:
list_glob: '{run_output_path}/Tomograms/VoxelSpacing*'
match_regex: .*
name_regex: VoxelSpacing(.*)
destination_prefix: ''
source_prefix: 'robert_kiewisz_tardis_01_2024'
87 changes: 59 additions & 28 deletions ingestion_tools/scripts/common/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import re
from copy import deepcopy
from typing import TYPE_CHECKING, Any
from common.finders import DatasetImporterFactory, RunImporterFactory, VSImporterFactory

import yaml

Expand All @@ -27,9 +28,10 @@ def __init__(self, run_regex: re.Pattern[str], tiltseries: dict[str, Any] | None
self.tomograms = tomograms


class DataImportConfig:
class DepositionImportConfig:
https_prefix = os.getenv("DOMAIN_NAME", "https://files.cryoetdataportal.cziscience.com")
source_prefix: str
deposition_id: str
destination_prefix: str
fs: FileSystemApi
run_glob: str
Expand Down Expand Up @@ -59,10 +61,14 @@ class DataImportConfig:
tiltseries_template: dict[str, Any]
annotation_template: dict[str, Any]
output_prefix: str
input_prefix: str
overrides_by_run: list[RunOverride] | None = None
run_data_map: dict[str, Any]
run_data_map_file: str | None = None
# Data Finders
#vs_finder_config: DepositionObjectImporterFactory | None = None
dataset_finder_config: DatasetImporterFactory | None = None
run_finder_config: RunImporterFactory | None = None
vs_finder_config: VSImporterFactory | None = None

def __init__(self, fs: FileSystemApi, config_path: str, output_prefix: str, input_bucket: str):
self.output_prefix = output_prefix
Expand All @@ -71,6 +77,20 @@ def __init__(self, fs: FileSystemApi, config_path: str, output_prefix: str, inpu
dataset_config = yaml.safe_load(conffile)
config = dataset_config["standardization_config"]

# TODO refactor this to not be so literal.
config["dataset_finder_config"] = None
config["run_finder_config"] = None
config["vs_finder_config"] = None
if config.get("dataset"):
config["dataset_finder_config"] = DatasetImporterFactory(**config["dataset"])
del config["dataset"]
if config.get("run"):
config["run_finder_config"] = RunImporterFactory(**config["run"])
del config["run"]
if config.get("tomogram_voxel_spacing"):
config["vs_finder_config"] = VSImporterFactory(**config["tomogram_voxel_spacing"])
del config["tomogram_voxel_spacing"]

for k, v in config.items():
if "regex" in k:
v = re.compile(v)
Expand All @@ -97,9 +117,13 @@ def __init__(self, fs: FileSystemApi, config_path: str, output_prefix: str, inpu
"annotations": "annotation",
}
for config_key, template_key in template_configs.items():
setattr(self, f"{template_key}_template", dataset_config[config_key])
try:
setattr(self, f"{template_key}_template", dataset_config[config_key])
except KeyError:
setattr(self, f"{template_key}_template", {})
self.input_path = f"{input_bucket}/{self.source_prefix}"
self.dataset_root_dir = f"{input_bucket}/{self.source_prefix}"
self.deposition_root_dir = f"{input_bucket}/{self.source_prefix}"

def load_run_data_map(self) -> None:
self.run_data_map = self.load_run_metadata_file("run_data_map_file")
Expand Down Expand Up @@ -149,28 +173,28 @@ def load_run_ts_map(self) -> None:
self.run_to_ts_map = self.load_run_csv_file("run_to_ts_map_csv")

@classmethod
def get_run_name(cls, obj: BaseImporter) -> str:
def get_dataset_name(cls, obj: BaseImporter) -> str:
try:
run = obj.get_run()
if run:
return run.run_name
ds = obj.get_dataset()
if ds:
return ds.name
except ValueError:
pass
return ""

@classmethod
def get_run_voxelsize(cls, obj: BaseImporter) -> float:
def get_run_name(cls, obj: BaseImporter) -> str:
try:
run = obj.get_run()
if run:
return run.voxel_spacing
return run.name
except ValueError:
pass
return 0
return ""

def get_output_path(self, obj: BaseImporter) -> str:
key = f"{obj.type_key}"
return self.resolve_output_path(key, self.get_run_name(obj), self.get_run_voxelsize(obj))
return self.resolve_output_path(key, obj)

def get_run_data_map(self, run_name: str) -> dict[str, Any]:
if map_vars := self.run_data_map.get(run_name):
Expand Down Expand Up @@ -234,27 +258,34 @@ def get_run_override(self, run_name: str) -> RunOverride | None:

def get_metadata_path(self, obj: BaseImporter) -> str:
key = f"{obj.type_key}_metadata"
return self.resolve_output_path(key, self.get_run_name(obj), self.get_run_voxelsize(obj))
return self.resolve_output_path(key, obj)

def resolve_output_path(self, key: str, run_name: str, voxelsize: float | str) -> str:
def resolve_output_path(self, key: str, obj: BaseImporter) -> str:
paths = {
"tomogram": "{run_name}/Tomograms/VoxelSpacing{voxelsize}/CanonicalTomogram",
"key_image": "{run_name}/Tomograms/VoxelSpacing{voxelsize}/KeyPhotos",
"tiltseries": "{run_name}/TiltSeries",
"frames": "{run_name}/Frames",
"annotation": "{run_name}/Tomograms/VoxelSpacing{voxelsize}/Annotations",
"annotation_metadata": "{run_name}/Tomograms/VoxelSpacing{voxelsize}/Annotations",
"run_metadata": "{run_name}/run_metadata.json",
"tomogram_metadata": "{run_name}/Tomograms/VoxelSpacing{voxelsize}/CanonicalTomogram/tomogram_metadata.json",
"tiltseries_metadata": "{run_name}/TiltSeries/tiltseries_metadata.json",
"dataset_metadata": "dataset_metadata.json",
"tomogram": "{dataset_name}/{run_name}/Tomograms/VoxelSpacing{voxel_spacing_name}/CanonicalTomogram",
"key_image": "{dataset_name}/{run_name}/Tomograms/VoxelSpacing{voxel_spacing_name}/KeyPhotos",
"tiltseries": "{dataset_name}/{run_name}/TiltSeries",
"frames": "{dataset_name}/{run_name}/Frames",
"annotation": "{dataset_name}/{run_name}/Tomograms/VoxelSpacing{voxel_spacing_name}/Annotations",
"annotation_metadata": "{dataset_name}/{run_name}/Tomograms/VoxelSpacing{voxel_spacing_name}/Annotations",
"run_metadata": "{dataset_name}/{run_name}/run_metadata.json",
"tomogram_metadata": "{dataset_name}/{run_name}/Tomograms/VoxelSpacing{voxel_spacing_name}/CanonicalTomogram/tomogram_metadata.json",
"tiltseries_metadata": "{dataset_name}/{run_name}/TiltSeries/tiltseries_metadata.json",
"dataset_metadata": "{dataset_name}/dataset_metadata.json",
"run": "{dataset_name}/{run_name}",
"dataset": "{dataset_name}",
"dataset_keyphoto": "Images",
"neuroglancer": "{run_name}/Tomograms/VoxelSpacing{voxelsize}/CanonicalTomogram/neuroglancer_config.json",
"neuroglancer": "{dataset_name}/{run_name}/Tomograms/VoxelSpacing{voxel_spacing_name}/CanonicalTomogram/neuroglancer_config.json",
}
output_prefix = self.output_prefix
glob_vars = obj.get_glob_vars()
# support older configs that specified the dataset name as the output prefix
if self.output_prefix == glob_vars["dataset_name"]:
output_prefix = ""
path = os.path.join(
self.output_prefix,
output_prefix,
self.destination_prefix,
paths[key].format(run_name=run_name, voxelsize=voxelsize),
paths[key].format(**glob_vars)
)
self.fs.makedirs(path)
return path
Expand All @@ -263,9 +294,9 @@ def glob_files(self, obj: BaseImporter, globstring: str) -> list[str]:
run = obj.get_run()
if not globstring:
return []
globvars = run.get_glob_vars()
globvars = obj.get_glob_vars()
with contextlib.suppress(ValueError):
globvars["int_run_name"] = int(run.run_name)
globvars["int_run_name"] = int(run.name)
expanded_glob = os.path.join(self.dataset_root_dir, globstring.format(**globvars))
results = self.fs.glob(expanded_glob)
if not results:
Expand Down
Loading