Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Updating paths for Gains and CollectionMetadata #271

Merged
merged 5 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions ingestion_tools/scripts/common/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,10 +226,10 @@ def resolve_output_path(self, key: str, obj: BaseImporter) -> str:
"tomogram": "{dataset_name}/{run_name}/Tomograms/VoxelSpacing{voxel_spacing_name}/CanonicalTomogram",
"key_image": "{dataset_name}/{run_name}/Tomograms/VoxelSpacing{voxel_spacing_name}/KeyPhotos",
"tiltseries": "{dataset_name}/{run_name}/TiltSeries",
"gain": "{dataset_name}/{run_name}/Frames/{run_name}_gain",
"gain": "{dataset_name}/{run_name}/Gains/",
"frame": "{dataset_name}/{run_name}/Frames",
"rawtilt": "{dataset_name}/{run_name}/TiltSeries",
"collection_metadata": "{dataset_name}/{run_name}/TiltSeries",
"collection_metadata": "{dataset_name}/{run_name}/Frames",
"alignment": "{dataset_name}/{run_name}/Alignments",
"annotation": "{dataset_name}/{run_name}/Tomograms/VoxelSpacing{voxel_spacing_name}/Annotations",
"annotation_metadata": "{dataset_name}/{run_name}/Tomograms/VoxelSpacing{voxel_spacing_name}/Annotations",
Expand Down
10 changes: 6 additions & 4 deletions ingestion_tools/scripts/importers/gain.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@ class GainImporter(BaseImporter):
def import_item(self) -> None:
fs = self.config.fs
item = self.path
output_filename = self.get_output_path()
source_file_name = os.path.basename(item)
output_dir = self.get_output_path()
if item.endswith(".dm4"):
dest_file_name = os.path.splitext(source_file_name)[0] + ".mrc"
local_input = fs.localreadable(item)
local_output = fs.localwritable(output_filename + ".mrc")
local_output = fs.localwritable(os.path.join(output_dir, dest_file_name))
subprocess.check_output(["/usr/local/IMOD/bin/dm2mrc", local_input, local_output])
fs.push(local_output)
else:
_, extension = os.path.splitext(item)
fs.copy(item, f"{output_filename}{extension}")
dest_file_path = os.path.join(output_dir, source_file_name)
fs.copy(item, dest_file_path)
5 changes: 5 additions & 0 deletions ingestion_tools/scripts/tests/fixtures/dataset1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ annotations:
file_format: csv
glob_string: particle_lists/{run_name}_fas.csv
shape: Point
collection_metadata:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the code for collection_metadata support not part of this PR?

Copy link
Contributor Author

@manasaV3 manasaV3 Sep 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch! the collection_metadata test file got missed in being tracked.

- sources:
- source_multi_glob:
list_globs:
- metadata/mdocs_modified/foo-{run_name}.mdoc
alignments:
- metadata:
affine_transformation_matrix:
Expand Down
4 changes: 2 additions & 2 deletions ingestion_tools/scripts/tests/s3_import/test_alignments.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from common.config import DepositionImportConfig
from common.fs import FileSystemApi
from tests.s3_import.util import list_dir
from tests.s3_import.util import get_data_from_s3, list_dir


def get_parents(config: DepositionImportConfig) -> dict[str, BaseImporter]:
Expand Down Expand Up @@ -49,7 +49,7 @@ def validate_dataframe(
s3_client: S3Client,
) -> Callable[[str, str, int], None]:
def get_data_frame(bucket_name: str, path: str) -> pd.DataFrame:
body = s3_client.get_object(Bucket=bucket_name, Key=path)["Body"]
body = get_data_from_s3(s3_client, bucket_name, path)
return pd.read_csv(body, sep=r"\s+")

def validate(prefix: str, filename: str, id_prefix: int) -> None:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import os.path
from os.path import basename

from importers.collection_metadata import CollectionMetadataImporter
from mypy_boto3_s3 import S3Client

from common.fs import FileSystemApi
from tests.s3_import.util import create_config, get_data_from_s3, get_dataset_and_run, list_dir


def test_collection_metadata_import(s3_fs: FileSystemApi, test_output_bucket: str, s3_client: S3Client) -> None:
config = create_config(s3_fs, test_output_bucket)
dataset, run = get_dataset_and_run(config)
collection_metadata = list(CollectionMetadataImporter.finder(config, dataset=dataset, run=run))
for item in collection_metadata:
item.import_item()

run_name = run.name
prefix = f"output/{dataset.name}/{run_name}/Frames"
actual_files = [basename(item) for item in list_dir(s3_client, test_output_bucket, prefix)]
source_filename = os.path.basename(collection_metadata[0].path)
assert source_filename in actual_files

actual = get_data_from_s3(s3_client, test_output_bucket, os.path.join(prefix, source_filename)).readlines()
source_file_path = "/".join(collection_metadata[0].path.split("/")[1:])
expected = get_data_from_s3(s3_client, "test-public-bucket", source_file_path).readlines()
assert actual == expected
36 changes: 15 additions & 21 deletions ingestion_tools/scripts/tests/s3_import/test_gains.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,13 @@
from unittest.mock import Mock

import pytest
from importers.dataset import DatasetImporter
from importers.gain import GainImporter
from importers.run import RunImporter
from mypy_boto3_s3 import S3Client
from standardize_dirs import IMPORTERS

from common.config import DepositionImportConfig
from common.fs import FileSystemApi
from tests.s3_import.util import list_dir
from tests.s3_import.util import create_config, get_dataset_and_run, list_dir


def create_file_locally(*args, **kwargs):
Expand All @@ -29,40 +27,36 @@ def config(s3_fs: FileSystemApi, test_output_bucket: str) -> DepositionImportCon


def test_non_dm4_gains_import(
config: DepositionImportConfig,
s3_fs: FileSystemApi,
test_output_bucket: str,
s3_client: S3Client,
) -> None:
datasets = list(DatasetImporter.finder(config))
runs = list(RunImporter.finder(config, dataset=datasets[0]))
gains = list(GainImporter.finder(config, dataset=datasets[0], run=runs[0]))
config = create_config(s3_fs, test_output_bucket)
dataset, run = get_dataset_and_run(config)
gains = list(GainImporter.finder(config, dataset=dataset, run=run))
for gain in gains:
gain.import_item()

dataset_name = datasets[0].name
run_name = runs[0].name
prefix = f"output/{dataset_name}/{run_name}/Frames"
run_name = run.name
prefix = f"output/{dataset.name}/{run_name}/Gains"
gain_files = [basename(item) for item in list_dir(s3_client, test_output_bucket, prefix)]
assert f"{run_name}_gain.gain" in gain_files
assert f"CountRef_{run_name}.gain" in gain_files


def test_dm4_gains_import(
config: DepositionImportConfig,
s3_fs: FileSystemApi,
test_output_bucket: str,
s3_client: S3Client,
monkeypatch: pytest.MonkeyPatch,
) -> None:
subprocess_mock = Mock(spec="subprocess.check_output", side_effect=create_file_locally)
monkeypatch.setattr(subprocess, "check_output", subprocess_mock)

datasets = list(DatasetImporter.finder(config))
runs = list(RunImporter.finder(config, dataset=datasets[0]))
gains = list(GainImporter.finder(config, dataset=datasets[0], run=runs[1]))
config = create_config(s3_fs, test_output_bucket)
dataset, run = get_dataset_and_run(config, run_index=1)
gains = list(GainImporter.finder(config, dataset=dataset, run=run))
for gain in gains:
gain.import_item()

dataset_name = datasets[0].name
run_name = runs[1].name
prefix = f"output/{dataset_name}/{run_name}/Frames"
run_name = run.name
prefix = f"output/{dataset.name}/{run_name}/Gains"
gain_files = [basename(item) for item in list_dir(s3_client, test_output_bucket, prefix)]
assert f"{run_name}_gain.mrc" in gain_files
assert f"CountRef_{run_name}.mrc" in gain_files
30 changes: 30 additions & 0 deletions ingestion_tools/scripts/tests/s3_import/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
from typing import List

from botocore.response import StreamingBody
from importers.dataset import DatasetImporter
from importers.run import RunImporter
from mypy_boto3_s3 import S3Client
from standardize_dirs import IMPORTERS

from common.config import DepositionImportConfig
from common.fs import FileSystemApi


def list_dir(s3_client: S3Client, bucket: str, prefix: str, assert_non_zero_size: bool = False) -> List[str]:
Expand All @@ -9,3 +16,26 @@ def list_dir(s3_client: S3Client, bucket: str, prefix: str, assert_non_zero_size
for item in files["Contents"]:
assert item["Size"] > 0
return [item["Key"] for item in files["Contents"]] if "Contents" in files else []


def create_config(s3_fs: FileSystemApi, test_output_bucket: str, config_path: str = None) -> DepositionImportConfig:
output_path = f"{test_output_bucket}/output"
input_bucket = "test-public-bucket"
if config_path is None:
config_path = "dataset1.yaml"
import_config = f"tests/fixtures/{config_path}"
return DepositionImportConfig(s3_fs, import_config, output_path, input_bucket, IMPORTERS)


def get_dataset_and_run(
config: DepositionImportConfig,
dataset_index: int = 0,
run_index: int = 0,
) -> tuple[DatasetImporter, RunImporter]:
dataset = list(DatasetImporter.finder(config))[dataset_index]
run = list(RunImporter.finder(config, dataset=dataset))[run_index]
return dataset, run


def get_data_from_s3(s3_client: S3Client, bucket_name: str, path: str) -> StreamingBody:
return s3_client.get_object(Bucket=bucket_name, Key=path)["Body"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
PixelSpacing = 3.3702
Voltage = 300
ImageFile = TS_run1.mrc
ImageSize = 3708 3838
DataMode = 1

[T = SerialEM: Digitized on EMBL Krios 28-Apr-18 11:36:19 ]

[T = Tilt axis angle = 84.7, binning = 1 spot = 6 camera = 0]

[ZValue = 0]
TiltAngle = -0.00499939
StagePosition = 181.087 446.897
StageZ = -26.7037
Magnification = 42000
Intensity = 0.116505
ExposureDose = 2.35936
PixelSpacing = 3.3702
SpotSize = 6
Defocus = -0.279296
ImageShift = -0.05244 -0.0632113
RotationAngle = 174.7
ExposureTime = 1
Binning = 1
CameraIndex = 0
DividedBy2 = 1
MagIndex = 27
CountsPerElectron = 17.25
MinMaxMean = -55 1151 220.207
TargetDefocus = -2
SubFramePath = TS_run1_01.mrc
NumSubFrames = 10
FrameDosesAndNumber = 0.23594 10
DateTime = 28-Apr-18 11:38:54
NavigatorLabel = 112
Loading