Skip to content

Commit

Permalink
Add --preserve-tree option to dandi download
Browse files Browse the repository at this point in the history
  • Loading branch information
jwodder committed Jul 23, 2024
1 parent c0b64d5 commit c583a89
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 23 deletions.
15 changes: 13 additions & 2 deletions dandi/cli/cmd_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,15 @@
default="all",
show_default=True,
)
@click.option(
"--preserve-tree",
is_flag=True,
help=(
"When downloading only part of a Dandiset, also download"
" `dandiset.yaml` and do not strip leading directories from asset"
" paths. Implies `--download all`."
),
)
@click.option(
"--sync", is_flag=True, help="Delete local assets that do not exist on the server"
)
Expand Down Expand Up @@ -138,6 +147,7 @@ def download(
sync: bool,
dandi_instance: str,
path_type: PathType,
preserve_tree: bool,
) -> None:
# We need to import the download module rather than the download function
# so that the tests can properly patch the function with a mock.
Expand Down Expand Up @@ -171,8 +181,9 @@ def download(
format=format,
jobs=jobs[0],
jobs_per_zarr=jobs[1],
get_metadata="dandiset.yaml" in download_types,
get_assets="assets" in download_types,
get_metadata="dandiset.yaml" in download_types or preserve_tree,
get_assets="assets" in download_types or preserve_tree,
preserve_tree=preserve_tree,
sync=sync,
path_type=path_type,
# develop_debug=develop_debug
Expand Down
7 changes: 7 additions & 0 deletions dandi/cli/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def test_download_defaults(mocker):
jobs_per_zarr=None,
get_metadata=True,
get_assets=True,
preserve_tree=False,
sync=False,
path_type=PathType.EXACT,
)
Expand All @@ -41,6 +42,7 @@ def test_download_all_types(mocker):
jobs_per_zarr=None,
get_metadata=True,
get_assets=True,
preserve_tree=False,
sync=False,
path_type=PathType.EXACT,
)
Expand All @@ -59,6 +61,7 @@ def test_download_metadata_only(mocker):
jobs_per_zarr=None,
get_metadata=True,
get_assets=False,
preserve_tree=False,
sync=False,
path_type=PathType.EXACT,
)
Expand All @@ -77,6 +80,7 @@ def test_download_assets_only(mocker):
jobs_per_zarr=None,
get_metadata=False,
get_assets=True,
preserve_tree=False,
sync=False,
path_type=PathType.EXACT,
)
Expand Down Expand Up @@ -110,6 +114,7 @@ def test_download_gui_instance_in_dandiset(mocker):
jobs_per_zarr=None,
get_metadata=True,
get_assets=True,
preserve_tree=False,
sync=False,
path_type=PathType.EXACT,
)
Expand All @@ -135,6 +140,7 @@ def test_download_api_instance_in_dandiset(mocker):
jobs_per_zarr=None,
get_metadata=True,
get_assets=True,
preserve_tree=False,
sync=False,
path_type=PathType.EXACT,
)
Expand All @@ -160,6 +166,7 @@ def test_download_url_instance_match(mocker):
jobs_per_zarr=None,
get_metadata=True,
get_assets=True,
preserve_tree=False,
sync=False,
path_type=PathType.EXACT,
)
Expand Down
42 changes: 31 additions & 11 deletions dandi/dandiarchive.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,11 +183,17 @@ def navigate(
yield (client, dandiset, assets)

@abstractmethod
def get_asset_download_path(self, asset: BaseRemoteAsset) -> str:
def get_asset_download_path(
self, asset: BaseRemoteAsset, preserve_tree: bool
) -> str:
"""
Returns the path (relative to the base download directory) at which the
asset ``asset`` (assumed to have been returned by this object's
`get_assets()` method) should be downloaded
`get_assets()` method) should be downloaded.
If ``preserve_tree`` is `True`, then the download is being performed
with ``--download tree`` option, and the method's return value should
be adjusted accordingly.
:meta private:
"""
Expand Down Expand Up @@ -231,7 +237,9 @@ def get_assets(
assert d is not None
yield from d.get_assets(order=order)

def get_asset_download_path(self, asset: BaseRemoteAsset) -> str:
def get_asset_download_path(
self, asset: BaseRemoteAsset, preserve_tree: bool
) -> str:
return asset.path.lstrip("/")

def is_under_download_path(self, path: str) -> bool:
Expand All @@ -242,13 +250,17 @@ def is_under_download_path(self, path: str) -> bool:
class SingleAssetURL(ParsedDandiURL):
"""Superclass for parsed URLs that refer to a single asset"""

def get_asset_download_path(self, asset: BaseRemoteAsset) -> str:
return posixpath.basename(asset.path.lstrip("/"))
def get_asset_download_path(
self, asset: BaseRemoteAsset, preserve_tree: bool
) -> str:
path = asset.path.lstrip("/")
if preserve_tree:
return path
else:
return posixpath.basename(path)

def is_under_download_path(self, path: str) -> bool:
raise TypeError(
f"{type(self).__name__}.is_under_download_path() should not be called"
)
return False

Check warning on line 263 in dandi/dandiarchive.py

View check run for this annotation

Codecov / codecov/patch

dandi/dandiarchive.py#L263

Added line #L263 was not covered by tests


@dataclass
Expand All @@ -257,8 +269,14 @@ class MultiAssetURL(ParsedDandiURL):

path: str

def get_asset_download_path(self, asset: BaseRemoteAsset) -> str:
return multiasset_target(self.path, asset.path.lstrip("/"))
def get_asset_download_path(
self, asset: BaseRemoteAsset, preserve_tree: bool
) -> str:
path = asset.path.lstrip("/")
if preserve_tree:
return path
else:
return multiasset_target(self.path, path)

def is_under_download_path(self, path: str) -> bool:
prefix = posixpath.dirname(self.path.strip("/"))
Expand Down Expand Up @@ -487,7 +505,9 @@ def get_assets(
if strict and not any_assets:
raise NotFoundError(f"No assets found matching glob {self.path!r}")

def get_asset_download_path(self, asset: BaseRemoteAsset) -> str:
def get_asset_download_path(
self, asset: BaseRemoteAsset, preserve_tree: bool
) -> str:
return asset.path.lstrip("/")

def is_under_download_path(self, path: str) -> bool:
Expand Down
28 changes: 19 additions & 9 deletions dandi/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def download(
jobs_per_zarr: int | None = None,
get_metadata: bool = True,
get_assets: bool = True,
preserve_tree: bool = False,
sync: bool = False,
path_type: PathType = PathType.EXACT,
) -> None:
Expand Down Expand Up @@ -141,6 +142,7 @@ def download(
existing=existing,
get_metadata=get_metadata,
get_assets=get_assets,
preserve_tree=preserve_tree,
jobs_per_zarr=jobs_per_zarr,
on_error="yield" if format is DownloadFormat.PYOUT else "raise",
**kw,
Expand Down Expand Up @@ -201,6 +203,7 @@ class Downloader:
existing: DownloadExisting
get_metadata: bool
get_assets: bool
preserve_tree: bool
jobs_per_zarr: int | None
on_error: Literal["raise", "yield"]
#: which will be set .gen to assets. Purpose is to make it possible to get
Expand All @@ -214,19 +217,24 @@ def __post_init__(self, output_dir: str | Path) -> None:
# TODO: if we are ALREADY in a dandiset - we can validate that it is
# the same dandiset and use that dandiset path as the one to download
# under
if isinstance(self.url, DandisetURL):
if isinstance(self.url, DandisetURL) or (
self.preserve_tree and self.url.dandiset_id is not None
):
assert self.url.dandiset_id is not None
self.output_prefix = Path(self.url.dandiset_id)
else:
self.output_prefix = Path()
self.output_path = Path(output_dir, self.output_prefix)

def is_dandiset_yaml(self) -> bool:
return isinstance(self.url, AssetItemURL) and self.url.path == "dandiset.yaml"

def download_generator(self) -> Iterator[dict]:
"""
A generator for downloads of files, folders, or entire dandiset from
DANDI (as identified by URL)
This function is a generator which would yield records on ongoing
This function is a generator which yields records on ongoing
activities. Activities include traversal of the remote resource (DANDI
archive), download of individual assets while yielding records (TODO:
schema) while validating their checksums "on the fly", etc.
Expand All @@ -235,10 +243,8 @@ def download_generator(self) -> Iterator[dict]:
with self.url.navigate(strict=True) as (client, dandiset, assets):
if (
isinstance(self.url, DandisetURL)
or (
isinstance(self.url, AssetItemURL)
and self.url.path == "dandiset.yaml"
)
or self.is_dandiset_yaml()
or self.preserve_tree
) and self.get_metadata:
assert dandiset is not None
for resp in _populate_dandiset_yaml(
Expand All @@ -248,7 +254,7 @@ def download_generator(self) -> Iterator[dict]:
"path": str(self.output_prefix / dandiset_metadata_file),
**resp,
}
if isinstance(self.url, AssetItemURL):
if self.is_dandiset_yaml():
return

# TODO: do analysis of assets for early detection of needed renames
Expand All @@ -262,7 +268,9 @@ def download_generator(self) -> Iterator[dict]:
assets = self.assets_it.feed(assets)
lock = Lock()
for asset in assets:
path = self.url.get_asset_download_path(asset)
path = self.url.get_asset_download_path(
asset, preserve_tree=self.preserve_tree
)
self.asset_download_paths.add(path)
download_path = Path(self.output_path, path)
path = str(self.output_prefix / path)
Expand Down Expand Up @@ -995,7 +1003,9 @@ class DownloadProgress:
@dataclass
class ProgressCombiner:
zarr_size: int
file_qty: int | None = None # set to specific known value whenever full sweep is complete
file_qty: int | None = (
None # set to specific known value whenever full sweep is complete
)
files: dict[str, DownloadProgress] = field(default_factory=dict)
#: Total size of all files that were not skipped and did not error out
#: during download
Expand Down
45 changes: 44 additions & 1 deletion dandi/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,28 @@ def test_download_folder(text_dandiset: SampleDandiset, tmp_path: Path) -> None:
assert (tmp_path / "subdir2" / "coconut.txt").read_text() == "Coconut\n"


def test_download_folder_preserve_tree(
text_dandiset: SampleDandiset, tmp_path: Path
) -> None:
dandiset_id = text_dandiset.dandiset_id
download(
f"dandi://{text_dandiset.api.instance_id}/{dandiset_id}/subdir2/",
tmp_path,
preserve_tree=True,
)
assert list_paths(tmp_path, dirs=True) == [
tmp_path / dandiset_id,
tmp_path / dandiset_id / "dandiset.yaml",
tmp_path / dandiset_id / "subdir2",
tmp_path / dandiset_id / "subdir2" / "banana.txt",
tmp_path / dandiset_id / "subdir2" / "coconut.txt",
]
assert (tmp_path / dandiset_id / "subdir2" / "banana.txt").read_text() == "Banana\n"
assert (
tmp_path / dandiset_id / "subdir2" / "coconut.txt"
).read_text() == "Coconut\n"


def test_download_item(text_dandiset: SampleDandiset, tmp_path: Path) -> None:
dandiset_id = text_dandiset.dandiset_id
download(
Expand All @@ -182,6 +204,26 @@ def test_download_item(text_dandiset: SampleDandiset, tmp_path: Path) -> None:
assert (tmp_path / "coconut.txt").read_text() == "Coconut\n"


def test_download_item_preserve_tree(
text_dandiset: SampleDandiset, tmp_path: Path
) -> None:
dandiset_id = text_dandiset.dandiset_id
download(
f"dandi://{text_dandiset.api.instance_id}/{dandiset_id}/subdir2/coconut.txt",
tmp_path,
preserve_tree=True,
)
assert list_paths(tmp_path, dirs=True) == [
tmp_path / dandiset_id,
tmp_path / dandiset_id / "dandiset.yaml",
tmp_path / dandiset_id / "subdir2",
tmp_path / dandiset_id / "subdir2" / "coconut.txt",
]
assert (
tmp_path / dandiset_id / "subdir2" / "coconut.txt"
).read_text() == "Coconut\n"


def test_download_dandiset_yaml(text_dandiset: SampleDandiset, tmp_path: Path) -> None:
dandiset_id = text_dandiset.dandiset_id
download(
Expand Down Expand Up @@ -330,6 +372,7 @@ def test_download_metadata404(text_dandiset: SampleDandiset, tmp_path: Path) ->
existing=DownloadExisting.ERROR,
get_metadata=True,
get_assets=True,
preserve_tree=False,
jobs_per_zarr=None,
on_error="raise",
).download_generator()
Expand Down Expand Up @@ -985,4 +1028,4 @@ def test_pyouthelper_time_remaining_1339():
# once done, dont print ETA
assert len(done) == 2
else:
assert done[-1] == f"ETA: {10-i} seconds<"
assert done[-1] == f"ETA: {10 - i} seconds<"
6 changes: 6 additions & 0 deletions docs/source/cmdline/download.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ Options

Whether to interpret asset paths in URLs as exact matches or glob patterns

.. option:: --preserve-tree

When downloading only part of a Dandiset, also download
:file:`dandiset.yaml` and do not strip leading directories from asset
paths. Implies ``--download all``.

.. option:: --sync

Delete local assets that do not exist on the server after downloading

0 comments on commit c583a89

Please sign in to comment.