Skip to content

Commit

Permalink
Delete files in Zarrs in batches of 100
Browse files Browse the repository at this point in the history
  • Loading branch information
jwodder committed Feb 15, 2022
1 parent d2f8fac commit 124ee25
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 13 deletions.
3 changes: 3 additions & 0 deletions dandi/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,3 +159,6 @@ class DandiInstance(NamedTuple):

#: Maximum number of Zarr directory entries to upload at once
ZARR_UPLOAD_BATCH_SIZE = 255

#: Maximum number of Zarr directory entries to delete at once
ZARR_DELETE_BATCH_SIZE = 100
21 changes: 20 additions & 1 deletion dandi/dandiapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
ClassVar,
Dict,
FrozenSet,
Iterable,
Iterator,
List,
Optional,
Expand All @@ -38,6 +39,7 @@
DRAFT,
MAX_CHUNK_SIZE,
RETRY_STATUSES,
ZARR_DELETE_BATCH_SIZE,
DandiInstance,
EmbargoStatus,
known_instances,
Expand All @@ -46,7 +48,13 @@
from .exceptions import NotFoundError, SchemaVersionError
from .keyring import keyring_lookup
from .misctypes import BasePath, Digest
from .utils import USER_AGENT, check_dandi_version, ensure_datetime, is_interactive
from .utils import (
USER_AGENT,
check_dandi_version,
chunked,
ensure_datetime,
is_interactive,
)

lgr = get_logger()

Expand Down Expand Up @@ -1412,6 +1420,17 @@ def iterfiles(self, include_dirs: bool = False) -> Iterator["RemoteZarrEntry"]:
"""
return self.filetree.iterfiles(include_dirs=include_dirs)

def rmfiles(self, files: Iterable["RemoteZarrEntry"]) -> None:
"""Delete one or more files from the Zarr"""
# Don't bother checking that the entries are actually files or even
# belong to this Zarr, as if they're not, the server will return an
# error anyway.
for entries in chunked(files, ZARR_DELETE_BATCH_SIZE):
self.client.delete(
f"/zarr/{self.zarr}/files/",
json=[{"path": str(e)} for e in entries],
)


class RemoteAsset(BaseRemoteAsset):
"""
Expand Down
19 changes: 7 additions & 12 deletions dandi/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -833,8 +833,9 @@ def iter_upload(
json={"metadata": metadata, "zarr_id": zarr_id},
)
a = RemoteAsset.from_data(dandiset, r)
assert isinstance(a, RemoteZarrAsset)
to_upload: List[dict] = []
to_delete: List[dict] = []
to_delete: List[RemoteZarrEntry] = []
for p in stat.files:
pdigest = p.get_digest().value
item = {"path": str(p), "etag": pdigest}
Expand All @@ -851,8 +852,7 @@ def iter_upload(
pp,
p,
)
old_zarr_entries.pop(pps)
to_delete.append({"path": pps})
to_delete.append(old_zarr_entries.pop(pps))
break
to_upload.append(item)
else:
Expand All @@ -864,11 +864,9 @@ def iter_upload(
)
for ee in e.iterfiles():
try:
old_zarr_entries.pop(str(ee))
to_delete.append(old_zarr_entries.pop(str(ee)))
except KeyError:
pass
else:
to_delete.append({"path": str(ee)})
to_upload.append(item)
elif pdigest != e.get_digest().value:
lgr.debug(
Expand All @@ -880,7 +878,7 @@ def iter_upload(
else:
lgr.debug("%s: File %s already on server; skipping", asset_path, p)
if to_delete:
client.delete(f"/zarr/{zarr_id}/files/", json=to_delete)
a.rmfiles(to_delete)
yield {"status": "initiating upload"}
lgr.debug("%s: Beginning upload", asset_path)
bytes_uploaded = 0
Expand Down Expand Up @@ -931,17 +929,14 @@ def iter_upload(
lgr.debug("%s: Completing upload of batch #%d", asset_path, i)
client.post(f"/zarr/{zarr_id}/upload/complete/")
lgr.debug("%s: Upload completed", asset_path)
old_zarr_files = [k for k, e in old_zarr_entries.items() if e.is_file()]
old_zarr_files = [e for e in old_zarr_entries.values() if e.is_file()]
if old_zarr_files:
lgr.debug(
"%s: Deleting %s in remote Zarr not present locally",
asset_path,
pluralize(len(old_zarr_files), "file"),
)
client.delete(
f"/zarr/{zarr_id}/files/",
json=[{"path": k} for k in old_zarr_files],
)
a.rmfiles(old_zarr_files)
r = client.get(f"/zarr/{zarr_id}/")
if r["checksum"] != filetag:
raise RuntimeError(
Expand Down

0 comments on commit 124ee25

Please sign in to comment.