diff --git a/dandi/consts.py b/dandi/consts.py index 535d6e986..b68dff98f 100644 --- a/dandi/consts.py +++ b/dandi/consts.py @@ -159,3 +159,6 @@ class DandiInstance(NamedTuple): #: Maximum number of Zarr directory entries to upload at once ZARR_UPLOAD_BATCH_SIZE = 255 + +#: Maximum number of Zarr directory entries to delete at once +ZARR_DELETE_BATCH_SIZE = 100 diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 7b7bff7a9..1f6341300 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -15,6 +15,7 @@ ClassVar, Dict, FrozenSet, + Iterable, Iterator, List, Optional, @@ -38,6 +39,7 @@ DRAFT, MAX_CHUNK_SIZE, RETRY_STATUSES, + ZARR_DELETE_BATCH_SIZE, DandiInstance, EmbargoStatus, known_instances, @@ -46,7 +48,13 @@ from .exceptions import NotFoundError, SchemaVersionError from .keyring import keyring_lookup from .misctypes import BasePath, Digest -from .utils import USER_AGENT, check_dandi_version, ensure_datetime, is_interactive +from .utils import ( + USER_AGENT, + check_dandi_version, + chunked, + ensure_datetime, + is_interactive, +) lgr = get_logger() @@ -1412,6 +1420,17 @@ def iterfiles(self, include_dirs: bool = False) -> Iterator["RemoteZarrEntry"]: """ return self.filetree.iterfiles(include_dirs=include_dirs) + def rmfiles(self, files: Iterable["RemoteZarrEntry"]) -> None: + """Delete one or more files from the Zarr""" + # Don't bother checking that the entries are actually files or even + # belong to this Zarr, as if they're not, the server will return an + # error anyway. + for entries in chunked(files, ZARR_DELETE_BATCH_SIZE): + self.client.delete( + f"/zarr/{self.zarr}/files/", + json=[{"path": str(e)} for e in entries], + ) + class RemoteAsset(BaseRemoteAsset): """ diff --git a/dandi/files.py b/dandi/files.py index 9cea3fe69..14ae9d123 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -833,8 +833,9 @@ def iter_upload( json={"metadata": metadata, "zarr_id": zarr_id}, ) a = RemoteAsset.from_data(dandiset, r) + assert isinstance(a, RemoteZarrAsset) to_upload: List[dict] = [] - to_delete: List[dict] = [] + to_delete: List[RemoteZarrEntry] = [] for p in stat.files: pdigest = p.get_digest().value item = {"path": str(p), "etag": pdigest} @@ -851,8 +852,7 @@ def iter_upload( pp, p, ) - old_zarr_entries.pop(pps) - to_delete.append({"path": pps}) + to_delete.append(old_zarr_entries.pop(pps)) break to_upload.append(item) else: @@ -864,11 +864,9 @@ def iter_upload( ) for ee in e.iterfiles(): try: - old_zarr_entries.pop(str(ee)) + to_delete.append(old_zarr_entries.pop(str(ee))) except KeyError: pass - else: - to_delete.append({"path": str(ee)}) to_upload.append(item) elif pdigest != e.get_digest().value: lgr.debug( @@ -880,7 +878,7 @@ def iter_upload( else: lgr.debug("%s: File %s already on server; skipping", asset_path, p) if to_delete: - client.delete(f"/zarr/{zarr_id}/files/", json=to_delete) + a.rmfiles(to_delete) yield {"status": "initiating upload"} lgr.debug("%s: Beginning upload", asset_path) bytes_uploaded = 0 @@ -931,17 +929,14 @@ def iter_upload( lgr.debug("%s: Completing upload of batch #%d", asset_path, i) client.post(f"/zarr/{zarr_id}/upload/complete/") lgr.debug("%s: Upload completed", asset_path) - old_zarr_files = [k for k, e in old_zarr_entries.items() if e.is_file()] + old_zarr_files = [e for e in old_zarr_entries.values() if e.is_file()] if old_zarr_files: lgr.debug( "%s: Deleting %s in remote Zarr not present locally", asset_path, pluralize(len(old_zarr_files), "file"), ) - client.delete( - f"/zarr/{zarr_id}/files/", - json=[{"path": k} for k in old_zarr_files], - ) + a.rmfiles(old_zarr_files) r = client.get(f"/zarr/{zarr_id}/") if r["checksum"] != filetag: raise RuntimeError(