Skip to content

Commit

Permalink
fix: notify master for core checkpoint deletes [MD-325] (#9415)
Browse files Browse the repository at this point in the history
  • Loading branch information
azhou-determined committed May 29, 2024
1 parent b96ccba commit d94e299
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 2 deletions.
34 changes: 32 additions & 2 deletions harness/determined/core/_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -677,9 +677,31 @@ def _selector(path: str) -> bool:

def delete(self, storage_id: str) -> None:
"""
Delete a checkpoint from the storage backend.
Delete a checkpoint from the storage backend and notify the master.
"""
self._storage_manager.delete(storage_id, ["**/*"])
resources_deleted = self._storage_manager.delete(storage_id, ["**/*"])
self._report_checkpoint_deleted(storage_id=storage_id, resources=resources_deleted)

def _report_checkpoint_deleted(
self,
storage_id: str,
resources: Optional[Dict[str, int]] = None,
) -> None:
"""
After deleting a checkpoint, report deletion to the master.
"""
deleted_checkpoint = [
bindings.v1PatchCheckpoint(
uuid=storage_id,
resources=bindings.PatchCheckpointOptionalResources(
resources=resources, # type: ignore
),
)
]

bindings.patch_PatchCheckpoints(
self._session, body=bindings.v1PatchCheckpointsRequest(checkpoints=deleted_checkpoint)
)

def _write_metadata_file(self, ckpt_dir: str, metadata: Dict[str, Any]) -> None:
metadata_path = pathlib.Path(ckpt_dir).joinpath("metadata.json")
Expand Down Expand Up @@ -742,6 +764,14 @@ def _report_checkpoint(
# No master to report to; just log the event.
logger.info(f"saved checkpoint {storage_id}")

def _report_checkpoint_deleted(
self,
storage_id: str,
resources: Optional[Dict[str, int]] = None,
) -> None:
# No master to report to; just log the event.
logger.info(f"deleted checkpoint {storage_id}")

def get_metadata(self, storage_id: str) -> Dict[str, Any]:
# TODO: when the StorageManager supports downloading with a file filter, we should attempt
# to download metadata.json from the checkpoint and read it here.
Expand Down
10 changes: 10 additions & 0 deletions harness/tests/core/test_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def restore_path(
storage_manager.store_path = mock.MagicMock(side_effect=store_path)
storage_manager.restore_path = mock.MagicMock(side_effect=restore_path)
storage_manager._list_directory = mock.MagicMock(return_value={"one": 1, "two": 2})
storage_manager.delete = mock.MagicMock()

return storage_manager

Expand Down Expand Up @@ -145,6 +146,15 @@ def do_test() -> None:
storage_manager.restore_path.assert_called_once()
storage_manager.restore_path.reset_mock()

# Test delete.
if pex.distributed.rank == 0:
checkpoint_context.delete("ckpt-uuid")
if not dummy:
session._do_request.assert_called_once()
session._do_request.reset_mock()
storage_manager.delete.assert_called_once()
storage_manager.delete.reset_mock()


@pytest.mark.parametrize(
"resources,expected_merged,expected_conflicts",
Expand Down

0 comments on commit d94e299

Please sign in to comment.