From 0be233f27d0b3d2572c156244dbfc3defbeac98c Mon Sep 17 00:00:00 2001 From: Jonathan Langlois Date: Thu, 24 Aug 2023 09:24:05 +0900 Subject: [PATCH 1/2] fix: avoid copying files with same name prefix --- gcsfs/core.py | 15 +++------------ gcsfs/tests/test_core.py | 7 ++++--- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/gcsfs/core.py b/gcsfs/core.py index e8098bf3..a8364f91 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -835,18 +835,6 @@ async def _info(self, path, generation=None, **kwargs): else: raise FileNotFoundError(path) - async def _glob(self, path, prefix="", **kwargs): - if not prefix: - # Identify pattern prefixes. Ripped from fsspec.spec.AbstractFileSystem.glob and matches - # the glob.has_magic patterns. - indstar = path.find("*") if path.find("*") >= 0 else len(path) - indques = path.find("?") if path.find("?") >= 0 else len(path) - indbrace = path.find("[") if path.find("[") >= 0 else len(path) - - ind = min(indstar, indques, indbrace) - prefix = path[:ind].split("/")[-1] - return await super()._glob(path, prefix=prefix, **kwargs) - async def _ls(self, path, detail=False, prefix="", versions=False, **kwargs): """List objects under the given '/{bucket}/{prefix} path.""" path = self._strip_protocol(path).rstrip("/") @@ -1252,6 +1240,9 @@ async def _find( else: _prefix = key + if _prefix != "" and await self._isdir(f"{bucket}/{_prefix}"): + _prefix = _prefix.rstrip("/") + "/" + objects, _ = await self._do_list_objects( bucket, delimiter="", prefix=_prefix, versions=versions ) diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index 51956e80..5b0e6761 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -285,10 +285,11 @@ def test_gcs_glob(gcs): for f in gcs.glob(TEST_BUCKET + "/nested/*") if gcs.isfile(f) ) + # the following is no longer true since the glob method list the root path # Ensure the glob only fetches prefixed folders - gcs.dircache.clear() - gcs.glob(TEST_BUCKET + "/nested**1") - assert all(d.startswith(TEST_BUCKET + "/nested") for d in gcs.dircache) + # gcs.dircache.clear() + # gcs.glob(TEST_BUCKET + "/nested**1") + # assert all(d.startswith(TEST_BUCKET + "/nested") for d in gcs.dircache) # the following is no longer true as of #437 # gcs.glob(TEST_BUCKET + "/test*") # assert TEST_BUCKET + "/test" in gcs.dircache From d1ff25f9870a5a09f287a68bd52183a699c41dfd Mon Sep 17 00:00:00 2001 From: Jonathan Langlois Date: Thu, 31 Aug 2023 08:38:58 +0900 Subject: [PATCH 2/2] feat: avoid calling isdir before fetching objects --- gcsfs/core.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/gcsfs/core.py b/gcsfs/core.py index 09ba37b9..97ecb697 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -1387,24 +1387,27 @@ async def _find( **kwargs, ): path = self._strip_protocol(path) - bucket, key, generation = self.split_path(path) if maxdepth is not None and maxdepth < 1: raise ValueError("maxdepth must be at least 1") - if prefix: - _path = "" if not key else key.rstrip("/") + "/" - _prefix = f"{_path}{prefix}" - else: - _prefix = key - - if _prefix != "" and await self._isdir(f"{bucket}/{_prefix}"): - _prefix = _prefix.rstrip("/") + "/" - + # Fetch objects as if the path is a directory objects, _ = await self._do_list_objects( - bucket, delimiter="", prefix=_prefix, versions=versions + path, delimiter="", prefix=prefix, versions=versions ) + if not objects: + # Fetch objects as if the path is a file + bucket, key, _ = self.split_path(path) + if prefix: + _path = "" if not key else key.rstrip("/") + "/" + _prefix = f"{_path}{prefix}" + else: + _prefix = key + objects, _ = await self._do_list_objects( + bucket, delimiter="", prefix=_prefix, versions=versions + ) + dirs = {} cache_entries = {}