Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] GH-89727: Fix shutil.rmtree() recursion error on deep trees. #103164

Closed
wants to merge 27 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
17f02bb
GH-89727: Fix `shutil.rmtree()` recursion error on deep trees.
barneygale Apr 1, 2023
48dc5db
Delay urllib import
barneygale Apr 1, 2023
6e92b27
Fix up filenames in exceptions
barneygale Apr 1, 2023
c042220
Restore `shutil._rmtree_islink()`
barneygale Apr 1, 2023
e268e89
Tidy up exception handling
barneygale Apr 1, 2023
3f40cb0
Better compatibility with shutil tests.
barneygale Apr 1, 2023
b59bda8
Move main implementation into pathlib
barneygale Apr 1, 2023
c174376
Make `_fwalk()` private, tidy up naming.
barneygale Apr 1, 2023
0e8d8e6
Fix tests
barneygale Apr 1, 2023
fea58ab
Fix missing filename in exception, tidy up more naming.
barneygale Apr 1, 2023
bff0f5e
More cleanup
barneygale Apr 1, 2023
3b21684
Even more tidy-up
barneygale Apr 1, 2023
5abee55
Reduce diff a bit
barneygale Apr 2, 2023
57173d9
Performance improvements
barneygale Apr 2, 2023
58070d9
Simplify rmtree (h/t eryksun)
barneygale Apr 2, 2023
d61baa1
More performance tweaks
barneygale Apr 2, 2023
e983dd7
Make `error._func` private, as it's only intended for `shutil.rmtree()`
barneygale Apr 3, 2023
5387d32
Improve tests
barneygale Apr 3, 2023
75541fb
Improve performance
barneygale Apr 3, 2023
698db60
Merge branch 'main' into pathlib-fwalk
barneygale Apr 3, 2023
74cdd6c
Supply relative paths to `rmdir()` in fd-based walk.
barneygale Apr 7, 2023
c17d07c
Handle rmdir() and unlink() not accepting dir_fd
barneygale Apr 7, 2023
b4f120c
Move rmtree() implementation back into shutil.
barneygale Apr 12, 2023
e856ccb
Restore missing dir_fd check
barneygale Apr 12, 2023
d1e349c
Remove unnecessary change to test
barneygale Apr 12, 2023
63e55e1
Reduce memory usage a bit.
barneygale Apr 12, 2023
f7c5352
Reduce diff
barneygale Apr 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 97 additions & 42 deletions Lib/pathlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from _collections_abc import Sequence
from errno import ENOENT, ENOTDIR, EBADF, ELOOP
from stat import S_ISDIR, S_ISLNK, S_ISREG, S_ISSOCK, S_ISBLK, S_ISCHR, S_ISFIFO
from urllib.parse import quote_from_bytes as urlquote_from_bytes


__all__ = [
Expand Down Expand Up @@ -203,6 +202,64 @@ def _select_from(self, parent_path, is_dir, exists, scandir, normcase):
return


class _WalkAction:
WALK = object()
YIELD = object()
CLOSE = object()


def _walk(top_down, on_error, follow_symlinks, follow_junctions, use_fd, actions):
while actions:
action, value = actions.pop()
try:
if action is _WalkAction.WALK:
path, dir_fd, entry = value
dirnames = []
filenames = []
try:
if use_fd:
scandir_it, fd = path._scandir_fwalk(
follow_symlinks, actions, dir_fd, entry)
result = path, dirnames, filenames, fd
else:
scandir_it, fd = path._scandir(), None
result = path, dirnames, filenames
except OSError as error:
if not hasattr(error, '_func'):
error._func = os.scandir
error.filename = str(path)
raise error
if not top_down:
actions.append((_WalkAction.YIELD, result))
with scandir_it:
for entry in scandir_it:
try:
if entry.is_dir(follow_symlinks=follow_symlinks) and (
follow_junctions or not entry.is_junction()):
if not top_down:
actions.append((_WalkAction.WALK, (
path._make_child_relpath(entry.name), fd,
entry if use_fd and not follow_symlinks else None)))
dirnames.append(entry.name)
else:
filenames.append(entry.name)
except OSError:
filenames.append(entry.name)
if top_down:
yield result
for dirname in reversed(dirnames):
actions.append((_WalkAction.WALK, (
path._make_child_relpath(dirname), fd, None)))
elif action is _WalkAction.YIELD:
yield value
elif action is _WalkAction.CLOSE:
os.close(value)
else:
raise AssertionError(f"unknown walk action: {action}")
except OSError as error:
if on_error is not None:
on_error(error)

#
# Public API
#
Expand Down Expand Up @@ -372,7 +429,9 @@ def as_uri(self):
# It's a posix path => 'file:///etc/hosts'
prefix = 'file://'
path = str(self)
return prefix + urlquote_from_bytes(os.fsencode(path))

from urllib.parse import quote_from_bytes
return prefix + quote_from_bytes(os.fsencode(path))

@property
def _parts_normcase(self):
Expand Down Expand Up @@ -1205,50 +1264,46 @@ def expanduser(self):

return self

def walk(self, top_down=True, on_error=None, follow_symlinks=False):
def walk(self, top_down=True, on_error=None, follow_symlinks=False, follow_junctions=True):
"""Walk the directory tree from this directory, similar to os.walk()."""
sys.audit("pathlib.Path.walk", self, on_error, follow_symlinks)
paths = [self]

while paths:
path = paths.pop()
if isinstance(path, tuple):
yield path
continue

# We may not have read permission for self, in which case we can't
# get a list of the files the directory contains. os.walk()
# always suppressed the exception in that instance, rather than
# blow up for a minor reason when (say) a thousand readable
# directories are still left to visit. That logic is copied here.
actions = [(_WalkAction.WALK, (self, None, None))]
return _walk(top_down, on_error, follow_symlinks, follow_junctions, False, actions)

if {os.stat, os.open} <= os.supports_dir_fd and {os.stat, os.scandir} <= os.supports_fd:
def _fwalk(self, top_down=True, *, on_error=None, follow_symlinks=False, dir_fd=None):
"""Walk the directory tree from this directory, similar to os.fwalk()."""
sys.audit("pathlib.Path._fwalk", self, on_error, follow_symlinks, dir_fd)
actions = [(_WalkAction.WALK, (self, dir_fd, None))]
try:
scandir_it = path._scandir()
except OSError as error:
if on_error is not None:
on_error(error)
continue

with scandir_it:
dirnames = []
filenames = []
for entry in scandir_it:
try:
is_dir = entry.is_dir(follow_symlinks=follow_symlinks)
except OSError:
# Carried over from os.path.isdir().
is_dir = False

if is_dir:
dirnames.append(entry.name)
else:
filenames.append(entry.name)

if top_down:
yield path, dirnames, filenames
return _walk(top_down, on_error, follow_symlinks, True, True, actions)
finally:
for action, value in reversed(actions):
if action is _WalkAction.CLOSE:
try:
os.close(value)
except OSError:
pass

def _scandir_fwalk(self, follow_symlinks, actions, dir_fd, entry):
name = self if dir_fd is None else self.name
if follow_symlinks:
fd = os.open(name, os.O_RDONLY, dir_fd=dir_fd)
actions.append((_WalkAction.CLOSE, fd))
else:
paths.append((path, dirnames, filenames))

paths += [path._make_child_relpath(d) for d in reversed(dirnames)]
# Note: To guard against symlink races, we use the standard
# lstat()/open()/fstat() trick.
if entry is None:
orig_st = os.stat(name, follow_symlinks=False, dir_fd=dir_fd)
else:
orig_st = entry.stat(follow_symlinks=False)
fd = os.open(name, os.O_RDONLY, dir_fd=dir_fd)
actions.append((_WalkAction.CLOSE, fd))
if not os.path.samestat(orig_st, os.stat(fd)):
error = NotADirectoryError("Cannot walk into a symbolic link")
error._func = os.path.islink
raise error
return os.scandir(fd), fd


class PosixPath(Path, PurePosixPath):
Expand Down
173 changes: 38 additions & 135 deletions Lib/shutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import fnmatch
import collections
import errno
import pathlib
import warnings

try:
Expand Down Expand Up @@ -562,116 +563,49 @@ def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
ignore_dangling_symlinks=ignore_dangling_symlinks,
dirs_exist_ok=dirs_exist_ok)

if hasattr(os.stat_result, 'st_file_attributes'):
def _rmtree_islink(path):
try:
st = os.lstat(path)
return (stat.S_ISLNK(st.st_mode) or
(st.st_file_attributes & stat.FILE_ATTRIBUTE_REPARSE_POINT
and st.st_reparse_tag == stat.IO_REPARSE_TAG_MOUNT_POINT))
except OSError:
return False
else:
def _rmtree_islink(path):
return os.path.islink(path)

# version vulnerable to race conditions
def _rmtree_unsafe(path, onexc):
try:
with os.scandir(path) as scandir_it:
entries = list(scandir_it)
except OSError as err:
onexc(os.scandir, path, err)
entries = []
for entry in entries:
fullname = entry.path
try:
is_dir = entry.is_dir(follow_symlinks=False)
except OSError:
is_dir = False

if is_dir and not entry.is_junction():
walker = path.walk(
top_down=False,
follow_symlinks=False,
follow_junctions=False,
on_error=lambda err: onexc(err._func, err.filename, err))
for dirpath, _, filenames in walker:
for filename in filenames:
try:
if entry.is_symlink():
# This can only happen if someone replaces
# a directory with a symlink after the call to
# os.scandir or entry.is_dir above.
raise OSError("Cannot call rmtree on a symbolic link")
dirpath._make_child_relpath(filename).unlink()
except OSError as err:
onexc(os.path.islink, fullname, err)
continue
_rmtree_unsafe(fullname, onexc)
else:
try:
os.unlink(fullname)
except OSError as err:
onexc(os.unlink, fullname, err)
try:
os.rmdir(path)
except OSError as err:
onexc(os.rmdir, path, err)
onexc(os.unlink, err.filename, err)
try:
dirpath.rmdir()
except OSError as err:
onexc(os.rmdir, err.filename, err)

# Version using fd-based APIs to protect against races
def _rmtree_safe_fd(topfd, path, onexc):
try:
with os.scandir(topfd) as scandir_it:
entries = list(scandir_it)
except OSError as err:
err.filename = path
onexc(os.scandir, path, err)
return
for entry in entries:
fullname = os.path.join(path, entry.name)
try:
is_dir = entry.is_dir(follow_symlinks=False)
except OSError:
is_dir = False
else:
if is_dir:
try:
orig_st = entry.stat(follow_symlinks=False)
is_dir = stat.S_ISDIR(orig_st.st_mode)
except OSError as err:
onexc(os.lstat, fullname, err)
continue
if is_dir:
walker = path._fwalk(
top_down=False,
follow_symlinks=False,
on_error=lambda err: onexc(err._func, err.filename, err),
dir_fd=topfd)
for dirpath, dirnames, filenames, fd in walker:
for filename in filenames:
try:
dirfd = os.open(entry.name, os.O_RDONLY, dir_fd=topfd)
dirfd_closed = False
os.unlink(filename, dir_fd=fd)
except OSError as err:
onexc(os.open, fullname, err)
else:
try:
if os.path.samestat(orig_st, os.fstat(dirfd)):
_rmtree_safe_fd(dirfd, fullname, onexc)
try:
os.close(dirfd)
dirfd_closed = True
os.rmdir(entry.name, dir_fd=topfd)
except OSError as err:
onexc(os.rmdir, fullname, err)
else:
try:
# This can only happen if someone replaces
# a directory with a symlink after the call to
# os.scandir or stat.S_ISDIR above.
raise OSError("Cannot call rmtree on a symbolic "
"link")
except OSError as err:
onexc(os.path.islink, fullname, err)
finally:
if not dirfd_closed:
os.close(dirfd)
else:
onexc(os.unlink, str(dirpath._make_child_relpath(filename)), err)
for dirname in dirnames:
try:
os.rmdir(dirname, dir_fd=fd)
except OSError as err:
onexc(os.rmdir, str(dirpath._make_child_relpath(dirname)), err)
if dirpath == path:
try:
os.unlink(entry.name, dir_fd=topfd)
os.rmdir(path, dir_fd=topfd)
except OSError as err:
onexc(os.unlink, fullname, err)
onexc(os.rmdir, str(path), err)

_use_fd_functions = ({os.open, os.stat, os.unlink, os.rmdir} <=
os.supports_dir_fd and
os.scandir in os.supports_fd and
os.stat in os.supports_follow_symlinks)
_use_fd_functions = hasattr(pathlib.Path, '_fwalk') and {os.unlink, os.rmdir} <= os.supports_dir_fd

def rmtree(path, ignore_errors=False, onerror=None, *, onexc=None, dir_fd=None):
"""Recursively delete a directory tree.
Expand Down Expand Up @@ -718,51 +652,20 @@ def onexc(*args):
else:
exc_info = type(exc), exc, exc.__traceback__
return onerror(func, path, exc_info)

if isinstance(path, bytes):
path = os.fsdecode(path)
path = pathlib.Path(path)
if _use_fd_functions:
# While the unsafe rmtree works fine on bytes, the fd based does not.
if isinstance(path, bytes):
path = os.fsdecode(path)
# Note: To guard against symlink races, we use the standard
# lstat()/open()/fstat() trick.
try:
orig_st = os.lstat(path, dir_fd=dir_fd)
except Exception as err:
onexc(os.lstat, path, err)
return
try:
fd = os.open(path, os.O_RDONLY, dir_fd=dir_fd)
fd_closed = False
except Exception as err:
onexc(os.open, path, err)
return
try:
if os.path.samestat(orig_st, os.fstat(fd)):
_rmtree_safe_fd(fd, path, onexc)
try:
os.close(fd)
fd_closed = True
os.rmdir(path, dir_fd=dir_fd)
except OSError as err:
onexc(os.rmdir, path, err)
else:
try:
# symlinks to directories are forbidden, see bug #1669
raise OSError("Cannot call rmtree on a symbolic link")
except OSError as err:
onexc(os.path.islink, path, err)
finally:
if not fd_closed:
os.close(fd)
return _rmtree_safe_fd(dir_fd, path, onexc)
else:
if dir_fd is not None:
barneygale marked this conversation as resolved.
Show resolved Hide resolved
raise NotImplementedError("dir_fd unavailable on this platform")
try:
if _rmtree_islink(path):
if path.is_symlink() or path.is_junction():
# symlinks to directories are forbidden, see bug #1669
raise OSError("Cannot call rmtree on a symbolic link")
except OSError as err:
onexc(os.path.islink, path, err)
onexc(os.path.islink, str(path), err)
# can't continue even if onexc hook returns
return
return _rmtree_unsafe(path, onexc)
Expand Down
Loading