Skip to content

Commit

Permalink
API: New copy / view semantics using Copy-on-Write (pandas-dev#46958)
Browse files Browse the repository at this point in the history
* API: New copy / view semantics using Copy-on-Write

* fix more tests

* Handle CoW in BM.iset

* Handle CoW in xs

* add bunch of todo comments and usage warnings

* Insert None ref in BM.insert

* Ensure to not assume copy_on_write is set in case of ArrayManager

* Handle refs in BM._combine / test CoW in select_dtypes

* handle get_numeric_data for single block manager

* fix test_internals (get_numeric_data now uses CoW)

* handle refs in consolidation

* fix deep=None for ArrayManager

* update copy/view tests from other PR

* clean-up fast_xs workarounds now it returns a SingleBlockManager

* tracks refs in to_frame

* fixup after updata main and column_setitem + iloc inplace setitem changes (pandas-devgh-45333)

* fix inplace fillna + fixup new tests

* address comments + update some todo comments

* Update pandas/core/internals/managers.py

Co-authored-by: Matthew Roeschke <emailformattr@gmail.com>

* fixup linting

* update new copy_view tests to use get_array helper

* add comment to setitem

* switch default to False, ensure CoW copies only happen when enabled + add additional test build with CoW

* update type annotations

* Fix stata issue to avoid SettingWithCopyWarning in read_stata

* update type + option comment

* fixup new rename test

Co-authored-by: Matthew Roeschke <emailformattr@gmail.com>
  • Loading branch information
2 people authored and CloseChoice committed Aug 21, 2022
1 parent c1ff44a commit 0157e90
Show file tree
Hide file tree
Showing 47 changed files with 1,019 additions and 291 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/ubuntu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ jobs:
extra_apt: "language-pack-zh-hans"
lang: "zh_CN.utf8"
lc_all: "zh_CN.utf8"
- name: "Copy-on-Write"
env_file: actions-310.yaml
pattern: "not slow and not network and not single_cpu"
pandas_copy_on_write: "1"
- name: "Data Manager"
env_file: actions-38.yaml
pattern: "not slow and not network and not single_cpu"
Expand Down Expand Up @@ -84,6 +88,7 @@ jobs:
LC_ALL: ${{ matrix.lc_all || '' }}
PANDAS_TESTING_MODE: ${{ matrix.pandas_testing_mode || '' }}
PANDAS_DATA_MANAGER: ${{ matrix.pandas_data_manager || 'block' }}
PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }}
TEST_ARGS: ${{ matrix.test_args || '' }}
PYTEST_WORKERS: ${{ contains(matrix.pattern, 'not single_cpu') && 'auto' || '1' }}
PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }}
Expand Down
13 changes: 9 additions & 4 deletions pandas/_libs/internals.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import defaultdict
import weakref

cimport cython
from cpython.slice cimport PySlice_GetIndicesEx
Expand Down Expand Up @@ -674,8 +675,9 @@ cdef class BlockManager:
public list axes
public bint _known_consolidated, _is_consolidated
public ndarray _blknos, _blklocs
public list refs

def __cinit__(self, blocks=None, axes=None, verify_integrity=True):
def __cinit__(self, blocks=None, axes=None, refs=None, verify_integrity=True):
# None as defaults for unpickling GH#42345
if blocks is None:
# This adds 1-2 microseconds to DataFrame(np.array([]))
Expand All @@ -687,6 +689,7 @@ cdef class BlockManager:

self.blocks = blocks
self.axes = axes.copy() # copy to make sure we are not remotely-mutable
self.refs = refs

# Populate known_consolidate, blknos, and blklocs lazily
self._known_consolidated = False
Expand Down Expand Up @@ -795,12 +798,14 @@ cdef class BlockManager:
ndarray blknos, blklocs

nbs = []
nrefs = []
for blk in self.blocks:
nb = blk.getitem_block_index(slobj)
nbs.append(nb)
nrefs.append(weakref.ref(blk))

new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)]
mgr = type(self)(tuple(nbs), new_axes, verify_integrity=False)
mgr = type(self)(tuple(nbs), new_axes, nrefs, verify_integrity=False)

# We can avoid having to rebuild blklocs/blknos
blklocs = self._blklocs
Expand All @@ -813,7 +818,7 @@ cdef class BlockManager:
def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager:

if axis == 0:
new_blocks = self._slice_take_blocks_ax0(slobj)
new_blocks, new_refs = self._slice_take_blocks_ax0(slobj)
elif axis == 1:
return self._get_index_slice(slobj)
else:
Expand All @@ -822,4 +827,4 @@ cdef class BlockManager:
new_axes = list(self.axes)
new_axes[axis] = new_axes[axis]._getitem_slice(slobj)

return type(self)(tuple(new_blocks), new_axes, verify_integrity=False)
return type(self)(tuple(new_blocks), new_axes, new_refs, verify_integrity=False)
2 changes: 1 addition & 1 deletion pandas/compat/pickle_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ def load_newobj(self):
arr = np.array([], dtype="m8[ns]")
obj = cls.__new__(cls, arr, arr.dtype)
elif cls is BlockManager and not args:
obj = cls.__new__(cls, (), [], False)
obj = cls.__new__(cls, (), [], None, False)
else:
obj = cls.__new__(cls, *args)

Expand Down
9 changes: 3 additions & 6 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,7 @@
from decimal import Decimal
import operator
import os
from typing import (
Callable,
Literal,
)
from typing import Callable

from dateutil.tz import (
tzlocal,
Expand Down Expand Up @@ -1838,8 +1835,8 @@ def using_array_manager():


@pytest.fixture
def using_copy_on_write() -> Literal[False]:
def using_copy_on_write() -> bool:
"""
Fixture to check if Copy-on-Write is enabled.
"""
return False
return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block"
20 changes: 20 additions & 0 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,26 @@ def use_inf_as_na_cb(key) -> None:
)


# TODO better name?
copy_on_write_doc = """
: bool
Use new copy-view behaviour using Copy-on-Write. Defaults to False,
unless overridden by the 'PANDAS_COPY_ON_WRITE' environment variable
(if set to "1" for True, needs to be set before pandas is imported).
"""


with cf.config_prefix("mode"):
cf.register_option(
"copy_on_write",
# Get the default from an environment variable, if set, otherwise defaults
# to False. This environment variable can be set for testing.
os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "1",
copy_on_write_doc,
validator=is_bool,
)


# user warnings
chained_assignment = """
: string
Expand Down
18 changes: 12 additions & 6 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3708,13 +3708,19 @@ def _get_column_array(self, i: int) -> ArrayLike:
"""
Get the values of the i'th column (ndarray or ExtensionArray, as stored
in the Block)
Warning! The returned array is a view but doesn't handle Copy-on-Write,
so this should be used with caution (for read-only purposes).
"""
return self._mgr.iget_values(i)

def _iter_column_arrays(self) -> Iterator[ArrayLike]:
"""
Iterate over the arrays of all columns in order.
This returns the values as stored in the Block (ndarray or ExtensionArray).
Warning! The returned array is a view but doesn't handle Copy-on-Write,
so this should be used with caution (for read-only purposes).
"""
for i in range(len(self.columns)):
yield self._get_column_array(i)
Expand Down Expand Up @@ -5147,7 +5153,7 @@ def set_axis(
"labels",
[
("method", None),
("copy", True),
("copy", None),
("level", None),
("fill_value", np.nan),
("limit", None),
Expand Down Expand Up @@ -5372,7 +5378,7 @@ def rename(
index: Renamer | None = ...,
columns: Renamer | None = ...,
axis: Axis | None = ...,
copy: bool = ...,
copy: bool | None = ...,
inplace: Literal[True],
level: Level = ...,
errors: IgnoreRaise = ...,
Expand All @@ -5387,7 +5393,7 @@ def rename(
index: Renamer | None = ...,
columns: Renamer | None = ...,
axis: Axis | None = ...,
copy: bool = ...,
copy: bool | None = ...,
inplace: Literal[False] = ...,
level: Level = ...,
errors: IgnoreRaise = ...,
Expand All @@ -5402,7 +5408,7 @@ def rename(
index: Renamer | None = ...,
columns: Renamer | None = ...,
axis: Axis | None = ...,
copy: bool = ...,
copy: bool | None = ...,
inplace: bool = ...,
level: Level = ...,
errors: IgnoreRaise = ...,
Expand All @@ -5416,7 +5422,7 @@ def rename(
index: Renamer | None = None,
columns: Renamer | None = None,
axis: Axis | None = None,
copy: bool = True,
copy: bool | None = None,
inplace: bool = False,
level: Level = None,
errors: IgnoreRaise = "ignore",
Expand Down Expand Up @@ -6290,7 +6296,7 @@ class max type
if inplace:
new_obj = self
else:
new_obj = self.copy()
new_obj = self.copy(deep=None)
if allow_duplicates is not lib.no_default:
allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates")

Expand Down
16 changes: 10 additions & 6 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1046,7 +1046,7 @@ def _rename(
index: Renamer | None = None,
columns: Renamer | None = None,
axis: Axis | None = None,
copy: bool_t = True,
copy: bool_t | None = None,
inplace: bool_t = False,
level: Level | None = None,
errors: str = "ignore",
Expand Down Expand Up @@ -4161,6 +4161,12 @@ def _check_setitem_copy(self, t="setting", force=False):
df.iloc[0:5]['group'] = 'a'
"""
if (
config.get_option("mode.copy_on_write")
and config.get_option("mode.data_manager") == "block"
):
return

# return early if the check is not needed
if not (force or self._is_copy):
return
Expand Down Expand Up @@ -5261,7 +5267,7 @@ def reindex(self: NDFrameT, *args, **kwargs) -> NDFrameT:
axes, kwargs = self._construct_axes_from_arguments(args, kwargs)
method = missing.clean_reindex_fill_method(kwargs.pop("method", None))
level = kwargs.pop("level", None)
copy = kwargs.pop("copy", True)
copy = kwargs.pop("copy", None)
limit = kwargs.pop("limit", None)
tolerance = kwargs.pop("tolerance", None)
fill_value = kwargs.pop("fill_value", None)
Expand All @@ -5286,9 +5292,7 @@ def reindex(self: NDFrameT, *args, **kwargs) -> NDFrameT:
for axis, ax in axes.items()
if ax is not None
):
if copy:
return self.copy()
return self
return self.copy(deep=copy)

# check if we are a multi reindex
if self._needs_reindex_multi(axes, method, level):
Expand Down Expand Up @@ -6265,7 +6269,7 @@ def astype(
return cast(NDFrameT, result)

@final
def copy(self: NDFrameT, deep: bool_t = True) -> NDFrameT:
def copy(self: NDFrameT, deep: bool_t | None = True) -> NDFrameT:
"""
Make a copy of this object's indices and data.
Expand Down
10 changes: 10 additions & 0 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,11 @@ def copy(self: T, deep=True) -> T:
-------
BlockManager
"""
if deep is None:
# ArrayManager does not yet support CoW, so deep=None always means
# deep=True for now
deep = True

# this preserves the notion of view copying of axes
if deep:
# hit in e.g. tests.io.json.test_pandas
Expand Down Expand Up @@ -591,6 +596,11 @@ def _reindex_indexer(
pandas-indexer with -1's only.
"""
if copy is None:
# ArrayManager does not yet support CoW, so deep=None always means
# deep=True for now
copy = True

if indexer is None:
if new_axis is self._axes[axis] and not copy:
return self
Expand Down
11 changes: 9 additions & 2 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -839,17 +839,22 @@ def _slice(

return self.values[slicer]

def set_inplace(self, locs, values: ArrayLike) -> None:
def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None:
"""
Modify block values in-place with new item value.
If copy=True, first copy the underlying values in place before modifying
(for Copy-on-Write).
Notes
-----
`set_inplace` never creates a new array or new Block, whereas `setitem`
_may_ create a new array and always creates a new Block.
Caller is responsible for checking values.dtype == self.dtype.
"""
if copy:
self.values = self.values.copy()
self.values[locs] = values

def take_nd(
Expand Down Expand Up @@ -1665,9 +1670,11 @@ def iget(self, i: int | tuple[int, int] | tuple[slice, int]):
raise IndexError(f"{self} only contains one item")
return self.values

def set_inplace(self, locs, values: ArrayLike) -> None:
def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None:
# When an ndarray, we should have locs.tolist() == [0]
# When a BlockPlacement we should have list(locs) == [0]
if copy:
self.values = self.values.copy()
self.values[:] = values

def _maybe_squeeze_arg(self, arg):
Expand Down
Loading

0 comments on commit 0157e90

Please sign in to comment.